Пример #1
0
    def __init__(self, data, archive_dir="archives", mbox=False):
        """
        Initializes an Archive object.

        The behavior of the constructor depends on the type
        of its first argument, data.

        If data is a Pandas DataFrame, it is treated as a representation of
        email messages with columns for Message-ID, From, Date, In-Reply-To,
        References, and Body. The created Archive becomes a wrapper around a
        copy of the input DataFrame.

        If data is a string, then it is interpreted as a path to either a
        single .mbox file (if the optional argument single_file is True) or
        else to a directory of .mbox files (also in .mbox format). Note that
        the file extensions need not be .mbox; frequently they will be .txt.

        Upon initialization, the Archive object drops duplicate entries
        and sorts its member variable *data* by Date.
        """

        if isinstance(data, pd.core.frame.DataFrame):
            self.data = data.copy()
        elif isinstance(data, str):

            self.data = mailman.load_data(data,
                                          archive_dir=archive_dir,
                                          mbox=mbox)

        self.data['Date'] = pd.to_datetime(self.data['Date'], utc=True)

        self.data.drop_duplicates(inplace=True)

        # Drops any entries with no Date field.
        # It may be wiser to optionally
        # do interpolation here.
        self.data.dropna(subset=['Date'], inplace=True)

        #convert any null fields to None -- csv saves these as nan sometimes
        self.data = self.data.where(pd.notnull(self.data), None)

        try:
            #set the index to be the Message-ID column
            self.data.set_index('Message-ID', inplace=True)
        except KeyError:
            #will get KeyError if Message-ID is already index
            pass

        self.data.sort(columns='Date', inplace=True)
Пример #2
0
    def __init__(self, data, archive_dir="archives", mbox=False):
        """
        Initializes an Archive object.

        The behavior of the constructor depends on the type
        of its first argument, data.

        If data is a Pandas DataFrame, it is treated as a representation of
        email messages with columns for Message-ID, From, Date, In-Reply-To,
        References, and Body. The created Archive becomes a wrapper around a
        copy of the input DataFrame.

        If data is a string, then it is interpreted as a path to either a
        single .mbox file (if the optional argument single_file is True) or
        else to a directory of .mbox files (also in .mbox format). Note that
        the file extensions need not be .mbox; frequently they will be .txt.

        Upon initialization, the Archive object drops duplicate entries
        and sorts its member variable *data* by Date.
        """

        if isinstance(data, pd.core.frame.DataFrame):
            self.data = data.copy()
        elif isinstance(data, str):

            self.data = mailman.load_data(data,archive_dir=archive_dir,mbox=mbox)

        self.data['Date'] = pd.to_datetime(self.data['Date'], utc=True)

        self.data.drop_duplicates(inplace=True)

        # Drops any entries with no Date field.
        # It may be wiser to optionally
        # do interpolation here.
        self.data.dropna(subset=['Date'], inplace=True)

        #convert any null fields to None -- csv saves these as nan sometimes
        self.data = self.data.where(pd.notnull(self.data),None)

        try:
            #set the index to be the Message-ID column
            self.data.set_index('Message-ID',inplace=True)
        except KeyError:
            #will get KeyError if Message-ID is already index
            pass

        self.data.sort(columns='Date', inplace=True)
Пример #3
0
    def __init__(self, data, archive_dir=CONFIG.mail_path, mbox=False):
        """
        Initialize an Archive object.

        The behavior of the constructor depends on the type
        of its first argument, data.

        If data is a Pandas DataFrame, it is treated as a representation of
        email messages with columns for Message-ID, From, Date, In-Reply-To,
        References, and Body. The created Archive becomes a wrapper around a
        copy of the input DataFrame.

        If data is a string, then it is interpreted as a path to either a
        single .mbox file (if the optional argument single_file is True) or
        else to a directory of .mbox files (also in .mbox format). Note that
        the file extensions need not be .mbox; frequently they will be .txt.

        Upon initialization, the Archive object drops duplicate entries
        and sorts its member variable *data* by Date.
        """
          
        if isinstance(data, pd.core.frame.DataFrame):
            self.data = data.copy()
        elif isinstance(data, str):
            self.data = mailman.load_data(data,archive_dir=archive_dir,mbox=mbox)
        
        try:
            self.data['Date'] = pd.to_datetime(self.data['Date'], errors='coerce', infer_datetime_format=True, utc=True)
        except:
            #TODO: writing a CSV file was for debugging purposes, should be removed
            out_path = 'datetime-exception.csv'
            with open(out_path, 'w') as f:
                self.data.to_csv(f, encoding='utf-8')
            
            logging.error('Error while converting to datetime, despite coerce mode.')
            raise

        try:
            self.data.drop_duplicates(inplace=True)
        except:
            logging.error('Error while removing duplicate messages, maybe timezone issues?', exc_info=True)

        # Drops any entries with no Date field.
        # It may be wiser to optionally
        # do interpolation here.
        if self.data['Date'].isnull().any():
            #self.data.dropna(subset=['Date'], inplace=True)
            self.data = self.data[self.data['Date'].notnull()]
            # workaround for https://github.com/pandas-dev/pandas/issues/13407

        #convert any null fields to None -- csv saves these as nan sometimes
        self.data = self.data.where(pd.notnull(self.data),None)

        try:
            #set the index to be the Message-ID column
            self.data.set_index('Message-ID',inplace=True)
        except KeyError:
            #will get KeyError if Message-ID is already index
            pass

        # let's do a pass to try to find bad tzinfo's
        bad_indices = []
        for i, row in self.data.iterrows():
            try:
                future_comparison = row['Date'] < datetime.datetime.now(pytz.utc)
            except:
                logging.error('Error timezone issues while detecting bad rows', exc_info=True)
                bad_indices.append(i)
                logging.info('Bad timezone on %s', row['Date'])
        if len(bad_indices) > 0:
            # drop those rows that threw an error
            self.data = self.data.drop(bad_indices)
            logging.info('Dropped %d rows', len(bad_indices))

        try:
            self.data.sort_values(by='Date', inplace=True)
        except:
            logging.error('Error while sorting, maybe timezone issues?', exc_info=True)
        
        if self.data.empty:
            raise mailman.MissingDataException('Archive after initial processing is empty. Was data collected properly?')
Пример #4
0
    def __init__(self, data, archive_dir=CONFIG.mail_path, mbox=False):
        """
        Initialize an Archive object.

        The behavior of the constructor depends on the type
        of its first argument, data.

        If data is a Pandas DataFrame, it is treated as a representation of
        email messages with columns for Message-ID, From, Date, In-Reply-To,
        References, and Body. The created Archive becomes a wrapper around a
        copy of the input DataFrame.

        If data is a string, then it is interpreted as a path to either a
        single .mbox file (if the optional argument single_file is True) or
        else to a directory of .mbox files (also in .mbox format). Note that
        the file extensions need not be .mbox; frequently they will be .txt.

        Upon initialization, the Archive object drops duplicate entries
        and sorts its member variable *data* by Date.
        """

        if isinstance(data, pd.core.frame.DataFrame):
            self.data = data.copy()
        elif isinstance(data, str):
            self.data = mailman.load_data(data,
                                          archive_dir=archive_dir,
                                          mbox=mbox)

        try:
            self.data['Date'] = pd.to_datetime(self.data['Date'],
                                               errors='coerce',
                                               infer_datetime_format=True,
                                               utc=True)
        except:
            #TODO: writing a CSV file was for debugging purposes, should be removed
            out_path = 'datetime-exception.csv'
            with open(out_path, 'w') as f:
                self.data.to_csv(f, encoding='utf-8')

            logging.error(
                'Error while converting to datetime, despite coerce mode.')
            raise

        try:
            self.data.drop_duplicates(inplace=True)
        except:
            logging.error(
                'Error while removing duplicate messages, maybe timezone issues?',
                exc_info=True)

        # Drops any entries with no Date field.
        # It may be wiser to optionally
        # do interpolation here.
        if self.data['Date'].isnull().any():
            #self.data.dropna(subset=['Date'], inplace=True)
            self.data = self.data[self.data['Date'].notnull()]
            # workaround for https://github.com/pandas-dev/pandas/issues/13407

        #convert any null fields to None -- csv saves these as nan sometimes
        self.data = self.data.where(pd.notnull(self.data), None)

        try:
            #set the index to be the Message-ID column
            self.data.set_index('Message-ID', inplace=True)
        except KeyError:
            #will get KeyError if Message-ID is already index
            pass

        # let's do a pass to try to find bad tzinfo's
        bad_indices = []
        for i, row in self.data.iterrows():
            try:
                future_comparison = row['Date'] < datetime.datetime.now(
                    pytz.utc)
            except:
                logging.error('Error timezone issues while detecting bad rows',
                              exc_info=True)
                bad_indices.append(i)
                logging.info('Bad timezone on %s', row['Date'])
        if len(bad_indices) > 0:
            # drop those rows that threw an error
            self.data = self.data.drop(bad_indices)
            logging.info('Dropped %d rows', len(bad_indices))

        try:
            self.data.sort_values(by='Date', inplace=True)
        except:
            logging.error('Error while sorting, maybe timezone issues?',
                          exc_info=True)

        if self.data.empty:
            raise mailman.MissingDataException(
                'Archive after initial processing is empty. Was data collected properly?'
            )
Пример #5
0
    def __init__(self, data, archive_dir="archives", mbox=False):
        """
        Initializes an Archive object.

        The behavior of the constructor depends on the type
        of its first argument, data.

        If data is a Pandas DataFrame, it is treated as a representation of
        email messages with columns for Message-ID, From, Date, In-Reply-To,
        References, and Body. The created Archive becomes a wrapper around a
        copy of the input DataFrame.

        If data is a string, then it is interpreted as a path to either a
        single .mbox file (if the optional argument single_file is True) or
        else to a directory of .mbox files (also in .mbox format). Note that
        the file extensions need not be .mbox; frequently they will be .txt.

        Upon initialization, the Archive object drops duplicate entries
        and sorts its member variable *data* by Date.
        """
          
        if isinstance(data, pd.core.frame.DataFrame):
            self.data = data.copy()
        elif isinstance(data, str):
            self.data = mailman.load_data(data,archive_dir=archive_dir,mbox=mbox)
        
        try:
            self.data['Date'] = pd.to_datetime(self.data['Date'], errors='coerce', infer_datetime_format=True, utc=True)
        except:
            out_path = 'datetime-exception.csv'

            with open(out_path, 'w') as f:
                self.data.to_csv(f, encoding='utf-8')
            
            logging.error('Error while converting to datetime, despite coerce mode.')
            raise

        try:
            self.data.drop_duplicates(inplace=True)
        except:
            logging.error('Error while removing duplicate messages, maybe timezone issues?', exc_info=True)

        # Drops any entries with no Date field.
        # It may be wiser to optionally
        # do interpolation here.
        if self.data['Date'].isnull().any():
            #self.data.dropna(subset=['Date'], inplace=True)
            self.data = self.data[self.data['Date'].notnull()]
            # workaround for https://github.com/pandas-dev/pandas/issues/13407

        #convert any null fields to None -- csv saves these as nan sometimes
        self.data = self.data.where(pd.notnull(self.data),None)

        try:
            #set the index to be the Message-ID column
            self.data.set_index('Message-ID',inplace=True)
        except KeyError:
            #will get KeyError if Message-ID is already index
            pass

        self.data.sort_values(by='Date', inplace=True)