def __init__(self, data, archive_dir="archives", mbox=False): """ Initializes an Archive object. The behavior of the constructor depends on the type of its first argument, data. If data is a Pandas DataFrame, it is treated as a representation of email messages with columns for Message-ID, From, Date, In-Reply-To, References, and Body. The created Archive becomes a wrapper around a copy of the input DataFrame. If data is a string, then it is interpreted as a path to either a single .mbox file (if the optional argument single_file is True) or else to a directory of .mbox files (also in .mbox format). Note that the file extensions need not be .mbox; frequently they will be .txt. Upon initialization, the Archive object drops duplicate entries and sorts its member variable *data* by Date. """ if isinstance(data, pd.core.frame.DataFrame): self.data = data.copy() elif isinstance(data, str): self.data = mailman.load_data(data, archive_dir=archive_dir, mbox=mbox) self.data['Date'] = pd.to_datetime(self.data['Date'], utc=True) self.data.drop_duplicates(inplace=True) # Drops any entries with no Date field. # It may be wiser to optionally # do interpolation here. self.data.dropna(subset=['Date'], inplace=True) #convert any null fields to None -- csv saves these as nan sometimes self.data = self.data.where(pd.notnull(self.data), None) try: #set the index to be the Message-ID column self.data.set_index('Message-ID', inplace=True) except KeyError: #will get KeyError if Message-ID is already index pass self.data.sort(columns='Date', inplace=True)
def __init__(self, data, archive_dir="archives", mbox=False): """ Initializes an Archive object. The behavior of the constructor depends on the type of its first argument, data. If data is a Pandas DataFrame, it is treated as a representation of email messages with columns for Message-ID, From, Date, In-Reply-To, References, and Body. The created Archive becomes a wrapper around a copy of the input DataFrame. If data is a string, then it is interpreted as a path to either a single .mbox file (if the optional argument single_file is True) or else to a directory of .mbox files (also in .mbox format). Note that the file extensions need not be .mbox; frequently they will be .txt. Upon initialization, the Archive object drops duplicate entries and sorts its member variable *data* by Date. """ if isinstance(data, pd.core.frame.DataFrame): self.data = data.copy() elif isinstance(data, str): self.data = mailman.load_data(data,archive_dir=archive_dir,mbox=mbox) self.data['Date'] = pd.to_datetime(self.data['Date'], utc=True) self.data.drop_duplicates(inplace=True) # Drops any entries with no Date field. # It may be wiser to optionally # do interpolation here. self.data.dropna(subset=['Date'], inplace=True) #convert any null fields to None -- csv saves these as nan sometimes self.data = self.data.where(pd.notnull(self.data),None) try: #set the index to be the Message-ID column self.data.set_index('Message-ID',inplace=True) except KeyError: #will get KeyError if Message-ID is already index pass self.data.sort(columns='Date', inplace=True)
def __init__(self, data, archive_dir=CONFIG.mail_path, mbox=False): """ Initialize an Archive object. The behavior of the constructor depends on the type of its first argument, data. If data is a Pandas DataFrame, it is treated as a representation of email messages with columns for Message-ID, From, Date, In-Reply-To, References, and Body. The created Archive becomes a wrapper around a copy of the input DataFrame. If data is a string, then it is interpreted as a path to either a single .mbox file (if the optional argument single_file is True) or else to a directory of .mbox files (also in .mbox format). Note that the file extensions need not be .mbox; frequently they will be .txt. Upon initialization, the Archive object drops duplicate entries and sorts its member variable *data* by Date. """ if isinstance(data, pd.core.frame.DataFrame): self.data = data.copy() elif isinstance(data, str): self.data = mailman.load_data(data,archive_dir=archive_dir,mbox=mbox) try: self.data['Date'] = pd.to_datetime(self.data['Date'], errors='coerce', infer_datetime_format=True, utc=True) except: #TODO: writing a CSV file was for debugging purposes, should be removed out_path = 'datetime-exception.csv' with open(out_path, 'w') as f: self.data.to_csv(f, encoding='utf-8') logging.error('Error while converting to datetime, despite coerce mode.') raise try: self.data.drop_duplicates(inplace=True) except: logging.error('Error while removing duplicate messages, maybe timezone issues?', exc_info=True) # Drops any entries with no Date field. # It may be wiser to optionally # do interpolation here. if self.data['Date'].isnull().any(): #self.data.dropna(subset=['Date'], inplace=True) self.data = self.data[self.data['Date'].notnull()] # workaround for https://github.com/pandas-dev/pandas/issues/13407 #convert any null fields to None -- csv saves these as nan sometimes self.data = self.data.where(pd.notnull(self.data),None) try: #set the index to be the Message-ID column self.data.set_index('Message-ID',inplace=True) except KeyError: #will get KeyError if Message-ID is already index pass # let's do a pass to try to find bad tzinfo's bad_indices = [] for i, row in self.data.iterrows(): try: future_comparison = row['Date'] < datetime.datetime.now(pytz.utc) except: logging.error('Error timezone issues while detecting bad rows', exc_info=True) bad_indices.append(i) logging.info('Bad timezone on %s', row['Date']) if len(bad_indices) > 0: # drop those rows that threw an error self.data = self.data.drop(bad_indices) logging.info('Dropped %d rows', len(bad_indices)) try: self.data.sort_values(by='Date', inplace=True) except: logging.error('Error while sorting, maybe timezone issues?', exc_info=True) if self.data.empty: raise mailman.MissingDataException('Archive after initial processing is empty. Was data collected properly?')
def __init__(self, data, archive_dir=CONFIG.mail_path, mbox=False): """ Initialize an Archive object. The behavior of the constructor depends on the type of its first argument, data. If data is a Pandas DataFrame, it is treated as a representation of email messages with columns for Message-ID, From, Date, In-Reply-To, References, and Body. The created Archive becomes a wrapper around a copy of the input DataFrame. If data is a string, then it is interpreted as a path to either a single .mbox file (if the optional argument single_file is True) or else to a directory of .mbox files (also in .mbox format). Note that the file extensions need not be .mbox; frequently they will be .txt. Upon initialization, the Archive object drops duplicate entries and sorts its member variable *data* by Date. """ if isinstance(data, pd.core.frame.DataFrame): self.data = data.copy() elif isinstance(data, str): self.data = mailman.load_data(data, archive_dir=archive_dir, mbox=mbox) try: self.data['Date'] = pd.to_datetime(self.data['Date'], errors='coerce', infer_datetime_format=True, utc=True) except: #TODO: writing a CSV file was for debugging purposes, should be removed out_path = 'datetime-exception.csv' with open(out_path, 'w') as f: self.data.to_csv(f, encoding='utf-8') logging.error( 'Error while converting to datetime, despite coerce mode.') raise try: self.data.drop_duplicates(inplace=True) except: logging.error( 'Error while removing duplicate messages, maybe timezone issues?', exc_info=True) # Drops any entries with no Date field. # It may be wiser to optionally # do interpolation here. if self.data['Date'].isnull().any(): #self.data.dropna(subset=['Date'], inplace=True) self.data = self.data[self.data['Date'].notnull()] # workaround for https://github.com/pandas-dev/pandas/issues/13407 #convert any null fields to None -- csv saves these as nan sometimes self.data = self.data.where(pd.notnull(self.data), None) try: #set the index to be the Message-ID column self.data.set_index('Message-ID', inplace=True) except KeyError: #will get KeyError if Message-ID is already index pass # let's do a pass to try to find bad tzinfo's bad_indices = [] for i, row in self.data.iterrows(): try: future_comparison = row['Date'] < datetime.datetime.now( pytz.utc) except: logging.error('Error timezone issues while detecting bad rows', exc_info=True) bad_indices.append(i) logging.info('Bad timezone on %s', row['Date']) if len(bad_indices) > 0: # drop those rows that threw an error self.data = self.data.drop(bad_indices) logging.info('Dropped %d rows', len(bad_indices)) try: self.data.sort_values(by='Date', inplace=True) except: logging.error('Error while sorting, maybe timezone issues?', exc_info=True) if self.data.empty: raise mailman.MissingDataException( 'Archive after initial processing is empty. Was data collected properly?' )
def __init__(self, data, archive_dir="archives", mbox=False): """ Initializes an Archive object. The behavior of the constructor depends on the type of its first argument, data. If data is a Pandas DataFrame, it is treated as a representation of email messages with columns for Message-ID, From, Date, In-Reply-To, References, and Body. The created Archive becomes a wrapper around a copy of the input DataFrame. If data is a string, then it is interpreted as a path to either a single .mbox file (if the optional argument single_file is True) or else to a directory of .mbox files (also in .mbox format). Note that the file extensions need not be .mbox; frequently they will be .txt. Upon initialization, the Archive object drops duplicate entries and sorts its member variable *data* by Date. """ if isinstance(data, pd.core.frame.DataFrame): self.data = data.copy() elif isinstance(data, str): self.data = mailman.load_data(data,archive_dir=archive_dir,mbox=mbox) try: self.data['Date'] = pd.to_datetime(self.data['Date'], errors='coerce', infer_datetime_format=True, utc=True) except: out_path = 'datetime-exception.csv' with open(out_path, 'w') as f: self.data.to_csv(f, encoding='utf-8') logging.error('Error while converting to datetime, despite coerce mode.') raise try: self.data.drop_duplicates(inplace=True) except: logging.error('Error while removing duplicate messages, maybe timezone issues?', exc_info=True) # Drops any entries with no Date field. # It may be wiser to optionally # do interpolation here. if self.data['Date'].isnull().any(): #self.data.dropna(subset=['Date'], inplace=True) self.data = self.data[self.data['Date'].notnull()] # workaround for https://github.com/pandas-dev/pandas/issues/13407 #convert any null fields to None -- csv saves these as nan sometimes self.data = self.data.where(pd.notnull(self.data),None) try: #set the index to be the Message-ID column self.data.set_index('Message-ID',inplace=True) except KeyError: #will get KeyError if Message-ID is already index pass self.data.sort_values(by='Date', inplace=True)