def ztest_few_days_syncer_with_deletion(self): #pylint:disable-msg=C0103 """ check that there was a deletion """ db_dir = '/tmp/gmail_bk' #clean db dir delete_db_dir(db_dir) #copy test email in dest dir storage_dir = "%s/db/%s" % (db_dir, '2011-11') gmvault_utils.makedirs(storage_dir) shutil.copyfile( '../etc/tests/test_few_days_syncer/2384403887202624608.eml.gz', '%s/2384403887202624608.eml.gz' % (storage_dir)) shutil.copyfile( '../etc/tests/test_few_days_syncer/2384403887202624608.meta', '%s/2384403887202624608.meta' % (storage_dir)) syncer = gmvault.GMVaulter('/tmp/gmail_bk', 'imap.gmail.com', 993, self.login, self.passwd) syncer.sync(imap_req="Since 1-Nov-2011 Before 2-Nov-2011", db_cleaning=True) self.assertFalse( os.path.exists('%s/2384403887202624608.eml.gz' % (storage_dir))) self.assertFalse( os.path.exists('%s/2384403887202624608.meta' % (storage_dir))) self.assertTrue( os.path.exists('%s/1384313269332005293.meta' % (storage_dir))) self.assertTrue( os.path.exists('%s/1384313269332005293.eml.gz' % (storage_dir)))
def _create_dirs(self, working_dir, nb_dirs, nb_files_per_dir): """ create all the dirs and files """ dirname = 'dir_%d' data_file = '%d.eml' meta_file = '%d.meta' for nb in xrange(0, nb_dirs): #make dir the_dir = '%s/%s' % (working_dir, dirname % (nb)) gmvault_utils.makedirs(the_dir) for file_id in xrange(0, nb_files_per_dir): #create data file fd = open( '%s/%s_%s' % (the_dir, dirname % (nb), data_file % (file_id)), 'w') fd.write("something") fd.close() #create metadata file fd = open( '%s/%s_%s' % (the_dir, dirname % (nb), meta_file % (file_id)), 'w') fd.write("another info something") fd.close()
def _init_sub_chats_dir(self): """ get info from existing sub chats """ nb_to_dir = {} LOG.debug("LIMIT_PER_CHAT_DIR = %s" % (self._limit_per_chat_dir) ) if os.path.exists(self._chats_dir): dirs = os.listdir(self._chats_dir) for the_dir in dirs: the_split = the_dir.split("-") if len(the_split) != 2: raise Exception("Should get 2 elements in %s" % (the_split)) nb_to_dir[int(the_split[1])] = the_dir if len(nb_to_dir) == 0: # no sub dir yet. Set it up self._sub_chats_nb = 0 self._sub_chats_inc = 1 self._sub_chats_dir = self.SUB_CHAT_AREA % ("subchats-%s" % (self._sub_chats_inc)) gmvault_utils.makedirs("%s/%s" % (self._db_dir, self._sub_chats_dir)) # treat when more than limit chats in max dir # treat when no dirs # add limit as attribute limit_per_dir = 2000 else: the_max = max(nb_to_dir) files = os.listdir("%s/%s" % (self._chats_dir, nb_to_dir[the_max])) self._sub_chats_nb = len(files)/2 self._sub_chats_inc = the_max self._sub_chats_dir = self.SUB_CHAT_AREA % nb_to_dir[the_max]
def _init_sub_chats_dir(self): """ get info from existing sub chats """ nb_to_dir = {} LOG.debug("LIMIT_PER_CHAT_DIR = %s" % self._limit_per_chat_dir) if os.path.exists(self._chats_dir): dirs = os.listdir(self._chats_dir) for the_dir in dirs: the_split = the_dir.split("-") if len(the_split) != 2: raise Exception("Should get 2 elements in %s" % the_split) nb_to_dir[int(the_split[1])] = the_dir if len(nb_to_dir) == 0: # no sub dir yet. Set it up self._sub_chats_nb = 0 self._sub_chats_inc = 1 self._sub_chats_dir = self.SUB_CHAT_AREA % ("subchats-%s" % (self._sub_chats_inc)) gmvault_utils.makedirs("%s/%s" % (self._db_dir, self._sub_chats_dir)) # treat when more than limit chats in max dir # treat when no dirs # add limit as attribute limit_per_dir = 2000 else: the_max = max(nb_to_dir) files = os.listdir("%s/%s" % (self._chats_dir, nb_to_dir[the_max])) self._sub_chats_nb = len(files)/2 self._sub_chats_inc = the_max self._sub_chats_dir = self.SUB_CHAT_AREA % nb_to_dir[the_max]
def bury_metadata(self, email_info, local_dir=None, extra_labels=()): """ Store metadata info in .meta file Arguments: email_info: metadata info local_dir : intermediary dir (month dir) """ if local_dir: the_dir = '%s/%s' % (self._db_dir, local_dir) gmvault_utils.makedirs(the_dir) else: the_dir = self._db_dir meta_path = self.METADATA_FNAME % ( the_dir, email_info[imap_utils.GIMAPFetcher.GMAIL_ID]) with open(meta_path, 'w') as meta_desc: # parse header fields to extract subject and msgid subject, u_subject, msgid, received, h_from, h_to = self.parse_header_fields( email_info[imap_utils.GIMAPFetcher.IMAP_HEADER_FIELDS_KEY]) # need to convert labels that are number as string # come from imap_lib when label is a number labels = [] for label in email_info[imap_utils.GIMAPFetcher.GMAIL_LABELS]: if isinstance(label, (int, long, float, complex)): label = str(label) labels.append(unicode(gmvault_utils.remove_consecutive_spaces_and_strip(label))) labels.extend(extra_labels) #add extra labels #create json structure for metadata meta_obj = { self.ID_K : email_info[imap_utils.GIMAPFetcher.GMAIL_ID], self.LABELS_K : labels, self.FLAGS_K : email_info[imap_utils.GIMAPFetcher.IMAP_FLAGS], self.THREAD_IDS_K : email_info[imap_utils.GIMAPFetcher.GMAIL_THREAD_ID], self.INT_DATE_K : gmvault_utils.datetime2e(email_info[imap_utils.GIMAPFetcher.IMAP_INTERNALDATE]), self.SUBJECT_K : subject, self.MSGID_K : msgid, self.XGM_RECV_K : received } json.dump(meta_obj, meta_desc) meta_desc.flush() gmsql.GMSQL.store_email( email_info[imap_utils.GIMAPFetcher.GMAIL_ID], email_info[imap_utils.GIMAPFetcher.GMAIL_THREAD_ID], h_from, h_to, u_subject, # unicode email_info[imap_utils.GIMAPFetcher.IMAP_INTERNALDATE], labels ) return email_info[imap_utils.GIMAPFetcher.GMAIL_ID]
def bury_email(self, email_info, local_dir=None, compress=False, extra_labels=()): """ store all email info in 2 files (.meta and .eml files) Arguments: email_info: the email content local_dir : intermediary dir (month dir) compress : if compress is True, use gzip compression """ if local_dir: the_dir = '%s/%s' % (self._db_dir, local_dir) gmvault_utils.makedirs(the_dir) else: the_dir = self._db_dir data_path = self.DATA_FNAME % ( the_dir, email_info[imap_utils.GIMAPFetcher.GMAIL_ID]) # if the data has to be encrypted if self._encrypt_data: data_path = '%s.crypt' % data_path if compress: data_path = '%s.gz' % data_path data_desc = gzip.open(data_path, 'wb') else: data_desc = open(data_path, 'wb') try: if self._encrypt_data: # need to be done for every encryption cipher = self.get_encryption_cipher() cipher.initCTR() data = cipher.encryptCTR( email_info[imap_utils.GIMAPFetcher.EMAIL_BODY]) else: data = email_info[imap_utils.GIMAPFetcher.EMAIL_BODY] # write in chunks of one 1 MB for chunk in gmvault_utils.chunker(data, 1048576): # data_desc.write(chunk) try: detection = chardet.detect(chunk) #LOG.critical("the data %s\n" % (chunk)) #LOG.critical("====== PRINT Type of string %s" %(type(chunk))) #try to convert to unicode with ascii u_chunk = unicode(chunk, encoding= detection['encoding']) except Exception, e: LOG.critical(e) LOG.critical("Warning: Guessed encoding = %s. Ignore those characters" % (detection)) #try utf-8 u_chunk = unicode(chunk, encoding="utf-8", errors='replace') if u_chunk: data_desc.write(u_chunk.encode('utf-8')) else: raise Exception("error cannot write %s" % (chunk)) self.bury_metadata(email_info, local_dir, extra_labels) data_desc.flush()
def delete_emails(self, emails_info, msg_type): """ Delete all emails and metadata with ids """ if msg_type == 'email': db_dir = self._db_dir else: db_dir = self._chats_dir move_to_bin = gmvault_utils.get_conf_defaults().get_boolean( "General", "keep_in_bin" , False) if move_to_bin: LOG.critical("Move emails to the bin:%s" % self._bin_dir) for (a_id, date_dir) in emails_info: the_dir = '%s/%s' % (db_dir, date_dir) data_p = self.DATA_FNAME % (the_dir, a_id) comp_data_p = '%s.gz' % data_p cryp_comp_data_p = '%s.crypt.gz' % data_p metadata_p = self.METADATA_FNAME % (the_dir, a_id) if move_to_bin: #move files to the bin gmvault_utils.makedirs(self._bin_dir) # create bin filenames bin_p = self.DATA_FNAME % (self._bin_dir, a_id) metadata_bin_p = self.METADATA_FNAME % (self._bin_dir, a_id) if os.path.exists(data_p): os.rename(data_p, bin_p) elif os.path.exists(comp_data_p): os.rename(comp_data_p, '%s.gz' % bin_p) elif os.path.exists(cryp_comp_data_p): os.rename(cryp_comp_data_p, '%s.crypt.gz' % bin_p) if os.path.exists(metadata_p): os.rename(metadata_p, metadata_bin_p) else: #delete files if they exists if os.path.exists(data_p): os.remove(data_p) elif os.path.exists(comp_data_p): os.remove(comp_data_p) elif os.path.exists(cryp_comp_data_p): os.remove(cryp_comp_data_p) if os.path.exists(metadata_p): os.remove(metadata_p) gmsql.GMSQL.delete_email(a_id)
def delete_emails(self, emails_info, msg_type): """ Delete all emails and metadata with ids """ if msg_type == 'email': db_dir = self._db_dir else: db_dir = self._chats_dir move_to_bin = gmvault_utils.get_conf_defaults().get_boolean( "General", "keep_in_bin" , False) if move_to_bin: LOG.critical("Move emails to the bin:%s" % self._bin_dir) for (a_id, date_dir) in emails_info: the_dir = '%s/%s' % (db_dir, date_dir) data_p = self.DATA_FNAME % (the_dir, a_id) comp_data_p = '%s.gz' % data_p cryp_comp_data_p = '%s.crypt.gz' % data_p metadata_p = self.METADATA_FNAME % (the_dir, a_id) if move_to_bin: #move files to the bin gmvault_utils.makedirs(self._bin_dir) # create bin filenames bin_p = self.DATA_FNAME % (self._bin_dir, a_id) metadata_bin_p = self.METADATA_FNAME % (self._bin_dir, a_id) if os.path.exists(data_p): os.rename(data_p, bin_p) elif os.path.exists(comp_data_p): os.rename(comp_data_p, '%s.gz' % bin_p) elif os.path.exists(cryp_comp_data_p): os.rename(cryp_comp_data_p, '%s.crypt.gz' % bin_p) if os.path.exists(metadata_p): os.rename(metadata_p, metadata_bin_p) else: #delete files if they exists if os.path.exists(data_p): os.remove(data_p) elif os.path.exists(comp_data_p): os.remove(comp_data_p) elif os.path.exists(cryp_comp_data_p): os.remove(cryp_comp_data_p) if os.path.exists(metadata_p): os.remove(metadata_p)
def bury_metadata(self, email_info, local_dir = None, extra_labels = []): #pylint:disable=W0102 """ Store metadata info in .meta file Arguments: email_info: metadata info local_dir : intermdiary dir (month dir) """ if local_dir: the_dir = '%s/%s' % (self._db_dir, local_dir) gmvault_utils.makedirs(the_dir) else: the_dir = self._db_dir meta_path = self.METADATA_FNAME % (the_dir, email_info[imap_utils.GIMAPFetcher.GMAIL_ID]) meta_desc = open(meta_path, 'w') # parse header fields to extract subject and msgid subject, msgid, received = self.parse_header_fields(email_info[imap_utils.GIMAPFetcher.IMAP_HEADER_FIELDS_KEY]) # need to convert labels that are number as string # come from imap_lib when label is a number labels = [] for label in email_info[imap_utils.GIMAPFetcher.GMAIL_LABELS]: if isinstance(label, (int, long, float, complex)): label = str(label) labels.append(unicode(gmvault_utils.remove_consecutive_spaces_and_strip(label))) labels.extend(extra_labels) #add extra labels #create json structure for metadata meta_obj = { self.ID_K : email_info[imap_utils.GIMAPFetcher.GMAIL_ID], self.LABELS_K : labels, self.FLAGS_K : email_info[imap_utils.GIMAPFetcher.IMAP_FLAGS], self.THREAD_IDS_K : email_info[imap_utils.GIMAPFetcher.GMAIL_THREAD_ID], self.INT_DATE_K : gmvault_utils.datetime2e(email_info[imap_utils.GIMAPFetcher.IMAP_INTERNALDATE]), self.FLAGS_K : email_info[imap_utils.GIMAPFetcher.IMAP_FLAGS], self.SUBJECT_K : subject, self.MSGID_K : msgid, self.XGM_RECV_K : received } json.dump(meta_obj, meta_desc) meta_desc.flush() meta_desc.close() return email_info[imap_utils.GIMAPFetcher.GMAIL_ID]
def bury_email(self, email_info, local_dir = None, compress = False, extra_labels = []): #pylint:disable=W0102 """ store all email info in 2 files (.meta and .eml files) Arguments: email_info: the email content local_dir : intermdiary dir (month dir) compress : if compress is True, use gzip compression """ if local_dir: the_dir = '%s/%s' % (self._db_dir, local_dir) gmvault_utils.makedirs(the_dir) else: the_dir = self._db_dir data_path = self.DATA_FNAME % (the_dir, email_info[imap_utils.GIMAPFetcher.GMAIL_ID]) # if the data has to be encrypted if self._encrypt_data: data_path = '%s.crypt' % (data_path) if compress: data_path = '%s.gz' % (data_path) data_desc = gzip.open(data_path, 'wb') else: data_desc = open(data_path, 'wb') if self._encrypt_data: # need to be done for every encryption cipher = self.get_encryption_cipher() cipher.initCTR() data = cipher.encryptCTR(email_info[imap_utils.GIMAPFetcher.EMAIL_BODY]) gmvault_utils.buffered_write(data_desc, data) if len(data) > 4194304 else data_desc.write(data) else: data = email_info[imap_utils.GIMAPFetcher.EMAIL_BODY] #data_desc.write(data) gmvault_utils.buffered_write(data_desc, data) if len(data) > 4194304 else data_desc.write(data) self.bury_metadata(email_info, local_dir, extra_labels) data_desc.flush() data_desc.close() return email_info[imap_utils.GIMAPFetcher.GMAIL_ID]
def get_sub_chats_dir(self): """ Get sub_chats_dir """ if self._sub_chats_inc == -1: self._init_sub_chats_dir() if self._sub_chats_nb >= self._limit_per_chat_dir: self._sub_chats_inc += 1 self._sub_chats_nb = 1 self._sub_chats_dir = self.SUB_CHAT_AREA % ("subchats-%s" % (self._sub_chats_inc)) gmvault_utils.makedirs('%s/%s' % (self._db_dir, self._sub_chats_dir)) return self._sub_chats_dir else: self._sub_chats_nb += 1 return self._sub_chats_dir
def _make_new_chat_dir(self): """ Get sub_chats_dir """ if self._sub_chats_inc == -1: self._init_sub_chats_dir() if self._sub_chats_nb >= self._limit_per_chat_dir: self._sub_chats_inc += 1 self._sub_chats_nb = 1 #Beware use double string substitution here. We are in the sub chats area self._sub_chats_dir = self.SUB_CHAT_AREA % ("subchats-%s" % (self._sub_chats_inc)) LOG.debug("_make_new_char_dir. Making dir %s/%s" % (self._db_dir, self._sub_chats_dir)) gmvault_utils.makedirs('%s/%s' % (self._db_dir, self._sub_chats_dir)) return self._sub_chats_dir else: self._sub_chats_nb += 1 return self._sub_chats_dir
def _create_dirs(self, working_dir, nb_dirs, nb_files_per_dir): """ create all the dirs and files """ dirname = 'dir_%d' data_file = '%d.eml' meta_file = '%d.meta' for nb in xrange(0, nb_dirs): #make dir the_dir = '%s/%s' % (working_dir, dirname % (nb)) gmvault_utils.makedirs(the_dir) for file_id in xrange(0,nb_files_per_dir): #create data file fd = open('%s/%s_%s' % (the_dir, dirname % (nb) , data_file % (file_id)), 'w') fd.write("something") fd.close() #create metadata file fd = open('%s/%s_%s' % (the_dir, dirname % (nb) , meta_file % (file_id)), 'w') fd.write("another info something") fd.close()
def ztest_few_days_syncer_with_deletion(self): #pylint:disable-msg=C0103 """ check that there was a deletion """ db_dir = '/tmp/gmail_bk' #clean db dir delete_db_dir(db_dir) #copy test email in dest dir storage_dir = "%s/db/%s" % (db_dir, '2011-11') gmvault_utils.makedirs(storage_dir) shutil.copyfile('../etc/tests/test_few_days_syncer/2384403887202624608.eml.gz','%s/2384403887202624608.eml.gz' % (storage_dir)) shutil.copyfile('../etc/tests/test_few_days_syncer/2384403887202624608.meta','%s/2384403887202624608.meta' % (storage_dir)) syncer = gmvault.GMVaulter('/tmp/gmail_bk', 'imap.gmail.com', 993, self.login, self.passwd) syncer.sync(imap_req = "Since 1-Nov-2011 Before 2-Nov-2011", db_cleaning = True) self.assertFalse(os.path.exists('%s/2384403887202624608.eml.gz' % (storage_dir))) self.assertFalse(os.path.exists('%s/2384403887202624608.meta' % (storage_dir))) self.assertTrue(os.path.exists('%s/1384313269332005293.meta' % (storage_dir))) self.assertTrue(os.path.exists('%s/1384313269332005293.eml.gz' % (storage_dir)))
def __init__(self, a_storage_dir, encrypt_data=False): """ Store on disks args: a_storage_dir: Storage directory a_use_encryption: Encryption key. If there then encrypt """ self._top_dir = a_storage_dir self._db_dir = '%s/%s' % (a_storage_dir, GmailStorer.DB_AREA) self._quarantine_dir = '%s/%s' % (a_storage_dir, GmailStorer.QUARANTINE_AREA) self._info_dir = '%s/%s' % (a_storage_dir, GmailStorer.INFO_AREA) self._chats_dir = '%s/%s' % (self._db_dir, GmailStorer.CHATS_AREA) self._bin_dir = '%s/%s' % (a_storage_dir, GmailStorer.BIN_AREA) gmsql.GMSQL.connect('%s/meta.db' % (a_storage_dir)) self._sub_chats_dir = None self._sub_chats_inc = -1 self._sub_chats_nb = -1 self._limit_per_chat_dir = gmvault_utils.get_conf_defaults().getint( "General", "limit_per_chat_dir", 1500) #make dirs if not os.path.exists(self._db_dir): LOG.critical("No Storage DB in %s. Create it.\n" % a_storage_dir) gmvault_utils.makedirs(self._db_dir) gmvault_utils.makedirs(self._chats_dir) gmvault_utils.makedirs(self._quarantine_dir) gmvault_utils.makedirs(self._info_dir) self.fsystem_info_cache = {} self._encrypt_data = encrypt_data self._encryption_key = None self._cipher = None #add version if it is needed to migrate gmvault-db in the future self._create_gmvault_db_version()
def __init__(self, a_storage_dir, encrypt_data=False): """ Store on disks args: a_storage_dir: Storage directory a_use_encryption: Encryption key. If there then encrypt """ self._top_dir = a_storage_dir self._db_dir = '%s/%s' % (a_storage_dir, GmailStorer.DB_AREA) self._quarantine_dir = '%s/%s' % (a_storage_dir, GmailStorer.QUARANTINE_AREA) self._info_dir = '%s/%s' % (a_storage_dir, GmailStorer.INFO_AREA) self._chats_dir = '%s/%s' % (self._db_dir, GmailStorer.CHATS_AREA) self._bin_dir = '%s/%s' % (a_storage_dir, GmailStorer.BIN_AREA) self._sub_chats_dir = None self._sub_chats_inc = -1 self._sub_chats_nb = -1 self._limit_per_chat_dir = gmvault_utils.get_conf_defaults().getint( "General", "limit_per_chat_dir", 1500) #make dirs if not os.path.exists(self._db_dir): LOG.critical("No Storage DB in %s. Create it.\n" % a_storage_dir) gmvault_utils.makedirs(self._db_dir) gmvault_utils.makedirs(self._chats_dir) gmvault_utils.makedirs(self._quarantine_dir) gmvault_utils.makedirs(self._info_dir) self.fsystem_info_cache = {} self._encrypt_data = encrypt_data self._encryption_key = None self._cipher = None #add version if it is needed to migrate gmvault-db in the future self._create_gmvault_db_version()
def bury_email(self, email_info, local_dir=None, compress=False, extra_labels=()): """ store all email info in 2 files (.meta and .eml files) Arguments: email_info: the email content local_dir : intermediary dir (month dir) compress : if compress is True, use gzip compression """ if local_dir: the_dir = '%s/%s' % (self._db_dir, local_dir) gmvault_utils.makedirs(the_dir) else: the_dir = self._db_dir data_path = self.DATA_FNAME % ( the_dir, email_info[imap_utils.GIMAPFetcher.GMAIL_ID]) # TODO: First compress then encrypt # create a compressed CIOString and encrypt it #if compress: # data_path = '%s.gz' % data_path # data_desc = StringIO.StringIO() #else: # data_desc = open(data_path, 'wb') #if self._encrypt_data: # data_path = '%s.crypt2' % data_path #TODO create a wrapper fileobj that compress in io string #then chunk write #then compress #then encrypt if it is required # if the data has to be encrypted if self._encrypt_data: data_path = '%s.crypt' % data_path if compress: data_path = '%s.gz' % data_path data_desc = gzip.open(data_path, 'wb') else: data_desc = open(data_path, 'wb') try: if self._encrypt_data: # need to be done for every encryption cipher = self.get_encryption_cipher() cipher.initCTR() data = cipher.encryptCTR(email_info[imap_utils.GIMAPFetcher.EMAIL_BODY]) LOG.debug("Encrypt data.") #write encrypted data without encoding data_desc.write(data) #no encryption then utf-8 encode and write else: #convert email content to unicode data = gmvault_utils.convert_to_unicode(email_info[imap_utils.GIMAPFetcher.EMAIL_BODY]) # write in chunks of one 1 MB for chunk in gmvault_utils.chunker(data, 1048576): data_desc.write(chunk.encode('utf-8')) #store metadata info self.bury_metadata(email_info, local_dir, extra_labels) data_desc.flush() finally: data_desc.close() return email_info[imap_utils.GIMAPFetcher.GMAIL_ID]
def setUp(self): #pylint:disable-msg=C0103 gmvault_utils.makedirs(self.output_dir) self.maildir = MaildirMock(self.output_dir) self.gexp = gmvault_export.GMVaultExporter(self.db_dir, self.maildir)
def setUp(self): #pylint:disable-msg=C0103 gmvault_utils.makedirs(self.output_dir) self.mbox = gmvault_export.MBox(self.output_dir)
def bury_email(self, email_info, local_dir=None, compress=False, extra_labels=()): """ store all email info in 2 files (.meta and .eml files) Arguments: email_info: the email content local_dir : intermediary dir (month dir) compress : if compress is True, use gzip compression """ if local_dir: the_dir = '%s/%s' % (self._db_dir, local_dir) gmvault_utils.makedirs(the_dir) else: the_dir = self._db_dir data_path = self.DATA_FNAME % ( the_dir, email_info[imap_utils.GIMAPFetcher.GMAIL_ID]) # if the data has to be encrypted if self._encrypt_data: data_path = '%s.crypt' % data_path if compress: data_path = '%s.gz' % data_path data_desc = gzip.open(data_path, 'wb') else: data_desc = open(data_path, 'wb') try: if self._encrypt_data: # need to be done for every encryption cipher = self.get_encryption_cipher() cipher.initCTR() data = cipher.encryptCTR( email_info[imap_utils.GIMAPFetcher.EMAIL_BODY]) else: data = email_info[imap_utils.GIMAPFetcher.EMAIL_BODY] # write in chunks of one 1 MB for chunk in gmvault_utils.chunker(data, 1048576): # data_desc.write(chunk) try: detection = chardet.detect(chunk) #LOG.critical("the data %s\n" % (chunk)) #LOG.critical("====== PRINT Type of string %s" %(type(chunk))) #try to convert to unicode with ascii u_chunk = unicode(chunk, encoding=detection['encoding']) except Exception, e: LOG.critical(e) LOG.critical( "Warning: Guessed encoding = %s. Ignore those characters" % (detection)) #try utf-8 u_chunk = unicode(chunk, encoding="utf-8", errors='replace') if u_chunk: data_desc.write(u_chunk.encode('utf-8')) else: raise Exception("error cannot write %s" % (chunk)) self.bury_metadata(email_info, local_dir, extra_labels) data_desc.flush()