def fetch(): while True: email = None task = None gc.collect() task = db.popOutlookTask() if task is None: time.sleep(3) continue if task['timestamp'] > datetime.now(pytz.UTC): # if the task is to be served in the future db.pushOutlookTaskObject(task) time.sleep(3) continue try: email = task['email'] print 'processing', email#, 'that was queued at', task['timestamp'] state = db.getState(email, None) print "Using fetcher", state if state is None: continue lastuid = state['lastuid'] credentials = state['credentials'] access_token = credentials['access_token'] version = int(state['version']) # # renew access token if expired # if credentials.access_token_expired: # credentials.refresh(httplib2.Http()) except KeyboardInterrupt: # add the task again for fetching if email: db.pushOutlookTask(email, None) print 'interrupted' #kill all the process of outlookfetcher.py running p = subprocess.Popen(['ps', '-A'], stdout=subprocess.PIPE) out, err = p.communicate() for line in out.splitlines(): if 'outlookfetcher' in line: pid = int(line.split(None, 1)[0]) os.kill(pid, signal.SIGKILL) return except: db.log(email=email, ip=None, module="fetcher", msg=traceback.format_exc(), level=logging.ERROR) continue try: all = get_me(access_token) if all is None: db.log(email=email, ip=None, module="fetcher", msg="all mail not enabled") db.storeState(email, None, state) # add the fetching task again to the queue with 5min delay task['timestamp'] = datetime.now(pytz.UTC) + timedelta(minutes=3) db.pushOutlookTaskObject(task) continue state['working'] = True db.storeState(email, None, state) append = False firstTime = False if lastuid > 0: emails = db.getEmails(email, version, None) append = True else: emails = [] firstTime = True email_ids = get_my_messages_ids(access_token, 0, email) # print email_ids uids = [d['id'] for d in email_ids['value']] while '@odata.nextLink' in email_ids: skip = email_ids['@odata.nextLink'][email_ids['@odata.nextLink'].index('skip=')+5:email_ids['@odata.nextLink'].index('&$order')] print 'getting ids new skip', skip email_ids = get_my_messages_ids(access_token, skip, email) for d in email_ids['value']: uids.append(d['id']) print "email ids length", len(uids) # if '@odata.nextLink' not in email_ids: # skip = int(skip) + len(email_ids['value']) # email_ids = get_my_messages_ids_less(access_token, skip, email) # for d in email_ids['value']: # uids.append(d['id']) # print "email ids length less", len(uids) uids = uids[-LIMIT_NEMAILS:] # print "lastuid", lastuid base_skip = 0 if lastuid > 0 and lastuid in uids: base_skip = uids.index(lastuid)+1 uids = uids[uids.index(lastuid)+1:] # ignore if the last uid is less or equal to the result if len(uids)==0: uids = [] total = len(uids) if total > LIMIT_NEMAILS: base_skip += (total - LIMIT_NEMAILS) db.log(email=email, ip=None, module="fetcher", msg=str(total) + " new emails since last login") loaded = 0 start = 0 print "total", total fetchtime = 0 parsingtime = 0 skip = base_skip email_data0 = None first_time = True if len(emails) >= REFRESH_NETWORK: emails = [] append = False while loaded < total: tmptime = time.time() if email_data0 is not None and '@odata.nextLink' in email_data0: # print email_data0['@odata.nextLink'] first_time = False skip = email_data0['@odata.nextLink'][email_data0['@odata.nextLink'].index('skip=')+5:] if skip.find('&') != -1: skip = skip[:skip.index('&')] # elif email_data0 is not None: # #see if it's really no more emails or just api error # first_time = False # skip = int(skip) + len(email_data) # email_data0 = get_my_messages_less(access_token, email, skip, total, base_skip, first_time) # email_data = email_data0['value'] # print "email_data less", len(email_data) # # fetchtime += (time.time() - tmptime) # # for each email # tmptime = time.time() # if append == False: # for i in xrange(0, len(email_data)): # loaded += 1 # # print email_data[i] # parsed_email = db.formatOutlookEmails(email, email_data[i]) # # print parsed_email # if parsed_email != None: # emails.append(parsed_email) # else: # for i in xrange(0, len(email_data)): # loaded += 1 # parsed_email = db.formatOutlookEmails(email, email_data[i]) # # print parsed_email # emails.append(parsed_email) # parsingtime += (time.time() - tmptime) print "getting messages, skip now is", skip email_data0 = get_my_messages(access_token, email, skip, total, base_skip, first_time) email_data = email_data0['value'] print "email_data", len(email_data) fetchtime += (time.time() - tmptime) # for each email tmptime = time.time() if append == False: for i in xrange(0, len(email_data)): loaded += 1 # print email_data[i] parsed_email = db.formatOutlookEmails(email, email_data[i]) # print parsed_email if parsed_email != None: emails.append(parsed_email) else: for i in xrange(0, len(email_data)): loaded += 1 parsed_email = db.formatOutlookEmails(email, email_data[i]) # print parsed_email emails.append(parsed_email) parsingtime += (time.time() - tmptime) perc = (loaded*100.0)/total if len(emails) >= REFRESH_NETWORK or loaded >= total: if append: db.storeEmails(email, emails, version, None) append = False else: # store the file db.storeEmails(email, emails, version + 1, None) state['version'] = version + 1 version+=1 # update state print "emails length", len(emails) state['lastuid'] = uids[min(start+JUMP-1, len(uids)-1)] db.storeState(email, None, state) emails = [] db.log(email=email, ip=None, module="fetcher", msg="new version %s stored in the db" % version) start+=JUMP if firstTime: db.log(email=email, ip=None, module="fetcher", msg="marked for email") db.log(email=email, ip=None, module="fetcher", msg="done fetching. Network time: %ds. Parsing time: %ds." % (fetchtime, parsingtime)) #state = db.getState(email) if 'working' in state: del state['working'] # delete the refresh tokens for security reasons if 'credentials' in state: del state['credentials'] db.storeState(email, None, state) except KeyboardInterrupt: # add the task again for fetching if email: db.pushOutlookTask(email, None) print 'interrupted' # kill all the process of outlookfetcher.py running p = subprocess.Popen(['ps', '-A'], stdout=subprocess.PIPE) out, err = p.communicate() for line in out.splitlines(): if 'outlookfetcher' in line: pid = int(line.split(None, 1)[0]) os.kill(pid, signal.SIGKILL) return except: db.log(email=email, ip=None, module="fetcher", msg=traceback.format_exc(), level=logging.ERROR) # add the task again for fetching if email: db.pushOutlookTask(email, None)
def fetchGmail(): while True: email = None task = None gc.collect() task = db.popTask() if task is None: time.sleep(3) continue if task['timestamp'] > datetime.now(pytz.UTC): # if the task is to be served in the future db.pushTaskObject(task) time.sleep(3) continue try: email = task['email'] imap_conn = None print 'processing', email # , 'that was queued at', task['timestamp'] state = db.getState(email, None) print "Using fetcher", state if state is None: continue lastuid = int(state['lastuid']) credentials = oauth2client.client.OAuth2Credentials.from_json(state['credentials']) version = int(state['version']) # renew access token if expired if credentials.access_token_expired: credentials.refresh(httplib2.Http()) authstring = GenerateOAuth2String(email, credentials.access_token, base64_encode=False) imap_conn = ADV_IMAP4_SSL('imap.gmail.com') imap_conn.authenticate('XOAUTH2', lambda x: authstring) except KeyboardInterrupt: # add the task again for fetching if email: db.pushTask(email, None) print 'interrupted' return except: db.log(email=email, ip=None, module="fetcher", msg=traceback.format_exc(), level=logging.ERROR) if imap_conn: imap_conn.logout() continue try: all = getAllMailMailbox(imap_conn) if all is None: imap_conn.logout() db.log(email=email, ip=None, module="fetcher", msg="all mail not enabled") state['imap'] = True db.storeState(email, None, state) # add the fetching task again to the queue with 5min delay task['timestamp'] = datetime.now(pytz.UTC) + timedelta(minutes=3) db.pushTaskObject(task) continue elif 'imap' in state: del state['imap'] db.storeState(email, None, state) # db.markTaskForImap(email) # db.markTaskForEmail(email) imap_conn.select(all) state['working'] = True db.storeState(email, None, state) append = False firstTime = False if lastuid > 0: emails = db.getEmails(email, version, None) append = True else: emails = [] firstTime = True ok, data = imap_conn.uid('search', None, 'UID', str(lastuid + 1) + ':*') uids = [int(d) for d in data[0].split()] uids = uids[-LIMIT_NEMAILS:] # ignore if the last uid is less or equal to the result if len(uids) == 1 and lastuid >= uids[0]: uids = [] total = len(uids) db.log(email=email, ip=None, module="fetcher", msg=str(total) + " new emails since last login") loaded = 0 start = 0 fetchtime = 0 parsingtime = 0 while loaded < total: tmptime = time.time() # print str(uids[min(start, len(uids)-1)])+ ":" + str(uids[min(start+JUMP-1, len(uids)-1)]) ok, data = imap_conn.uid('fetch', str(uids[min(start, len(uids) - 1)]) + ":" + str( uids[min(start + JUMP - 1, len(uids) - 1)]), '(UID X-GM-LABELS FLAGS X-GM-THRID BODY.PEEK[HEADER.FIELDS (FROM TO CC Date)])') fetchtime += (time.time() - tmptime) # for each email tmptime = time.time() for i in xrange(0, len(data), 2): loaded += 1 emails.append(data[i]) parsingtime += (time.time() - tmptime) perc = (loaded * 100.0) / total if len(emails) >= REFRESH_NETWORK or loaded >= total: if append: db.storeEmails(email, emails, version, None) append = False else: # store the file db.storeEmails(email, emails, version + 1, None) state['version'] = version + 1 version += 1 # update state state['lastuid'] = uids[min(start + JUMP - 1, len(uids) - 1)] db.storeState(email, None, state) emails = [] db.log(email=email, ip=None, module="fetcher", msg="new version %s stored in the db" % version) start += JUMP imap_conn.logout() if firstTime: db.pushNotifyDone(email) db.log(email=email, ip=None, module="fetcher", msg="marked for email") db.log(email=email, ip=None, module="fetcher", msg="done fetching. Network time: %ds. Parsing time: %ds." % (fetchtime, parsingtime)) # state = db.getState(email) if 'working' in state: del state['working'] # delete the refresh tokens for security reasons if 'credentials' in state: del state['credentials'] db.storeState(email, None, state) except KeyboardInterrupt: # add the task again for fetching if email: db.pushTask(email, None) print 'interrupted' return except: db.log(email=email, ip=None, module="fetcher", msg=traceback.format_exc(), level=logging.ERROR) if imap_conn: imap_conn.logout() # add the task again for fetching if email: db.pushTask(email, None)
f.close() print len(firstnames) # load last names lastnames = [] f = open('lastnames.csv', 'rU') reader = csv.reader(f) for row in reader: lastnames.append(row[0].strip()) f.close() print len(lastnames) ## read demodata data = [] for ver in range(version + 1): data += db.getEmails(orig_email, ver) print len(data), 'emails loaded from db' def getAddresses(field): field = field.replace('\r\n', ' ').replace('\n', ' ') addrs = em.utils.getaddresses([field]) result = [] for name, email in addrs: ss = decode_header(name) decod_name = "" for s, encoding in ss: if encoding is None: encoding = 'ascii' try: decod_name += s.decode(encoding) except: