def tweet_processing(tweetFile): tweet_file = open(tweetFile) # Open the file for reading tweet_hash = {} # Define a dictionary for keeping the hashtags as values and their id as keys tweet_id={} # Define a dictionary for keeping the created_at as values and their id as keys first_tweet=True latest_date=None print("Started reading tweets") for tweet_line in tweet_file: # Loop for every tweet in the tweets file tweet = json.loads(tweet_line) # convert the json string to dictionary object if "entities" in tweet.keys(): # Check whether entities tags present hashtags = tweet["entities"]["hashtags"] # - if present then extract the hashtags if hashtags: if first_tweet: latest_date=datetime.datetime.fromtimestamp(time.mktime(datetime.datetime.now().timetuple()),pytz.utc) #set latest date to current datetime if its first tweet #latest_date=datetime.datetime.fromtimestamp(mktime_tz(parsedate_tz("Fri Oct 30 15:29:44 +0000 2015")),pytz.utc) first_tweet=False tweet_hash[tweet["id"]]=remove_dups(['#'+str((remove_unicode(ht['text']))).lower() for ht in hashtags if ht != None and len((remove_unicode(ht['text'])))>1 ]) #extracts the hastags cleans them and checks if length of hashtag is more than 1 character tweet_id[tweet["id"]]=[datetime.datetime.fromtimestamp(mktime_tz(parsedate_tz(tweet['created_at'])),pytz.utc),0] create_graph(tweet_hash[tweet["id"]]) #calls the for key,value in tweet_id.items(): #checks for old tweets if found evict them if (latest_date-value[0]).total_seconds()>60: if len(tweet_hash[key])>=2: evict_graph(tweet_hash[key],key) #for i in older_tweet: #removes old tweets from if # if i in tweet_id.keys():del tweet_id[i] tweet_date=datetime.datetime.fromtimestamp(mktime_tz(parsedate_tz(tweet['created_at'])),pytz.utc) if tweet_date>=latest_date:latest_date=tweet_date feature2=open(outfile,'w') for degree in rolling_degree:feature2.write(str(degree)+'\n') #write into output file print("Processing is completed!!!")
def main(): ts = 'Fri Dec 07 16:12:48 +0100 2012' dt = int(mktime_tz(parsedate_tz(ts.strip()))) print dt ts = 'Fri Dec 07 16:12:48 +0000 2012' dt = int(mktime_tz(parsedate_tz(ts.strip()))) print dt
def handle(self, **options): count = TLE.objects.count() if count: last_TLE = TLE.objects.first() last_mktime = mktime_tz(parsedate_tz(last_TLE.datetime_in_lines.ctime())) else: last_mktime = 0 url = 'http://www.celestrak.com/NORAD/elements/stations.txt' resp = get(url) url_mktime = mktime_tz(parsedate_tz(resp.headers['last-modified'])) url_datetime = datetime.utcfromtimestamp(url_mktime) if url_mktime > last_mktime: self.stdout.write('Date and time of creation TLE: {}'.format(url_datetime.isoformat())) self.stdout.write('New TLE found, downloading...') result = urlretrieve(url, 'TLE.txt') fh = open(result[0], 'r', encoding='utf8') lines = fh.readlines()[:3] fh.close() os.remove(result[0]) title = lines[0].strip() norad_id = int(lines[1][2:7]) sat, status = Satellite.objects.get_or_create(title=title, satellite_number=norad_id) try: self.stdout.write('Start create and save object - ' + sat.title + '\n') TLE.objects.bulk_create(unique_tle(lines, sat)) self.stdout.write('Finished create and save object - ' + sat.title + '\n') except: self.stdout.write('Fail create and save object - ' + sat.title + '\n') else: self.stdout.write('No new TLE. A new attempt after 5 minutes...')
def parse_pubdate(text): """Parse a date string into a Unix timestamp >>> parse_pubdate('Fri, 21 Nov 1997 09:55:06 -0600') 880127706 >>> parse_pubdate('2003-12-13T00:00:00+02:00') 1071266400 >>> parse_pubdate('2003-12-13T18:30:02Z') 1071340202 >>> parse_pubdate('Mon, 02 May 1960 09:05:01 +0100') -305049299 >>> parse_pubdate('') 0 >>> parse_pubdate('unknown') 0 """ if not text: return 0 parsed = parsedate_tz(text) if parsed is not None: try: pubtimeseconds = int(mktime_tz(parsed)) return pubtimeseconds except(OverflowError,ValueError): logger.warning('bad pubdate %s is before epoch or after end of time (2038)',parsed) return 0 try: parsed = time.strptime(text[:19], '%Y-%m-%dT%H:%M:%S') if parsed is not None: m = re.match(r'^(?:Z|([+-])([0-9]{2})[:]([0-9]{2}))$', text[19:]) if m: parsed = list(iter(parsed)) if m.group(1): offset = 3600 * int(m.group(2)) + 60 * int(m.group(3)) if m.group(1) == '-': offset = 0 - offset else: offset = 0 parsed.append(offset) return int(mktime_tz(tuple(parsed))) else: return int(time.mktime(parsed)) except Exception: pass logger.error('Cannot parse date: %s', repr(text)) return 0
def _retrieve_mails(uri): LOG.debug('Retrieving mail archive from uri: %s', uri) content = utils.read_uri(uri) if not content: LOG.error('Error reading mail archive from uri: %s', uri) return content = utils.gzip_decompress(content) LOG.debug('Mail archive is loaded, start processing') content += TRAILING_RECORD for rec in re.finditer(MAIL_BOX_PATTERN, content): email = rec.groupdict() email['author_email'] = email['author_email'].replace(' at ', '@', 1) if not utils.check_email_validity(email['author_email']): continue email['date'] = int(email_utils.mktime_tz( email_utils.parsedate_tz(email['date']))) for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS): collection = set() for item in re.finditer(pattern, email['body']): groups = item.groupdict() item_id = groups['id'] if 'module' in groups: item_id = groups['module'] + ':' + item_id email['module'] = groups['module'] collection.add(item_id) email[pattern_name] = list(collection) yield email
def fetch_photos(self, user): """Fetch up to 40 photos from the tumbleblog of a specified user""" user_cache_dir = os.path.join(self.cache_dir, user) if not os.path.exists(user_cache_dir): os.mkdir(user_cache_dir) feed = etree.parse(RSS_URL.format(user=user)) for item in ITEM_XPATH(feed): (guid,) = GUID_XPATH(item) guid = guid.rsplit(':', 1)[1] cache_file_name = os.path.join(user_cache_dir, guid) + '.json' if os.path.exists(cache_file_name): continue attrs = json.loads(ATTR_XPATH(item)[0]) if attrs['type'] != 'image': continue (datestr,) = DATE_XPATH(item) timestamp = int(mktime_tz(parsedate_tz(datestr))) image_url = attrs['url'] image_file = image_url.rsplit('/', 1)[1] local_image = os.path.join(user_cache_dir, image_file) urlretrieve(image_url, local_image) details = dict(image_file=image_file, timestamp=timestamp, body=attrs['body']) with file(cache_file_name, 'wb') as descriptor: json.dump(details, descriptor)
def refresh(self, now=None): """ This fairly complex and heuristic function refreshes a server response for replay. - It adjusts date, expires and last-modified headers. - It adjusts cookie expiration. """ if not now: now = time.time() delta = now - self.timestamp_start refresh_headers = [ "date", "expires", "last-modified", ] for i in refresh_headers: if i in self.headers: d = parsedate_tz(self.headers[i][0]) if d: new = mktime_tz(d) + delta self.headers[i] = [formatdate(new)] c = [] for i in self.headers["set-cookie"]: c.append(self._refresh_cookie(i, delta)) if c: self.headers["set-cookie"] = c
def was_modified_since(self, header=None, mtime=0, size=0): """ Was something modified since the user last downloaded it? header This is the value of the If-Modified-Since header. If this is None, I'll just return True. mtime This is the modification time of the item we're talking about. size This is the size of the item we're talking about. """ try: if header is None: raise ValueError matches = re.match(r"^([^;]+)(; length=([0-9]+))?$", header, re.IGNORECASE) header_mtime = mktime_tz(parsedate_tz(matches.group(1))) header_len = matches.group(3) if header_len and int(header_len) != size: raise ValueError() if mtime > header_mtime: raise ValueError() except (AttributeError, ValueError, OverflowError): return True return False
def parse_prod(logdate): global stats global users maildate = ''.join([x[-2:] for x in logdate.split('-')]) mailarchive = join(utils.get_conf()['Dir::Base'], 'mail/archive', 'mail-%s.xz' % maildate) if not isfile(mailarchive): return (fd, tmpfile) = utils.temp_filename(utils.get_conf()['Dir::TempPath']) system('xzcat %s > %s' % (mailarchive, tmpfile)) for message in mbox(tmpfile): if (message['subject'] and message['subject'].startswith('Comments regarding')): try: member = users[' '.join(message['From'].split()[:-1])] except KeyError: continue ts = mktime_tz(parsedate_tz(message['date'])) timestamp = datetime.fromtimestamp(ts).strftime("%Y%m%d%H%M%S") date = parse_timestamp(timestamp) if date not in stats: stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0, 'REJECT': 0, 'PROD': 0}, 'members': {}} if member not in stats[date]['members']: stats[date]['members'][member] = {'ACCEPT': 0, 'REJECT': 0, 'PROD': 0} if member not in stats['history']['members']: stats['history']['members'][member] = {'ACCEPT': 0, 'REJECT': 0, 'PROD': 0} stats[date]['stats']['PROD'] += 1 stats[date]['members'][member]['PROD'] += 1 stats['history']['stats']['PROD'] += 1 stats['history']['members'][member]['PROD'] += 1 unlink(tmpfile)
def datetime_from_rfc822(value): ''' Turns an RFC822 formatted date into a datetime object. Example:: inputs.datetime_from_rfc822('Wed, 02 Oct 2002 08:00:00 EST') :param str value: The RFC822-complying string to transform :return: The parsed datetime :rtype: datetime :raises ValueError: if value is an invalid date literal ''' raw = value if not time_regex.search(value): value = ' '.join((value, '00:00:00')) try: timetuple = parsedate_tz(value) timestamp = mktime_tz(timetuple) if timetuple[-1] is None: return datetime.fromtimestamp(timestamp).replace(tzinfo=pytz.utc) else: return datetime.fromtimestamp(timestamp, pytz.utc) except Exception: raise ValueError('Invalid date literal "{0}"'.format(raw))
def get_mail_queue(site): """ Get a list of files that are still in the NEW mail_queue folder """ join = os.path.join queue_path = os.environ.get('NAAYA_MAIL_QUEUE', None) if queue_path is None: return [] mail_queue = [] new_queue_path = join(queue_path, 'new') if os.path.isdir(new_queue_path): # Get all messages files messages = [join(new_queue_path, filename) for filename in sorted(os.listdir(new_queue_path))] for message in messages: message_file = open(message, 'r+') mail = message_from_file(message_file) message_file.close() # Prepare the date to be formatted with utShowFullDateTime date = email_utils.parsedate_tz(mail.get('Date', '')) date = email_utils.mktime_tz(date) date = datetime.fromtimestamp(date) mail_queue.append({ 'subject': mail.get('Subject', '(no-subject)'), 'content': mail.get_payload(decode=True), 'recipients': mail.get_all('To'), 'sender': mail.get('From'), 'date': date, 'filename': os.path.split(message)[-1] }) return mail_queue
def parse_item(self, item): ep = Storage() ep.title = item.findtext('title') match = self.title_strip_quality.search(ep.title) if match: title_ = match.group(1) else: title_ = ep.title info = self.parse_title(title_) ep.update(info) ep.description = item.findtext('description') ep.link = item.find('enclosure').get('url') ep.pubdate = item.findtext('pubDate') ep.filename = item.findtext('title') ep.pubdate = datetime.datetime.utcfromtimestamp(mktime_tz(parsedate_tz(ep.pubdate))) ep.filterwith = ep.title ep.size = item.find('enclosure').get('length') try: ep.size = int(ep.size) except: ep.size = 0 if ep.size < 100*1024*1024: ep.size = 300*1024*1024 ep.guid = item.findtext('guid') return ep
def parse_item(self, item): ep = Storage() ep.filename = item.findtext('{http://xmlns.ezrss.it/0.1/}torrent/{http://xmlns.ezrss.it/0.1/}fileName') info = self.parse_title(ep.filename) ep.update(info) ep.title = item.findtext('title') ep.link = item.findtext('link') ep.description = item.findtext('description') ep.guid = item.findtext('guid') ep.pubdate = item.findtext('pubDate') ep.magnet = item.findtext('{http://xmlns.ezrss.it/0.1/}torrent/{http://xmlns.ezrss.it/0.1/}magnetURI') ep.filename = item.findtext('{http://xmlns.ezrss.it/0.1/}torrent/{http://xmlns.ezrss.it/0.1/}fileName') ep.pubdate = datetime.datetime.utcfromtimestamp(mktime_tz(parsedate_tz(ep.pubdate))) ep.size = item.find('enclosure').get('length') try: ep.size = int(ep.size) except: ep.size = 0 if ep.size < 100*1024*1024: ep.size = 300*1024*1024 if ep.magnet == 'magnet:?xt=urn:btih:&dn=': #check at least for a bt-chat info_hash btchat = self.magnetr.search(ep.link) if btchat: hash = btchat.group(1) ep.magnet = 'magnet:?xt=urn:btih:%s' % hash ep.link = None if not ep.guid: ep.guid = ep.description ep.filterwith = ep.title return ep
def get_webex_email(site, filename, where_to_read="sent-webex"): """ Show a specific webex email saved on the disk """ save_path = get_log_dir(site) join = os.path.join if save_path: save_path = join(save_path, where_to_read) if os.path.isdir(save_path): message_path = join(save_path, filename) try: message_file = open(message_path, "r+") except IOError: return None mail = message_from_file(message_file) message_file.close() # Prepare the date to be formatted with utShowFullDateTime date = email_utils.parsedate_tz(mail.get("Date", "")) date = email_utils.mktime_tz(date) date = datetime.fromtimestamp(date) return { "subject": mail.get("Subject", "(no-subject)"), "content": mail.get_payload(decode=True).replace("\n\n", "</p><p>").replace("\n", "<br/>"), "recipients": mail.get_all("To"), "sender": mail.get("From"), "date": date, "webex": mail.get("X-Accept-Webex-Data", ""), }
def dbopen(environ, db_basename): stderr = environ['wsgi.errors'] if not db_basename in databases.databases: db_filename = os.path.join(environ["DATADIR"], db_basename) stderr.write("db_filename: %s\n" % db_filename) conn = sqlite3.connect(db_filename) conn.enable_load_extension(True) conn.load_extension("mod_spatialite") conn.enable_load_extension(False) conn.row_factory = sqlite3.Row databases.databases[db_basename] = (conn.cursor(), int(os.path.getmtime(db_filename))) (cursor, last_modified) = databases.databases[db_basename] time_now = time.time() response_headers = [ ('Date', formatdate(time_now, usegmt=True)), ('Last-Modified', formatdate(last_modified, usegmt=True)), ('Cache-Control', 'public,max-age=86400'), ] if_modified_since = environ.get("HTTP_IF_MODIFIED_SINCE") if if_modified_since is not None: stderr.write("If-Modified-Since: %s\n" % if_modified_since) stderr.write("Last-Modified: %s\n" % formatdate(last_modified, usegmt=True)) if_modified_since = mktime_tz(parsedate_tz(if_modified_since)) if last_modified <= if_modified_since: stderr.write("304 Not Modified\n") return (None, response_headers) return (cursor, response_headers)
def get_bulk_email(site, filename, where_to_read="sent-bulk", message_file_path=None): """ Show a specific bulk email saved on the disk """ try: if not message_file_path: save_path = _get_message_path(site, where_to_read) message_file_path = os.path.join(save_path, filename) message_file = open(message_file_path, "r+") except (IOError, TypeError, AttributeError): return None mail = message_from_file(message_file) message_file.close() # Prepare the date to be formatted with utShowFullDateTime date = email_utils.parsedate_tz(mail.get("Date", "")) date = email_utils.mktime_tz(date) date = datetime.fromtimestamp(date) r = { "subject": mail.get("Subject", "(no-subject)"), "content": mail.get_payload(decode=True).replace("\n\n", "</p><p>").replace("\n", "<br/>"), "recipients": mail.get_all("To"), "cc_recipients": mail.get_all("Cc"), "sender": mail.get("From"), "date": date, "webex": mail.get("X-Accept-Webex-Data", ""), } return r
def get_bulk_email(site, filename, where_to_read='sent-bulk', message_file_path=None): """ Show a specific bulk email saved on the disk """ try: if not message_file_path: save_path = _get_message_path(site, where_to_read) message_file_path = os.path.join(save_path, filename) message_file = open(message_file_path, 'r+') except (IOError, TypeError, AttributeError): return None mail = message_from_file(message_file) message_file.close() # Prepare the date to be formatted with utShowFullDateTime date = email_utils.parsedate_tz(mail.get('Date', '')) date = email_utils.mktime_tz(date) date = datetime.fromtimestamp(date) r = { 'subject': mail.get('Subject', '(no-subject)'), 'content': mail.get_payload(decode=True).replace( '\n\n', '</p><p>').replace('\n', '<br/>'), 'recipients': mail.get_all('To'), 'cc_recipients': mail.get_all('Cc'), 'sender': mail.get('From'), 'date': date, 'webex': mail.get('X-Accept-Webex-Data', '') } return r
def get_bulk_email(site, filename): """ Show a specific bulk email saved on the disk """ save_path = get_log_dir(site) join = os.path.join if save_path: save_path = join(save_path, 'sent-bulk') if os.path.isdir(save_path): message_path = join(save_path, filename) try: message_file = open(message_path, 'r+') except IOError: return None mail = message_from_file(message_file) message_file.close() # Prepare the date to be formatted with utShowFullDateTime date = email_utils.parsedate_tz(mail.get('Date', '')) date = email_utils.mktime_tz(date) date = datetime.fromtimestamp(date) return { 'subject': mail.get('Subject', '(no-subject)'), 'content': mail.get_payload(decode=True).replace( '\n\n', '</p><p>').replace('\n', '<br/>'), 'recipients': mail.get_all('To'), 'sender': mail.get('From'), 'date': date, }
def __init__(self, mbFile, listAddrProg, startDate=None, endDate=None): self.mbFile = mbFile mb = mailbox.UnixMailbox(file(mbFile, "r"), email.message_from_file) checked = set() count = 0 config = yaml.load(open("game.yaml")) tz = pytz.timezone(config.get("timezone", "UTC")) for msg in mb: try: if not listAddrProg.search(msg["to"]): continue timetuple = parsedate_tz(msg["date"]) timestamp = mktime_tz(timetuple) date = datetime.fromtimestamp(timestamp) date = date.replace(tzinfo=tz) if startDate > date or date > endDate: continue mail = Mail(msg, date) self.msgs.append(mail) # count = count +1 # print "count...", count # if count == 50: # break except Exception as e: print "failed analyzing a message!" , e
def _refresh_cookie(self, c, delta): """ Takes a cookie string c and a time delta in seconds, and returns a refreshed cookie string. """ try: c = Cookie.SimpleCookie(str(c)) except Cookie.CookieError: raise ValueError("Invalid Cookie") for i in c.values(): if "expires" in i: d = parsedate_tz(i["expires"]) if d: d = mktime_tz(d) + delta i["expires"] = formatdate(d) else: # This can happen when the expires tag is invalid. # reddit.com sends a an expires tag like this: "Thu, 31 Dec # 2037 23:59:59 GMT", which is valid RFC 1123, but not # strictly correct according to the cookie spec. Browsers # appear to parse this tolerantly - maybe we should too. # For now, we just ignore this. del i["expires"] ret = c.output(header="").strip() if not ret: raise ValueError("Invalid Cookie") return ret
def refresh_set_cookie_header(c, delta): """ Args: c: A Set-Cookie string delta: Time delta in seconds Returns: A refreshed Set-Cookie string """ name, value, attrs = parse_set_cookie_header(c) if not name or not value: raise ValueError("Invalid Cookie") if "expires" in attrs: e = parsedate_tz(attrs["expires"]) if e: f = mktime_tz(e) + delta attrs = attrs.with_set_all("expires", [formatdate(f)]) else: # This can happen when the expires tag is invalid. # reddit.com sends a an expires tag like this: "Thu, 31 Dec # 2037 23:59:59 GMT", which is valid RFC 1123, but not # strictly correct according to the cookie spec. Browsers # appear to parse this tolerantly - maybe we should too. # For now, we just ignore this. attrs = attrs.with_delitem("expires") ret = format_set_cookie_header(name, value, attrs) if not ret: raise ValueError("Invalid Cookie") return ret
def setUp(self): """ Reads an arbitrary number of mail messages and stores them in a brand new messages table. DANGER: Any existing message table WILL be lost. """ curs.execute("DROP TABLE IF EXISTS message") conn.commit() curs.execute(TBLDEF) conn.commit() files = glob(FILESPEC) self.msgids = {} # Keyed by message_id self.message_ids = {} # keyed by id self.msgdates = [] self.rowcount = 0 for f in files: ff = open(f) text = ff.read() msg = message_from_string(text) id = self.msgids[msg['message-id']] = maildb.store(msg) self.message_ids[id] = msg['message-id'] date = msg['date'] self.msgdates.append(datetime.datetime.fromtimestamp(mktime_tz(parsedate_tz(date)))) self.rowcount += 1 # Assuming no duplicated Message-IDs
def fixDir(dname): for fname in os.listdir(dname): fname = os.path.join(dname, fname) fp = open(fname) msg = Parser().parse(fp, True) if 'Date' in msg: date = parsedate_tz(msg['Date']) if date: # Ok I had some old emails with messed up Date headers as so: # Date: Sun, 22 Aug US/E 13:01:00 -0400 # I knew these were briefly from '99-'00 so I manually fix that here. ''' if date[0] < 1900: if date[1] < 3: year = 2000 else: year = 1999 date = (year,) + date[1:] print >> sys.stderr, "Fixing up year '%s' => '%s' for %s" % (msg['Date'], date, fname) ''' try: timestamp = mktime_tz(date) os.utime(fname, (timestamp, timestamp)) except ValueError: print >> sys.stderr, "Invalid date '%s' for %s: %s" % (msg['Date'], fname, date) else: print >> sys.stderr, "Could not parse date '%s' for %s" % (msg['Date'], fname) else: print >> sys.stderr, 'No Date header in %s' % (fname)
def handler(doc): # a 'rfc822' stores 'headers' as a dict, with each entry being a list. # We only care about headers which rfc5322 must appear 0 or 1 times, so # flatten the header values here... headers = dict((k, v[0]) for (k, v) in doc["headers"].iteritems()) # for now, 'from' etc are all tuples of [identity_type, identity_id] callbacks = [] ret = {} if "from" in headers: name, addr = parseaddr(headers["from"]) ret["from"] = ["email", addr.lower()] ret["from_display"] = name if "to" in headers: id_list = ret["to"] = [] disp_list = ret["to_display"] = [] fill_identity_info(headers["to"], id_list, disp_list) if "cc" in headers: id_list = ret["cc"] = [] disp_list = ret["cc_display"] = [] fill_identity_info(headers["cc"], id_list, disp_list) if "subject" in headers: ret["subject"] = headers["subject"] if "date" in headers: dval = headers["date"] if dval: try: ret["timestamp"] = mktime_tz(parsedate_tz(dval)) except (ValueError, TypeError), exc: logger.debug("Failed to parse date %r in doc %r: %s", dval, doc["_id"], exc) # later extensions will get upset if no attr exists # XXX - is this still true? We should fix those extensions! ret["timestamp"] = 0
def refresh(self, now=None): """ This fairly complex and heuristic function refreshes a server response for replay. - It adjusts date, expires and last-modified headers. - It adjusts cookie expiration. """ if not now: now = time.time() delta = now - self.timestamp_start refresh_headers = [ "date", "expires", "last-modified", ] for i in refresh_headers: if i in self.headers: d = parsedate_tz(self.headers[i]) if d: new = mktime_tz(d) + delta self.headers[i] = formatdate(new) c = [] for set_cookie_header in self.headers.get_all("set-cookie"): try: refreshed = self._refresh_cookie(set_cookie_header, delta) except ValueError: refreshed = set_cookie_header c.append(refreshed) if c: self.headers.set_all("set-cookie", c)
def setUp(self): """ Reads an arbitrary number of mail messages and stores them in a brand new messages table. """ self.conn = msc.Connect(**loginInfo) self.curs = self.conn.cursor() self.curs.execute("DROP TABLE IF EXISTS {0}".format(TBLNM)) self.conn.commit() curs.execute(TBLDEF) conn.commit() files = glob(FILESPEC) self.msgIds = {} # Keyed by messageId self.messageIds = {} # Keyed by id self.msgdates = [] self.rowcount = 0 for f in files: ff = open(f) text = ff.read() msg = message_from_string(text) iD = self.msgIds[msg['message-id']] = maildb.store(msg, self.conn, self.curs, TBLNM) self.messageIds[iD] = msg['message-id'] date = msg['date'] self.msgdates.append(datetime.datetime.fromtimestamp(mktime_tz(parsedate_tz(date)))) self.rowcount += 1 # Assuming no duplicated Message-IDs ff.close()
def parse_redis_message(payload): try: headers, body = payload.split('\r\n'*2, 1) headers = dict(map(lambda h: h.split(': ', 1), headers.split('\r\n'))) return Message(mktime_tz(parsedate_tz(headers['Last-Modified'])), int(headers['Etag']), headers['Content-Type'], body) except: raise Message.Invalid()
def moveByLabel (self): from email.utils import parsedate_tz,mktime_tz from mailbox import NoSuchMailboxError for folder in self.oldBox.list_folders(): _c_moved=0 _c_rej=0 _c_total=self.oldBox.get_folder(folder).__len__() print("\n[I] Folder " + folder + "", end="") for key, msg in self.oldBox.get_folder(folder).iteritems(): _date=msg['Date'] if _date: if (mktime_tz(parsedate_tz(_date)) - self.deltaT) < 0: if _c_moved == 0: #To detect if no thing is moved, so this can be a new folder try: self.newBox.get_folder(folder) except NoSuchMailboxError: print("[I]\tCreating in new: %s" % folder) self.newBox.add_folder(folder) # Mooooooooooooo'ving! self.newBox.get_folder(folder).add(msg) self.oldBox.get_folder(folder).remove(key) _c_moved += 1 print("\r[I]\tStats: Not moved (Bad Mail): %d/%d // Moved: %d/%d" % (_c_rej,_c_total,_c_moved,_c_total), end="") else: _c_rej += 1 if _c_moved >= _c_total: print("\n[W]\tRemoving folder %s" % folder, end="") print("")
def _parse_sibling(self, sibling, headers, data): """ Parses a single sibling out of a response. """ sibling.exists = True # Parse the headers... for header, value in headers: header = header.lower() if header == 'content-type': sibling.content_type, sibling.charset = \ self._parse_content_type(value) elif header == 'etag': sibling.etag = value elif header == 'link': sibling.links = self._parse_links(value) elif header == 'last-modified': sibling.last_modified = mktime_tz(parsedate_tz(value)) elif header.startswith('x-riak-meta-'): metakey = header.replace('x-riak-meta-', '') sibling.usermeta[metakey] = value elif header.startswith('x-riak-index-'): field = header.replace('x-riak-index-', '') reader = csv.reader([value], skipinitialspace=True) for line in reader: for token in line: token = decode_index_value(field, token) sibling.add_index(field, token) elif header == 'x-riak-deleted': sibling.exists = False sibling.encoded_data = data return sibling
def extract_file(self): extracted_file = {} i = 0 # Used to count actual JSON messages (with time and hashtags information) # j=0 # Used to count API messages about connection and rate limits (without time and hashtags information) for line in self.my_file: temp = json.loads(line) # read data line by line if 'created_at' in temp.keys(): # select the actual JSON messages temp_time = temp['created_at'] # extract time temp_tag = temp['entities']['hashtags'] # extract hashtags # transfer time string to datatime frame # temp_time = datetime.strptime(temp_time, '%a %b %d %H:%M:%S %z %Y') # strptime is Python platform only, do not use the above query if run by shell temp_time = datetime(1970, 1, 1) + timedelta(seconds=mktime_tz(parsedate_tz(temp_time))) # store time, hashtags information,and later extracted hashtag words to new file extracted_file[i] = [temp_time, temp_tag, []] # extract the hashtag words if temp_tag: for tag in temp_tag: extracted_file[i][2].append(tag['text']) else: # no hashtags pass i += 1 else: # these messages are Twitter API resulting from the rate-limit # can be stored in apifile for future uses # here we remove these messages from dataset pass # apifile[j] = temp # j += 1 return extracted_file
def parse_date_time(stamp): ts = parsedate_tz(stamp) ts = mktime_tz(ts) return datetime.fromtimestamp(ts)
def _parse_date(date): tm = parsedate_tz(date) if tm: return mktime_tz(tm) return 0
def from_value(cls, value): dtime = datetime.datetime.fromtimestamp(eut.mktime_tz(eut.parsedate_tz(value))) return cls(dtime, value)
def mail_date(mail): t = parsedate_tz(mail.get('Date', '')) if not t: return datetime.datetime.utcnow() return datetime.datetime.utcfromtimestamp(mktime_tz(t))
def run(self): params = self.cfg_params now_datetime = datetime.datetime.now() mailbox = None try: logging.info('Logging into the mailbox...') mailbox = poplib.POP3(params['server']) mailbox.user(params['username']) mailbox.pass_(params['#password']) except poplib.error_proto as error: raise RuntimeError( 'Unable to connect to the server. Please check: server, username and password' ) except socket.gaierror as error: raise RuntimeError('Unable to resolve the server name') for i in reversed(range(len(mailbox.list()[1]))): logging.info("Reading an email from the mailbox...") lines = mailbox.retr(i + 1)[1] msg_content = b'\r\n'.join(lines).decode('utf-8') email = Parser().parsestr(msg_content) email_datetime = datetime.datetime.fromtimestamp( mktime_tz(parsedate_tz(email.get_all('Date')[0]))) if now_datetime - timedelta( hours=params['accept_timedelta_hours']) > email_datetime: logging.info("Email is older than 'accept_timedelta_hours'. " "The email is ignored and extracting is done") break if not (params['accept_from'] in email.get_all('From')[0]): logging.info( "Email is not from the 'accept_from' address (<%s>) but from <%s>. " "The email is ignored" % (params['accept_from'], email.get_all('From')[0])) continue logging.info("Parsing the content of the email...") for part in email.walk(): if (part.get_filename() is None): continue filename = None raw_filename = part.get_filename() if (raw_filename is not None) and (decode_header(raw_filename)[0][1] is not None): filename = decode_header(raw_filename)[0][0].decode( decode_header(raw_filename)[0][1]) else: filename = raw_filename if (('accept_filename' in params) and (filename != params['accept_filename'])): logging.info( "Email attachment is not the name that is accepted but '%s'. " "The attachment is ignored" % (filename)) continue elif (('accept_re_filename' in params) and (re.match(params['accept_re_filename'], part.get_filename())) is None): logging.info( "Email attachment is not accepted by RE: '%s'. " "The attachment is ignored" % (filename)) continue logging.info("Valid email attachment found, downloading...") output_filename = None if ('accept_filename' in params): output_filename = '%s/out/files/%s' % (os.getenv( 'KBC_DATADIR', '.'), self.cfg_params['accept_filename']) elif ('accept_re_filename' in params): output_filename = '%s/out/files/%s' % (os.getenv( 'KBC_DATADIR', '.'), filename) if (output_filename is None): logging.info( "Unable to determine the name of the output file") fp = open(output_filename, 'wb') fp.write(part.get_payload(decode=True)) fp.close() logging.info('Logging out...') mailbox.quit()
def rfc2822_to_epoch(datestr): """Given rfc2822 date/time format, return seconds since epoch""" return mktime_tz(parsedate_tz(datestr))
def has_been_modified(self, request, response, spider): """Return whether the response was modified since last seen. We check against the database here. If the response has been modified, we update the database. If there is no stored last modified date, we save one. """ if hasattr(spider.scanner.scan_object, 'filescan'): try: # Removes unneeded prefix file_path = response.url.replace('file://', '') # Transform URL string into normal string file_path = unquote(file_path) # Retrieves file timestamp from mounted drive last_modified = datetime.datetime.fromtimestamp( os.path.getmtime(file_path), tz=pytz.utc) except OSError as e: logging.error( 'Error occured while getting last modified for file %s' % file_path) logging.error('Error message %s' % e) else: # Check the Last-Modified header to see if the content has been # updated since the last time we checked it. last_modified_header = response.headers.get("Last-Modified", None) if last_modified_header is not None: last_modified_header_date = datetime.datetime.fromtimestamp( mktime_tz( parsedate_tz(last_modified_header.decode('utf-8'))), tz=pytz.utc) else: last_modified_header_date = None if last_modified_header_date is None and request.method == 'GET': content_type_header = response.headers.get( "Content-Type", None).decode('utf-8') if content_type_header.startswith("text/html"): # TODO: Check meta tag. # TODO: This is correct, but find out where it goes :-) try: body_html = html.fromstring(response.body) except: logging.info('error occured.') meta_dict = { list(el.values())[0]: list(el.values())[1] for el in body_html.findall('head/meta') } if 'last-modified' in meta_dict: lm = meta_dict['last-modified'] try: last_modified_header_date = arrow.get(lm).datetime except: logging.error( "Date format error on last modied: {0}".format( lm)) # lastmod comes from a sitemap.xml file sitemap_lastmod_date = request.meta.get("lastmod", None) if sitemap_lastmod_date is None: last_modified = last_modified_header_date logging.debug("Using header's last-modified date: %s" % last_modified) else: if last_modified_header_date is None: # No Last-Modified header, use the lastmod from the sitemap last_modified = sitemap_lastmod_date logging.debug("Using lastmod from sitemap %s" % last_modified) else: # Take the most recent of the two logging.debug( "Taking most recent of (header) %sand (sitemap) %s" % (last_modified_header_date, sitemap_lastmod_date)) last_modified = max(last_modified_header_date, sitemap_lastmod_date) logging.debug("Last modified %s" % last_modified) if last_modified is not None: # Check against the database canonical_url = canonicalize_url(response.url) try: url_last_modified = UrlLastModified.objects.get( url=canonical_url, scanner=self.get_scanner_object(spider)) stored_last_modified = url_last_modified.last_modified logging.info("Comparing header %s against stored %s" % (last_modified, stored_last_modified)) if (stored_last_modified is not None and last_modified == stored_last_modified): return False else: # Update last-modified date in database url_last_modified.last_modified = last_modified url_last_modified.save() return True except UrlLastModified.DoesNotExist: logging.debug("No stored Last-Modified header found.") url_last_modified = UrlLastModified( url=canonical_url, last_modified=last_modified, scanner=self.get_scanner_object(spider)) logging.debug("Saving new last-modified value %s" % url_last_modified) url_last_modified.save() return True else: # If there is no Last-Modified header, we have to assume it has # been modified. logging.debug('No Last-Modified header found at all.') return True
__project__ = "Veles Machine Learning Platform" __versioninfo__ = 0, 9, 2 __version__ = ".".join(map(str, __versioninfo__)) __license__ = "Apache 2.0" __copyright__ = u"© 2013-2015 Samsung Electronics Co., Ltd." __authors__ = [ "Gennady Kuznetsov", "Vadim Markovtsev", "Alexey Kazantsev", "Lyubov Podoynitsina", "Denis Seresov", "Dmitry Senin", "Alexey Golovizin", "Egor Bulychev", "Ernesto Sanches" ] __contact__ = "Gennady Kuznetsov <*****@*****.**>" __plugins__ = set() try: __git__ = "$Commit$" __date__ = mktime_tz(parsedate_tz("$Date$")) except Exception as ex: warn("Cannot expand variables generated by Git, setting them to None") __git__ = None __date__ = None __logo_ext__ = ("Copyright %s" % __copyright__, "Released under Apache 2.0 license.", "https://velesnet.ml", "https://github.com/samsung/veles/issues") __logo__ = \ r" _ _ _____ _ _____ _____ " "\n" \ r"| | | | ___| | | ___/ ___| " + \ (" Version %s." % __version__) + \ (" %s\n" % formatdate(__date__, True)) + \ r"| | | | |__ | | | |__ \ `--. " + \
def get_timestamp(string_date): tt = parsedate_tz(string_date) return mktime_tz(tt)
def generate_api(user, config): #global user_infos global auths user_infos = {} auth_key = config["consumer_key"] + config["consumer_secret"] +\ config["access_token"] + config["access_secret"] if auth_key in auths: api = auths[auth_key]["api"] else: auth = tweepy.OAuthHandler(config["consumer_key"], config["consumer_secret"]) auth.set_access_token(config["access_token"], config["access_secret"]) api = tweepy.API(auth) auths[auth_key] = { "auth": auth, "api": api } if user not in user_infos: user_infos[user] = api.get_user(id=user) user_info = user_infos[user] username = user_info.screen_name.lower() title = "@" + user_info.screen_name if not config["author_username"] and "name" in user_info.__dict__ and len(user_info.name) > 0: title = user_info.name if "description" in user_info.__dict__ and len(user_info.description) > 0: description = user_info.description else: description = "%s's twitter" % title feed = { "title": title, "description": description, "author": username, "url": "https://twitter.com/" + username, "social": True, "entries": [] } tl = [] if config["count"] == -1: maxid = None while True: temp_tl = api.user_timeline(id=user, max_id=maxid, count=200, tweet_mode="extended") if not temp_tl: break tl = tl + temp_tl maxid = tl[-1].id - 1 sys.stderr.write("\r" + str(len(tl)) + " / " + str(user_info.statuses_count)) else: tl = api.user_timeline(id=user, count=config["count"], tweet_mode="extended") if not tl: return None for obj in tl: #caption = xml.sax.saxutils.unescape(re.sub(" *http[^ ]*t\.co/[^ ]*", "", obj.text)) #caption = xml.sax.saxutils.unescape(obj.text) #pprint.pprint(obj.__dict__) is_retweeted = False if "retweeted_status" in obj.__dict__ and obj.retweeted_status: is_retweeted = True if is_retweeted and not config["with_retweets"]: continue origcaption = obj.full_text.replace("\r", "\n") newcaption = origcaption if "entities" in obj.__dict__: if "urls" in obj.entities: for url in obj.entities["urls"]: newcaption = newcaption.replace(url["url"], url["expanded_url"]) caption = xml.sax.saxutils.unescape(re.sub(" *https?://t\.co/[^ ]*", "", newcaption)) #caption = xml.sax.saxutils.unescape(newcaption) date = rssit.util.localize_datetime(datetime.datetime.fromtimestamp(mktime_tz(parsedate_tz(obj._json["created_at"])))) entrydict = { "url": "https://twitter.com/" + obj.author.screen_name + "/status/" + obj.id_str, "caption": caption, "date": date, "updated_date": date, "author": obj.author.screen_name.lower(), "images": [], "videos": [] } #pprint.pprint(obj.__dict__) if "extended_entities" in obj.__dict__: for media in obj.__dict__["extended_entities"]["media"]: if media["type"] == "photo": url = media["media_url"] url = get_orig_image(url) entrydict["images"].append(url) #entrydict["images"].append(media["media_url"]) elif media["type"] == "video" or media["type"] == "animated_gif": videodict = { "image": media["media_url"] } variants = media["video_info"]["variants"] max_bitrate = -1 curr = None for variant in variants: if "bitrate" in variant and variant["bitrate"] > max_bitrate: curr = variant if not curr: curr = variants[0] videodict["video"] = curr["url"] entrydict["videos"].append(videodict) feed["entries"].append(entrydict) return feed
def parse_date(self, value): try: return datetime.utcfromtimestamp(mktime_tz(parsedate_tz(value))) except (TypeError, OverflowError): raise RuntimeError("Received an ill-formed timestamp")
def rfc1123_to_epoch(date_str): try: date_str = to_unicode(date_str, encoding='ascii') return mktime_tz(parsedate_tz(date_str)) except Exception: return None
""" Copyright 2011 by Brian C. Lane """ import sys import email raw_msg = sys.stdin.read() msg = email.message_from_string(raw_msg) date = msg.get('Date', None) if date: from email.utils import mktime_tz, parsedate_tz, formatdate try: # Convert to local TZ tz_tuple = parsedate_tz(date) epoch_time = mktime_tz(tz_tuple) msg.add_header('X-Date', formatdate( epoch_time, localtime=True )) from cStringIO import StringIO from email.generator import Generator fp = StringIO() g = Generator(fp, mangle_from_=False, maxheaderlen=200) g.flatten(msg) sys.stdout.write(fp.getvalue()) except: import traceback print traceback.format_exc() sys.stdout.write(raw_msg) else: # just write it out sys.stdout.write(raw_msg)
def fetch(self, url): """Attempts to fetch the URL requested which should refer to a robots.txt file, e.g. http://example.com/robots.txt. """ # ISO-8859-1 is the default encoding for text files per the specs for # HTTP 1.0 (RFC 1945 sec 3.6.1) and HTTP 1.1 (RFC 2616 sec 3.7.1). # ref: http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1 encoding = "iso-8859-1" content = "" expires_header = None content_type_header = None self._response_code = None self._source_url = url if self.user_agent: req = urllib_request.Request(url, None, {'User-Agent': self.user_agent}) else: req = urllib_request.Request(url) try: f = urllib_request.urlopen(req) content = f.read(MAX_FILESIZE) # As of Python 2.5, f.info() looks like it returns the HTTPMessage # object created during the connection. expires_header = f.info().get("expires") content_type_header = f.info().get("Content-Type") # As of Python 2.4, this file-like object reports the response # code, too. if hasattr(f, "code"): self._response_code = f.code else: self._response_code = 200 f.close() except urllib_error.URLError: # This is a slightly convoluted way to get the error instance, # but it works under Python 2 & 3. error_instance = sys.exc_info() if len(error_instance) > 1: error_instance = error_instance[1] if hasattr(error_instance, "code"): self._response_code = error_instance.code # MK1996 section 3.4 says, "...robots should take note of Expires # header set by the origin server. If no cache-control directives # are present robots should default to an expiry of 7 days". # This code is lazy and looks at the Expires header but not # Cache-Control directives. self.expiration_date = None if self._response_code >= 200 and self._response_code < 300: # All's well. if expires_header: self.expiration_date = email_utils.parsedate_tz(expires_header) if self.expiration_date: # About time zones -- the call to parsedate_tz() returns a # 10-tuple with the time zone offset in the 10th element. # There are 3 valid formats for HTTP dates, and one of # them doesn't contain time zone information. (UTC is # implied since all HTTP header dates are UTC.) When given # a date that lacks time zone information, parsedate_tz() # returns None in the 10th element. mktime_tz() interprets # None in the 10th (time zone) element to mean that the # date is *local* time, not UTC. # Therefore, if the HTTP timestamp lacks time zone info # and I run that timestamp through parsedate_tz() and pass # it directly to mktime_tz(), I'll get back a local # timestamp which isn't what I want. To fix this, I simply # convert a time zone of None to zero. It's much more # difficult to explain than to fix. =) # ref: http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3.1 if self.expiration_date[9] == None: self.expiration_date = self.expiration_date[:9] + (0, ) self.expiration_date = email_utils.mktime_tz( self.expiration_date) if self.use_local_time: # I have to do a little more converting to get this # UTC timestamp into localtime. self.expiration_date = time.mktime( time.gmtime(self.expiration_date)) #else: # The expires header was garbage. if not self.expiration_date: self.expiration_date = self._now() + SEVEN_DAYS if (self._response_code >= 200) and (self._response_code < 300): # All's well. media_type, encoding = _parse_content_type_header( content_type_header) # RFC 2616 sec 3.7.1 -- # When no explicit charset parameter is provided by the sender, # media subtypes of the "text" type are defined to have a default # charset value of "ISO-8859-1" when received via HTTP. # http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1 if not encoding: encoding = "iso-8859-1" elif self._response_code in (401, 403): # 401 or 403 ==> Go away or I will taunt you a second time! # (according to MK1996) content = "User-agent: *\nDisallow: /\n" elif self._response_code == 404: # No robots.txt ==> everyone's welcome content = "" else: # Uh-oh. I punt this up to the caller. #raise urllib_error.URLError, self._response_code, None raise if ((PY_MAJOR_VERSION == 2) and isinstance(content, str)) or \ ((PY_MAJOR_VERSION > 2) and (not isinstance(content, str))): # This ain't Unicode yet! It needs to be. # Unicode decoding errors are another point of failure that I punt # up to the caller. try: content = content.decode(encoding) except UnicodeError: _raise_error( UnicodeError, "Robots.txt contents are not in the encoding expected (%s)." % encoding) except (LookupError, ValueError): # LookupError ==> Python doesn't have a decoder for that encoding. # One can also get a ValueError here if the encoding starts with # a dot (ASCII 0x2e). See Python bug 1446043 for details. This # bug was supposedly fixed in Python 2.5. _raise_error( UnicodeError, "I don't understand the encoding \"%s\"." % encoding) # Now that I've fetched the content and turned it into Unicode, I # can parse it. self.parse(content)
def parse(self, m, prefix=None): """Parse messages sent by the 'buildbot-cvs-mail' program. """ # The mail is sent from the person doing the checkin. Assume that the # local username is enough to identify them (this assumes a one-server # cvs-over-rsh environment rather than the server-dirs-shared-over-NFS # model) _, addr = parseaddr(m["from"]) if not addr: # no From means this message isn't from buildbot-cvs-mail return None at = addr.find("@") if at == -1: author = addr # might still be useful else: author = addr[:at] author = util.bytes2unicode(author, encoding="ascii") # CVS accepts RFC822 dates. buildbot-cvs-mail adds the date as # part of the mail header, so use that. # This assumes cvs is being access via ssh or pserver, so the time # will be the CVS server's time. # calculate a "revision" based on that timestamp, or the current time # if we're unable to parse the date. log.msg('Processing CVS mail') dateTuple = parsedate_tz(m["date"]) if dateTuple is None: when = util.now() else: when = mktime_tz(dateTuple) theTime = datetime.datetime.utcfromtimestamp(float(when)) rev = theTime.strftime('%Y-%m-%d %H:%M:%S') catRE = re.compile(r'^Category:\s*(\S.*)') cvsRE = re.compile(r'^CVSROOT:\s*(\S.*)') cvsmodeRE = re.compile(r'^Cvsmode:\s*(\S.*)') filesRE = re.compile(r'^Files:\s*(\S.*)') modRE = re.compile(r'^Module:\s*(\S.*)') pathRE = re.compile(r'^Path:\s*(\S.*)') projRE = re.compile(r'^Project:\s*(\S.*)') singleFileRE = re.compile(r'(.*) (NONE|\d(\.|\d)+) (NONE|\d(\.|\d)+)') tagRE = re.compile(r'^\s+Tag:\s*(\S.*)') updateRE = re.compile(r'^Update of:\s*(\S.*)') comments = "" branch = None cvsroot = None fileList = None files = [] isdir = 0 path = None project = None lines = list(body_line_iterator(m)) while lines: line = lines.pop(0) m = catRE.match(line) if m: category = m.group(1) continue m = cvsRE.match(line) if m: cvsroot = m.group(1) continue m = cvsmodeRE.match(line) if m: cvsmode = m.group(1) continue m = filesRE.match(line) if m: fileList = m.group(1) continue m = modRE.match(line) if m: # We don't actually use this # module = m.group(1) continue m = pathRE.match(line) if m: path = m.group(1) continue m = projRE.match(line) if m: project = m.group(1) continue m = tagRE.match(line) if m: branch = m.group(1) continue m = updateRE.match(line) if m: # We don't actually use this # updateof = m.group(1) continue if line == "Log Message:\n": break # CVS 1.11 lists files as: # repo/path file,old-version,new-version file2,old-version,new-version # Version 1.12 lists files as: # file1 old-version new-version file2 old-version new-version # # files consists of tuples of 'file-name old-version new-version' # The versions are either dotted-decimal version numbers, ie 1.1 # or NONE. New files are of the form 'NONE NUMBER', while removed # files are 'NUMBER NONE'. 'NONE' is a literal string # Parsing this instead of files list in 'Added File:' etc # makes it possible to handle files with embedded spaces, though # it could fail if the filename was 'bad 1.1 1.2' # For cvs version 1.11, we expect # my_module new_file.c,NONE,1.1 # my_module removed.txt,1.2,NONE # my_module modified_file.c,1.1,1.2 # While cvs version 1.12 gives us # new_file.c NONE 1.1 # removed.txt 1.2 NONE # modified_file.c 1.1,1.2 if fileList is None: log.msg('CVSMaildirSource Mail with no files. Ignoring') return None # We don't have any files. Email not from CVS if cvsmode == '1.11': # Please, no repo paths with spaces! m = re.search('([^ ]*) ', fileList) if m: path = m.group(1) else: log.msg( 'CVSMaildirSource can\'t get path from file list. Ignoring mail' ) return None fileList = fileList[len(path):].strip() singleFileRE = re.compile( r'(.+?),(NONE|(?:\d+\.(?:\d+\.\d+\.)*\d+)),(NONE|(?:\d+\.(?:\d+\.\d+\.)*\d+))(?: |$)') # noqa pylint: disable=line-too-long elif cvsmode == '1.12': singleFileRE = re.compile( r'(.+?) (NONE|(?:\d+\.(?:\d+\.\d+\.)*\d+)) (NONE|(?:\d+\.(?:\d+\.\d+\.)*\d+))(?: |$)') # noqa pylint: disable=line-too-long if path is None: raise ValueError( 'CVSMaildirSource cvs 1.12 require path. Check cvs loginfo config' ) else: raise ValueError(f'Expected cvsmode 1.11 or 1.12. got: {cvsmode}') log.msg(f"CVSMaildirSource processing filelist: {fileList}") while fileList: m = singleFileRE.match(fileList) if m: curFile = path + '/' + m.group(1) files.append(curFile) fileList = fileList[m.end():] else: log.msg('CVSMaildirSource no files matched regex. Ignoring') return None # bail - we couldn't parse the files that changed # Now get comments while lines: line = lines.pop(0) comments += line comments = comments.rstrip() + "\n" if comments == '\n': comments = None return ('cvs', dict(author=author, committer=None, files=files, comments=comments, isdir=isdir, when=when, branch=branch, revision=rev, category=category, repository=cvsroot, project=project, properties=self.properties))
def rfc1123_to_epoch(date_str): try: return mktime_tz(parsedate_tz(date_str)) except Exception: return None
def parse_date_str(self, date_string): return datetime.datetime.fromtimestamp( mktime_tz(parsedate_tz(date_string)))
def handle(self, *args, **options): if not args[0]: raise CommandError('Need xml file') if len(args) > 1: if args[1] == 'delete': print('Flushing all posts ... ', end='') models.Post.objects.all().delete() print('Done!') xmldoc = minidom.parse(args[0]) authors = xmldoc.getElementsByTagName('wp:author') for author in authors: username = author.getElementsByTagName( 'wp:author_login')[0].childNodes[0].nodeValue email = author.getElementsByTagName( 'wp:author_email')[0].childNodes[0].nodeValue first_name_nodes = author.getElementsByTagName( 'wp:author_first_name')[0].childNodes first_name = '' if first_name_nodes: first_name = first_name_nodes[0].nodeValue last_name_nodes = author.getElementsByTagName( 'wp:author_first_name')[0].childNodes last_name = '' if last_name_nodes: last_name = first_name_nodes[0].nodeValue User.objects.get_or_create(username=username, email=email, first_name=first_name, last_name=last_name) posts = xmldoc.getElementsByTagName('item') for field in models.Post._meta.local_fields: if field.name == "created_at": field.auto_now_add = False elif field.name == "updated_at": field.auto_now_add = False field.auto_now = False for post in posts: title = post.getElementsByTagName( 'title')[0].childNodes[0].nodeValue slug_nodes = post.getElementsByTagName( 'wp:post_name')[0].childNodes slug = None if len(slug_nodes): slug = slug_nodes[0].nodeValue pub_date = post.getElementsByTagName( 'pubDate')[0].childNodes[0].nodeValue pub_date = datetime.fromtimestamp(mktime_tz( parsedate_tz(pub_date))) pub_date = timezone.make_aware(pub_date, timezone.get_current_timezone()) creator = post.getElementsByTagName( 'dc:creator')[0].childNodes[0].nodeValue creator = User.objects.get(username=creator) status = post.getElementsByTagName( 'wp:status')[0].childNodes[0].nodeValue published = status == 'publish' content = post.getElementsByTagName( 'content:encoded')[0].childNodes[0].nodeValue print('Adding "{}" ... '.format(title), end='') post = models.Post.objects.create(title=title, slug=slug, published=published, body=content, created_at=pub_date, updated_at=pub_date) if post.published: post.published_at = pub_date post.edited_by.add(creator) print('Done!')
def fromRfc2822(date): py_time = mktime_tz(parsedate_tz(str(date))) return time.strftime("%d %b %Y, %I:%M %P", time.localtime(py_time))
def conv_time(some_time): timestamp = mktime_tz(parsedate_tz(some_time)) return (datetime(1970, 1, 1) + timedelta(seconds=timestamp)).replace(tzinfo=pytz.UTC)
def main(): logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser( description='AWS S3 website deployment tool') parser.add_argument('-f', '--force', action='store_true', dest='force', help='force upload of all files') parser.add_argument('-n', '--dry-run', action='store_true', dest='dry', help='run without uploading any files') parser.add_argument( 'path', help='the .s3_website.yaml configuration file or directory', default='.', nargs='?') args = parser.parse_args() # Open configuration file conf, base_path = config.load_config_file(args.path) bucket_name = conf['s3_bucket'] cache_rules = conf.get('cache_rules', []) if 's3_reduced_redundancy' in conf.keys(): reduced_redundancy = conf['s3_reduced_redundancy'] else: reduced_redundancy = False logger.info('Connecting to bucket {}...'.format(bucket_name)) conn = S3Connection(calling_format=OrdinaryCallingFormat()) bucket = conn.get_bucket(bucket_name, validate=False) site_dir = os.path.join(base_path, conf['site']) logger.info('Site: {}'.format(site_dir)) processed_keys = set() updated_keys = set() for key in bucket: processed_keys.add(key.key) path = os.path.join(site_dir, key.key) # Delete keys that have been deleted locally if not os.path.isfile(path): logger.info('Deleting {}...'.format(key.key)) if not args.dry: key.delete() updated_keys.add(key.key) continue # Skip keys that have not been updated mtime = int(os.path.getmtime(path)) if not args.force: # Update key metadata if not available. # The bucket list() call that is executed through the bucket # iteration above actually does obtain the last modified date # from the server, but boto currently does not update the key # variables based on that. We need to do an additional get_key() # request to get the field populated. key = bucket.get_key(key.key) key_mtime = mktime_tz(parsedate_tz(key.last_modified)) if mtime <= key_mtime: logger.info('Not modified, skipping {}.'.format(key.key)) continue upload_key(key, path, cache_rules, args.dry, replace=True, reduced_redundancy=reduced_redundancy) updated_keys.add(key.key) for dirpath, dirnames, filenames in os.walk(site_dir): key_base = os.path.relpath(dirpath, site_dir) for name in filenames: path = os.path.join(dirpath, name) key_name = key_name_from_path(os.path.join(key_base, name)) if key_name in processed_keys: continue # Create new key key = Key(bucket) key.key = key_name logger.info('Creating key {}...'.format(key_name)) upload_key(key, path, cache_rules, args.dry, replace=False, reduced_redundancy=reduced_redundancy) updated_keys.add(key_name) logger.info('Bucket update done.') # Invalidate files in cloudfront distribution if 'cloudfront_distribution_id' in conf: logger.info('Connecting to Cloudfront distribution {}...'.format( conf['cloudfront_distribution_id'])) index_pattern = None if 'index_document' in conf: index_doc = conf['index_document'] index_pattern = r'(^(?:.*/)?)' + re.escape(index_doc) + '$' def path_from_key_name(key_name): if index_pattern is not None: m = re.match(index_pattern, key_name) if m: return m.group(1) return key_name t = PrefixCoverTree() for key_name in updated_keys: t.include(path_from_key_name(key_name)) for key_name in processed_keys - updated_keys: t.exclude(path_from_key_name(key_name)) paths = [] for prefix, exact in t.matches(): path = '/' + prefix + ('' if exact else '*') logger.info('Preparing to invalidate {}...'.format(path)) paths.append(path) conn = boto.connect_cloudfront() if len(paths) > 0: dist_id = conf['cloudfront_distribution_id'] if not args.dry: logger.info('Creating invalidation request...') conn.create_invalidation_request(dist_id, paths) else: logger.info('Nothing updated, skipping invalidation...') logger.info('Cloudfront invalidation done.')
def make_call(self, path, body=None, delete=False): """ Make a single UMAPI call with error handling and retry on temporary failure. :param path: the string endpoint path for the call :param body: (optional) list of dictionaries to be serialized into the request body :return: the requests.result object (on 200 response), raise error otherwise """ if body: request_body = json.dumps(body) def call(): return self.session.post(self.endpoint + path, auth=self.auth, data=request_body, timeout=self.timeout, verify=self.ssl_verify) else: if not delete: def call(): return self.session.get(self.endpoint + path, auth=self.auth, timeout=self.timeout, verify=self.ssl_verify) else: def call(): return self.session.delete(self.endpoint + path, auth=self.auth, timeout=self.timeout, verify=self.ssl_verify) start_time = time() result = None for num_attempts in range(1, self.retry_max_attempts + 1): try: result = call() if result.status_code in [200, 201, 204]: return result elif result.status_code in [429, 502, 503, 504]: if self.logger: self.logger.warning( "UMAPI timeout...service unavailable (code %d on try %d)", result.status_code, num_attempts) retry_wait = 0 if "Retry-After" in result.headers: advice = result.headers["Retry-After"] advised_time = parsedate_tz(advice) if advised_time is not None: # header contains date retry_wait = int(mktime_tz(advised_time) - time()) else: # header contains delta seconds retry_wait = int(advice) if retry_wait <= 0: # use exponential back-off with random delay delay = randint(0, self.retry_random_delay) retry_wait = (int(pow(2, num_attempts - 1)) * self.retry_first_delay) + delay elif 201 <= result.status_code < 400: raise ClientError( "Unexpected HTTP Status {:d}: {}".format( result.status_code, result.text), result) elif 400 <= result.status_code < 500: raise RequestError(result) else: raise ServerError(result) except requests.Timeout: if self.logger: self.logger.warning( "UMAPI connection timeout...(%d seconds on try %d)", self.timeout, num_attempts) retry_wait = 0 result = None if num_attempts < self.retry_max_attempts: if retry_wait > 0: if self.logger: self.logger.warning("Next retry in %d seconds...", retry_wait) sleep(retry_wait) else: if self.logger: self.logger.warning("Immediate retry...") total_time = int(time() - start_time) if self.logger: self.logger.error( "UMAPI timeout...giving up after %d attempts (%d seconds).", self.retry_max_attempts, total_time) raise UnavailableError(self.retry_max_attempts, total_time, result)
message_id = parsed.headers.get('Message-ID') subject = parsed.headers.get('Subject').strip('Re: ') sender = parsed.headers.get('Sender') delivered_to = parsed.headers.get('Delivered-To') _to = parsed.headers.get('To') to_addr = parse_email_address_list(_to)[0][1] _from = parsed.headers.get('From') from_addr = parse_email_address_list(_from)[0][1] date = parsed.headers.get('Date') parsed_date = parsedate_tz(date) timestamp = mktime_tz(parsed_date) received_date = datetime.fromtimestamp(timestamp) # We have to hard-code these values unfortunately msg_id = 2 thread_id = 2 mailing_list_headers = { "List-Id": "<golang-nuts.googlegroups.com>", "List-Post": "<http://groups.google.com/group/golang-nuts/post>, <mailto:[email protected]>", "List-Owner": None, "List-Subscribe": "<http://groups.google.com/group/golang-nuts/subscribe>, <mailto:[email protected]>", "List-Unsubscribe": "<http://groups.google.com/group/golang-nuts/subscribe>, <mailto:[email protected]>", "List-Archive": "<http://groups.google.com/group/golang-nuts>", "List-Help": "<http://groups.google.com/support/>, <mailto:[email protected]>" }
def genESDoc(self, msg, timeStampFromResponse=False): httpService = msg.getHttpService() doc = DocHTTPRequestResponse(protocol=httpService.getProtocol(), host=httpService.getHost(), port=httpService.getPort()) doc.meta.index = self.confESIndex request = msg.getRequest() response = msg.getResponse() if request: iRequest = self.helpers.analyzeRequest(msg) doc.request.method = iRequest.getMethod() doc.request.url = iRequest.getUrl().toString() headers = iRequest.getHeaders() for header in headers: try: doc.add_request_header(header) except: doc.request.requestline = header parameters = iRequest.getParameters() for parameter in parameters: ptype = parameter.getType() if ptype == IParameter.PARAM_URL: typename = "url" elif ptype == IParameter.PARAM_BODY: typename = "body" elif ptype == IParameter.PARAM_COOKIE: typename = "cookie" elif ptype == IParameter.PARAM_XML: typename = "xml" elif ptype == IParameter.PARAM_XML_ATTR: typename = "xmlattr" elif ptype == IParameter.PARAM_MULTIPART_ATTR: typename = "multipartattr" elif ptype == IParameter.PARAM_JSON: typename = "json" else: typename = "unknown" name = parameter.getName() value = parameter.getValue() doc.add_request_parameter(typename, name, value) ctype = iRequest.getContentType() if ctype == IRequestInfo.CONTENT_TYPE_NONE: doc.request.content_type = "none" elif ctype == IRequestInfo.CONTENT_TYPE_URL_ENCODED: doc.request.content_type = "urlencoded" elif ctype == IRequestInfo.CONTENT_TYPE_MULTIPART: doc.request.content_type = "multipart" elif ctype == IRequestInfo.CONTENT_TYPE_XML: doc.request.content_type = "xml" elif ctype == IRequestInfo.CONTENT_TYPE_JSON: doc.request.content_type = "json" elif ctype == IRequestInfo.CONTENT_TYPE_AMF: doc.request.content_type = "amf" else: doc.request.content_type = "unknown" bodyOffset = iRequest.getBodyOffset() doc.request.body = request[bodyOffset:].tostring().decode( "ascii", "replace") if response: iResponse = self.helpers.analyzeResponse(response) doc.response.status = iResponse.getStatusCode() doc.response.content_type = iResponse.getStatedMimeType() doc.response.inferred_content_type = iResponse.getInferredMimeType( ) headers = iResponse.getHeaders() dateHeader = None for header in headers: try: doc.add_response_header(header) match = reDateHeader.match(header) if match: dateHeader = match.group(1) except: doc.response.responseline = header cookies = iResponse.getCookies() for cookie in cookies: expCookie = cookie.getExpiration() expiration = None if expCookie: try: expiration = str( datetime.fromtimestamp(expCookie.time / 1000)) except: pass doc.add_response_cookie(cookie.getName(), cookie.getValue(), cookie.getDomain(), cookie.getPath(), expiration) bodyOffset = iResponse.getBodyOffset() doc.response.body = response[bodyOffset:].tostring().decode( "ascii", "replace") if timeStampFromResponse: if dateHeader: try: doc.timestamp = datetime.fromtimestamp( mktime_tz(parsedate_tz(dateHeader)), tz) # try to use date from response header "Date" self.lastTimestamp = doc.timestamp except: doc.timestamp = self.lastTimestamp # fallback: last stored timestamp. Else: now return doc
def parse_datetime(value): time_tuple = parsedate_tz(value) timestamp = mktime_tz(time_tuple) return datetime.datetime.fromtimestamp(timestamp)
def _onsuccess(boto_key): checksum = boto_key.etag.strip('"') last_modified = boto_key.last_modified modified_tuple = parsedate_tz(last_modified) modified_stamp = int(mktime_tz(modified_tuple)) return {'checksum': checksum, 'last_modified': modified_stamp}
def parse_email(self, message_string, existing_email=None): """ Creates or replace a email from a string """ parsed_email = email.message_from_string(message_string) body = None error_description = None def get_payload(message): """ Returns the first text/html body, and falls back to text/plain body """ def process_part(part, default_charset, text_part, html_part): """ Returns the first text/plain body as a unicode object, and the first text/html body """ if part.is_multipart(): for part in part.get_payload(): charset = part.get_content_charset(default_charset) (text_part, html_part) = process_part(part, charset, text_part, html_part) else: charset = part.get_content_charset(default_charset) decoded_part = part.get_payload(decode=True) decoded_part = decoded_part.decode(charset, 'replace') if part.get_content_type( ) == 'text/plain' and text_part is None: text_part = decoded_part elif part.get_content_type( ) == 'text/html' and html_part is None: html_part = decoded_part return (text_part, html_part) html_part = None text_part = None default_charset = message.get_charset() or 'ISO-8859-1' (text_part, html_part) = process_part(message, default_charset, text_part, html_part) if html_part: return ('text/html', sanitize_html( AbstractMailbox.strip_full_message_quoting_html( html_part))) elif text_part: return ('text/plain', AbstractMailbox.strip_full_message_quoting_plaintext( text_part)) else: return ( 'text/plain', u"Sorry, no assembl-supported mime type found in message parts" ) (mimeType, body) = get_payload(parsed_email) def email_header_to_unicode(header_string, join_crlf=True): decoded_header = decode_email_header(header_string) default_charset = 'ASCII' text = ''.join([ unicode(t[0], t[1] or default_charset) for t in decoded_header ]) if join_crlf: text = u''.join(text.split('\r\n')) return text new_message_id = parsed_email.get('Message-ID', None) if new_message_id: new_message_id = self.clean_angle_brackets( email_header_to_unicode(new_message_id)) else: error_description = "Unable to parse the Message-ID for message string: \n%s" % message_string return (None, None, error_description) assert new_message_id new_in_reply_to = parsed_email.get('In-Reply-To', None) if new_in_reply_to: new_in_reply_to = self.clean_angle_brackets( email_header_to_unicode(new_in_reply_to)) sender = email_header_to_unicode(parsed_email.get('From')) sender_name, sender_email = parseaddr(sender) sender_email_account = EmailAccount.get_or_make_profile( self.db, sender_email, sender_name) creation_date = datetime.utcfromtimestamp( mktime_tz(parsedate_tz(parsed_email['Date']))) subject = email_header_to_unicode(parsed_email['Subject'], False) recipients = email_header_to_unicode(parsed_email['To']) body = body.strip() # Try/except for a normal situation is an anti-pattern, # but sqlalchemy doesn't have a function that returns # 0, 1 result or an exception try: email_object = self.db.query(Email).filter( Email.source_post_id == new_message_id, Email.discussion_id == self.discussion_id, Email.source == self).one() if existing_email and existing_email != email_object: raise ValueError( "The existing object isn't the same as the one found by message id" ) email_object.recipients = recipients email_object.sender = sender email_object.creation_date = creation_date email_object.source_post_id = new_message_id email_object.in_reply_to = new_in_reply_to email_object.body_mime_type = mimeType email_object.imported_blob = message_string # TODO MAP: Make this nilpotent. email_object.subject = LangString.create(subject) email_object.body = LangString.create(body) except NoResultFound: email_object = Email(discussion=self.discussion, source=self, recipients=recipients, sender=sender, subject=LangString.create(subject), creation_date=creation_date, source_post_id=new_message_id, in_reply_to=new_in_reply_to, body=LangString.create(body), body_mime_type=mimeType, imported_blob=message_string) except MultipleResultsFound: """ TO find duplicates (this should no longer happen, but in case it ever does... SELECT * FROM post WHERE id in (SELECT MAX(post.id) as max_post_id FROM imported_post JOIN post ON (post.id=imported_post.id) GROUP BY message_id, source_id HAVING COUNT(post.id)>1) To kill them: USE assembl; UPDATE post p SET parent_id = ( SELECT new_post_parent.id AS new_post_parent_id FROM post AS post_to_correct JOIN post AS bad_post_parent ON (post_to_correct.parent_id = bad_post_parent.id) JOIN post AS new_post_parent ON (new_post_parent.message_id = bad_post_parent.message_id AND new_post_parent.id <> bad_post_parent.id) WHERE post_to_correct.parent_id IN ( SELECT MAX(post.id) as max_post_id FROM imported_post JOIN post ON (post.id=imported_post.id) GROUP BY message_id, source_id HAVING COUNT(post.id)>1 ) AND p.id = post_to_correct.id ) USE assembl; DELETE FROM post WHERE post.id IN (SELECT MAX(post.id) as max_post_id FROM imported_post JOIN post ON (post.id=imported_post.id) GROUP BY message_id, source_id HAVING COUNT(post.id)>1) """ raise MultipleResultsFound("ID %s has duplicates in source %d" % (new_message_id, self.id)) email_object.creator = sender_email_account.profile # email_object = self.db.merge(email_object) email_object.guess_languages() return (email_object, parsed_email, error_description)
def parseRFC2616Date(s): """returns seconds since unix epoch representing UTC from the HTTP-compatible time specification s. """ parts = emailutils.parsedate_tz(s) return emailutils.mktime_tz(parts)
def create_title_from_libsyn_rss(rss_feed_url): """Parses a libsyn-generated RSS feed""" if rss_feed_url.startswith('http'): feed = urllib.urlopen(rss_feed_url) feed_tree = ElementTree.parse(feed).getroot() libsyn_slug = re.search('//(.*).podiobooks', rss_feed_url).group(1) else: # Only unit tests hit this side feed_tree = ElementTree.parse(rss_feed_url).getroot() libsyn_slug = 'linus' if feed_tree is None: return None feed_tree = feed_tree.find('channel') title = Title() title.name = feed_tree.find('title').text title.slug = slugify(title.name) existing_slug_count = Title.objects.all().filter(slug=title.slug).count() if existing_slug_count > 0: title.slug += "---CHANGEME--" + str(time.time()) title.old_slug = title.slug title.libsyn_slug = libsyn_slug title.description = strip_tags(feed_tree.find('description').text).strip() if feed_tree.find('{http://www.itunes.com/dtds/podcast-1.0.dtd}explicit' ).text == 'yes': title.is_explicit = True title.deleted = True title.libsyn_cover_image_url = feed_tree.find('image').find('url').text default_license = License.objects.get(slug='by-nc-nd') title.license = default_license title.save() items = feed_tree.findall('item') start_date = datetime.datetime.now(timezone.utc) for item in items: episode = Episode() episode.title = title episode.name = item.find('title').text episode.description = strip_tags(item.find('description').text).strip() episode.filesize = item.find('enclosure').get('length') episode.url = item.find('enclosure').get('url').replace( 'traffic.libsyn.com', 'media.podiobooks.com') episode.duration = item.find( '{http://www.itunes.com/dtds/podcast-1.0.dtd}duration').text episode.media_date_created = datetime.datetime.fromtimestamp( mktime_tz(parsedate_tz(item.find('pubDate').text)), timezone.utc) try: episode.sequence = int(episode.url[episode.url.rfind('.') - 2:episode.url.rfind('.')] ) # Use URL File Name to Calc Seq episode.media_date_created = start_date + datetime.timedelta( 10, episode.sequence) except ValueError: print(episode.url) episode.sequence = 0 episode.save() return title
def rfc2822_to_datetime(rfc_date): """Converts a RFC 2822 date string to a Python datetime""" timestamp = mktime_tz(parsedate_tz(rfc_date)) raw_dt = datetime.datetime.utcfromtimestamp(timestamp) return raw_dt.replace(tzinfo=pytz.utc)