def parseDate(date): """ Parse a TextMate date (YYYY-MM-DD HH-MM-SS, no time zone) """ m = _textmate_date_re.match(date) if not m: return time.mktime(feedparser._parse_date(date)) isodate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % {'year': m.group(1), 'month': m.group(2), 'day': m.group(3), 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6), 'zonediff': '+00:00'} return time.mktime(feedparser._parse_date(isodate))
def from_xml(self, x): AtomItem.from_xml(self, x) updated = unicode(x.updated) published = unicode(x.published) updated_parsed = feedparser._parse_date(updated) published_parsed = feedparser._parse_date(published) activity_type = unicode(x.category.attrib.get('label')) verb = unicode( getattr(x, '{%s}verb' % constants.NS.ActivityStreams, None)) object_tags = list( getattr(x, '{%s}object' % constants.NS.ActivityStreams, None)) objects = [] for object_tag in object_tags: if object_tag is not None: object = ActivityObject() object.populate(object_tag, InputType.XML) objects.append(object) source = object.source else: object = None source = None # context things # TODO: get an example and parse out all the values from a location # so we don't pass around lxml objects #location = getattr(x, '{%s}location', None) mood = getattr(x, '{%s}mood' % constants.NS.ActivityContext, None) if mood is None: mood_text = mood_icon = None else: mood_text = unicode(mood) mood_icon = unicode(mood.attrib.get('icon')) object.mood_text = mood_text object.mood_icon = mood_icon self.__dict__.update( verb=verb, source=source, objects=[], updated=updated, published=published, updated_parsed=int(calendar.timegm(updated_parsed)) + TIMEZONE_FIX, published_parsed=int(calendar.timegm(published_parsed)) + TIMEZONE_FIX, activity_type=activity_type, )
def from_xml(self, x): AtomItem.from_xml(self, x) updated = unicode(x.updated) published = unicode(x.published) updated_parsed = feedparser._parse_date(updated) published_parsed = feedparser._parse_date(published) activity_type = unicode(x.category.attrib.get("label")) verb = unicode(getattr(x, "{%s}verb" % constants.NS.ActivityStreams, None)) object_tags = list(getattr(x, "{%s}object" % constants.NS.ActivityStreams, None)) objects = [] for object_tag in object_tags: if object_tag is not None: object = ActivityObject() object.populate(object_tag, InputType.XML) objects.append(object) source = object.source else: object = None source = None # context things # TODO: get an example and parse out all the values from a location # so we don't pass around lxml objects # location = getattr(x, '{%s}location', None) mood = getattr(x, "{%s}mood" % constants.NS.ActivityContext, None) if mood is None: mood_text = mood_icon = None else: mood_text = unicode(mood) mood_icon = unicode(mood.attrib.get("icon")) object.mood_text = mood_text object.mood_icon = mood_icon self.__dict__.update( verb=verb, source=source, objects=[], updated=updated, published=published, updated_parsed=int(calendar.timegm(updated_parsed)) + TIMEZONE_FIX, published_parsed=int(calendar.timegm(published_parsed)) + TIMEZONE_FIX, activity_type=activity_type, )
def parse_date(self, node, feed, ns=''): value = unicoder(node.text) feed['updated'] = value date = feedparser._parse_date(value) if self.unix_timestamp and date: date = time.mktime(date) feed['updated_parsed'] = date
def _filter_entry(self, entry): keys = entry.keys() for field in ('date_parsed', 'updated_parsed', 'published_parsed'): if field in keys: del entry[field] for field in ('date', 'updated', 'published'): if field in keys: parsed = _parse_date(entry[field]) if parsed is None: del entry[field] else: entry[field] = datetime(*parsed[:6]) if 'date' not in keys and 'published' in keys: entry['date'] = entry['published'] entry['links'] = [l for l in [link.get('href') for link in entry['links']] if l is not None] if 'summary' not in entry: if 'content' in entry: entry['summary'] = entry['content'][0]['value'] if self.name is not None: entry['title'] = u'[%s] %s' % (self.name, entry['title']) entry['root_link'] = self.url return entry
def str2datetime(string): import feedparser from datetime import datetime try: return datetime(*(feedparser._parse_date(string)[:6])) except Exception as e: logging.debug("Failed to convert %s into datetime" % string)
def get(self, slug): logging.debug('PipesHandler.get') self.response.headers["Content-Type"] = "text/xml" if slug[0] == '/': slug = slug[1:] if slug: logging.debug('slug is ' + slug) #req_path = self.request.path feed_xml_memcache_key = PIPE_OUTPUT_FEED_XML_MEMCACHE_KEY + '_' + slug feed_xml = memcache.get(feed_xml_memcache_key) updated_time_str = self.request.get('updated_time') updated_time = None if updated_time_str: updated_time = datetime.datetime.fromtimestamp( time.mktime(feedparser._parse_date(updated_time_str))) if not feed_xml: logging.debug('feed_xml not found in memcache, query enties') pipe = get_pipe(slug) if pipe: try: feed_xml = self.get_feed_xml(pipe, updated_time) memcache.add(feed_xml_memcache_key, feed_xml, 60 * 30) #cache ten minutes except Exception, e: logging.exception(e) return self.response.set_status(500) else: return self.response.set_status(404) if feed_xml: logging.debug('the len(feed_xml) is %d', len(feed_xml)) self.response.out.write(feed_xml)
def get(self, slug): logging.debug('PipesHandler.get') self.response.headers["Content-Type"] = "text/xml" if slug[0] == '/': slug = slug[1:] if slug: logging.debug('slug is ' + slug) #req_path = self.request.path feed_xml_memcache_key = PIPE_OUTPUT_FEED_XML_MEMCACHE_KEY + '_' + slug feed_xml = memcache.get(feed_xml_memcache_key) updated_time_str = self.request.get('updated_time') updated_time = None if updated_time_str: updated_time = datetime.datetime.fromtimestamp(time.mktime(feedparser._parse_date(updated_time_str))) if not feed_xml: logging.debug('feed_xml not found in memcache, query enties') pipe = get_pipe(slug) if pipe: try: feed_xml = self.get_feed_xml(pipe, updated_time) memcache.add(feed_xml_memcache_key, feed_xml, 60 * 30)#cache ten minutes except Exception , e: logging.exception(e) return self.response.set_status(500) else: return self.response.set_status(404) if feed_xml: logging.debug('the len(feed_xml) is %d', len(feed_xml)) self.response.out.write(feed_xml)
def get_pubdate(entry): """Try to determine the real pubDate of a feedparser entry This basically takes the updated_parsed value, but also uses some more advanced techniques to work around various issues with ugly feeds. "published" now also takes precedence over "updated" (with updated used as a fallback if published is not set/available). RSS' "pubDate" element is "updated", and will only be used if published_parsed is not available. """ pubdate = entry.get('published_parsed', None) if pubdate is None: pubdate = entry.get('updated_parsed', None) if pubdate is None: # See http://code.google.com/p/feedparser/issues/detail?id=327 updated = entry.get('published', entry.get('updated', None)) if updated is not None: # FIXME: This is kludgy. We should write our own date handler # and register it with feedparser.registerDateHandler() and/or # wait for feedparser to add support for this bogus date format. pubdate = feedparser._parse_date(updated.replace(',', '')) if pubdate is None: # Cannot determine pubdate - party like it's 1970! return 0 return mktime_tz(pubdate + (0,))
def _check_date(self, func, dtstring, dttuple): try: tup = func(dtstring) except (OverflowError, ValueError): tup = None self.assertEqual(tup, dttuple) self.assertEqual(tup, feedparser._parse_date(dtstring))
def _compute_rss_stats(self, rsslink, fr): if fr is not None and not ( "application/xml" in fr.headers["content-type"] or "text/xml" in fr.headers["content-type"] or "application/rss+xml" in fr.headers["content-type"]): return (None, None) try: rss = self.wrapper.pq(fr.webpage) except (lxml.etree.XMLSyntaxError, lxml.etree.ParserError): return (rsslink, 0) # Now let's get more recent and oldest item dates in stream first = last = None count = 0 for entry in rss("item").items(): count += 1 date = feedparser._parse_date(entry("pubDate").text()) if date is not None: publication = time.mktime(date) if first is None or first < publication: first = publication if last is None or last > publication: last = publication # Compute ratio items per week if first is not None and last is not None: timedelta = first - last if timedelta > 0: weekratio = count / (timedelta / (7 * 24 * 60 * 60)) return (rsslink, weekratio) return (rsslink, 0)
def parse_http_datetime(s): try: if INT_PATTERN.match(s): return int(time.time()) + int(s) return int(calendar.timegm(feedparser._parse_date(s))) except: return 0
def _patch_feedparser(self, path='quotes_app.services.feedparser.parse', image=True): feed_dict = { 'title':self.expected_feed_title, 'description':self.expected_feed_description, 'link':self.expected_feed_homepage, 'tags':self.expected_feed_tags } if image == True: feed_dict['image'] = MicroMock( url=self.expected_feed_image_url ) mock_feedparser_results = MicroMock( feed=MicroMock(**feed_dict), entries=[MicroMock( title="Why is yoda so old?", publication_date="Thu, 04 Aug 2005 17:02:29 -0400", description="Lets find out why yoda won't die quickly.", link = "http://starwars.fke/ep/40", guid = "http://starwars.fke/ep/40", published_parsed=feedparser._parse_date("Thu, 04 Aug 2005 17:02:29 -0400") )] ) patcher = patch(path, return_value=mock_feedparser_results) self.parse_spy = patcher.start() self.addCleanup(patcher.stop)
def makeSolrDate(self, datestr): """ Solr is very particular about the date format it can handle """ d = feedparser._parse_date(datestr) date = datetime.datetime(d.tm_year, d.tm_mon, d.tm_mday, d.tm_hour, d.tm_min, d.tm_sec) return date.isoformat()+'Z'
def date(self): #TODO use http://labix.org/python-dateutil instead of feedparser d = self._get("DC.date", "date") if d: #TODO ??? timezone timetuple = feedparser._parse_date(d) return datetime.datetime(*timetuple[:7]) return None
def makeSolrDate(self, datestr): """ Solr is very particular about the date format it can handle """ d = feedparser._parse_date(datestr) date = datetime.datetime(d.tm_year, d.tm_mon, d.tm_mday, d.tm_hour, d.tm_min, d.tm_sec) return date.isoformat() + 'Z'
def _end_updated(self): value = self.pop('updated') parsed_value = feedparser._parse_date(value) overwrite = ('youtube.com' not in self.baseuri) try: self._save('updated_parsed', parsed_value, overwrite=overwrite) except TypeError, te: logger.warn('Your feedparser version is too old: %s', te)
def _end_updated(self): value = self.pop("updated") parsed_value = feedparser._parse_date(value) overwrite = "youtube.com" not in self.baseuri try: self._save("updated_parsed", parsed_value, overwrite=overwrite) except TypeError, te: logger.warn("Your feedparser version is too old: %s", te)
def parseDate(date): """ Parse a TextMate date (YYYY-MM-DD HH:MM:SS, no time zone, assume it's always localtime) """ m = _textmate_date_re.match(date) if not m: return time.mktime(feedparser._parse_date(date)) return time.mktime(time.localtime(calendar.timegm(time.gmtime(time.mktime(time.strptime(date,"%Y-%m-%d %H:%M:%S"))))))
def testdate(self): dates0 = '2010/07/07 14:40:24 +0800' #print feedparser._parse_date(dates0) #dates1 = '2010-07-07T14:49:24 +0800' #print feedparser._parse_date(dates1) date = feedparser._parse_date(dates0) t = datetime.datetime.fromtimestamp(time.mktime(date)) print t.strftime("%Y-%m-%dT%H:%M:%SZ"), time.asctime(date)
def testdate(self): dates0 = "2010/07/07 14:40:24 +0800" # print feedparser._parse_date(dates0) # dates1 = '2010-07-07T14:49:24 +0800' # print feedparser._parse_date(dates1) date = feedparser._parse_date(dates0) t = datetime.datetime.fromtimestamp(time.mktime(date)) print t.strftime("%Y-%m-%dT%H:%M:%SZ"), time.asctime(date)
def fetch_channel(self, channel): etag = channel.http_etag modified = feedparser._parse_date(channel.http_last_modified) # If we have a username or password, rebuild the url with them included # Note: using a HTTPBasicAuthHandler would be pain because we need to # know the realm. It can be done, but I think this method works, too url = channel.authenticate_url(channel.url) for handler in self.custom_handlers: custom_feed = handler.handle_url(url) if custom_feed is not None: return feedcore.Result(feedcore.CUSTOM_FEED, custom_feed) return self.fetch(url, etag, modified)
def parseDate(date): """ Parse a TextMate date (YYYY-MM-DD HH:MM:SS, no time zone, assume it's always localtime) """ m = _textmate_date_re.match(date) if not m: return time.mktime(feedparser._parse_date(date)) return time.mktime( time.localtime( calendar.timegm( time.gmtime( time.mktime(time.strptime(date, "%Y-%m-%d %H:%M:%S"))))))
def parse_date(date): """Parse a TextMate date (YYYY-MM-DD HH:MM:SS, no time zone, assume it's always localtime)""" m = _textmate_date_re.match(date) try: from feedparser import _parse_date if not m: return time.mktime(_parse_date(date)) except: pass return time.mktime(time.localtime(calendar.timegm(time.gmtime(time.mktime(time.strptime(date, '%Y-%m-%d %H:%M:%S'))))))
def get_tornado_warnings(feed): """Get a list of the current tornado warnings in effect""" state_list = ['AL', 'AK', 'AS', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FM', 'FL', 'GA', 'GU', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MH', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'MP', 'OH', 'OK', 'OR', 'PW', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'VI', 'WA', 'WV', 'WI', 'WY'] print_debug('Parsing entries') # Trim down the list of alerts to actual tornado warnings tornado_alerts = [ x for x in feed.entries if re.search('tornado', x['cap_event'], re.IGNORECASE) and x['cap_status'] == 'Actual' ] for entry in tornado_alerts: affected_counties = entry['cap_areadesc'].split('; ') if not affected_counties: print_debug('Counties list empty:\n' + entry) debug_mail('Counties empty', entry) affected_state = re.search('\?x=(..)', entry['id']).group(1) if affected_state not in state_list: print_debug('State not found:\n' + entry) debug_mail('State not found', entry) continue if entry.cap_event == 'Tornado Watch': alert_type='watch' elif entry.cap_event == 'Tornado Warning': alert_type='warning' else: alert_type='Unknown' starttime = time.mktime(feedparser._parse_date(entry['cap_effective'])) - time.timezone endtime = time.mktime(feedparser._parse_date(entry['cap_expires'])) - time.timezone for affected_county in affected_counties: yield affected_county, affected_state, starttime, endtime, alert_type
def _start_newznab_attr(self, attrsD): context = self._getContext() # Add the dict if "newznab" not in context: context["newznab"] = {} # Don't crash when it fails try: # Add keys context["newznab"][attrsD["name"]] = attrsD["value"] # Try to get date-object if attrsD["name"] == "usenetdate": context["newznab"][attrsD["name"] + "_parsed"] = feedparser._parse_date(attrsD["value"]) except KeyError: pass
def run(self, saved_state): feed = feedparser.parse(self.url) artifacts = [] for item in list(reversed(feed['items'])): # Only new items. published_parsed = item.get('published_parsed') or item.get( 'updated_parsed') if published_parsed and published_parsed <= feedparser._parse_date( saved_state or '0001-01-01'): continue try: soup = bs4.BeautifulSoup(item['content'][0]['value'], 'html.parser') except KeyError: try: soup = bs4.BeautifulSoup(item['summary'], 'html.parser') except KeyError: # Can't find any feed content, just skip this entry. continue # do some preprocessing to remove common obfuscation methods [x.unwrap() for x in soup.find_all('strong')] [x.unwrap() for x in soup.find_all('b')] [x.unwrap() for x in soup.find_all('em')] [x.unwrap() for x in soup.find_all('i')] soup = bs4.BeautifulSoup(soup.decode(), 'html.parser') text = '' if self.feed_type == 'afterioc': text = soup.get_text(separator=' ').split(AFTERIOC)[-1] artifacts += self.process_element(text, item.get('link') or self.url, include_nonobfuscated=True) elif self.feed_type == 'clean': text = soup.get_text(separator=' ') artifacts += self.process_element(text, item.get('link') or self.url, include_nonobfuscated=True) else: # Default: self.feed_type == 'messy'. text = soup.get_text(separator=' ') artifacts += self.process_element(text, item.get('link') or self.url) saved_state = item.get('published') or item.get('updated') return saved_state, artifacts
def parse_by_url(url): try: objs = [] data = feedparser.parse(url) for item in data['entries']: objs.append({ 'title': strip_tags(item['title']), 'published': feedparser._parse_date(item['published']), 'link': item['link'] }) objs.sort(key=lambda x: x['published']) objs = objs[::-1] except: # everything can happened here objs = [] return objs
def to_datetime(t, tzinfo=None): if not t: return None if isinstance(t, str): t = datetime.datetime(*feedparser._parse_date(t)[:6], tzinfo=UTC) tz = tzinfo or LocalTimezone() if isinstance(t, (tuple, time.struct_time)): t = datetime.datetime(*t[:6], tzinfo=tz) if isinstance(t, (int, float)): t = datetime.datetime.fromtimestamp(t, tz=tz) if not isinstance(t, datetime.datetime): raise ValueError(repr(t)) if not t.tzinfo: t = datetime.datetime(*t.timetuple()[:6], tzinfo=tz) return t
def parse_date(date): """Parse a TextMate date (YYYY-MM-DD HH:MM:SS, no time zone, assume it's always localtime)""" m = _textmate_date_re.match(date) try: from feedparser import _parse_date if not m: return time.mktime(_parse_date(date)) except: pass return time.mktime( time.localtime( calendar.timegm( time.gmtime( time.mktime(time.strptime(date, '%Y-%m-%d %H:%M:%S'))))))
def __init__(self, data): self.userid = data.author.id self.text = data.body self.commentId = data.commentId if getattr(data, "postedDate", None) is not None: self.postedDate = data.postedDate self.postedDate_parsed = int(calendar.timegm(feedparser._parse_date(self.postedDate))) else: self.postedDate_parsed = data.postedDate_parsed try: int(self.userid) except ValueError: pass else: self.userid = "myspace.com.person.%s" % self.userid
def __init__(self, data): self.userid = data.author.id self.text = data.body self.commentId = data.commentId if getattr(data, 'postedDate', None) is not None: self.postedDate = data.postedDate self.postedDate_parsed = int( calendar.timegm(feedparser._parse_date(self.postedDate))) else: self.postedDate_parsed = data.postedDate_parsed try: int(self.userid) except ValueError: pass else: self.userid = 'myspace.com.person.%s' % self.userid
def _new_with_headers_expires(self, url, stale_feed): """ >>> from datetime import datetime, timedelta >>> now = datetime.utcnow() >>> stale = {'headers': {'expires':'Wed, 18 Aug 2010 06:06:11 GMT'}} >>> url = 'http://feedparser.org/docs/examples/atom10.xml' >>> res = _new_with_headers_expires(url, stale) >>> res.status 200 >>> stale = {'headers': { 'expires':datetime.isoformat(now + timedelta(minutes=30))}} >>> res2 = _new_with_headers_expires(url, stale) >>> res == res2 False """ exp = fp._parse_date(stale_feed['headers']['expires']) if time.time() > exp: return retrieve_feed(url)
def relatize(value): """ Returns the relative time of each request. Another feature stolen from github. How it works: get the date from value - use _parse_date from feed parser get current utc time. compare current utc time and output relative time """ date_struct = _parse_date(value)[0:6] the_date = datetime(*date_struct) now = datetime.utcnow() if time.daylight: now = now + timedelta(hours=1) time_difference = now - the_date if time_difference.days < 0: return 'sometime in the near future' # just in case the time screws up if time_difference.days > 356: return 'about %d years ago' % (time_difference.days / 356) elif time_difference.days > 60: return 'about %d months ago' % (time_difference.days / 30) elif time_difference.days > 30: return 'about a month ago' elif time_difference.days > 1: return 'about %d days ago' % time_difference.days elif time_difference.days > 0: return 'about a day ago' elif time_difference.seconds > 7200: return 'about %d hours ago' % (time_difference.seconds / 3600) elif time_difference.seconds > 3600: return 'about an hour ago' elif time_difference.seconds > 120: return 'about %d minutes ago' % (time_difference.seconds / 60) elif time_difference.seconds > 60: return 'about a minute ago' elif time_difference.seconds < 60 or time_difference.days < 1: return 'just now'
def __real_check_now_cb(self): rss_file = os.path.join(os.environ["HOME"], ".movistar_desktop/", "rss.xml") if not os.path.exists(rss_file) : return True d = feedparser.parse(rss_file) os.system("rm %s" % rss_file) if (len(d['entries']) < 1): print _(u"No entries in RSS") return True # Check date with saved feed new_feed_date = md5.new(d.entries[0].date).hexdigest() saved_feed_date = md5.new(self.conf.get_updater_feed_date()).hexdigest() release_date_parsed = feedparser._parse_date(self.conf.get_release_date()) # Debug print "RSS-----------------" print d.entries[0] print "--------------------" print "new_feed_date %s" % new_feed_date print "saved_feed_date %s" % saved_feed_date print "fecha: %s" % d.entries[0].updated_parsed print "release: %s" % release_date_parsed # End debug if ((new_feed_date != saved_feed_date) and (release_date_parsed < d.entries[0].updated_parsed)): self.uw_dialog.set_title(d.entries[0].title) self.uw_label.set_text(d.entries[0].description) self.uw_dialog.show_all() result = self.uw_dialog.run() if (result == gtk.RESPONSE_OK): os.system("gnome-open %s" % d.entries[0].link) self.conf.set_updater_feed_date(d.entries[0].date) self.conf.save_conf() elif (result == gtk.RESPONSE_NO): self.conf.set_updater_feed_date(d.entries[0].date) self.conf.save_conf() self.uw_dialog.hide() return True
def rdfaparse(content): resources = [] triples = rdfascrape.rdfascrape(content) for count, (s, p, o, dt) in enumerate(triples): obj = {} obj['label'] = '_' + str(count) obj['id'] = '_' + str(count) pred = p.split('/')[-1].split('#')[-1] if pred == u'dc:date' or dt in [u'xsd:date', u'xs:date', u'http://www.w3.org/2001/XMLSchema' + u'date']: # feedparer's internal date parser robustly handles different # time formats and returns a 9-tuple import feedparser normalizeddate = feedparser._parse_date(o) obj[pred] = time.strftime("%Y-%m-%dT%H:%M:%S", normalizeddate) obj[pred + u'localized'] = time.strftime("%a, %d %b %Y %H:%M:%S", normalizeddate) else: obj[pred] = o resources.append(obj) return resources
def discovery(): ''' Sample query: curl "http://localhost:8880/osci.jove.discovery" ''' doc = '''<?xml version="1.0" encoding="UTF-8"?> <OpenSearchDescription xmlns="http://a9.com/-/spec/opensearch/1.1/" xmlns:osci="%(oscibase)s/content/jove/datamodel#"> <ShortName>JoVE</ShortName> <LongName>JoVE OSCI adapter</LongName> <Description>JoVE</Description> <Contact>%(admin)s</Contact> <Url type="application/atom+xml" rel="results" template="%(oscibase)s/content/jove?search={searchTerms}"/> <Url type="application/atom+xml" rel="http://purl.zepheira.com/osci/content/model#id" template="%(oscibase)s/content/jove?id={searchTerms}"/> <Attribution>© 2009 Zepheira, LLC</Attribution> <osci:metadata-profile href="%(oscibase)s/content/jove/metadata-profile"/> </OpenSearchDescription> '''%{'admin': ADMIN_EMAIL, 'oscibase': OSCI_BASE} #Check XML amara.parse(doc) return doc # --- %< --- entries = [] for it in islice(feed.rss.channel.item, 0, 3): entry = {} print >> sys.stderr, "processing", unicode(it.link) entry['id'] = unicode(it.link) entry['label'] = entry['id'] entry['title'] = unicode(it.title) desc = unicode(it.description) entry['description'] = desc[:desc.find(u'<div class=\"feedflare\">')] #print htmlparse(str(it.description)) #Above hack will do for now ;) entry['link'] = unicode(it.origLink) entry['pubDate'] =time.strftime("%Y-%m-%dT%H:%M:%S", feedparser._parse_date(str(it.pubDate))) entry['categories'] = [ unicode(c).strip() for c in it.category ] entry['snapshot'] = MOCKUP_IMAGES[unicode(it.link)] entry['icon'] = MOCKUP_ICON entry = get_data_from_page(entry, str(it.origLink)) entries.append(entry) print simplejson.dumps({'items': entries}, indent=4)
def from_json(self, js): # hooray for utc updated = self.updated = self.published = getattr( js, 'moodLastUpdated', getattr(js, 'moodStatusLastUpdated', 0)) if self.updated == 0: self.updated_parsed = self.published_parsed = updated else: try: self.updated_parsed = self.published_parsed = int(updated) except (ValueError, TypeError): updated_parsed = feedparser._parse_date(updated) self.updated_parsed = self.published_parsed = int( calendar.timegm(updated_parsed)) user = getattr(js, 'user', None) if user is None: self.author_id = js.userId else: self.author_id = user.userId log.info_s("status json: %r", js) self.id = js.statusId moodimage_url = getattr(js, 'moodPictureUrl', getattr(js, 'moodimageurl', None)) if moodimage_url == self.SPACER_URL: moodimage_url = None #self.icon_url = user.image #self.icon_url = moodimage_url self.icon_url = None self.contents = [(u'xhtml', js.status)] self.body = js.status self.mood_text = getattr(js, 'moodName', getattr(js, 'mood', None)) self.mood_icon = moodimage_url self._numComments = 0 try: self._numComments = int( getattr(js, 'numComments', None) or getattr(js, '_numComments', None)) except (AttributeError, ValueError, TypeError): self._numComments = None self.comments = map(MyspaceComment.from_json, js.get('comments', []))
def str2datetime(string: str) -> datetime: """ >>> str2datetime("01.01.1990") datetime.datetime(1990, 1, 1, 0, 0) >>> str2datetime("25 AUG 2012") datetime.datetime(2012, 8, 25, 0, 0) >>> str2datetime("18 APR 1973") datetime.datetime(1973, 4, 18, 0, 0) >>> str2datetime("1968") datetime.datetime(1968, 1, 1, 0, 0) """ try: return datetime(*(feedparser._parse_date(string)[:6])) except: logging.error("failed to parse %s as date" % repr(string)) raise
def get_datetime(unparsed_date): """ string to datetime """ parsed_date = feedparser._parse_date(unparsed_date) if not parsed_date: return datetime.datetime.min if isinstance(parsed_date, dict): return datetime.datetime(parsed_date['year'], parsed_date['month'], parsed_date['day'], parsed_date['hour'], parsed_date['min'], parsed_date['sec'], tzinfo=None) else: return datetime.datetime(parsed_date[0], parsed_date[1], parsed_date[2], parsed_date[3], parsed_date[4], parsed_date[5], tzinfo=None)
def get_metadata(url): """Get file download metadata Returns a (size, type, name) from the given download URL. Will use the network connection to determine the metadata via the HTTP header fields. """ track_fp = util.urlopen(url) headers = track_fp.info() filesize = headers['content-length'] or '0' filetype = headers['content-type'] or 'application/octet-stream' if 'last-modified' in headers: parsed_date = feedparser._parse_date(headers['last-modified']) filedate = time.mktime(parsed_date) else: filedate = None filename = os.path.basename(os.path.dirname(url)) track_fp.close() return filesize, filetype, filedate, filename
def GetTorrents(feed_list): torrents = [] for feed in feed_list: last_sync = feedparser._parse_date(feed['last_sync']) feedparser_dict = feedparser.parse(feed['link']) for entry in feedparser_dict.entries: # Torrent links are stored as a link element or as an enclosure if entry.published_parsed > last_sync: if '.torrent' in entry.link: torrents.append({"link": entry.link, "date": entry.published, "date_parsed": entry.published_parsed}) elif (len(entry.enclosures) and entry.enclosures[0]['type'] == 'application/x-bittorrent'): torrents.append({"link": entry.enclosures[0]['href'], "date": entry.published, "date_parsed": entry.published_parsed}) # Get highest date of this feed, update json, and return only torrents UpdateSyncDates(torrents) torrents = [torrent["link"] for torrent in torrents] return torrents
def from_json(self, js): # hooray for utc updated = self.updated = self.published = getattr( js, "moodLastUpdated", getattr(js, "moodStatusLastUpdated", 0) ) if self.updated == 0: self.updated_parsed = self.published_parsed = updated else: try: self.updated_parsed = self.published_parsed = int(updated) except (ValueError, TypeError): updated_parsed = feedparser._parse_date(updated) self.updated_parsed = self.published_parsed = int(calendar.timegm(updated_parsed)) user = getattr(js, "user", None) if user is None: self.author_id = js.userId else: self.author_id = user.userId log.info_s("status json: %r", js) self.id = js.statusId moodimage_url = getattr(js, "moodPictureUrl", getattr(js, "moodimageurl", None)) if moodimage_url == self.SPACER_URL: moodimage_url = None # self.icon_url = user.image # self.icon_url = moodimage_url self.icon_url = None self.contents = [(u"xhtml", js.status)] self.body = js.status self.mood_text = getattr(js, "moodName", getattr(js, "mood", None)) self.mood_icon = moodimage_url self._numComments = 0 try: self._numComments = int(getattr(js, "numComments", None) or getattr(js, "_numComments", None)) except (AttributeError, ValueError, TypeError): self._numComments = None self.comments = map(MyspaceComment.from_json, js.get("comments", []))
def get_datetime(unparsed_date): """ string to datetime """ parsed_date = feedparser._parse_date(unparsed_date) if not parsed_date: return datetime.datetime.min if isinstance(parsed_date, dict): return datetime.datetime( parsed_date['year'], parsed_date['month'], parsed_date['day'], parsed_date['hour'], parsed_date['min'], parsed_date['sec'], tzinfo=None) else: return datetime.datetime( parsed_date[0], parsed_date[1], parsed_date[2], parsed_date[3], parsed_date[4], parsed_date[5], tzinfo=None)
def test_None(self): self.assertTrue(feedparser._parse_date(None) is None)
def get(self, save, **kwargs): '''Document collected via {} feed reader'''.format(self.doctype) # This RSS-scraper is a generic fallback option in case we do not have # any specific one. Therefore, only use the following generic values # if we do not have any more specific info already if 'rss_url' in kwargs: RSS_URL = kwargs['rss_url'] else: try: RSS_URL = self.rss_url except: RSS_URL = 'N/A' assert RSS_URL != 'N/A', 'You need to specify the feed URL. Example: rss_url="http://www.nu.nl/rss"' if type(RSS_URL) is str: RSS_URL = [RSS_URL] for thisurl in RSS_URL: rss_body = self.get_page_body(thisurl) d = feedparser.parse(rss_body) for post in d.entries: try: _id = post.id except: _id = post.link if _id == None: _id = post.link link = re.sub("/$", "", self.getlink(post.link)) # By now, we have retrieved the RSS feed. We now have to determine for the item that # we are currently processing (post in d.entries), whether we want to follow its # link and actually get the full text and process it. If we already have it, # we do not need to (therefore check_exists). But also, if we do not want to # work with the database backend (as indicated by save=False), we probably also # do not want to look something up in the database. We therefore also retrieve it in # that case. if save == False or check_exists(_id)[0] == False: try: req = urllib2.Request( link, headers={'User-Agent': "Wget/1.9"}) htmlsource = urllib2.urlopen(req).read().decode( encoding="utf-8", errors="ignore") except: htmlsource = None logger.info( 'Could not open link - will not retrieve full article, but will give it another try with different User Agent' ) # Some (few) scrapers seem to block certain user agents. Therefore, if code above did # not succed, try fetching the article pretending to user Firefox on Windows if not htmlsource or htmlsource == "": try: req = urllib2.Request( link, headers={ 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0" }) htmlsource = urllib2.urlopen(req).read().decode( encoding="utf-8", errors="ignore") except: htmlsource = None logger.info( 'Could not open link - will not retrieve full article' ) try: teaser = re.sub(r"\n|\r\|\t", " ", post.description) except: teaser = "" try: datum = datetime.datetime( *feedparser._parse_date(post.published)[:6]) except: try: # alternative date format as used by nos.nl datum = datetime.datetime(*feedparser._parse_date( post.published[5:16])[:6]) except: #print("Couldn't parse publishing date") datum = None doc = { "_id": _id, "title_rss": post.title, "teaser_rss": teaser, "publication_date": datum, "htmlsource": htmlsource, "feedurl": thisurl, "url": re.sub("/$", "", post.link) } if htmlsource is not None: # TODO: CHECK IF PARSEHTML returns None, if so, raise custom exception parsed = self.parsehtml(doc['htmlsource']) if parsed is None or parsed == {}: try: raise UnparsableException except UnparsableException: pass else: doc.update(parsed) parsedurl = self.parseurl(link) doc.update(parsedurl) docnoemptykeys = { k: v for k, v in doc.items() if v or v == False } yield docnoemptykeys
def update(): VALID_EXT = debrid.VALID_EXT rsslist = rssList() sourceList = [] if len(rsslist) > 0: control.infoDialog('Checking RSS Lists...') for x in rsslist: u = x['rss'] timeNow = datetime.datetime.utcnow() timeOffset = int(x['offset']) timeOffset = ( timeNow - datetime.timedelta(days=int(timeOffset))).strftime('%Y%m%d') html = requests.get(u).content r = BeautifulSoup(html, "html.parser") soup = r.find_all('channel')[0] soup = soup.find_all('item') for item in soup: try: title = item.find_all('title')[0].getText().strip() link = item.find_all('link')[0].getText().strip() checkDB = rssDB(mode='check', link=link, title=title) if checkDB == True: print("[REALIZER RSS MANAGER] TORRENT ALREADY ADDED: %s" % title) raise Exception() try: date = item.find_all('pubdate')[0].getText().strip() except: date = item.find_all('pubDate')[0].getText().strip() dateString = feedparser._parse_date(date) dt = datetime.datetime.fromtimestamp(mktime(dateString)) pubDate = dt.strftime('%Y%m%d') strDate = dt.strftime('%Y-%m-%d') if int(pubDate) >= int(timeOffset): r = debrid.realdebrid().addtorrent(link) id = r['id'] select = debrid.realdebrid().torrentInfo(id) status = str(select['status']) print("[REALIZER RSS MANAGER] REALDEBRID STATUS", status) if cleantitle.get( status ) != 'waitingfilesselection' and cleantitle.get( status) != 'downloaded': debrid.realdebrid().delete(id, type='torrents') raise Exception() files = select['files'] filesIDs = [ i['id'] for i in files if i['path'].split('.')[-1].lower() in VALID_EXT ] if len(filesIDs) < 1 or filesIDs == []: debrid.realdebrid().delete(id, type='torrents') raise Exception() r = debrid.realdebrid().selectTorrentList(id, filesIDs) source = { 'title': title, 'link': link, 'id': id, 'date': str(strDate) } sourceList.append(source) except: pass control.infoDialog('RSS Lists check completed') rssDB(data=sourceList)
def t(dateString): t = feedparser._parse_date(dateString) return datetimefromparsed(t).isoformat() if t else None
def parse_rfc3339(s: str) -> datetime.datetime: # pylint: disable=protected-access struct = feedparser._parse_date(s) return ts2dt(int(timegm(struct)))
#Daily Beast dBeast = [] sourceList = [nytimes, washPo, dBeast] feeds = [] for source in sourceList: for feed in source: feeds.append(feedparser.parse(feed)) #The set of all articles in Newspaper format. entries = [] #runs through each feed, taking each item / article, and adding its nArticle to the entries object. for feed in feeds: for item in feed["items"]: if "published" in item: pubDate = item["published"] elif "pubDate" in item: pubDate = item["pubDate"] else: pubDate = "??" print calendar.timegm(time.gmtime()) - calendar.timegm( feedparser._parse_date(pubDate)) entries.append(nArticle(item["link"])) #From here there seems to be a set of entries containing a ton of Articles using the URLs from the RSS feed. pdfkit.from_url(entries[1].url, 'out.pdf')
def getLink(self): """Reads the HTML page and extracts the link, title and body.""" if not self.children.intersection(self.attrs): return # mandatory child element missing self.loadCache() try: f = feedparser._open_resource(self.uri, self.etag, self.modified, USER_AGENT, None, [], {} ) html = f.read() except Exception as e: sys.stderr.write('Getting page %s: %s\n' % (self.uri, e)) return if getattr(f, 'status', None) == 304 or not html: # not modified or empty page return # save HTTP headers if hasattr(f, 'info'): info = f.info() etag = info.getheader('ETag') modified = info.getheader('Last-Modified') if modified: modified = feedparser._parse_date(modified) self.saveCache(etag, modified) # if the page is compressed, decompress it ce = info.getheader('Content-Encoding', '') if ce == 'gzip': try: import gzip import StringIO html = gzip.GzipFile(fileobj=StringIO.StringIO(html)).read() except Exception as e: sys.stderr.write('Unzipping page %s: %s\n' % (self.uri, e)) return elif ce == 'deflate': try: import zlib html = zlib.decompress(html, -zlib.MAX_WBITS) except Exception as e: sys.stderr.write('Inflating page %s: %s\n' % (self.uri, e)) return # resolve relative URIs html = feedparser._resolveRelativeURIs(html, self.uri, self.encoding, 'text/html') if hasattr(f, 'headers'): charsets = [c for c in feedparser._getCharacterEncoding(f.headers, html) if c] else: charsets = [self.encoding] for charset in charsets: try: html = html.decode(charset) break except UnicodeDecodeError: pass except LookupError: pass if 'regex' in self.attrs: self.match_regex(html) else: self.match_xpath(html)
data = '' elif zlib and f.headers.get('content-encoding', '') == 'deflate': try: data = zlib.decompress(data, -zlib.MAX_WBITS) except Exception, e: result['bozo'] = 1 result['bozo_exception'] = e data = '' # save HTTP headers if hasattr(f, 'info'): info = f.info() result['etag'] = info.getheader('ETag') last_modified = info.getheader('Last-Modified') if last_modified: result['modified'] = _parse_date(last_modified) if hasattr(f, 'url'): result['href'] = f.url result['status'] = 200 if hasattr(f, 'status'): result['status'] = f.status if hasattr(f, 'headers'): result['headers'] = f.headers.dict if hasattr(f, 'close'): f.close() # there are four encodings to keep track of: # - http_encoding is the encoding declared in the Content-Type HTTP header # - xml_encoding is the encoding declared in the <?xml declaration # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
data = feedparser.parse(feed_xml) entries = get_entries(data, pipe) logging.debug('get %d entries', len(entries)) key_names = [] for e in entries: key_names.append(e['key_name']) entries = filter_entries(entries, pipe) logging.debug('get %d entries', len(entries)) for e in entries: logging.debug(' e.key().id_or_name() = %s e.link = %s ', str(e.key().id_or_name()), str(e.link)) elif type == 'time': oldest_update_time = datetime.datetime.fromtimestamp( time.mktime( feedparser._parse_date('2010/07/15 07:29:25 +0800'))) logging.debug('the oldest_update_time is %s', str(oldest_update_time)) elif type == 'query': feed_xml = self.get_xml(pipe) logging.debug('the len(feed_xml) is %d', len(feed_xml)) data = feedparser.parse(feed_xml) entries = get_entries(data, pipe) logging.debug('start filter %d entries', len(entries)) oldest_update_time = get_oldest_update_time(entries) logging.debug('the oldest_update_time is %s', str(oldest_update_time)) db_entries = model.FeedEntry.gql( "WHERE pipe = :1 AND updated_time >=:2 ORDER BY updated_time DESC", pipe, oldest_update_time).fetch(200) logging.debug('query finished. get %d entries', len(db_entries))