def ParseFeedUrls(self): """ return list like [(section,title,url),..] """ urls = [] for feed in self.feeds: section, url = feed[0], feed[1] isfulltext = feed[2] if len(feed) > 2 else False timeout = CONNECTION_TIMEOUT+15 if isfulltext else CONNECTION_TIMEOUT opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: feed = feedparser.parse(result.content.decode(self.feed_encoding)) else: feed = feedparser.parse(AutoDecoder().decode(result.content)) urladded = [] # 防止部分RSS产生重复文章 for e in feed['entries'][:self.max_articles_per_feed]: url = e.link if url not in urladded: if isfulltext: desc = e.content[0].value if hasattr(e, 'content') and e.content[0].value else e.summary urls.append((section, e.title, url, desc if desc else u'Has no summary, is it fulltext feed?')) else: urls.append((section, e.title, url, None)) urladded.append(url) else: self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url)) return urls
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] for feed in self.feeds: section, url = feed[0], feed[1] isfulltext = feed[2] if len(feed) > 2 else False timeout = self.timeout+10 if isfulltext else self.timeout opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: feed = feedparser.parse(result.content.decode(self.feed_encoding)) else: feed = feedparser.parse(AutoDecoder().decode(result.content)) urladded = set() # 防止部分RSS产生重复文章 for e in feed['entries'][:self.max_articles_per_feed]: #支持HTTPS urlfeed = e.link.replace('http://','https://') if url.startswith('https://') else e.link if urlfeed not in urladded: desc = None if isfulltext: if hasattr(e, 'content') and e.content[0].value: desc = e.content[0].value elif hasattr(e, 'summary'): desc = e.summary else: self.log.warn('feed item invalid,link to webpage for article.(%s)'%e.title) urls.append((section, e.title, urlfeed, desc)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url)) return urls
def updateFeed(feed): """ function to update the content of a feed """ feedData = feedparser.parse(feed.url, etag = feed.etag, modified = time.localtime(feed.lastModified)) try: if feedData.status == 301: print "feed url modified. trying the new url..." feedData = feedparser.parse(feedData.url, etag = feed.etag, modified = time.localtime(feed.lastModified)) if feedData.status == 304: print "No updates" else: print feedData.status lastModified = time.localtime(feed.lastModified) feed.lastModified = time.mktime(feedData.modified) for item in feedData.entries: if item.updated_parsed > lastModified: _addItem(feed, item) print "Added %s to the database." % item.title #comment this later except AttributeError: print " Error fetching feeds, Network error???"
def parsefeed(self, feed, retires=1): """parse feed using feedparser""" try: # 访问feed,自动尝试在地址结尾加上或去掉'/' feed_data = feedparser.parse(feed.encode('utf-8')) if not feed_data.feed.has_key('title'): if feed[-1] == '/': feed_data = feedparser.parse(feed[0:-1].encode('utf-8')) elif feed[-1] != '/': feed_data = feedparser.parse((feed + '/').encode('utf-8')) if not feed_data.feed.has_key('title'): raise UserWarning("read error") else: return feed_data else: return feed_data except UserWarning: logging.error("fail({}): {}".format(feed, "read error")) return None except Exception, e: if retires > 0: logging.error("error({}): {} , retry".format(feed, e)) return self.parsefeed(feed, retires - 1) # 如果读取错误,重试一次 else: logging.error("fail({}): {}".format(feed, e)) return None
def parsefeed(self, feed, retires = 1): """parse feed using feedparser""" try: # 访问feed,自动尝试在地址结尾加上或去掉'/' feed_data = feedparser.parse(feed.encode('utf-8')) if not feed_data.feed.has_key('title'): if feed[-1] == '/': feed_data = feedparser.parse(feed[0:-1].encode('utf-8')) elif feed[-1] != '/': feed_data = feedparser.parse((feed + '/').encode('utf-8')) if not feed_data.feed.has_key('title'): raise UserWarning("read error") else: return feed_data else: return feed_data except UserWarning: logging.error("fail({}): {}".format(feed, "read error")) return None except Exception, e: if retires > 0: logging.error("error({}): {} , retry".format(feed, e)) return self.parsefeed(feed, retires - 1) # 如果读取错误,重试一次 else: logging.error("fail({}): {}".format(feed, e)) return None
def ParseFeedUrls(self): """ return list like [(section,title,url),..] """ urls = [] opener = URLOpener(self.host) decoder = AutoDecoder() for section, url in self.feeds: result = opener.open(url) if result.status_code == 200: if self.feed_encoding: feed = feedparser.parse(result.content.decode(self.feed_encoding)) else: feed = feedparser.parse(decoder.decode(result.content)) for e in feed['entries'][:self.max_articles_per_feed]: urls.append((section,e.title,e.link)) return urls
def main(): optp = OptionParser() optp.add_option("-a", "--auth", dest="auth", help="GMail http basic auth hash") optp.add_option("-b", "--botjid", dest="botJid", help="bot's jabber id") optp.add_option("-p", "--botpass", dest="botPass", help="bot's jabber password") optp.add_option("-t", "--targetjid", dest="targetJid", help="message recepient jid") opts, args = optp.parse_args() request = urllib2.Request("https://mail.google.com/mail/feed/atom") request.add_header("Authorization", "Basic " + opts.auth) result = urllib2.urlopen(request) rss = feedparser.parse(result.read()) if (len(rss['entries']) > 0): f = open(os.path.dirname(__file__) + '/msgs.list', 'r+a') seens = f.read().split("\n") for item in rss['entries']: if (item['id'] not in seens and item['author'].find(opts.botJid) < 0): sendJabberMessage("*%s* \n%s\n\n%s" % (item['title'], item['author'], item['summary']), opts) f.write(item['id'] + "\n")
def get(self): feed_id, feed_url = self.request.get('feed_id'), \ self.request.get('feed_url') url_result = urllib2.urlopen(feed_url) feed_result = fd.parse(url_result) if feed_result.bozo == 1: logging.error('fetch error, id: %s, url: %s, error: %s', feed_id, feed_url, feed_result.bozo_exception) return feed_update_time = feed_result.get('updated', datetime.utcnow) has_update = True f = Feed.get_by_id(int(feed_id)) if f.is_allow_fetch(feed_update_time): for entry in feed_result.entries: if entry.published_parsed <= f.lastedPublishedTime: logging.info('no updated, id: %s, url: %s', feed_id, feed_url) has_update = False break e = Entry(title = entry.title, url = entry.link, author = entry.author, content = entry.content), publishedTime = entry.published_parsed) e.put() logging.debug('fetch entry, url: %s', entry.link)
def feed_segments(self): segment_props = [] response = fetch_url("http://podcastrss.play.it/the-sports-junkies_mp3_128.xml") if response and response.status_code == 200: feed = feedparser.parse(response.content) feed_items = feed.get('items') feed_items.reverse() classic_segment_count = 30 for item in feed_items: url = str(item['enclosures'][0]['href']).split('?')[0].replace('http://www.podtrac.com/pts/redirect.mp3/', 'http://') props = { 'description': item.get('description'), \ 'duration': int(item.get('itunes_duration', 0)), \ 'date': datetime.fromtimestamp(mktime(item['updated_parsed'])).date(), \ 'url': url, } if is_classic_segment(item): props['is_classic'] = True props['num'] = classic_segment_count classic_segment_count += 1 else: props['is_classic'] = False props['num'] = parse_segment_number(item['title']) printer(self.response.out, "%s\n" % props) segment_props.append(props) return segment_props
def addFeed(feedUrl): """ Function to add a new feed to the database. """ try: feedData = feedparser.parse(feedUrl) except: #this never occurs since parser does not raise any exceptions when invalid url is sent print "Invalid feed Url!" else: try: newFeed = Feed(url=unicode(feedUrl), title=feedData.feed.title, lastModified=time.mktime(feedData.modified), etag=unicode(feedData.etag)) session.commit() except AttributeError: session.rollback() print "Error! Invalid feed URL" except: session.rollback() print "%s \t Feed already subscribed" % (feedData.feed.title) else: print "Subscribed to \t %s " % (feedData.feed.title) fetchFeeds(newFeed, feedData)
def get_messages(self, account, url): user = self.session.get_user() response, content = oauth.CybozuliveHandler.request(user, account, url) if response["status"] != "200": raise Exception(response["status"] + " failed to get messages. : " + url) result = feedparser.parse(content) messages = [] for entry in result.entries: messages.append({ "id": entry.id, "title": entry.title, "link": entry.link, "author": entry.author, "summary": re.sub("\n", '<br/>', utils.escape_html(entry.summary)) if hasattr(entry, "summary") else "", "updated": datetime.datetime( *entry.updated_parsed[:6]).strftime("%a %b %d %H:%M:%S %Y") }) template_values = { 'service': 'cybozulive', "title": result.feed.title, "link": result.feed.link, "feed_url": url, 'messages': messages } return template_values
def get(self, action=""): if action == "messages": url = self.request.get('type') d = memcache.get(url) if d is None: result = urllib.urlopen(url) d = feedparser.parse(result) memcache.set(url, d, 2*60) #2分キャッシュ #RSSの形式が規格外の場合 #if d.bozo == 1: # raise Exception("Can not parse given URL.") response = { "title": d.feed.get("title"), "link": d.feed.get("link"), "feed_url": url, "messages": [] } for entry in d.entries: response["messages"].append({ "title": entry.get("title"), "link": entry.get("link"), "updated": datetime.datetime(*entry.updated_parsed[:6]).strftime("%a %b %d %H:%M:%S %Y") }) feed_json = simplejson.dumps(response) self.response.headers["Cache-Control"] = "public, max-age=120" self.response.headers["Content-Type"] = "application/json" return self.response.out.write(feed_json) elif action == "add_column": tmpl = os.path.join(os.path.dirname(__file__), "../view/rss_add_column.html") return self.response.out.write(template.render(tmpl, {})) self.error(400)
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] tnow = datetime.datetime.utcnow() urladded = set() for feed in self.feeds: section, url = feed[0], feed[1].replace('gzh', 'gzhjs') isfulltext = feed[2] if len(feed) > 2 else False timeout = self.timeout+10 if isfulltext else self.timeout opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers) else: content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers) content = content[content.index('{'):content.index('}')+1] content = json.loads(content) for e in content['items'][:self.max_articles_per_feed]: e = feedparser.parse(e)['entries'][0] updated = None if hasattr(e, 'lastmodified') and e.lastmodified: updated = float(e.lastmodified) if self.oldest_article > 0 and updated: updated = datetime.datetime.utcfromtimestamp(updated) delta = tnow - updated if self.oldest_article > 365: threshold = self.oldest_article #以秒为单位 else: threshold = 86400*self.oldest_article #以天为单位 if delta.days*86400+delta.seconds > threshold: self.log.info("Skip old article(%s): %s" % (updated.strftime('%Y-%m-%d %H:%M:%S'),e.href)) continue #支持HTTPS if hasattr(e, 'href'): if url.startswith('https://'): urlfeed = e.href.replace('http://','https://') else: urlfeed = e.href if urlfeed in urladded: continue else: urlfeed = '' desc = None urls.append((section, e.title, urlfeed, desc)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url)) return urls
def blog_search(query): cache_key = "newsy-blog-" + sha.new(query).hexdigest() entries = cache.get(cache_key) if entries == None: url = "http://blogsearch.google.co.jp/blogsearch_feeds?" params = { 'hl' : 'en', 'q' : query, 'lr' : 'lang_en', 'ie' : 'utf-8', 'num' : 25, 'output' : 'rss'} d = feedparser.parse(url + urllib.urlencode(params)) entries = [] for e in d.entries: if 'summary' in e: summary = e.summary else: summary = e.description entries.append({ 'title' : e.title, 'link' : e.link, 'description' : e.summary }) return entries
def get_profiles(accounts): profiles = [] max_results = 100 for account in accounts: if account.service != "cybozulive" or account.access_token == None: continue # TODO 必ずプロフィール情報を更新するのはやめたい start_index = 0 while True: response, content = oauth.CybozuliveHandler.request_with_account( account, "https://api.cybozulive.com/api/group/V2?max-results=%s&start-index=%s" % (str(max_results), str(start_index))) account.account_info = unicode(content, 'utf-8') account.put() account_info = feedparser.parse(account.account_info) for group in account_info.entries: profiles.append({ "service": "cybozulive", "account_name": account.account_name, "url": "cybozulive/post/" + account.account_name + "/" + group.id.split(",")[1], "name": group.title + "/" + account.account_name }) if len(account_info.entries) < max_results: break start_index += max_results return profiles
def fetch(self): try: result = urlfetch.fetch(self.uri.encode('utf-8')) except: self.error = 'Can’t Fetch' return None if result.status_code != 200: self.error = 'Can’t Fetch (%d)' % result.status_code return None try: rss = feedparser.parse(result.content) except: self.error = 'Wrong RSS Format' return None if not rss or rss.bozo == 1: self.error = 'Wrong RSS Format' return None #URL, タイトル、日付だけ取り出す self.error = '' self.title = rss.channel.title self.entries = [] for entry in rss.entries: e = Entry() e.title = entry.title e.link = entry.link e.updated = entry.updated self.entries.append(e) return self
def addFeed(feedUrl): """ Function to add a new feed to the database. """ try: feedData = feedparser.parse(feedUrl) except: #this never occurs since parser does not raise any exceptions when invalid url is sent print "Invalid feed Url!" else: try: newFeed = Feed(url = unicode(feedUrl), title = feedData.feed.title, lastModified = time.mktime(feedData.modified), etag = unicode(feedData.etag)) session.commit() except AttributeError: session.rollback() print "Error! Invalid feed URL" except: session.rollback() print "%s \t Feed already subscribed" % (feedData.feed.title) else: print "Subscribed to \t %s " % (feedData.feed.title) fetchFeeds(newFeed, feedData)
def import_posts(commit=True): ideas = Idea.all().fetch(1000) #ideas = [Idea.get_by_id(9)] print 'Importing posts for %s idea(s)...' % len(ideas) to_put = [] for idea in ideas: soup = make_soup(idea.source_url) # We get the idea's actual body from the RSS feed rss = feedparser.parse(idea_feed_url(idea)) body = rss.feed.subtitle.replace( '\nFeed Created by spigit.com feed manager.', '') idea.body = clean_body(body) to_put.append(idea) headers = soup.find('td', 'main')\ .findAll('div', 'commentheader', recursive=False) for header in headers: content = header.findNextSiblings('div', limit=1)[0] post = make_post(idea, header, content, commit=False) to_put.extend(post) to_put = filter(None, to_put) if commit: db.put(to_put) return to_put
def get_rssfeed_parsed(self, rssfeed_data, cookies=None, cookie_header={}): """ rssfeed_data: A dictionary containing rss feed data as stored in the YaRSS2 config. cookies: A dictionary of cookie values as stored in the YaRSS2 config. cookie_header paramamer will not be used cookie_header: A dictionary of cookie values as returned by yarss2.http.get_cookie_header. """ return_dict = {} rssfeeds_dict = {} if cookies: cookie_header = http.get_cookie_header(cookies, rssfeed_data["site"]) self.log.info("Fetching RSS Feed: '%s' with Cookie: '%s'." % (rssfeed_data["name"], cookie_header)) # Will abort after 10 seconds if server doesn't answer try: parsed_feeds = feedparser.parse(rssfeed_data["url"], request_headers=cookie_header, agent=self.agent, timeout=10) except Exception, e: self.log.warn("Exception occured in feedparser:" + str(e)) self.log.warn( "Feedparser was called with url: '%s' and header: '%s'" % (rssfeed_data["url"], cookie_header)) self.log.warn("Stacktrace:" + common.get_exception_string()) return None
def post(self): stream = FeedStream.get(db.Key(self.request.POST.get("key"))) if stream is None: logging.warn("feedstream not found for subscription request") self.response.out.write("feedstream not found for subscription request") self.error(404) return feed = feedparser.parse(stream.url) if hasattr(feed, 'feed') and hasattr(feed.feed, 'links'): hub_url = find_feed_url('hub', feed.feed.links) if hub_url is None: logging.info("no hub found for: %s" % stream.url) self.response.out.write('no hub found') return else: logging.info("sending pshb subscription request for: %s" % stream.url) stream.pshb_hub_url = hub_url stream.put() self.subscribe_to_topic(stream, hub_url) self.response.out.write('sent subscription request') return logging.warn('could not parse feed unable to initiate subscription') self.response.out.write('could not parse feed unable to initiate subscription') self.error(400)
def get_feed(channel_id): feedurl = 'https://www.youtube.com/feeds/videos.xml?channel_id=' + str(channel_id) try: d = feedparser.parse(feedurl) except Exception as e: logging.info('exception caught {}'.format(e))
def main(bot, user, target, msg): feed = feedparser.parse(bot.memory['feeds']['civfanatics']['url']) bot.message( user, target, feed.feed.title + ' :: ' + feed.entries[0].title + ': ' + feed.entries[0].summary_detail.value + ' - ' + feed.entries[0].updated + ' - ' + xrl.xrl_encoder(connection, event, feed.entries[0].link)) bot.memory['feeds']['civfanatics']['last_title'] = feed.feed.title
def parse_feed(self, feed): """Helper method to handle conditional HTTP stuff""" try: logging.info("Requesting Feed for: %s" % feed.url) if feed.http_etag is not None and len(feed.http_etag) > 0 and feed.http_last_modified is not None: # give feedparser back what it pulled originally, a time.struct_time object return feedparser.parse(feed.url, etag=feed.http_etag, modified=feed.http_last_modified.timetuple()) if feed.http_etag is not None and len(feed.http_etag) > 0: return feedparser.parse(feed.url, etag=feed.http_etag) if feed.http_last_modified is not None: # give feedparser back what it pulled originally, a time.struct_time object return feedparser.parse(feed.url, modified=feed.http_last_modified.timetuple()) else: return feedparser.parse(feed.url) except UnicodeDecodeError: logging.error("Unicode error parsing feed: %s" % feed.url) return None
def get_account_info(self): resp, content = self.oauth_request("https://api.cybozulive.com/api/group/V2") if resp["status"] == "200": content = unicode(content,'utf-8') d = feedparser.parse(content) return d.feed.author_detail.email, None, None, content else: raise Exception("failed to verify credentials")
def GOODREADS(host=None, feednr=None, priority=0, dispname=None, test=False): """ Goodreads RSS query function, return all the results in a list, can handle multiple wishlists but expects goodreads format (looks for goodreads category names) """ results = [] basehost = host if not str(host)[:4] == "http": host = 'http://' + host URL = host result, success = fetchURL(URL) if test: return success if success: data = feedparser.parse(result) else: logger.error('Error fetching data from %s: %s' % (host, result)) BlockProvider(basehost, result) return [] if data: logger.debug('Parsing results from %s' % URL) provider = data['feed']['link'] if not dispname: dispname = provider logger.debug("RSS %s returned %i result%s" % (provider, len(data.entries), plural(len(data.entries)))) for post in data.entries: title = '' book_id = '' author_name = '' isbn = '' if 'title' in post: title = post.title if 'book_id' in post: book_id = post.book_id if 'author_name' in post: author_name = post.author_name if 'isbn' in post: isbn = post.isbn if title and author_name: results.append({ 'rss_prov': provider, 'rss_feed': feednr, 'rss_title': title, 'rss_author': author_name, 'rss_bookid': book_id, 'rss_isbn': isbn, 'priority': priority, 'dispname': dispname }) else: logger.debug('No data returned from %s' % host) return results
def request_feed(url, **kwargs): """ Wrapper for `request_response', which will return a feed object. """ response = request_response(url, **kwargs) if response is not None: return feedparser.parse(response.content)
def add_channel_to_db(url): used_groups = created_groups = 0 # debug info feed = feedparser.parse(url) if feed['bozo'] == 1: raise "RSS is not well-formed XML" items = feed['items'] db = DataBase() group_name = db.get_last_group_id() messages = db.get_news() for item in items: item_hash = get_hashes(plain_from_html(item['summary'])) equal = message_id = 0 for message in messages: # convert string to hash mess_hash = [] for x in re.split(' ', message['hashes']): if x != '': mess_hash.append(int(x)) t_equal = compare_hashes(item_hash, mess_hash) if t_equal > equal: equal = t_equal message_id = message['message_id'] # convert hashes to string str_hash = u"" for h in item_hash: str_hash += " %i" % h if equal > 0: used_groups = used_groups + 1 # debug # add new message db.add_message(feed['channel']['title'], plain_from_html(item['title']), plain_from_html(item['summary']), str_hash, db.get_message_group(message_id)) else: created_groups = created_groups + 1 # debug # create new group db.add_new_group("Topic %i" % group_name) # add new message db.add_message(feed['channel']['title'], plain_from_html(item['title']), plain_from_html(item['summary']), str_hash, group_name) # increment group id group_name = group_name+1 if __debug__: print "new groups %i, used groups %i" % \ (created_groups, used_groups)
def Items(self): itemsprocessed = [] cnt4debug = 0 opener = URLOpener(self.host) decoder = AutoDecoder() for section, url in self.feeds: content = None cnt4debug += 1 if IsRunInLocal and cnt4debug > 1: break result = opener.open(url) status_code, content = result.status_code, result.content if status_code != 200 and content: logging.error('err(%d) to fetch %s.' % (status_code,url)) continue if self.feed_encoding: content = content.decode(self.feed_encoding) else: content = decoder.decode(content) content = self.preprocess(content) feed = feedparser.parse(content) for e in feed['entries']: # 全文RSS中如果有广告或其他不需要的内容,可以在postprocess去掉 desc = self.postprocess(e.description) desc = self.FragToXhtml(desc, e.title, self.feed_encoding) if self.keep_image: soup = BeautifulSoup(content) self.soupbeforeimage(soup) for img in soup.findAll('img'): imgurl = img['src'] if not imgurl.startswith('http') and not imgurl.startswith('www'): imgurl = self.urljoin(url, imgurl) imgresult = opener.open(imgurl) imgcontent = imgresult.content if imgresult.status_code == 200 else None if imgcontent: imgtype = imghdr.what(None, imgcontent) if imgtype: imgmime = r"image/" + imgtype if imgtype == 'jpeg': fnimg = "%d.jpg" % random.randint(10000,99999999) else: fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype) img['src'] = fnimg yield (imgmime, imgurl, fnimg, imgcontent) self.soupprocessex(soup) desc = soup.renderContents('utf-8').decode('utf-8') soup = None if e.title not in itemsprocessed and desc: itemsprocessed.append(e.title) yield (section, e.link, e.title, desc)
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] tnow = datetime.datetime.utcnow() urladded = set() for feed in self.feeds: section, url = feed[0], feed[1] isfulltext = feed[2] if len(feed) > 2 else False timeout = self.timeout+10 if isfulltext else self.timeout opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(True).decode(result.content,url) else: content = AutoDecoder(True).decode(result.content,url) feed = feedparser.parse(content) for e in feed['entries'][:self.max_articles_per_feed]: updated = None if hasattr(e, 'updated_parsed') and e.updated_parsed: updated = e.updated_parsed elif hasattr(e, 'published_parsed') and e.published_parsed: updated = e.published_parsed elif hasattr(e, 'created_parsed'): updated = e.created_parsed if self.oldest_article > 0 and updated: delta = tnow - datetime.datetime(*(updated[0:6])) if delta.days*86400+delta.seconds > 86400*self.oldest_article: self.log.info("Skip old article: %s" % e.link) continue #支持HTTPS urlfeed = e.link.replace('http://','https://') if url.startswith('https://') else e.link if urlfeed in urladded: continue desc = None if isfulltext: if hasattr(e, 'description'): desc = e.description elif hasattr(e, 'content') and e.content[0]['value']: desc = e.content[0]['value'] else: self.log.warn('fulltext feed item no has desc,link to webpage for article.(%s)'%e.title) urls.append((section, e.title, urlfeed, desc)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url)) return urls
def ParseFeedUrls(self): #解析xml,返回相关信息 """ return list like [(section,title,url,desc),..] """ urls = [] tnow = datetime.datetime.utcnow() urladded = set() for feed in self.feeds: section, url = feed[0], feed[1] isfulltext = feed[2] if len(feed) > 2 else False timeout = self.timeout + 10 if isfulltext else self.timeout opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) if result.code == 200 and result.content: if self.feed_encoding: content = result.content.decode(self.feed_encoding) else: content = AutoDecoder(True).decode(result.content, url) feed = feedparser.parse(content) #进行解析 #分解得到的内容 for e in feed['entries'][:self. max_articles_per_feed]: #取相应数量的feed if self.oldest_article > 0 and hasattr( e, 'updated_parsed'): #是否有更新 updated = e.updated_parsed if updated: delta = tnow - datetime.datetime(*(updated[0:6])) #根据时间来判断要取的文章 if delta.days * 86400 + delta.seconds > 86400 * self.oldest_article: self.log.info("Skip old article: %s" % e.link) continue #支持HTTPS urlfeed = e.link.replace( 'http://', 'https://') if url.startswith('https://') else e.link if urlfeed in urladded: continue desc = None if isfulltext: if hasattr(e, 'content') and e.content[0]['value']: desc = e.content[0]['value'] elif hasattr(e, 'description'): desc = e.description else: self.log.warn( 'fulltext feed item no has desc,link to webpage for article.(%s)' % e.title) urls.append((section, e.title, urlfeed, desc)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%d):%s' % (result.code, url)) return urls
def cache(self): """cache newest feed""" xdm.common.MM.clearRole('news') self.news = [] self.feed = feedparser.parse(XDM_FEED_URL) for e in self.feed.entries: tags = [] for tag in e.tags: tags.append(tag['term']) self.news.append(SimpleNews(e.summary_detail.value, e.link, tags))
def get(self): rawdata = """ <rss channel="2.0"> <channel> <title>Sample feed</title> </channel> </rss> """ d = feedparser.parse(rawdata) self.response.write(d['feed']['title']) self.response.write('Hello world!')
def autofeed(bot): for feed in feeds: rss = feedparser.parse(feeds[feed]['url']) if bot.memory['feeds'][feed]['last_title'] == feed.feed.title: pass else: bot.msg( feeds[feed], feed.feed.title + ' :: ' + feed.entries[0].title + ': ' + feed.entries[0].summary_detail.value + ' - ' + feed.entries[0].updated + ' - ' + xrl.xrl_encoder(feed.entries[0].link)) feeds[feed]['last_title'] = feed.feed.title
def get(self): # Incoming url parameters # iGoogle prepends var names with 'up_' for some reason... subreddits = self.request.get("up_subreddits") # Pipe separated list of subreddits for top menu width = self.request.get("up_width", "500") # Truncate headline chars, default 500 imgur_switch = self.request.get("up_imgur", 1) # Imgur mirror, 1=imgur, 2=mirur, 3=filmot feed = self.request.get("r", "all") # Current requested subreddit feed try: # Fetch and parse the feed rss = urlfetch.fetch(self.feed_to_url(feed), headers={"Cache-Control": "max-age=0"}) parsed = feedparser.parse(rss.content) stories = [] for entry in parsed.entries: comment_count = self.get_comment_count(entry.summary_detail) # Extract the external link url, and transform imgur links to requested mirror external_link = self.transform_url(self.get_external_link(entry.summary_detail), int(imgur_switch)) # Build a hash object for each story... parsed_story_hash = { "full_title": entry.title_detail.value, # Full, non-truncated title just in case "fixed_width_title": self.truncate(entry.title_detail.value, int(width)), "external_link": external_link, "comment_link": entry.link, "comment_count": comment_count, } # ... and append to the stories list stories.append(parsed_story_hash) # The main data is 'stories'. Everything else is # there to persist the URL parameters. template_data = { "subreddits": subreddits.split("|"), "link_subreddits": subreddits, "link_imgur": imgur_switch, "width": width, "current_feed": str(feed), "stories": stories, } # Finally, render the template with the data path = os.path.join(os.path.dirname(__file__), "index.html") self.response.out.write(template.render(path, template_data)) except: self.response.out.write( '<a style="font-size:1em;font-weight:bold;text-decoration:none;" href="http://www.downforeveryoneorjustme.com/reddit.com">is reddit down?</a>' )
def print_channel_info(url, num=0): feed = feedparser.parse(url) if feed['bozo'] == 1: raise "RSS is not well-formed XML" items = feed["items"] print "------------------------------------------------------------" print feed["channel"]["title"], feed["channel"]["link"], " (", \ len(items), ")" print "------------------------------------------------------------" print "First message:" print "+ ", items[num]["title"], " +" print plain_from_html(items[num]["summary"])
def get(self): # Incoming url parameters # iGoogle prepends var names with 'up_' for some reason... subreddits = self.request.get('up_subreddits') # Pipe separated list of subreddits for top menu width = self.request.get('up_width', '500') # Truncate headline chars, default 500 imgur_switch = self.request.get('up_imgur', 1) # Imgur mirror, 1=imgur, 2=mirur, 3=filmot feed = self.request.get('r', 'all') # Current requested subreddit feed try: # Fetch and parse the feed rss = urlfetch.fetch(self.feed_to_url(feed), headers = {'Cache-Control' : 'max-age=0'}) parsed = feedparser.parse(rss.content) stories = [] for entry in parsed.entries: comment_count = self.get_comment_count(entry.summary_detail) # Extract the external link url, and transform imgur links to requested mirror external_link = self.transform_url(self.get_external_link(entry.summary_detail), int(imgur_switch)) # Build a hash object for each story... parsed_story_hash = { 'full_title' : entry.title_detail.value, # Full, non-truncated title just in case 'fixed_width_title' : self.truncate(entry.title_detail.value, int(width)), 'external_link' : external_link, 'comment_link' : entry.link, 'comment_count' : comment_count } # ... and append to the stories list stories.append(parsed_story_hash) # The main data is 'stories'. Everything else is # there to persist the URL parameters. template_data = { 'subreddits' : subreddits.split('|'), 'link_subreddits' : subreddits, 'link_imgur' : imgur_switch, 'width' : width, 'current_feed' : str(feed), 'stories' : stories } # Finally, render the template with the data path = os.path.join(os.path.dirname(__file__), 'index.html') self.response.out.write(template.render(path, template_data)) except: self.response.out.write('<a style="font-size:1em;font-weight:bold;text-decoration:none;" href="http://www.downforeveryoneorjustme.com/reddit.com">is reddit down?</a>')
def get(self): breakingstories = getAllBreakingStories("bbc") feed = feedparser.parse('http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/breaking_news/rss.xml') articles = feed['entries'] for a in articles: if not headlineInList(a['title'], breakingstories): breakingstory = BreakingStory() breakingstory.source = 'bbc' breakingstory.url = a['link'] breakingstory.headline = a['title'] breakingstory.description = a['description'] breakingstory.put() self.response.out.write("added story: " + a['title'] + ' (' + a['link'] + ') <br />')
def fetch_feeds(request): feed_key = request.POST.get('feed_key', None) if feed_key is None: logging.error('missing parameter') raise TypeError('missing parameter') feed = Feed.get(feed_key) if feed is None: logging.error('Feed object not found: %s', feed_key) raise TypeError('Feed object not found') parser = feedparser.parse(feed.url) # check if feed exists if hasattr(feed, 'bozo_exception'): feed.is_valid = False logging.warn('Invalid feed: %s;;%s', feed.id, feed.url) feed.put() return # setup feed title if does not exist if not feed.title: feed.title = parser.feed.title rd = ReadyData.gql("WHERE data_type = :1 AND owner = :2 LIMIT 1", 'feed', feed.owner).get() if rd is None: rd = ReadyData(owner=feed.owner, data_type='feed') rd.content = '' for e in parser['entries']: # TODO - check the date article = '<h1>%(title)s</h1>' % e for content in e['content']: article += content['value'] rd.content += article rd.merged += 1 rd.put() feed.put() params = {'ready_data_key': rd.key()} taskqueue.add(url=reverse('fetcher-send'), params=params) return True
def update(user, auto_save=True): logging.debug('Checking in for %s' % user.username) poster = audioscrobbler.AudioScrobblerPost(username=user.username, password=user.password, password_is_md5=True, verbose=True) poster.auth() contents = urlfetch.fetch(user.rss_url).content # hack around the namespacing. contents = contents.replace('<rhap:', '<').replace('</rhap:', '</') tracks_played = feedparser.parse(contents) for i in xrange(len(tracks_played['entries']) - 1, -1, -1): # skip a track we've already submitted # see this date ridiculousness? Bleh. played_at = datetime.datetime( *tracks_played['entries'][i]['updated_parsed'][0:6]) if user.last_updated >= played_at: continue track = dict(artist_name=tracks_played['entries'][i]['artist'], song_title=tracks_played['entries'][i]['track'], length=tracks_played['entries'][i]['duration'], date_played=int( time.mktime( tracks_played['entries'][i]['updated_parsed'])), album=tracks_played['entries'][i]['album']) poster.add_track(**track) user.submitted_tracks.insert( 0, '%s - %s' % (track['artist_name'], track['song_title'])) # bulk submit num_submitted = poster.flush_cache() total_submitted = num_submitted + user.num_submitted logging.debug('\tSubmitted %d tracks (%d total)' % (num_submitted, total_submitted)) user.submitted_tracks = user.submitted_tracks[0:50] user.num_submitted = total_submitted if auto_save: user.put()
def addFeed(feedUrl): """ Function to add a new feed to the database. """ try: feedUrl = autorss.getRSSLink(feedUrl) feedData = feedparser.parse(feedUrl) except: #this never occurs since parser does not raise any exceptions when invalid url is sent print "Invalid feed Url!" raise FeedError else: try: newFeed = Feed(url = unicode(feedUrl), title = feedData.feed.title, lastModified = time.mktime(feedData.modified), etag = unicode(feedData.etag)) session.commit() except AttributeError: session.rollback() print "Error! Invalid feed URL" raise FeedError except: session.rollback() print "%s \t Feed already subscribed" % (feedData.feed.title) raise FeedError else: try: # Get the topics list and assign the feed to all the available topics. topicsList = Topic.query.all() for topic in topicsList: setFeedTopic(newFeed, topic, False) session.commit() print "Added %s to all topics" % newFeed.title except: session.rollback() print "Error setting up topics to the Feed" raise FeedError print "Subscribed to \t %s " % (feedData.feed.title) fetchFeeds(newFeed, feedData) topicsList = Topic.query.all()
def nzbs(provider=None, forcerss=False): feedthis = [] def _parse_feed(site, url, verify): logger.fdebug('[RSS] Fetching items from ' + site) payload = None headers = {'User-Agent': str(mylar.USER_AGENT)} try: r = requests.get(url, params=payload, verify=verify, headers=headers) except Exception, e: logger.warn('Error fetching RSS Feed Data from %s: %s' % (site, e)) return feedme = feedparser.parse(r.content) feedthis.append({"site": site, "feed": feedme})
def get_train_info(list_lines: list) -> list: """ tetsudo.comのRSS情報から指定した路線の情報を取得する NOTE: titleは 【{{鉄道名?}}】"."join({{路線名}} みたいな感じで出るっぽいが(2020/07/05時点)パターンは要確認 ex) 【JR九州】鹿児島本線・肥薩線・指宿枕崎線・日豊本線・日南線・吉都線・特急列車 上記にinで検索をかけるので鉄道会社でも可能かつ曖昧検索になる """ RSS_URL = "http://api.tetsudo.com/traffic/rss20.xml" JST = timezone(timedelta(hours=+9)) now = datetime.now(JST) rss_data = feedparser.parse(RSS_URL) #NOTE: 前回からの実行時間の差分ではなく固定値で直近1時間を取ってるのでnowのタイミングで取れないデータが出る場合がある # わざわざデータをおく場所を作りたくないのでその点は許容している recent_entry = filter( lambda entry: #TODO: 固定値で1 hourを引き渡してるので後でもうちょっと受け入れ幅を広げる now - datetime_from_rfc822(entry['published']) < timedelta(hours=1) and any(line in entry['title'] for line in list_lines), rss_data['entries']) for entry in recent_entry: #TODO: 後で消す(確認用にしばらくおいておく) print(entry) yield entry
def Startit(searchName, searchIssue, searchYear, ComicVersion, IssDateFix): #searchName = "Uncanny Avengers" #searchIssue = "01" #searchYear = "2012" #clean up searchName due to webparse. searchName = searchName.replace("%20", " ") if "," in searchName: searchName = searchName.replace(",", "") logger.fdebug("name:" + str(searchName)) logger.fdebug("issue:" + str(searchIssue)) logger.fdebug("year:" + str(searchYear)) splitSearch = searchName.split(" ") joinSearch = "+".join(splitSearch) + "+" + searchIssue searchIsOne = "0" + searchIssue searchIsTwo = "00" + searchIssue if "-" in searchName: searchName = searchName.replace("-", '((\\s)?[-:])?(\\s)?') regexName = searchName.replace(" ", '((\\s)?[-:])?(\\s)?') #logger.fdebug('searchName:' + searchName) #logger.fdebug('regexName:' + regexName) if mylar.USE_MINSIZE: size_constraints = "minsize=" + str(mylar.MINSIZE) else: size_constraints = "minsize=10" if mylar.USE_MAXSIZE: size_constraints = size_constraints + "&maxsize=" + str(mylar.MAXSIZE) if mylar.USENET_RETENTION != None: max_age = "&age=" + str(mylar.USENET_RETENTION) feeds = [] feeds.append( feedparser.parse( "http://nzbindex.nl/rss/alt.binaries.comics.dcp/?sort=agedesc&" + str(size_constraints) + str(max_age) + "&dq=%s&max=50&more=1" % joinSearch)) if mylar.ALTEXPERIMENTAL: feeds.append( feedparser.parse( "http://nzbindex.nl/rss/?dq=%s&g[]=41&g[]=510&sort=agedesc&hidespam=0&max=&more=1" % joinSearch)) entries = [] mres = {} tallycount = 0 for feed in feeds: totNum = len(feed.entries) tallycount += len(feed.entries) #keyPair = {} keyPair = [] regList = [] countUp = 0 logger.fdebug(str(totNum) + " results") while countUp < totNum: urlParse = feed.entries[countUp].enclosures[0] #keyPair[feed.entries[countUp].title] = feed.entries[countUp].link #keyPair[feed.entries[countUp].title] = urlParse["href"] keyPair.append({ "title": feed.entries[countUp].title, "link": urlParse["href"], "length": urlParse["length"], "pubdate": feed.entries[countUp].updated }) countUp = countUp + 1 # thanks to SpammyHagar for spending the time in compiling these regEx's! regExTest = "" regEx = "(%s\\s*(0)?(0)?%s\\s*\\(%s\\))" % (regexName, searchIssue, searchYear) regExOne = "(%s\\s*(0)?(0)?%s\\s*\\(.*?\\)\\s*\\(%s\\))" % ( regexName, searchIssue, searchYear) #Sometimes comics aren't actually published the same year comicVine says - trying to adjust for these cases regExTwo = "(%s\\s*(0)?(0)?%s\\s*\\(%s\\))" % (regexName, searchIssue, int(searchYear) + 1) regExThree = "(%s\\s*(0)?(0)?%s\\s*\\(%s\\))" % ( regexName, searchIssue, int(searchYear) - 1) regExFour = "(%s\\s*(0)?(0)?%s\\s*\\(.*?\\)\\s*\\(%s\\))" % ( regexName, searchIssue, int(searchYear) + 1) regExFive = "(%s\\s*(0)?(0)?%s\\s*\\(.*?\\)\\s*\\(%s\\))" % ( regexName, searchIssue, int(searchYear) - 1) regexList = [ regEx, regExOne, regExTwo, regExThree, regExFour, regExFive ] except_list = [ 'releases', 'gold line', 'distribution', '0-day', '0 day' ] for entry in keyPair: title = entry['title'] #logger.fdebug("titlesplit: " + str(title.split("\""))) splitTitle = title.split("\"") noYear = 'False' for subs in splitTitle: #logger.fdebug('sub:' + subs) regExCount = 0 if len(subs) > 10 and not any(d in subs.lower() for d in except_list): #Looping through dictionary to run each regEx - length + regex is determined by regexList up top. # while regExCount < len(regexList): # regExTest = re.findall(regexList[regExCount], subs, flags=re.IGNORECASE) # regExCount = regExCount +1 # if regExTest: # logger.fdebug(title) # entries.append({ # 'title': subs, # 'link': str(link) # }) if IssDateFix != "no": if IssDateFix == "01" or IssDateFix == "02": ComicYearFix = str(int(searchYear) - 1) else: ComicYearFix = str(int(searchYear) + 1) else: ComicYearFix = searchYear if searchYear not in subs and ComicYearFix not in subs: noYear = 'True' noYearline = subs if (searchYear in subs or ComicYearFix in subs) and noYear == 'True': #this would occur on the next check in the line, if year exists and #the noYear check in the first check came back valid append it subs = noYearline + ' (' + searchYear + ')' noYear = 'False' if noYear == 'False': entries.append({ 'title': subs, 'link': entry['link'], 'pubdate': entry['pubdate'], 'length': entry['length'] }) break # break out so we don't write more shit. # if len(entries) >= 1: if tallycount >= 1: mres['entries'] = entries return mres # print("Title: "+regList[0]) # print("Link: "+keyPair[regList[0]]) else: logger.fdebug("No Results Found") return "no results"
def torrents(pickfeed=None, seriesname=None, issue=None): if pickfeed is None: pickfeed = 1 #else: # print "pickfeed is " + str(pickfeed) passkey = mylar.CBT_PASSKEY srchterm = None if seriesname: srchterm = re.sub(' ', '%20', seriesname) if issue: srchterm += '%20' + str(issue) if mylar.KAT_PROXY: if mylar.KAT_PROXY.endswith('/'): kat_url = mylar.KAT_PROXY else: kat_url = mylar.KAT_PROXY + '/' else: kat_url = 'http://kat.ph/' if pickfeed == "1": # cbt rss feed based on followlist feed = "http://comicbt.com/rss.php?action=browse&passkey=" + str( passkey) + "&type=dl" elif pickfeed == "2" and srchterm is not None: # kat.ph search feed = kat_url + "usearch/" + str( srchterm) + "%20category%3Acomics%20seeds%3A1/?rss=1" elif pickfeed == "3": # kat.ph rss feed feed = kat_url + "usearch/category%3Acomics%20seeds%3A1/?rss=1" elif pickfeed == "4": #cbt follow link feed = "http://comicbt.com/rss.php?action=follow&passkey=" + str( passkey) + "&type=dl" elif pickfeed == "5": # cbt series link # seriespage = "http://comicbt.com/series.php?passkey=" + str(passkey) feed = "http://comicbt.com/rss.php?action=series&series=" + str( seriesno) + "&passkey=" + str(passkey) else: logger.error('invalid pickfeed denoted...') return title = [] link = [] description = [] seriestitle = [] if pickfeed == "5": # we need to get the series # first seriesSearch(seriespage, seriesname) feedme = feedparser.parse(feed) i = 0 feeddata = [] myDB = db.DBConnection() torthekat = [] katinfo = {} for entry in feedme['entries']: if pickfeed == "3": tmpsz = feedme.entries[i].enclosures[0] feeddata.append({ 'Site': 'KAT', 'Title': feedme.entries[i].title, 'Link': tmpsz['url'], 'Pubdate': feedme.entries[i].updated, 'Size': tmpsz['length'] }) elif pickfeed == "2": tmpsz = feedme.entries[i].enclosures[0] torthekat.append({ 'site': 'KAT', 'title': feedme.entries[i].title, 'link': tmpsz['url'], 'pubdate': feedme.entries[i].updated, 'length': tmpsz['length'] }) #print ("Site: KAT") #print ("Title: " + str(feedme.entries[i].title)) #print ("Link: " + str(tmpsz['url'])) #print ("pubdate: " + str(feedme.entries[i].updated)) #print ("size: " + str(tmpsz['length'])) elif pickfeed == "1" or pickfeed == "4": # tmpsz = feedme.entries[i].enclosures[0] feeddata.append({ 'Site': 'CBT', 'Title': feedme.entries[i].title, 'Link': feedme.entries[i].link, 'Pubdate': feedme.entries[i].updated # 'Size': tmpsz['length'] }) #print ("Site: CBT") #print ("Title: " + str(feeddata[i]['Title'])) #print ("Link: " + str(feeddata[i]['Link'])) #print ("pubdate: " + str(feeddata[i]['Pubdate'])) i += 1 logger.fdebug('there were ' + str(i) + ' results..') if not seriesname: rssdbupdate(feeddata, i, 'torrent') else: katinfo['entries'] = torthekat return katinfo return
def nzbs(provider=None): nzbprovider = [] nzbp = 0 if mylar.NZBSU == 1: nzbprovider.append('nzb.su') nzbp += 1 if mylar.DOGNZB == 1: nzbprovider.append('dognzb') nzbp += 1 # -------- # Xperimental if mylar.EXPERIMENTAL == 1: nzbprovider.append('experimental') nzbp += 1 newznabs = 0 newznab_hosts = [] if mylar.NEWZNAB == 1: for newznab_host in mylar.EXTRA_NEWZNABS: if newznab_host[4] == '1' or newznab_host[4] == 1: newznab_hosts.append(newznab_host) nzbprovider.append('newznab') newznabs += 1 logger.fdebug('newznab name:' + str(newznab_host[0]) + ' - enabled: ' + str(newznab_host[4])) # -------- providercount = int(nzbp + newznabs) logger.fdebug('there are : ' + str(providercount) + ' RSS search providers you have enabled.') nzbpr = providercount - 1 if nzbpr < 0: nzbpr == 0 feeddata = [] feedthis = [] ft = 0 totNum = 0 nonexp = "no" while (nzbpr >= 0): if nzbprovider[nzbpr] == 'experimental': feed = feedparser.parse( "http://nzbindex.nl/rss/alt.binaries.comics.dcp/?sort=agedesc&max=50&more=1" ) totNum = len(feed.entries) site = 'experimental' keyPair = {} regList = [] entries = [] mres = {} countUp = 0 i = 0 for entry in feed['entries']: tmpsz = feed.entries[i].enclosures[0] feeddata.append({ 'Site': site, 'Title': feed.entries[i].title, 'Link': tmpsz['url'], #feed.entries[i].link, 'Pubdate': feed.entries[i].updated, 'Size': tmpsz['length'] }) # print ("Site:" + str(site)) # print ("Title:" + str(feed.entries[i].title)) # print ("Link:" + str(feed.entries[i].link)) # print ("Pubdate:" + str(feed.entries[i].updated)) # print ("Size:" + str(tmpsz['length'])) i += 1 logger.info(str(i) + ' results from Experimental feed indexed.') nzbpr -= 1 else: if nzbprovider[nzbpr] == 'newznab': for newznab_host in newznab_hosts: if newznab_host[3] is None: newznabuid = '1' newznabcat = '7030' else: if '#' not in newznab_host[3]: newznabuid = newznab_host[3] newznabcat = '7030' else: newzst = newznab_host[3].find('#') newznabuid = newznab_host[3][:newzst] newznabcat = newznab_host[3][newzst + 1:] feed = newznab_host[1].rstrip() + '/rss?t=' + str( newznabcat) + '&dl=1&i=' + str( newznabuid) + '&r=' + newznab_host[2].rstrip() feedme = feedparser.parse(feed) site = newznab_host[0].rstrip() feedthis.append({"feed": feedme, "site": site}) totNum += len(feedme.entries) ft += 1 nonexp = "yes" nzbpr -= 1 elif nzbprovider[nzbpr] == 'nzb.su': if mylar.NZBSU_UID is None: mylar.NZBSU_UID = '1' feed = 'http://api.nzb.su/rss?t=7030&dl=1&i=' + mylar.NZBSU_UID + '&r=' + mylar.NZBSU_APIKEY feedme = feedparser.parse(feed) site = nzbprovider[nzbpr] feedthis.append({"feed": feedme, "site": site}) totNum += len(feedme.entries) ft += 1 nonexp = "yes" nzbpr -= 1 elif nzbprovider[nzbpr] == 'dognzb': if mylar.DOGNZB_UID is None: mylar.DOGNZB_UID = '1' feed = 'https://dognzb.cr/rss.cfm?r=' + mylar.DOGNZB_APIKEY + '&t=7030' feedme = feedparser.parse(feed) site = nzbprovider[nzbpr] ft += 1 nonexp = "yes" feedthis.append({"feed": feedme, "site": site}) totNum += len(feedme.entries) nzbpr -= 1 i = 0 if nonexp == "yes": #print str(ft) + " sites checked. There are " + str(totNum) + " entries to be updated." #print feedme for ft in feedthis: sitei = 0 site = ft['site'] logger.fdebug(str(site) + " now being updated...") logger.fdebug('feedthis:' + str(ft)) for entry in ft['feed'].entries: if site == 'dognzb': #because the rss of dog doesn't carry the enclosure item, we'll use the newznab size value tmpsz = 0 #for attr in entry['newznab:attrib']: # if attr('@name') == 'size': # tmpsz = attr['@value'] # logger.fdebug('size retrieved as ' + str(tmpsz)) # break feeddata.append({ 'Site': site, 'Title': entry.title, #ft['feed'].entries[i].title, 'Link': entry.link, #ft['feed'].entries[i].link, 'Pubdate': entry.updated, #ft['feed'].entries[i].updated, 'Size': tmpsz }) else: #this should work for all newznabs (nzb.su included) #only difference is the size of the file between this and above (which is probably the same) tmpsz = entry.enclosures[ 0] #ft['feed'].entries[i].enclosures[0] feeddata.append({ 'Site': site, 'Title': entry.title, #ft['feed'].entries[i].title, 'Link': entry.link, #ft['feed'].entries[i].link, 'Pubdate': entry.updated, #ft['feed'].entries[i].updated, 'Size': tmpsz['length'] }) #logger.fdebug("Site: " + str(feeddata[i]['Site'])) #logger.fdebug("Title: " + str(feeddata[i]['Title'])) #logger.fdebug("Link: " + str(feeddata[i]['Link'])) #logger.fdebug("pubdate: " + str(feeddata[i]['Pubdate'])) #logger.fdebug("size: " + str(feeddata[i]['Size'])) sitei += 1 logger.info(str(site) + ' : ' + str(sitei) + ' entries indexed.') i += sitei logger.info( '[RSS] ' + str(i) + ' entries have been indexed and are now going to be stored for caching.' ) rssdbupdate(feeddata, i, 'usenet') return
def TDL(book=None, test=False): errmsg = '' provider = "torrentdownloads" host = lazylibrarian.CONFIG['TDL_HOST'] if not host.startswith('http'): host = 'http://' + host providerurl = url_fix(host) params = {"type": "search", "cid": "2", "search": book['searchterm']} searchURL = providerurl + "/rss.xml?%s" % urlencode(params) sterm = makeUnicode(book['searchterm']) data, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in data: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, data)) errmsg = data data = False if test: return success results = [] minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 if data: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) d = feedparser.parse(data) if len(d.entries): for item in d.entries: try: title = item['title'] seeders = int(item['seeders']) link = item['link'] size = int(item['size']) url = None if link and minimumseeders < int(seeders): # no point requesting the magnet link if not enough seeders # TDL gives us a relative link result, success = fetchURL(providerurl + link) if success: new_soup = BeautifulSoup(result, 'html5lib') for link in new_soup.find_all('a'): output = link.get('href') if output and output.startswith('magnet'): url = output break if not url or not title: logger.debug('Missing url or title') else: results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': 'magnet', 'priority': lazylibrarian.CONFIG['TDL_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
def LIME(book=None, test=False): errmsg = '' provider = "Limetorrent" host = lazylibrarian.CONFIG['LIME_HOST'] if not host.startswith('http'): host = 'http://' + host params = {"q": book['searchterm']} providerurl = url_fix(host + "/searchrss/other") searchURL = providerurl + "?%s" % urlencode(params) sterm = makeUnicode(book['searchterm']) data, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in data: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, data)) errmsg = data data = False if test: return success results = [] minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 if data: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) d = feedparser.parse(data) if len(d.entries): for item in d.entries: try: title = unaccented(item['title']) try: seeders = item['description'] seeders = int( seeders.split('Seeds:')[1].split(',')[0].strip()) except (IndexError, ValueError): seeders = 0 size = item['size'] try: size = int(size) except ValueError: size = 0 url = None for link in item['links']: if 'x-bittorrent' in link['type']: url = link['url'] if not url or not title: logger.debug('No url or title found') elif minimumseeders < int(seeders): results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': 'torrent', 'priority': lazylibrarian.CONFIG['LIME_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: if 'forbidden' in str(e).lower(): # may have ip based access limits logger.error( 'Access forbidden. Please wait a while before trying %s again.' % provider) else: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
def ZOO(book=None, test=False): errmsg = '' provider = "zooqle" host = lazylibrarian.CONFIG['ZOO_HOST'] if not host.startswith('http'): host = 'http://' + host providerurl = url_fix(host + "/search") params = {"q": book['searchterm'], "category": "books", "fmt": "rss"} searchURL = providerurl + "?%s" % urlencode(params) sterm = makeUnicode(book['searchterm']) data, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in data: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, data)) errmsg = data data = False if test: return success results = [] minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 if data: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) d = feedparser.parse(data) if len(d.entries): for item in d.entries: try: title = unaccented(item['title']) seeders = int(item['torrent_seeds']) link = item['links'][1]['href'] size = int(item['links'][1]['length']) magnet = item['torrent_magneturi'] url = None mode = 'torrent' if link: url = link mode = 'torrent' if magnet: if not url or (url and lazylibrarian.CONFIG['PREFER_MAGNET']): url = magnet mode = 'magnet' if not url or not title: logger.debug('No url or title found') elif minimumseeders < int(seeders): results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': mode, 'priority': lazylibrarian.CONFIG['ZOO_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: if 'forbidden' in str(e).lower(): # looks like zooqle has ip based access limits logger.error( 'Access forbidden. Please wait a while before trying %s again.' % provider) else: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
def EXTRA(book=None, test=False): errmsg = '' provider = "Extratorrent" host = lazylibrarian.CONFIG['EXTRA_HOST'] if not host.startswith('http'): host = 'http://' + host providerurl = url_fix(host + "/rss") params = {"type": "search", "s_cat": "2", "search": book['searchterm']} searchURL = providerurl + "/?%s" % urlencode(params) sterm = makeUnicode(book['searchterm']) data, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in data: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug('Error fetching data from %s: %s' % (provider, data)) errmsg = data data = False if test: return success results = [] minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 if data: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) d = feedparser.parse(data) if len(d.entries): for item in d.entries: try: title = unaccented(item['title']) try: seeders = int(item['seeders']) except ValueError: seeders = 0 try: size = int(item['size']) except ValueError: size = 0 url = None for link in item['links']: if 'x-bittorrent' in link['type']: url = link['href'] if not url or not title: logger.debug('No url or title found') elif minimumseeders < int(seeders): results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': 'torrent', 'priority': lazylibrarian.CONFIG['EXTRA_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
def fetch(self): import BlogPost # don't fetch internally hosted blogs if not self.from_feed: return events = [] # parse and iterate the feed entries = feedparser.parse(self.rss).entries for post in entries: try: date = dateutil.parser.parse(post.date).replace(tzinfo=None) except: date = datetime.datetime.utcnow() # don't re-add old posts if self.most_recent_date >= date: continue try: content = post.content[0].value except: content = post.description try: author_name = post.author_detail["name"] except: author_name = None # sanitize the post's content content = sanitize(content, [ "h1", "h2", "h3", "h4", "h5", "h6", "a:href", "p", "ul", "ol", "li", "br", "div", "img:src:alt:title", "b", "i", "u", "strong", "em", "table", "tbody", "td", "th", "thead", "tfoot", "pre", "tt", "code" ]) # format a summary for the post summary = sanitize(content, [], strip_tags=[ "h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "ol", "li", "br", "div", 'a', "b", "i", "u", "strong", "em", "pre", "tt", "code" ]) if len(summary) > 500: summary = summary[0:500] + u"..." summary = "<p>" + summary + "</p>" events.append( self.add_event(BlogPost.BlogPost, title=post.title, summary=summary, from_feed=True, author_name=author_name, date=date, extra_args={ "external_link": post.link, "content": content, "blog_id": self.id })) # find the new most recent date dates = [event.date for event in events if event is not None] dates.append(self.most_recent_date) self.most_recent_date = max(dates) self.save()
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] tnow = datetime.datetime.utcnow() urladded = set() for feed in self.feeds: section, url = feed[0], feed[1].replace('gzh', 'gzhjs') isfulltext = feed[2] if len(feed) > 2 else False timeout = self.timeout + 10 if isfulltext else self.timeout opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(True).decode( result.content, opener.realurl, result.headers) else: content = AutoDecoder(True).decode(result.content, opener.realurl, result.headers) content = content[content.index('{'):content.index('}') + 1] content = json.loads(content) for e in content['items'][:self.max_articles_per_feed]: e = feedparser.parse(e)['entries'][0] updated = None if hasattr(e, 'lastmodified') and e.lastmodified: updated = float(e.lastmodified) if self.oldest_article > 0 and updated: updated = datetime.datetime.utcfromtimestamp(updated) delta = tnow - updated if self.oldest_article > 365: threshold = self.oldest_article #以秒为单位 else: threshold = 86400 * self.oldest_article #以天为单位 if delta.days * 86400 + delta.seconds > threshold: self.log.info( "Skip old article(%s): %s" % (updated.strftime('%Y-%m-%d %H:%M:%S'), e.href)) continue #支持HTTPS if hasattr(e, 'href'): if url.startswith('https://'): urlfeed = e.href.replace('http://', 'https://') else: urlfeed = e.href if urlfeed in urladded: continue else: urlfeed = '' desc = None urls.append((section, e.title, urlfeed, desc)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%d):%s' % (result.status_code, url)) return urls
def RSS(host=None, feednr=None, priority=0, test=False): """ Generic RSS query function, just return all the results from the RSS feed in a list """ results = [] URL = host if not str(URL)[:4] == "http": URL = 'http://' + URL result, success = fetchURL(URL) if test: return success if success: data = feedparser.parse(result) else: logger.error('Error fetching data from %s: %s' % (host, result)) BlockProvider(host, result) data = None if data: # to debug because of api logger.debug('Parsing results from %s' % URL) provider = data['feed']['link'] logger.debug("RSS %s returned %i result%s" % (provider, len(data.entries), plural(len(data.entries)))) for post in data.entries: title = None magnet = None size = None torrent = None nzb = None url = None tortype = 'torrent' if 'title' in post: title = post.title if 'links' in post: for f in post.links: if 'x-bittorrent' in f['type']: size = f['length'] torrent = f['href'] break if 'x-nzb' in f['type']: size = f['length'] nzb = f['href'] break if 'torrent_magneturi' in post: magnet = post.torrent_magneturi if torrent: url = torrent tortype = 'torrent' if magnet: if not url or (url and lazylibrarian.CONFIG['PREFER_MAGNET']): url = magnet tortype = 'magnet' if nzb: # prefer nzb over torrent/magnet url = nzb tortype = 'nzb' if not url: if 'link' in post: url = post.link tor_date = 'Fri, 01 Jan 1970 00:00:00 +0100' if 'newznab_attr' in post: if post.newznab_attr['name'] == 'usenetdate': tor_date = post.newznab_attr['value'] if not size: size = 1000 if title and url: results.append({ 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_date': tor_date, 'tor_feed': feednr, 'tor_type': tortype, 'priority': priority }) else: logger.debug('No data returned from %s' % host) return results
def torrents(pickfeed=None, seriesname=None, issue=None, feedinfo=None): if pickfeed is None: return srchterm = None if seriesname: srchterm = re.sub(' ', '%20', seriesname) if issue: srchterm += '%20' + str(issue) if mylar.KAT_PROXY: if mylar.KAT_PROXY.endswith('/'): kat_url = mylar.KAT_PROXY else: kat_url = mylar.KAT_PROXY + '/' else: #switched to https. kat_url = 'https://kat.cr/' if pickfeed == 'KAT': #we need to cycle through both categories (comics & other) - so we loop. loopit = 2 else: loopit = 1 lp = 0 totalcount = 0 title = [] link = [] description = [] seriestitle = [] feeddata = [] myDB = db.DBConnection() torthekat = [] torthe32p = [] torinfo = {} while (lp < loopit): if lp == 0 and loopit == 2: pickfeed = '2' elif lp == 1 and loopit == 2: pickfeed = '5' feedtype = None if pickfeed == "1" and mylar.ENABLE_32P: # 32pages new releases feed. feed = 'https://32pag.es/feeds.php?feed=torrents_all&user='******'user'] + '&auth=' + feedinfo['auth'] + '&passkey=' + feedinfo[ 'passkey'] + '&authkey=' + feedinfo['authkey'] feedtype = ' from the New Releases RSS Feed for comics' verify = bool(mylar.VERIFY_32P) elif pickfeed == "2" and srchterm is not None: # kat.ph search feed = kat_url + "usearch/" + str( srchterm) + "%20category%3Acomics%20seeds%3A" + str( mylar.MINSEEDS) + "/?rss=1" verify = bool(mylar.KAT_VERIFY) elif pickfeed == "3": # kat.ph rss feed feed = kat_url + "usearch/category%3Acomics%20seeds%3A" + str( mylar.MINSEEDS) + "/?rss=1" feedtype = ' from the New Releases RSS Feed for comics' verify = bool(mylar.KAT_VERIFY) elif pickfeed == "4": #32p search if any([ mylar.USERNAME_32P is None, mylar.USERNAME_32P == '', mylar.PASSWORD_32P is None, mylar.PASSWORD_32P == '' ]): logger.error( '[RSS] Warning - you NEED to enter in your 32P Username and Password to use this option.' ) lp = +1 continue if mylar.MODE_32P == 0: logger.warn( '[32P] Searching is not available in 32p Legacy mode. Switch to Auth mode to use the search functionality.' ) lp = +1 continue return elif pickfeed == "5" and srchterm is not None: # kat.ph search (category:other since some 0-day comics initially get thrown there until categorized) feed = kat_url + "usearch/" + str( srchterm) + "%20category%3Aother%20seeds%3A1/?rss=1" verify = bool(mylar.KAT_VERIFY) elif pickfeed == "6": # kat.ph rss feed (category:other so that we can get them quicker if need-be) feed = kat_url + "usearch/.cbr%20category%3Aother%20seeds%3A" + str( mylar.MINSEEDS) + "/?rss=1" feedtype = ' from the New Releases for category Other RSS Feed that contain comics' verify = bool(mylar.KAT_VERIFY) elif int(pickfeed) >= 7 and feedinfo is not None: #personal 32P notification feeds. #get the info here feed = 'https://32pag.es/feeds.php?feed=' + feedinfo[ 'feed'] + '&user='******'user'] + '&auth=' + feedinfo[ 'auth'] + '&passkey=' + feedinfo[ 'passkey'] + '&authkey=' + feedinfo[ 'authkey'] + '&name=' + feedinfo['feedname'] feedtype = ' from your Personal Notification Feed : ' + feedinfo[ 'feedname'] verify = bool(mylar.VERIFY_32P) else: logger.error('invalid pickfeed denoted...') return if pickfeed == "3" or pickfeed == "6" or pickfeed == "2" or pickfeed == "5": picksite = 'KAT' elif pickfeed == "1" or pickfeed == "4" or int(pickfeed) > 7: picksite = '32P' if pickfeed != '4': payload = None try: r = requests.get(feed, params=payload, verify=verify) except Exception, e: logger.warn('Error fetching RSS Feed Data from %s: %s' % (picksite, e)) return feedme = feedparser.parse(r.content) #feedme = feedparser.parse(feed) i = 0 if pickfeed == '4': for entry in searchresults['entries']: justdigits = entry[ 'file_size'] #size not available in follow-list rss feed seeddigits = entry[ 'seeders'] #number of seeders not available in follow-list rss feed if int(seeddigits) >= int(mylar.MINSEEDS): torthe32p.append({ 'site': picksite, 'title': entry['torrent_seriesname'].lstrip() + ' ' + entry['torrent_seriesvol'] + ' #' + entry['torrent_seriesiss'], 'volume': entry['torrent_seriesvol'], # not stored by mylar yet. 'issue': entry['torrent_seriesiss'], # not stored by mylar yet. 'link': entry['torrent_id'], #just the id for the torrent 'pubdate': entry['pubdate'], 'size': entry['file_size'], 'seeders': entry['seeders'], 'files': entry['num_files'] }) i += 1 else: for entry in feedme['entries']: if any([pickfeed == "3", pickfeed == "6"]): tmpsz = feedme.entries[i].enclosures[0] feeddata.append({ 'site': picksite, 'title': feedme.entries[i].title, 'link': tmpsz['url'], 'pubdate': feedme.entries[i].updated, 'size': tmpsz['length'] }) elif any([pickfeed == "2", pickfeed == "5"]): tmpsz = feedme.entries[i].enclosures[0] torthekat.append({ 'site': picksite, 'title': feedme.entries[i].title, 'link': tmpsz['url'], 'pubdate': feedme.entries[i].updated, 'size': tmpsz['length'] }) elif pickfeed == "1" or int(pickfeed) > 7: tmpdesc = feedme.entries[i].description st_pub = feedme.entries[i].title.find('(') st_end = feedme.entries[i].title.find(')') pub = feedme.entries[i].title[ st_pub + 1:st_end] # +1 to not include ( #logger.fdebug('publisher: ' + re.sub("'",'', pub).strip()) #publisher sometimes is given within quotes for some reason, strip 'em. vol_find = feedme.entries[i].title.find('vol.') series = feedme.entries[i].title[st_end + 1:vol_find].strip() series = re.sub('&', '&', series).strip() #logger.fdebug('series title: ' + series) iss_st = feedme.entries[i].title.find(' - ', vol_find) vol = re.sub( '\.', '', feedme.entries[i].title[vol_find:iss_st]).strip() #logger.fdebug('volume #: ' + str(vol)) issue = feedme.entries[i].title[iss_st + 3:].strip() #logger.fdebug('issue # : ' + str(issue)) #break it down to get the Size since it's available on THIS 32P feed only so far. #when it becomes available in the new feeds, this will be working, for now it just nulls out. sizestart = tmpdesc.find('Size:') justdigits = 0 if sizestart >= 0: sizeend = tmpdesc.find('Leechers:') sizestart += 5 # to get to the end of the word 'Size:' tmpsize = tmpdesc[sizestart:sizeend].strip() fdigits = re.sub("[^0123456789\.]", "", tmpsize).strip() if '.' in fdigits: decfind = fdigits.find('.') wholenum = fdigits[:decfind] decnum = fdigits[decfind + 1:] else: wholenum = fdigits decnum = 0 if 'MB' in tmpsize: wholebytes = int(wholenum) * 1048576 wholedecimal = (int(decnum) * 1048576) / 100 justdigits = wholebytes + wholedecimal else: #it's 'GB' then wholebytes = (int(wholenum) * 1024) * 1048576 wholedecimal = ( (int(decnum) * 1024) * 1048576) / 100 justdigits = wholebytes + wholedecimal #this is not currently working for 32p #Get the # of seeders. #seedstart = tmpdesc.find('Seeders:') #seedend = tmpdesc.find('Added:') #seedstart +=8 # to get to the end of the word 'Seeders:' #tmpseed = tmpdesc[seedstart:seedend].strip() #seeddigits = re.sub("[^0123456789\.]", "", tmpseed).strip() seeddigits = 0 if int(mylar.MINSEEDS) >= int(seeddigits): link = feedme.entries[i].link linkst = link.find('&id') linken = link.find('&', linkst + 1) if linken == -1: linken = len(link) newlink = re.sub('&id=', '', link[linkst:linken]).strip() feeddata.append({ 'site': picksite, 'title': series.lstrip() + ' ' + vol + ' #' + issue, 'volume': vol, # not stored by mylar yet. 'issue': issue, # not stored by mylar yet. 'link': newlink, #just the id for the torrent 'pubdate': feedme.entries[i].updated, 'size': justdigits }) i += 1 if feedtype is None: logger.info('[' + picksite + '] there were ' + str(i) + ' results..') else: logger.info('[' + picksite + '] there were ' + str(i) + ' results' + feedtype) totalcount += i lp += 1