def parse_one(url, videos, ptime): feed = feedparse(url) for j in range(0, len(feed['items'])): timef = feed['items'][j]['published_parsed'] dt = datetime.fromtimestamp(mktime(timef)) if dt > ptime: videos.append(feed['items'][j]['link'])
def rss_get(update, context): try: args = update.message.text.split(" ") title = args[1] count = int(args[2]) feed_url = rss_dict.get(title) if feed_url is not None and count > 0: try: msg = sendMessage( f"Getting the last <b>{count}</b> item(s) from {title}", context.bot, update) rss_d = feedparse(feed_url[0]) item_info = "" for item_num in range(count): try: link = rss_d.entries[item_num]['links'][1]['href'] except IndexError: link = rss_d.entries[item_num]['link'] item_info += f"<b>Name: </b><code>{rss_d.entries[item_num]['title']}</code>\n" item_info += f"<b>Link: </b><code>{link}</code>\n\n" editMessage(item_info, msg) except IndexError as e: LOGGER.error(str(e)) editMessage( "Parse depth exceeded. Try again with a lower value.", msg) except Exception as e: LOGGER.error(str(e)) editMessage(str(e), msg) else: sendMessage("Enter a vaild title/value.", context.bot, update) except (IndexError, ValueError): sendMessage( f"Use this format to fetch:\n/{BotCommands.RssGetCommand} Title value", context.bot, update)
def rss_monitor(context): with rss_dict_lock: if len(rss_dict) == 0: rss_job.enabled = False return rss_saver = rss_dict for name, data in rss_saver.items(): try: rss_d = feedparse(data[0]) last_link = rss_d.entries[0]['link'] last_title = rss_d.entries[0]['title'] if data[1] == last_link or data[2] == last_title: continue feed_count = 0 while True: try: if data[1] == rss_d.entries[feed_count]['link'] or data[ 2] == rss_d.entries[feed_count]['title']: break except IndexError: LOGGER.info( f"Reached Max index no. {feed_count} for this feed: {name}. \ Maybe you need to add less RSS_DELAY to not miss some torrents" ) break parse = True for list in data[3]: if not any(x in str(rss_d.entries[feed_count] ['title']).lower() for x in list): parse = False feed_count += 1 break if not parse: continue try: url = rss_d.entries[feed_count]['links'][1]['href'] except IndexError: url = rss_d.entries[feed_count]['link'] if RSS_COMMAND is not None: feed_msg = f"{RSS_COMMAND} {url}" else: feed_msg = f"<b>Name: </b><code>{rss_d.entries[feed_count]['title']}</code>\n\n" feed_msg += f"<b>Link: </b><code>{url}</code>" sendRss(feed_msg, context.bot) feed_count += 1 sleep(5) DbManger().rss_update(name, str(last_link), str(last_title)) with rss_dict_lock: rss_dict[name] = [ data[0], str(last_link), str(last_title), data[3] ] LOGGER.info(f"Feed Name: {name}") LOGGER.info(f"Last item: {last_link}") except Exception as e: LOGGER.error(f"{e} Feed Name: {name} - Feed Link: {data[0]}") continue
def parse_feed(): all_feed_entries = [] for feed_url in FEED_URLS: parser_response = feedparse(feed_url) feed_entries = parse_feed_entries(parser_response) if feed_entries: all_feed_entries.extend(feed_entries) return all_feed_entries
def item_completed(self, results, item, _): log.msg("item_completed", level=log.INFO) item.commentFeed = feedparse(second(first(results)).body) log.msg(str(item.commentFeed), level=log.INFO) # okResults = imap(second, ifilter( # lambda (ok, _): ok and _.status == 200, # results)) # try: # item.commentFeed = feedparse(okResults.next().body) # except StopIteration: # pass return item
def __init__(self, rss, logger=lambda _: None): """Instantiates a content extractor for a given RSS feed. @type rss: string @param rss: the rss/atom feed @type logger: function of string => Unit @param logger: the logger """ self.rssEntries = feedparse(rss).entries self.rssLinks = tuple(imap(lambda _: _.link, self.rssEntries)) self.logger = logger self.urlZipPages = list() self.xPaths = None self.needsRefresh = True
def get_timeline_events(self, req, start, stop, filters): events = [] for url in filters: feed = feedparse(url) # TODO : Cache for entry in feed.entries: entry_time = mktime(entry.updated_parsed) if ((entry_time >= start) and (entry_time <= stop)): try: entry_author = entry.author except: entry_author = None event = ('feeditem', entry.link, entry.title, entry_time, entry_author, Markup(entry.summary)) events.append(event) return events
def __call__(self, url, *args): self.url = url if len(args) == 0: self.name = None size = -1 else: try: size = int(args[0]) self.name = u' '.join(args[1:]) except ValueError: self.name = u' '.join(args) size = -1 result = feedparse(url) # we don't want to use generators # here because the result is pickled # by the multiprocessing module entries = [self._filter_entry(e) for e in result['entries']] if size != -1: return entries[:size] return entries
def Parse(url, size=10): """Returns entries of the feed.""" result = feedparse(url) return islice(imap(_filter_entry, result['entries']), size)
def rss_sub(update, context): try: args = update.message.text.split(" ", 3) title = str(args[1]) feed_link = str(args[2]) f_lists = [] try: filters = str(args[3]).lower() if filters.startswith('f: '): filters = filters.split('f: ', 1)[1] filters_list = filters.split('|') for x in filters_list: y = x.split(' or ') f_lists.append(y) else: filters = None except: filters = None exists = rss_dict.get(title) if exists is not None: LOGGER.error( "This title already subscribed! Choose another title!") return sendMessage( "This title already subscribed! Choose another title!", context.bot, update) try: rss_d = feedparse(feed_link) sub_msg = "<b>Subscribed!</b>" sub_msg += f"\n\n<b>Title: </b><code>{title}</code>\n<b>Feed Url: </b>{feed_link}" sub_msg += f"\n\n<b>latest record for </b>{rss_d.feed.title}:" sub_msg += f"\n\n<b>Name: </b><code>{rss_d.entries[0]['title']}</code>" try: link = rss_d.entries[0]['links'][1]['href'] except IndexError: link = rss_d.entries[0]['link'] sub_msg += f"\n\n<b>Link: </b><code>{link}</code>" sub_msg += f"\n\n<b>Filters: </b><code>{filters}</code>" last_link = str(rss_d.entries[0]['link']) last_title = str(rss_d.entries[0]['title']) DbManger().rss_add(title, feed_link, last_link, last_title, filters) with rss_dict_lock: if len(rss_dict) == 0: rss_job.enabled = True rss_dict[title] = [feed_link, last_link, last_title, f_lists] sendMessage(sub_msg, context.bot, update) LOGGER.info(f"Rss Feed Added: {title} - {feed_link} - {filters}") except (IndexError, AttributeError) as e: LOGGER.error(str(e)) msg = "The link doesn't seem to be a RSS feed or it's region-blocked!" sendMessage(msg, context.bot, update) except Exception as e: LOGGER.error(str(e)) sendMessage(str(e), context.bot, update) except IndexError: msg = f"Use this format to add feed url:\n/{BotCommands.RssSubCommand} Title https://www.rss-url.com" msg += " f: 1080 or 720 or 144p|mkv or mp4|hevc (optional)\n\nThis filter will parse links that it's titles" msg += " contains `(1080 or 720 or 144p) and (mkv or mp4) and hevc` words. You can add whatever you want.\n\n" msg += "Another example: f: 1080 or 720p|.web. or .webrip.|hvec or x264 .. This will parse titles that contains" msg += " ( 1080 or 720p) and (.web. or .webrip.) and (hvec or x264). I have added space before and after 1080" msg += " to avoid wrong matching. If this `10805695` number in title it will match 1080 if added 1080 without" msg += " spaces after it." msg += "\n\nFilters Notes:\n\n1. | means and.\n\n2. Add `or` between similar keys, you can add it" msg += " between qualities or between extensions, so don't add filter like this f: 1080|mp4 or 720|web" msg += " because this will parse 1080 and (mp4 or 720) and web ... not (1080 and mp4) or (720 and web)." msg += "\n\n3. You can add `or` and `|` as much as you want." msg += "\n\n4. Take look on title if it has static special character after or before the qualities or extensions" msg += " or whatever and use them in filter to avoid wrong match" sendMessage(msg, context.bot, update)
def parse(url,size=10): """Return entries of feed.""" result = feedparse(url) return islice(imap(_filter_entry(),result['entries'],size))
def parseFeed(feed, user): index = open(feed.url) # gets html index index = index.read() # continues soupd = soup(index) # parse the hml with BeautifulSoup rsslink = soupd.findAll("link", type="application/rss+xml") # this little mess will get the # rss link if rsslink == "[]": rsslink = soupd.findAll("link", type="application/rdf+xml") # or rdf elif rsslink == "[]": rsslink = soupd.findAll("link", type="application/atom+xml") # or atom rsslink = str(rsslink) # and put to a string # no need to understand this as none will ever comprehend RegExp # a hint, it strips the link out of the html tag link = re.search('href=".*?"', rsslink) if link == 0: link = link.group() link = re.search('HREF=".*?"', rsslink) link = link.strip("HREF=") link = str(link) link = link.strip('""') elif link == 0: print "erro, linha 94 engine.py, link para rss n encontrado!" else: link = link.group() link = str(link) link = link.strip("href=") link = link.strip('""') feed.rsslink = link # we will need to store this, for updates # now some more fun, getting the real thing, xml content # from the link we have xml = open(link) xml = xml.read() # time to parse it with FeedParser k = feedparse(xml) feed.title = k.feed.title feed.put() # now let's parse the posts one-by-one for x in range(len(k.entries)): p = str(x) post = Post(url=k.entries[x].link, owner=user) # now that the post model is set, let us play! # ugly but necessary: if k.entries[x].has_key("author") is True: post.author = k.entries[x].author + " sings " else: post.author = "Anonymous" if k.entries[x].has_key("category") is True: post.category = " on " + k.entries[x].category else: post.category = "" if k.entries[x].has_key("date_parsed") is True: date = parseDate(k.entries[x].date_parsed) post.date = date if k.entries[x].has_key("summary") is True: post.summary = k.entries[x].summary else: post.summary = "No private dancing, cowboy..." if k.entries[x].has_key("title") is True: post.title = k.entries[x].title if len(post.title) == 0: post.title = "Untitled" post.feed = feed.key() post.put() return feed
#!/usr/bin/python from feedparser import parse as feedparse from dateparser import parse as dateparse url = "https://www.archlinuxjp.org/feeds/news.xml" date_fmt = "%Y/%m/%d" def get_last_reboot(): import subprocess log = subprocess.getoutput("last reboot --time-format iso") newest_boot = log.split("\n")[0].split()[4] reboot_time = dateparse(newest_boot) if reboot_time.year == 1970: newest_halt = log.split("\n")[1].split()[5] reboot_time = dateparse(newest_halt) return reboot_time last_reboot = get_last_reboot() responce = feedparse(url) newer_entries = [f"{dateparse(entry.updated).strftime(date_fmt)} {entry.title}" for entry in responce.entries if dateparse(entry.updated) > last_reboot] if newer_entries: print(responce.feed.title) print(*newer_entries, sep="\n")