def test_backlink_extraction(self): links = util.extract_links(None) self.assertEquals(links, []) text = "[[foo]], [[foo|bar]]" links = util.extract_links(text) self.assertEquals(links, ["foo"])
def put(self): """Adds the gaewiki:parent: labels transparently.""" if self.body is not None: options = util.parse_page(self.body) self.redirect = options.get('redirect') self.pread = options.get( 'public') == 'yes' and options.get('private') != 'yes' self.labels = options.get('labels', []) if 'date' in options: try: self.created = datetime.datetime.strptime( options['date'], '%Y-%m-%d %H:%M:%S') except ValueError: pass if 'name' in options and options['name'] != self.title: if self.get_by_title(options['name'], create_if_none=False) is not None: raise ValueError('A page named "%s" already exists.' % options['name']) self.title = options['name'] self.__update_geopt() self.links = util.extract_links(self.body) self.add_implicit_labels() db.Model.put(self) settings.check_and_flush(self)
def handle_feed(feed, source): """Handles a Superfeedr JSON feed. Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks for new items. http://documentation.superfeedr.com/schema.html#json http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications Args: feed: unicode string, Superfeedr JSON feed source: Blogger, Tumblr, or WordPress """ logging.info('Source: %s %s', source.label(), source.key.string_id()) logging.info('Raw feed: %s', feed) if source.status != 'enabled': logging.info('Dropping because source is %s', source.status) return elif 'webmention' not in source.features: logging.info("Dropping because source doesn't have webmention feature") return for item in json.loads(feed).get('items', []): url = item.get('permalinkUrl') or item.get('id') if not url: logging.error('Dropping feed item without permalinkUrl or id!') continue # extract links from content, discarding self links. # # i don't use get_webmention_target[s]() here because they follows redirects # and fetch link contents, and this handler should be small and fast and try # to return a response to superfeedr successfully. # # TODO: extract_links currently has a bug that makes it drop trailing # slashes. ugh. fix that. content = item.get('content') or item.get('summary', '') links = [ util.clean_url(util.unwrap_t_umblr_com(l)) for l in util.extract_links(content) if util.domain_from_link(l) not in source.domains ] logging.info('Found links: %s', links) if len(url) > _MAX_KEYPART_BYTES: logging.warning( 'Blog post URL is too long (over 500 chars)! Giving up.') bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES], source=source.key, feed_item=item, failed=links) else: bp = models.BlogPost(id=url, source=source.key, feed_item=item, unsent=links) bp.get_or_save()
def handle_feed(feed, source): """Handles a Superfeedr JSON feed. Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks for new items. http://documentation.superfeedr.com/schema.html#json http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications Args: feed: unicode string, Superfeedr JSON feed source: Blogger, Tumblr, or WordPress """ logging.info('Source: %s %s', source.label(), source.key.string_id()) logging.info('Raw feed: %s', feed) if source.status != 'enabled': logging.info('Dropping because source is %s', source.status) return elif 'webmention' not in source.features: logging.info("Dropping because source doesn't have webmention feature") return for item in json.loads(feed).get('items', []): url = item.get('permalinkUrl') or item.get('id') if not url: logging.error('Dropping feed item without permalinkUrl or id!') continue # extract links from content, discarding self links. # # i don't use get_webmention_target[s]() here because they follows redirects # and fetch link contents, and this handler should be small and fast and try # to return a response to superfeedr successfully. # # TODO: extract_links currently has a bug that makes it drop trailing # slashes. ugh. fix that. content = item.get('content') or item.get('summary', '') links = [util.clean_url(util.unwrap_t_umblr_com(l)) for l in util.extract_links(content) if util.domain_from_link(l) not in source.domains] unique = [] for link in util.dedupe_urls(links): if len(link) <= _MAX_STRING_LENGTH: unique.append(link) else: logging.info('Giving up on link over %s chars! %s', _MAX_STRING_LENGTH, link) logging.info('Found links: %s', unique) if len(url) > _MAX_KEYPART_BYTES: logging.warning('Blog post URL is too long (over 500 chars)! Giving up.') bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES], source=source.key, feed_item=item, failed=unique) else: bp = models.BlogPost(id=url, source=source.key, feed_item=item, unsent=unique) bp.get_or_save()
async def on_message(self, message: Message): if message.author == self.user: return reply = partial(message.reply, mention_author=False) for match in extract_links(message.content): cprint(match[0], blue) try: info = extract_info(match[0]) except NoVideoException: cprint("Tweet has no video", yellow) continue except Exception as ex: # pylint: disable=broad-except cprint(f"Extraction error: {ex}", red) await reply("Failed to download video") continue if "url" not in info: cprint("No url in info dict", yellow) continue url = info["url"] try: buffer = await download(url, message.guild.filesize_limit) except FileSizeException as ex: if ex.filesize: cprint( f"Not uploading, file is too large: {ex.filesize} > {ex.limit}", red) else: cprint( f"Not uploading, file is larger than [{ex.limit}] bytes", red) cprint("Falling back to direct URL", yellow) # If the file is too large, we fall back to posting a direct URL await reply(url) continue except Exception as ex: # pylint: disable=broad-except cprint(f"Http error: {ex}", red) await reply("Failed to download video") continue await reply(file=DiscordFile(fp=buffer, filename=f"{match[1]}.mp4") ) cprint("Upload success", green)
def handle_feed(feed, source): """Handles a Superfeedr JSON feed. Creates BlogPost entities and adds propagate-blogpost tasks for new items. http://documentation.superfeedr.com/schema.html#json http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications Args: feed: string, Superfeedr JSON feed source: Blogger, Tumblr, or WordPress """ logging.info('Source: %s %s', source.label(), source.key.string_id()) logging.info('Raw feed: %s', feed) if source.status != 'enabled': logging.warning('Dropping because source is %s', source.status) return elif 'webmention' not in source.features: logging.warning("Dropping because source doesn't have webmention feature") return for item in json.loads(feed).get('items', []): url = item.get('permalinkUrl') or item.get('id') if not url: logging.error('Dropping feed item without permalinkUrl or id!') continue source.preprocess_superfeedr_item(item) # extract links from content, discarding self links. # # i don't use get_webmention_target[s]() here because they follows redirects # and fetch link contents, and this handler should be small and fast and try # to return a response to superfeedr successfully. # # TODO: extract_links currently has a bug that makes it drop trailing # slashes. ugh. fix that. content = item.get('content') or item.get('summary', '') links = [l for l in util.extract_links(content) if util.domain_from_link(l) not in source.domains] logging.info('Found links: %s', links) models.BlogPost(id=url, source=source.key, feed_item=item, unsent=links, ).get_or_save()
def test_extract_links(self): self.assertEquals([], util.extract_links(None)) self.assertEquals([], util.extract_links('')) self.assertEquals([], util.extract_links('asdf qwert')) for text in ('http://foo.com', ' http://foo.com ', ' http://foo.com \n http://foo.com ', 'x http://foo.com\ny', 'x\thttp://foo.com.', 'x\rhttp://foo.com! ', 'x http://foo.com? ', '<a href="http://foo.com">', "<a href='http://foo.com'>", '<a href="xyz">http://foo.com</a>', ): self.assertEquals(['http://foo.com'], util.extract_links(text), 'Failed on %r' % text) self.assertEquals( ['http://foo.com', 'https://www.bar.com'], util.extract_links('x http://foo.com y https://www.bar.com z')) self.assertEquals( ['http://foo.com', 'http://bar.com'], util.extract_links('asdf http://foo.com qwert <a class="x" href="http://bar.com" >xyz</a> www.baz.com')) # trailing slash # TODO: make this work # self.assertEquals(['http://foo.com/'], # util.extract_links('x http://foo.com/')) # query self.assertEquals(['http://foo.com/bar?baz=baj'], util.extract_links('http://foo.com/bar?baz=baj y')) # preserve order self.assertEquals(['http://%s' % c for c in 'a', 'b', 'c', 'd'], util.extract_links('http://a http://b http://c http://d'))
def put(self): """Adds the gaewiki:parent: labels transparently.""" if self.body is not None: options = util.parse_page(self.body) self.redirect = options.get('redirect') self.pread = options.get('public') == 'yes' and options.get('private') != 'yes' self.labels = options.get('labels', []) if 'date' in options: try: self.created = datetime.datetime.strptime(options['date'], '%Y-%m-%d %H:%M:%S') except ValueError: pass if 'name' in options and options['name'] != self.title: if self.get_by_title(options['name'], create_if_none=False) is not None: raise ValueError('A page named "%s" already exists.' % options['name']) self.title = options['name'] self.__update_geopt() self.links = util.extract_links(self.body) self.add_implicit_labels() db.Model.put(self) settings.check_and_flush(self)
def visit(self, conn, crawl_id, source_id, visit_url): ''' Not abstract. Visits a url during a crawl. Inserts all relevant information to the database for a single visit. Inserts article information if the url matches the article regex. ''' visit_url_id = visit_url['id'] visit_url_string = visit_url['url'] base_url_string = self.base_url_string html = util.download_html(visit_url_string) found_links = util.extract_links(html, base_url_string) visit_id = queries.insert_visit(conn, crawl_id, visit_url_id) new_url_ids = queries.insert_urls(conn, found_links) queries.insert_links(conn, visit_id, new_url_ids) if self.is_article(visit_url_string): article = util.extract_article(html, visit_url_string) article_title = article.title article_text = article.text article_date = self.extract_date_from_url(visit_url_string) queries.insert_article(conn, visit_url_id, article_title, article_text, article_date, source_id)
""" Driver to call the get_batch method in scraper """ scraper.get_batch(pages, page_hook=parse_page, ret_hook=(ret_hook, [output, pages[-1]]), verbose=True) if __name__ == '__main__': import sys scraper = WebScraper(BASE_URI) try: output = open(sys.argv[1], 'w') except IndexError: print("Usage: {} <file>".format(sys.argv[0])) exit(1) output.write('{') latest_update = "" # todo: something with this base = scraper.get_html() latest_update = get_latest_update(base) pages = extract_links(base, 'td', 'data') do_scrape(pages, output) output.write('}') output.close()
def handle_feed(feed, source): """Handles a Superfeedr JSON feed. Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks for new items. http://documentation.superfeedr.com/schema.html#json http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications Args: feed: unicode string, Superfeedr JSON feed source: Blogger, Tumblr, or WordPress """ logger.info(f'Source: {source.label()} {source.key_id()}') logger.info(f'Raw feed: {feed}') if not feed: return if source.status != 'enabled': logger.info(f'Dropping because source is {source.status}') return elif 'webmention' not in source.features: logger.info("Dropping because source doesn't have webmention feature") return for item in feed.get('items', []): url = item.get('permalinkUrl') or item.get('id') if not url: logger.error('Dropping feed item without permalinkUrl or id!') continue # extract links from content, discarding self links. # # i don't use get_webmention_target[s]() here because they follows redirects # and fetch link contents, and this handler should be small and fast and try # to return a response to superfeedr successfully. # # TODO: extract_links currently has a bug that makes it drop trailing # slashes. ugh. fix that. content = item.get('content') or item.get('summary', '') links = [ util.clean_url(util.unwrap_t_umblr_com(url)) for url in util.extract_links(content) if util.domain_from_link(url) not in source.domains ] unique = [] for link in util.dedupe_urls(links): if len(link) <= _MAX_STRING_LENGTH: unique.append(link) else: logger.info( f'Giving up on link over {_MAX_STRING_LENGTH} chars! {link}' ) if len(unique) >= MAX_BLOGPOST_LINKS: logger.info('Stopping at 10 links! Skipping the rest.') break logger.info(f'Found links: {unique}') if len(url) > _MAX_KEYPART_BYTES: logger.warning( 'Blog post URL is too long (over 500 chars)! Giving up.') bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES], source=source.key, feed_item=item, failed=unique) else: bp = models.BlogPost(id=url, source=source.key, feed_item=item, unsent=unique) bp.get_or_save()