示例#1
0
    def test_backlink_extraction(self):
        links = util.extract_links(None)
        self.assertEquals(links, [])

        text = "[[foo]], [[foo|bar]]"
        links = util.extract_links(text)
        self.assertEquals(links, ["foo"])
示例#2
0
    def test_backlink_extraction(self):
        links = util.extract_links(None)
        self.assertEquals(links, [])

        text = "[[foo]], [[foo|bar]]"
        links = util.extract_links(text)
        self.assertEquals(links, ["foo"])
示例#3
0
    def put(self):
        """Adds the gaewiki:parent: labels transparently."""
        if self.body is not None:
            options = util.parse_page(self.body)
            self.redirect = options.get('redirect')
            self.pread = options.get(
                'public') == 'yes' and options.get('private') != 'yes'
            self.labels = options.get('labels', [])
            if 'date' in options:
                try:
                    self.created = datetime.datetime.strptime(
                        options['date'], '%Y-%m-%d %H:%M:%S')
                except ValueError:
                    pass
            if 'name' in options and options['name'] != self.title:
                if self.get_by_title(options['name'],
                                     create_if_none=False) is not None:
                    raise ValueError('A page named "%s" already exists.' %
                                     options['name'])
                self.title = options['name']
            self.__update_geopt()

        self.links = util.extract_links(self.body)
        self.add_implicit_labels()
        db.Model.put(self)
        settings.check_and_flush(self)
示例#4
0
def handle_feed(feed, source):
    """Handles a Superfeedr JSON feed.

  Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks
  for new items.

  http://documentation.superfeedr.com/schema.html#json
  http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications

  Args:
    feed: unicode string, Superfeedr JSON feed
    source: Blogger, Tumblr, or WordPress
  """
    logging.info('Source: %s %s', source.label(), source.key.string_id())
    logging.info('Raw feed: %s', feed)

    if source.status != 'enabled':
        logging.info('Dropping because source is %s', source.status)
        return
    elif 'webmention' not in source.features:
        logging.info("Dropping because source doesn't have webmention feature")
        return

    for item in json.loads(feed).get('items', []):
        url = item.get('permalinkUrl') or item.get('id')
        if not url:
            logging.error('Dropping feed item without permalinkUrl or id!')
            continue

        # extract links from content, discarding self links.
        #
        # i don't use get_webmention_target[s]() here because they follows redirects
        # and fetch link contents, and this handler should be small and fast and try
        # to return a response to superfeedr successfully.
        #
        # TODO: extract_links currently has a bug that makes it drop trailing
        # slashes. ugh. fix that.
        content = item.get('content') or item.get('summary', '')
        links = [
            util.clean_url(util.unwrap_t_umblr_com(l))
            for l in util.extract_links(content)
            if util.domain_from_link(l) not in source.domains
        ]

        logging.info('Found links: %s', links)
        if len(url) > _MAX_KEYPART_BYTES:
            logging.warning(
                'Blog post URL is too long (over 500 chars)! Giving up.')
            bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES],
                                 source=source.key,
                                 feed_item=item,
                                 failed=links)
        else:
            bp = models.BlogPost(id=url,
                                 source=source.key,
                                 feed_item=item,
                                 unsent=links)

        bp.get_or_save()
示例#5
0
def handle_feed(feed, source):
  """Handles a Superfeedr JSON feed.

  Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks
  for new items.

  http://documentation.superfeedr.com/schema.html#json
  http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications

  Args:
    feed: unicode string, Superfeedr JSON feed
    source: Blogger, Tumblr, or WordPress
  """
  logging.info('Source: %s %s', source.label(), source.key.string_id())
  logging.info('Raw feed: %s', feed)

  if source.status != 'enabled':
    logging.info('Dropping because source is %s', source.status)
    return
  elif 'webmention' not in source.features:
    logging.info("Dropping because source doesn't have webmention feature")
    return

  for item in json.loads(feed).get('items', []):
    url = item.get('permalinkUrl') or item.get('id')
    if not url:
      logging.error('Dropping feed item without permalinkUrl or id!')
      continue

    # extract links from content, discarding self links.
    #
    # i don't use get_webmention_target[s]() here because they follows redirects
    # and fetch link contents, and this handler should be small and fast and try
    # to return a response to superfeedr successfully.
    #
    # TODO: extract_links currently has a bug that makes it drop trailing
    # slashes. ugh. fix that.
    content = item.get('content') or item.get('summary', '')
    links = [util.clean_url(util.unwrap_t_umblr_com(l))
             for l in util.extract_links(content)
             if util.domain_from_link(l) not in source.domains]

    unique = []
    for link in util.dedupe_urls(links):
      if len(link) <= _MAX_STRING_LENGTH:
        unique.append(link)
      else:
        logging.info('Giving up on link over %s chars! %s', _MAX_STRING_LENGTH, link)

    logging.info('Found links: %s', unique)
    if len(url) > _MAX_KEYPART_BYTES:
      logging.warning('Blog post URL is too long (over 500 chars)! Giving up.')
      bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES], source=source.key,
                           feed_item=item, failed=unique)
    else:
      bp = models.BlogPost(id=url, source=source.key, feed_item=item, unsent=unique)

    bp.get_or_save()
示例#6
0
    async def on_message(self, message: Message):
        if message.author == self.user:
            return

        reply = partial(message.reply, mention_author=False)

        for match in extract_links(message.content):
            cprint(match[0], blue)

            try:
                info = extract_info(match[0])
            except NoVideoException:
                cprint("Tweet has no video", yellow)
                continue
            except Exception as ex:  # pylint: disable=broad-except
                cprint(f"Extraction error: {ex}", red)
                await reply("Failed to download video")
                continue

            if "url" not in info:
                cprint("No url in info dict", yellow)
                continue

            url = info["url"]

            try:
                buffer = await download(url, message.guild.filesize_limit)
            except FileSizeException as ex:
                if ex.filesize:
                    cprint(
                        f"Not uploading, file is too large: {ex.filesize} > {ex.limit}",
                        red)
                else:
                    cprint(
                        f"Not uploading, file is larger than [{ex.limit}] bytes",
                        red)

                cprint("Falling back to direct URL", yellow)

                # If the file is too large, we fall back to posting a direct URL
                await reply(url)

                continue

            except Exception as ex:  # pylint: disable=broad-except
                cprint(f"Http error: {ex}", red)
                await reply("Failed to download video")
                continue

            await reply(file=DiscordFile(fp=buffer, filename=f"{match[1]}.mp4")
                        )

            cprint("Upload success", green)
示例#7
0
def handle_feed(feed, source):
  """Handles a Superfeedr JSON feed.

  Creates BlogPost entities and adds propagate-blogpost tasks for new items.

  http://documentation.superfeedr.com/schema.html#json
  http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications

  Args:
    feed: string, Superfeedr JSON feed
    source: Blogger, Tumblr, or WordPress
  """
  logging.info('Source: %s %s', source.label(), source.key.string_id())
  logging.info('Raw feed: %s', feed)

  if source.status != 'enabled':
    logging.warning('Dropping because source is %s', source.status)
    return
  elif 'webmention' not in source.features:
    logging.warning("Dropping because source doesn't have webmention feature")
    return

  for item in json.loads(feed).get('items', []):
    url = item.get('permalinkUrl') or item.get('id')
    if not url:
      logging.error('Dropping feed item without permalinkUrl or id!')
      continue

    source.preprocess_superfeedr_item(item)
    # extract links from content, discarding self links.
    #
    # i don't use get_webmention_target[s]() here because they follows redirects
    # and fetch link contents, and this handler should be small and fast and try
    # to return a response to superfeedr successfully.
    #
    # TODO: extract_links currently has a bug that makes it drop trailing
    # slashes. ugh. fix that.
    content = item.get('content') or item.get('summary', '')
    links = [l for l in util.extract_links(content)
             if util.domain_from_link(l) not in source.domains]

    logging.info('Found links: %s', links)
    models.BlogPost(id=url,
                    source=source.key,
                    feed_item=item,
                    unsent=links,
                    ).get_or_save()
示例#8
0
  def test_extract_links(self):
    self.assertEquals([], util.extract_links(None))
    self.assertEquals([], util.extract_links(''))
    self.assertEquals([], util.extract_links('asdf qwert'))

    for text in ('http://foo.com',
                 '  http://foo.com  ',
                 '  http://foo.com \n http://foo.com  ',
                 'x http://foo.com\ny',
                 'x\thttp://foo.com.',
                 'x\rhttp://foo.com! ',
                 'x http://foo.com? ',
                 '<a href="http://foo.com">',
                 "<a href='http://foo.com'>",
                 '<a href="xyz">http://foo.com</a>',
                 ):
      self.assertEquals(['http://foo.com'], util.extract_links(text),
                        'Failed on %r' % text)

    self.assertEquals(
      ['http://foo.com', 'https://www.bar.com'],
      util.extract_links('x http://foo.com y https://www.bar.com z'))
    self.assertEquals(
      ['http://foo.com', 'http://bar.com'],
      util.extract_links('asdf http://foo.com qwert <a class="x" href="http://bar.com" >xyz</a> www.baz.com'))

    # trailing slash
    # TODO: make this work
    # self.assertEquals(['http://foo.com/'],
    #                   util.extract_links('x http://foo.com/'))

    # query
    self.assertEquals(['http://foo.com/bar?baz=baj'],
                      util.extract_links('http://foo.com/bar?baz=baj y'))

    # preserve order
    self.assertEquals(['http://%s' % c for c in 'a', 'b', 'c', 'd'],
                      util.extract_links('http://a http://b http://c http://d'))
示例#9
0
  def test_extract_links(self):
    self.assertEquals([], util.extract_links(None))
    self.assertEquals([], util.extract_links(''))
    self.assertEquals([], util.extract_links('asdf qwert'))

    for text in ('http://foo.com',
                 '  http://foo.com  ',
                 '  http://foo.com \n http://foo.com  ',
                 'x http://foo.com\ny',
                 'x\thttp://foo.com.',
                 'x\rhttp://foo.com! ',
                 'x http://foo.com? ',
                 '<a href="http://foo.com">',
                 "<a href='http://foo.com'>",
                 '<a href="xyz">http://foo.com</a>',
                 ):
      self.assertEquals(['http://foo.com'], util.extract_links(text),
                        'Failed on %r' % text)

    self.assertEquals(
      ['http://foo.com', 'https://www.bar.com'],
      util.extract_links('x http://foo.com y https://www.bar.com z'))
    self.assertEquals(
      ['http://foo.com', 'http://bar.com'],
      util.extract_links('asdf http://foo.com qwert <a class="x" href="http://bar.com" >xyz</a> www.baz.com'))

    # trailing slash
    # TODO: make this work
    # self.assertEquals(['http://foo.com/'],
    #                   util.extract_links('x http://foo.com/'))

    # query
    self.assertEquals(['http://foo.com/bar?baz=baj'],
                      util.extract_links('http://foo.com/bar?baz=baj y'))

    # preserve order
    self.assertEquals(['http://%s' % c for c in 'a', 'b', 'c', 'd'],
                      util.extract_links('http://a http://b http://c http://d'))
示例#10
0
    def put(self):
        """Adds the gaewiki:parent: labels transparently."""
        if self.body is not None:
            options = util.parse_page(self.body)
            self.redirect = options.get('redirect')
            self.pread = options.get('public') == 'yes' and options.get('private') != 'yes'
            self.labels = options.get('labels', [])
            if 'date' in options:
                try:
                    self.created = datetime.datetime.strptime(options['date'], '%Y-%m-%d %H:%M:%S')
                except ValueError:
                    pass
            if 'name' in options and options['name'] != self.title:
                if self.get_by_title(options['name'], create_if_none=False) is not None:
                    raise ValueError('A page named "%s" already exists.' % options['name'])
                self.title = options['name']
            self.__update_geopt()

        self.links = util.extract_links(self.body)
        self.add_implicit_labels()
        db.Model.put(self)
        settings.check_and_flush(self)
示例#11
0
    def visit(self, conn, crawl_id, source_id, visit_url):
        '''
        Not abstract. Visits a url during a crawl.
        Inserts all relevant information to the database for a single visit.
        Inserts article information if the url matches the article regex.
        '''
        visit_url_id = visit_url['id']
        visit_url_string = visit_url['url']

        base_url_string = self.base_url_string
        html = util.download_html(visit_url_string)
        found_links = util.extract_links(html, base_url_string)
        visit_id = queries.insert_visit(conn, crawl_id, visit_url_id)
        new_url_ids = queries.insert_urls(conn, found_links)
        queries.insert_links(conn, visit_id, new_url_ids)

        if self.is_article(visit_url_string):
            article = util.extract_article(html, visit_url_string)
            article_title = article.title
            article_text = article.text
            article_date = self.extract_date_from_url(visit_url_string)
            queries.insert_article(conn, visit_url_id, article_title, article_text, article_date, source_id)
示例#12
0
    """ Driver to call the get_batch method in scraper """
    scraper.get_batch(pages,
                      page_hook=parse_page,
                      ret_hook=(ret_hook, [output, pages[-1]]),
                      verbose=True)


if __name__ == '__main__':
    import sys
    scraper = WebScraper(BASE_URI)

    try:
        output = open(sys.argv[1], 'w')
    except IndexError:
        print("Usage: {} <file>".format(sys.argv[0]))
        exit(1)

    output.write('{')

    latest_update = ""  # todo: something with this
    base = scraper.get_html()

    latest_update = get_latest_update(base)
    pages = extract_links(base, 'td', 'data')

    do_scrape(pages, output)

    output.write('}')

    output.close()
示例#13
0
def handle_feed(feed, source):
    """Handles a Superfeedr JSON feed.

  Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks
  for new items.

  http://documentation.superfeedr.com/schema.html#json
  http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications

  Args:
    feed: unicode string, Superfeedr JSON feed
    source: Blogger, Tumblr, or WordPress
  """
    logger.info(f'Source: {source.label()} {source.key_id()}')
    logger.info(f'Raw feed: {feed}')

    if not feed:
        return

    if source.status != 'enabled':
        logger.info(f'Dropping because source is {source.status}')
        return
    elif 'webmention' not in source.features:
        logger.info("Dropping because source doesn't have webmention feature")
        return

    for item in feed.get('items', []):
        url = item.get('permalinkUrl') or item.get('id')
        if not url:
            logger.error('Dropping feed item without permalinkUrl or id!')
            continue

        # extract links from content, discarding self links.
        #
        # i don't use get_webmention_target[s]() here because they follows redirects
        # and fetch link contents, and this handler should be small and fast and try
        # to return a response to superfeedr successfully.
        #
        # TODO: extract_links currently has a bug that makes it drop trailing
        # slashes. ugh. fix that.
        content = item.get('content') or item.get('summary', '')
        links = [
            util.clean_url(util.unwrap_t_umblr_com(url))
            for url in util.extract_links(content)
            if util.domain_from_link(url) not in source.domains
        ]

        unique = []
        for link in util.dedupe_urls(links):
            if len(link) <= _MAX_STRING_LENGTH:
                unique.append(link)
            else:
                logger.info(
                    f'Giving up on link over {_MAX_STRING_LENGTH} chars! {link}'
                )
            if len(unique) >= MAX_BLOGPOST_LINKS:
                logger.info('Stopping at 10 links! Skipping the rest.')
                break

        logger.info(f'Found links: {unique}')
        if len(url) > _MAX_KEYPART_BYTES:
            logger.warning(
                'Blog post URL is too long (over 500 chars)! Giving up.')
            bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES],
                                 source=source.key,
                                 feed_item=item,
                                 failed=unique)
        else:
            bp = models.BlogPost(id=url,
                                 source=source.key,
                                 feed_item=item,
                                 unsent=unique)

        bp.get_or_save()