Exemplo n.º 1
0
    def add_stories_from_feed(self, db: DatabaseHandler, download: dict,
                              content: str) -> List[int]:
        """
        Handle feeds of type 'web_page' by just creating a story to associate with the content.

        Web page feeds are feeds that consist of a web page that we download once a week and add as a story.
        """
        download = decode_object_from_bytes_if_needed(download)
        content = decode_object_from_bytes_if_needed(content)

        feeds_id = download['feeds_id']

        feed = db.find_by_id(table='feeds', object_id=feeds_id)

        title = html_title(html=content, fallback='(no title)')
        title += '[' + sql_now() + ']'

        guid = f"{str(int(time.time()))}:{download['url']}"[0:1024]

        new_story = {
            'url': download['url'],
            'guid': guid,
            'media_id': feed['media_id'],
            'publish_date': sql_now(),
            'title': title,
        }

        story = add_story(db=db, story=new_story, feeds_id=feeds_id)
        if not story:
            raise McCrawlerFetcherSoftError(f"Failed to add story {new_story}")

        db.query(
            """
            UPDATE downloads
            SET stories_id = %(stories_id)s,
                type = 'content'
            WHERE downloads_id = %(downloads_id)s
        """, {
                'stories_id': story['stories_id'],
                'downloads_id': download['downloads_id'],
            })

        # A webpage that was just fetched is also a story
        story_ids = [
            story['stories_id'],
        ]

        return story_ids
Exemplo n.º 2
0
def generate_story(db: DatabaseHandler,
                   url: str,
                   content: str,
                   title: str = None,
                   publish_date: str = None,
                   fallback_date: Optional[str] = None) -> dict:
    """Add a new story to the database by guessing metadata using the given url and content.

    This function guesses the medium, feed, title, and date of the story from the url and content.

    If inserting the story results in a unique constraint error based on media_id and url, return
    the existing story instead.

    Arguments:
    db - db handle
    url - story url
    content - story content
    fallback_date - fallback to this date if the date guesser fails to find a date
    """
    if len(url) < 1:
        raise McTMStoriesException("url must not be an empty string")

    log.debug(f"Generating story from URL {url}...")

    url = url[0:MAX_URL_LENGTH]

    log.debug(f"Guessing medium for URL {url}...")
    medium = guess_medium(db, url)
    log.debug(f"Done guessing medium for URL {url}: {medium}")

    log.debug(f"Getting spider feed for medium {medium}...")
    feed = get_spider_feed(db, medium)
    log.debug(f"Done getting spider feed for medium {medium}: {feed}")

    log.debug(f"Getting spidered tag...")
    spidered_tag = get_spidered_tag(db)
    log.debug(f"Done getting spidered tag: {spidered_tag}")

    if title is None:
        log.debug(f"Parsing HTML title...")
        title = html_title(content, url, MAX_TITLE_LENGTH)
        log.debug(f"Done parsing HTML title: {title}")

    story = {
        'url': url,
        'guid': url,
        'media_id': medium['media_id'],
        'title': title,
        'description': ''
    }

    # postgres refuses to insert text values with the null character
    for field in ('url', 'guid', 'title'):
        story[field] = re2.sub('\x00', '', story[field])

    date_guess = None
    if publish_date is None:
        log.debug(f"Guessing date for URL {url}...")
        date_guess = guess_date(url, content)
        log.debug(f"Done guessing date for URL {url}: {date_guess}")

        story['publish_date'] = date_guess.date if date_guess.found else None
    else:
        story['publish_date'] = publish_date

    log.debug(f"Adding story {story}...")
    story = add_story(db, story, feed['feeds_id'])
    log.debug(f"Done adding story {story}")

    db.query(
        """
        insert into stories_tags_map (stories_id, tags_id)
            select %(a)s, %(b)s
            where not exists (
                select 1
                from stories_tags_map
                where stories_id = %(a)s
                  and tags_id = %(b)s
            )
        """, {
            'a': story['stories_id'],
            'b': spidered_tag['tags_id']
        })

    if publish_date is None:
        log.debug(f"Assigning date guess tag...")
        assign_date_guess_tag(db, story, date_guess, fallback_date)

    log.debug("add story: %s; %s; %s; %d" %
              (story['title'], story['url'], story['publish_date'],
               story['stories_id']))

    if story.get('is_new', False):
        log.debug("Story is new, creating download...")
        download = create_download_for_new_story(db, story, feed)

        log.debug("Storing story content...")
        store_and_verify_content(db, download, content)

        log.debug("Extracting story...")
        _extract_story(db, story)
        log.debug("Done extracting story")

    else:
        log.debug("Story is not new, skipping download storage and extraction")

    log.debug(f"Done generating story from URL {url}")

    return story
Exemplo n.º 3
0
def generate_story(db: DatabaseHandler,
                   url: str,
                   content: str,
                   title: str = None,
                   publish_date: str = None,
                   fallback_date: Optional[str] = None) -> dict:
    """Add a new story to the database by guessing metadata using the given url and content.

    This function guesses the medium, feed, title, and date of the story from the url and content.

    If inserting the story results in a unique constraint error based on media_id and url, return
    the existing story instead.

    Arguments:
    db - db handle
    url - story url
    content - story content
    fallback_date - fallback to this date if the date guesser fails to find a date
    """
    if len(url) < 1:
        raise McTMStoriesException("url must not be an empty string")

    url = url[0:MAX_URL_LENGTH]

    medium = guess_medium(db, url)
    feed = get_spider_feed(db, medium)
    spidered_tag = get_spidered_tag(db)

    if title is None:
        title = html_title(content, url, MAX_TITLE_LENGTH)

    story = {
        'url': url,
        'guid': url,
        'media_id': medium['media_id'],
        'title': title,
        'description': ''
    }

    # postgres refuses to insert text values with the null character
    for field in ('url', 'guid', 'title'):
        story[field] = re2.sub('\x00', '', story[field])

    date_guess = None
    if publish_date is None:
        date_guess = guess_date(url, content)
        story[
            'publish_date'] = date_guess.date if date_guess.found else fallback_date
        if story['publish_date'] is None:
            story['publish_date'] = datetime.datetime.now().isoformat()
    else:
        story['publish_date'] = publish_date

    story = add_story(db, story, feed['feeds_id'])

    db.query(
        """
        insert into stories_tags_map (stories_id, tags_id)
            select %(a)s, %(b)s
            where not exists (
                select 1
                from stories_tags_map
                where stories_id = %(a)s
                  and tags_id = %(b)s
            )
        """, {
            'a': story['stories_id'],
            'b': spidered_tag['tags_id']
        })

    if publish_date is None:
        assign_date_guess_tag(db, story, date_guess, fallback_date)

    log.debug("add story: %s; %s; %s; %d" %
              (story['title'], story['url'], story['publish_date'],
               story['stories_id']))

    if story.get('is_new', False):
        download = create_download_for_new_story(db, story, feed)
        store_content(db, download, content)
        _extract_story(story)

    return story
Exemplo n.º 4
0
def test_html_title() -> None:
    """Test html_title()."""

    # various meta tags
    assert html_title('Foo<meta name="og:title" content="OG Title" />', 'fb') == 'OG Title'
    assert html_title('Foo<meta name="hdl" content="HDL" />', 'fb') == 'HDL'
    assert html_title('Foo<meta name="twitter:title" content="Twitter Title" />', 'fb') == 'Twitter Title'
    assert html_title('Foo<meta name="dc.title" content="DC Title" />', 'fb') == 'DC Title'
    assert html_title('Foo<meta name="dcterms.title" content="DC Terms Title" />', 'fb') == 'DC Terms Title'
    assert html_title('Foo<meta name="title" content="Title" />', 'fb') == 'Title'

    # single quotes
    assert html_title("Foo<meta name='og:title' content='OG Title' />", 'fb') == 'OG Title'

    # name -> property
    assert html_title('Foo<meta property="hdl" content="HDL" />', 'fb') == 'HDL'

    # title tag
    assert html_title('<title>Title Tag</title>', 'fb') == 'Title Tag'

    # more complex
    html_path = mediawords.util.paths.mc_root_path() + '/mediacloud/test-data/html/strip.html'
    with open(html_path, 'r', encoding='utf8') as fh:
        html = fh.read()

    assert html_title(html, 'fb') == 'Global Voices - FBI Investigation Helps Uncover Latest Bribery Scandal In Greece'

    # fallback
    assert html_title('There is no title here', 'Fallback Title') == 'Fallback Title'

    # trim length
    assert html_title('<title>Foo Bar</title>', 'fb', 3) == 'Foo'

    # strip
    assert html_title('<title><b>Stripped</b> Title</title>', 'fb') == 'Stripped Title'