예제 #1
0
def _create_child_download_for_story(db: DatabaseHandler, story: dict, parent_download: dict) -> None:
    """Create a pending download for the story's URL."""
    story = decode_object_from_bytes_if_needed(story)
    parent_download = decode_object_from_bytes_if_needed(parent_download)

    download = {
        'feeds_id': parent_download['feeds_id'],
        'stories_id': story['stories_id'],
        'parent': parent_download['downloads_id'],
        'url': story['url'],
        'host': get_url_host(story['url']),
        'type': 'content',
        'sequence': 1,
        'state': 'pending',
        'priority': parent_download['priority'],
        'extracted': False,
    }

    content_delay = db.query("""
        SELECT content_delay
        FROM media
        WHERE media_id = %(media_id)s
    """, {'media_id': story['media_id']}).flat()[0]
    if content_delay:
        # Delay download of content this many hours. his is useful for sources that are likely to significantly change
        # content in the hours after it is first published.
        now = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
        download_at_timestamp = now + (content_delay * 60 * 60)
        download['download_time'] = get_sql_date_from_epoch(download_at_timestamp)

    db.create(table='downloads', insert_hash=download)
예제 #2
0
    async def fetch_store_transcript(self, stories_id: int) -> None:

        log.info(f"Fetching and storing transcript for story {stories_id}...")

        with tempfile.TemporaryDirectory(
                prefix='fetch_store_transcript') as temp_dir:
            transcript_json_path = os.path.join(temp_dir, 'transcript.json')

            gcs = GCSStore(bucket_config=self.config.transcripts())
            gcs.download_object(object_id=str(stories_id),
                                local_file_path=transcript_json_path)

            with open(transcript_json_path, 'r') as f:
                transcript_json = f.read()

        transcript = Transcript.from_dict(decode_json(transcript_json))

        db = connect_to_db_or_raise()

        story = db.find_by_id(table='stories', object_id=stories_id)

        feed = db.query(
            """
            SELECT *
            FROM feeds
            WHERE feeds_id = (
                SELECT feeds_id
                FROM feeds_stories_map
                WHERE stories_id = %(stories_id)s
            )
        """, {
                'stories_id': stories_id,
            }).hash()

        # Just like create_download_for_new_story(), it creates a new download except that it tests if such a download
        # exists first
        download = db.find_or_create(
            table='downloads',
            insert_hash={
                'feeds_id': feed['feeds_id'],
                'stories_id': story['stories_id'],
                'url': story['url'],
                'host': get_url_host(story['url']),
                'type': 'content',
                'sequence': 1,
                'state': 'success',
                'path': 'content:pending',
                'priority': 1,
                'extracted': 'f'
            },
        )

        text = transcript.download_text_from_transcript()

        # Store as a raw download and then let "extract-and-vector" app "extract" the stored text later
        store_content(db=db, download=download, content=text)

        log.info(
            f"Done fetching and storing transcript for story {stories_id}")
예제 #3
0
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict:
    """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download
    store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content()."""

    story = decode_object_from_bytes_if_needed(story)
    feed = decode_object_from_bytes_if_needed(feed)

    if 'content' in story:
        content = story['content']
    else:
        content = _get_test_content()

    if story.get('full_text_rss', None):
        story['full_text_rss'] = False
        db.update_by_id(
            table='stories',
            object_id=story['stories_id'],
            update_hash={'full_text_rss': False},
        )

    host = get_url_host(feed['url'])

    download = db.create(
        table='downloads',
        insert_hash={
            'feeds_id': feed['feeds_id'],
            'url': story['url'],
            'host': host,
            'type': 'content',
            'sequence': 1,
            'state': 'fetching',
            'priority': 1,
            'extracted': False,
            'stories_id': story['stories_id'],
        }
    )

    download = store_content(db=db, download=download, content=content)

    story['download'] = download
    story['content'] = content

    extract_and_process_story(db=db, story=story)

    story['download_text'] = db.query("""
        SELECT *
        FROM download_texts
        WHERE downloads_id = %(downloads_id)s
    """, {'downloads_id': download['downloads_id']}).hash()

    if not story['download_text']:
        raise McAddContentToTestStoryException("Unable to find download_text")

    return story
예제 #4
0
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict:
    """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download
    store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content()."""

    story = decode_object_from_bytes_if_needed(story)
    feed = decode_object_from_bytes_if_needed(feed)

    if 'content' in story:
        content = story['content']
    else:
        content = _get_test_content()

    if story.get('full_text_rss', None):
        story['full_text_rss'] = False
        db.update_by_id(
            table='stories',
            object_id=story['stories_id'],
            update_hash={'full_text_rss': False},
        )

    host = get_url_host(feed['url'])

    download = db.create(
        table='downloads',
        insert_hash={
            'feeds_id': feed['feeds_id'],
            'url': story['url'],
            'host': host,
            'type': 'content',
            'sequence': 1,
            'state': 'fetching',
            'priority': 1,
            'extracted': False,
            'stories_id': story['stories_id'],
        }
    )

    download = store_content(db=db, download=download, content=content)

    story['download'] = download
    story['content'] = content

    extract_and_process_story(db=db, story=story)

    story['download_text'] = db.query("""
        SELECT *
        FROM download_texts
        WHERE downloads_id = %(downloads_id)s
    """, {'downloads_id': download['downloads_id']}).hash()

    if not story['download_text']:
        raise McAddContentToTestStoryException("Unable to find download_text")

    return story
예제 #5
0
def create_download_for_story(db: DatabaseHandler, feed: dict,
                              story: dict) -> dict:
    feed = decode_object_from_bytes_if_needed(feed)
    story = decode_object_from_bytes_if_needed(story)

    host = get_url_host(url=feed['url'])

    return db.create(table='downloads',
                     insert_hash={
                         'feeds_id': feed['feeds_id'],
                         'url': story['url'],
                         'host': host,
                         'type': 'content',
                         'sequence': 1,
                         'state': 'success',
                         'priority': 1,
                         'extracted': False,
                         'path': 'postgresql:foo',
                         'stories_id': story['stories_id'],
                     })
예제 #6
0
def create_download_for_new_story(db: DatabaseHandler, story: dict,
                                  feed: dict) -> dict:
    """Create and return download object in database for the new story."""

    download = {
        'feeds_id': feed['feeds_id'],
        'stories_id': story['stories_id'],
        'url': story['url'],
        'host': get_url_host(story['url']),
        'type': 'content',
        'sequence': 1,
        'state': 'success',
        'path': 'content:pending',
        'priority': 1,
        'extracted': 'f'
    }

    download = db.create('downloads', download)

    return download
예제 #7
0
def create_download_for_feed(db: DatabaseHandler, feed: dict) -> dict:
    feed = decode_object_from_bytes_if_needed(feed)

    priority = 0
    if 'last_attempted_download_time' not in feed:
        priority = 10

    host = get_url_host(url=feed['url'])

    return db.create(table='downloads',
                     insert_hash={
                         'feeds_id': int(feed['feeds_id']),
                         'url': feed['url'],
                         'host': host,
                         'type': 'feed',
                         'sequence': 1,
                         'state': 'pending',
                         'priority': priority,
                         'download_time': 'NOW()',
                         'extracted': False,
                     })
예제 #8
0
def create_download_for_story(db: DatabaseHandler, feed: dict, story: dict) -> dict:
    feed = decode_object_from_bytes_if_needed(feed)
    story = decode_object_from_bytes_if_needed(story)

    host = get_url_host(url=feed['url'])

    return db.create(
        table='downloads',
        insert_hash={
            'feeds_id': feed['feeds_id'],
            'url': story['url'],
            'host': host,
            'type': 'content',
            'sequence': 1,
            'state': 'success',
            'priority': 1,
            'extracted': False,
            'path': 'postgresql:foo',
            'stories_id': story['stories_id'],
        }
    )
예제 #9
0
    def __get_url_domain(url_: str) -> str:

        if not is_http_url(url_):
            return url_

        host = get_url_host(url_)

        name_parts = host.split('.')

        n = len(name_parts) - 1

        # for country domains, use last three parts of name
        if re.search(pattern=r"\...$", string=host):
            domain = '.'.join([name_parts[n - 2], name_parts[n - 1], name_parts[0]])

        elif re.search(pattern=r"(localhost|blogspot\.com|wordpress\.com)", string=host):
            domain = url_

        else:
            domain = '.'.join([name_parts[n - 1], name_parts[n]])

        return domain.lower()
예제 #10
0
    def __get_url_domain(url_: str) -> str:

        if not is_http_url(url_):
            return url_

        host = get_url_host(url_)

        name_parts = host.split('.')

        n = len(name_parts) - 1

        # for country domains, use last three parts of name
        if re.search(pattern=r"\...$", string=host):
            domain = '.'.join([name_parts[n - 2], name_parts[n - 1], name_parts[0]])

        elif re.search(pattern=r"(localhost|blogspot\.com|wordpress\.com)", string=host):
            domain = url_

        else:
            domain = '.'.join([name_parts[n - 1], name_parts[n]])

        return domain.lower()
예제 #11
0
def create_download_for_feed(db: DatabaseHandler, feed: dict) -> dict:
    feed = decode_object_from_bytes_if_needed(feed)

    priority = 0
    if 'last_attempted_download_time' not in feed:
        priority = 10

    host = get_url_host(url=feed['url'])

    return db.create(
        table='downloads',
        insert_hash={
            'feeds_id': int(feed['feeds_id']),
            'url': feed['url'],
            'host': host,
            'type': 'feed',
            'sequence': 1,
            'state': 'pending',
            'priority': priority,
            'download_time': 'NOW()',
            'extracted': False,
        })
예제 #12
0
def _create_child_download_for_story(db: DatabaseHandler, story: dict,
                                     parent_download: dict) -> None:
    """Create a pending download for the story's URL."""
    story = decode_object_from_bytes_if_needed(story)
    parent_download = decode_object_from_bytes_if_needed(parent_download)

    download = {
        'feeds_id': parent_download['feeds_id'],
        'stories_id': story['stories_id'],
        'parent': parent_download['downloads_id'],
        'url': story['url'],
        'host': get_url_host(story['url']),
        'type': 'content',
        'sequence': 1,
        'state': 'pending',
        'priority': parent_download['priority'],
        'extracted': False,
    }

    content_delay = db.query(
        """
        SELECT content_delay
        FROM media
        WHERE media_id = %(media_id)s
    """, {
            'media_id': story['media_id']
        }).flat()[0]
    if content_delay:
        # Delay download of content this many hours. his is useful for sources that are likely to significantly change
        # content in the hours after it is first published.
        now = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
        download_at_timestamp = now + (content_delay * 60 * 60)
        download['download_time'] = get_sql_date_from_epoch(
            download_at_timestamp)

    db.create(table='downloads', insert_hash=download)
예제 #13
0
def add_content_to_test_story(db: DatabaseHandler, story: dict,
                              feed: dict) -> dict:
    """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download
    store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content()."""

    story = decode_object_from_bytes_if_needed(story)
    feed = decode_object_from_bytes_if_needed(feed)

    content_language_code = None
    if 'content' in story:
        content = story['content']
        content_language_code = language_code_for_text(content)
    else:
        content = _get_test_content()

    # If language code was undetermined, or if we're using Latin test content
    if not content_language_code:
        content_language_code = 'en'

    if story.get('full_text_rss', None):
        story['full_text_rss'] = False
        db.update_by_id(
            table='stories',
            object_id=story['stories_id'],
            update_hash={
                'full_text_rss': False,
                'language': content_language_code,
            },
        )

    host = get_url_host(feed['url'])

    download = db.create(table='downloads',
                         insert_hash={
                             'feeds_id': feed['feeds_id'],
                             'url': story['url'],
                             'host': host,
                             'type': 'content',
                             'sequence': 1,
                             'state': 'fetching',
                             'priority': 1,
                             'extracted': True,
                             'stories_id': story['stories_id'],
                         })

    download = store_content(db=db, download=download, content=content)

    extracted_content = html_strip(content)

    story['download'] = download
    story['content'] = extracted_content

    db.query(
        """
        INSERT INTO download_texts (downloads_id, download_text, download_text_length)
        VALUES (%(downloads_id)s, %(download_text)s, CHAR_LENGTH(%(download_text)s))
    """, {
            'downloads_id': download['downloads_id'],
            'download_text': extracted_content,
        })

    lang = LanguageFactory.language_for_code(content_language_code)
    assert lang, f"Language is None for code {content_language_code}"

    sentences = lang.split_text_to_sentences(extracted_content)
    sentence_number = 1
    for sentence in sentences:
        db.insert(table='story_sentences',
                  insert_hash={
                      'sentence': sentence,
                      'language': language_code_for_text(sentence) or 'en',
                      'sentence_number': sentence_number,
                      'stories_id': story['stories_id'],
                      'media_id': story['media_id'],
                      'publish_date': story['publish_date'],
                  })
        sentence_number += 1

    mark_as_processed(db=db, stories_id=story['stories_id'])

    story['download_text'] = db.query(
        """
        SELECT *
        FROM download_texts
        WHERE downloads_id = %(downloads_id)s
    """, {
            'downloads_id': download['downloads_id']
        }).hash()

    if not story['download_text']:
        raise McAddContentToTestStoryException("Unable to find download_text")

    return story
예제 #14
0
def test_get_url_host():
    with pytest.raises(mc_url.McGetURLHostException):
        # noinspection PyTypeChecker
        mc_url.get_url_host(None)
    assert mc_url.get_url_host('http://www.nytimes.com/') == 'www.nytimes.com'
    assert mc_url.get_url_host('http://*****:*****@WHITEHOUSE.GOV/michelle.html') == 'whitehouse.gov'
예제 #15
0
def test_get_url_host():
    with pytest.raises(mc_url.McGetURLHostException):
        # noinspection PyTypeChecker
        mc_url.get_url_host(None)
    assert mc_url.get_url_host('http://www.nytimes.com/') == 'www.nytimes.com'
    assert mc_url.get_url_host('http://*****:*****@WHITEHOUSE.GOV/michelle.html') == 'whitehouse.gov'