def _create_child_download_for_story(db: DatabaseHandler, story: dict, parent_download: dict) -> None: """Create a pending download for the story's URL.""" story = decode_object_from_bytes_if_needed(story) parent_download = decode_object_from_bytes_if_needed(parent_download) download = { 'feeds_id': parent_download['feeds_id'], 'stories_id': story['stories_id'], 'parent': parent_download['downloads_id'], 'url': story['url'], 'host': get_url_host(story['url']), 'type': 'content', 'sequence': 1, 'state': 'pending', 'priority': parent_download['priority'], 'extracted': False, } content_delay = db.query(""" SELECT content_delay FROM media WHERE media_id = %(media_id)s """, {'media_id': story['media_id']}).flat()[0] if content_delay: # Delay download of content this many hours. his is useful for sources that are likely to significantly change # content in the hours after it is first published. now = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) download_at_timestamp = now + (content_delay * 60 * 60) download['download_time'] = get_sql_date_from_epoch(download_at_timestamp) db.create(table='downloads', insert_hash=download)
async def fetch_store_transcript(self, stories_id: int) -> None: log.info(f"Fetching and storing transcript for story {stories_id}...") with tempfile.TemporaryDirectory( prefix='fetch_store_transcript') as temp_dir: transcript_json_path = os.path.join(temp_dir, 'transcript.json') gcs = GCSStore(bucket_config=self.config.transcripts()) gcs.download_object(object_id=str(stories_id), local_file_path=transcript_json_path) with open(transcript_json_path, 'r') as f: transcript_json = f.read() transcript = Transcript.from_dict(decode_json(transcript_json)) db = connect_to_db_or_raise() story = db.find_by_id(table='stories', object_id=stories_id) feed = db.query( """ SELECT * FROM feeds WHERE feeds_id = ( SELECT feeds_id FROM feeds_stories_map WHERE stories_id = %(stories_id)s ) """, { 'stories_id': stories_id, }).hash() # Just like create_download_for_new_story(), it creates a new download except that it tests if such a download # exists first download = db.find_or_create( table='downloads', insert_hash={ 'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id'], 'url': story['url'], 'host': get_url_host(story['url']), 'type': 'content', 'sequence': 1, 'state': 'success', 'path': 'content:pending', 'priority': 1, 'extracted': 'f' }, ) text = transcript.download_text_from_transcript() # Store as a raw download and then let "extract-and-vector" app "extract" the stored text later store_content(db=db, download=download, content=text) log.info( f"Done fetching and storing transcript for story {stories_id}")
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict: """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content().""" story = decode_object_from_bytes_if_needed(story) feed = decode_object_from_bytes_if_needed(feed) if 'content' in story: content = story['content'] else: content = _get_test_content() if story.get('full_text_rss', None): story['full_text_rss'] = False db.update_by_id( table='stories', object_id=story['stories_id'], update_hash={'full_text_rss': False}, ) host = get_url_host(feed['url']) download = db.create( table='downloads', insert_hash={ 'feeds_id': feed['feeds_id'], 'url': story['url'], 'host': host, 'type': 'content', 'sequence': 1, 'state': 'fetching', 'priority': 1, 'extracted': False, 'stories_id': story['stories_id'], } ) download = store_content(db=db, download=download, content=content) story['download'] = download story['content'] = content extract_and_process_story(db=db, story=story) story['download_text'] = db.query(""" SELECT * FROM download_texts WHERE downloads_id = %(downloads_id)s """, {'downloads_id': download['downloads_id']}).hash() if not story['download_text']: raise McAddContentToTestStoryException("Unable to find download_text") return story
def create_download_for_story(db: DatabaseHandler, feed: dict, story: dict) -> dict: feed = decode_object_from_bytes_if_needed(feed) story = decode_object_from_bytes_if_needed(story) host = get_url_host(url=feed['url']) return db.create(table='downloads', insert_hash={ 'feeds_id': feed['feeds_id'], 'url': story['url'], 'host': host, 'type': 'content', 'sequence': 1, 'state': 'success', 'priority': 1, 'extracted': False, 'path': 'postgresql:foo', 'stories_id': story['stories_id'], })
def create_download_for_new_story(db: DatabaseHandler, story: dict, feed: dict) -> dict: """Create and return download object in database for the new story.""" download = { 'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id'], 'url': story['url'], 'host': get_url_host(story['url']), 'type': 'content', 'sequence': 1, 'state': 'success', 'path': 'content:pending', 'priority': 1, 'extracted': 'f' } download = db.create('downloads', download) return download
def create_download_for_feed(db: DatabaseHandler, feed: dict) -> dict: feed = decode_object_from_bytes_if_needed(feed) priority = 0 if 'last_attempted_download_time' not in feed: priority = 10 host = get_url_host(url=feed['url']) return db.create(table='downloads', insert_hash={ 'feeds_id': int(feed['feeds_id']), 'url': feed['url'], 'host': host, 'type': 'feed', 'sequence': 1, 'state': 'pending', 'priority': priority, 'download_time': 'NOW()', 'extracted': False, })
def create_download_for_story(db: DatabaseHandler, feed: dict, story: dict) -> dict: feed = decode_object_from_bytes_if_needed(feed) story = decode_object_from_bytes_if_needed(story) host = get_url_host(url=feed['url']) return db.create( table='downloads', insert_hash={ 'feeds_id': feed['feeds_id'], 'url': story['url'], 'host': host, 'type': 'content', 'sequence': 1, 'state': 'success', 'priority': 1, 'extracted': False, 'path': 'postgresql:foo', 'stories_id': story['stories_id'], } )
def __get_url_domain(url_: str) -> str: if not is_http_url(url_): return url_ host = get_url_host(url_) name_parts = host.split('.') n = len(name_parts) - 1 # for country domains, use last three parts of name if re.search(pattern=r"\...$", string=host): domain = '.'.join([name_parts[n - 2], name_parts[n - 1], name_parts[0]]) elif re.search(pattern=r"(localhost|blogspot\.com|wordpress\.com)", string=host): domain = url_ else: domain = '.'.join([name_parts[n - 1], name_parts[n]]) return domain.lower()
def create_download_for_feed(db: DatabaseHandler, feed: dict) -> dict: feed = decode_object_from_bytes_if_needed(feed) priority = 0 if 'last_attempted_download_time' not in feed: priority = 10 host = get_url_host(url=feed['url']) return db.create( table='downloads', insert_hash={ 'feeds_id': int(feed['feeds_id']), 'url': feed['url'], 'host': host, 'type': 'feed', 'sequence': 1, 'state': 'pending', 'priority': priority, 'download_time': 'NOW()', 'extracted': False, })
def _create_child_download_for_story(db: DatabaseHandler, story: dict, parent_download: dict) -> None: """Create a pending download for the story's URL.""" story = decode_object_from_bytes_if_needed(story) parent_download = decode_object_from_bytes_if_needed(parent_download) download = { 'feeds_id': parent_download['feeds_id'], 'stories_id': story['stories_id'], 'parent': parent_download['downloads_id'], 'url': story['url'], 'host': get_url_host(story['url']), 'type': 'content', 'sequence': 1, 'state': 'pending', 'priority': parent_download['priority'], 'extracted': False, } content_delay = db.query( """ SELECT content_delay FROM media WHERE media_id = %(media_id)s """, { 'media_id': story['media_id'] }).flat()[0] if content_delay: # Delay download of content this many hours. his is useful for sources that are likely to significantly change # content in the hours after it is first published. now = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) download_at_timestamp = now + (content_delay * 60 * 60) download['download_time'] = get_sql_date_from_epoch( download_at_timestamp) db.create(table='downloads', insert_hash=download)
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict: """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content().""" story = decode_object_from_bytes_if_needed(story) feed = decode_object_from_bytes_if_needed(feed) content_language_code = None if 'content' in story: content = story['content'] content_language_code = language_code_for_text(content) else: content = _get_test_content() # If language code was undetermined, or if we're using Latin test content if not content_language_code: content_language_code = 'en' if story.get('full_text_rss', None): story['full_text_rss'] = False db.update_by_id( table='stories', object_id=story['stories_id'], update_hash={ 'full_text_rss': False, 'language': content_language_code, }, ) host = get_url_host(feed['url']) download = db.create(table='downloads', insert_hash={ 'feeds_id': feed['feeds_id'], 'url': story['url'], 'host': host, 'type': 'content', 'sequence': 1, 'state': 'fetching', 'priority': 1, 'extracted': True, 'stories_id': story['stories_id'], }) download = store_content(db=db, download=download, content=content) extracted_content = html_strip(content) story['download'] = download story['content'] = extracted_content db.query( """ INSERT INTO download_texts (downloads_id, download_text, download_text_length) VALUES (%(downloads_id)s, %(download_text)s, CHAR_LENGTH(%(download_text)s)) """, { 'downloads_id': download['downloads_id'], 'download_text': extracted_content, }) lang = LanguageFactory.language_for_code(content_language_code) assert lang, f"Language is None for code {content_language_code}" sentences = lang.split_text_to_sentences(extracted_content) sentence_number = 1 for sentence in sentences: db.insert(table='story_sentences', insert_hash={ 'sentence': sentence, 'language': language_code_for_text(sentence) or 'en', 'sentence_number': sentence_number, 'stories_id': story['stories_id'], 'media_id': story['media_id'], 'publish_date': story['publish_date'], }) sentence_number += 1 mark_as_processed(db=db, stories_id=story['stories_id']) story['download_text'] = db.query( """ SELECT * FROM download_texts WHERE downloads_id = %(downloads_id)s """, { 'downloads_id': download['downloads_id'] }).hash() if not story['download_text']: raise McAddContentToTestStoryException("Unable to find download_text") return story
def test_get_url_host(): with pytest.raises(mc_url.McGetURLHostException): # noinspection PyTypeChecker mc_url.get_url_host(None) assert mc_url.get_url_host('http://www.nytimes.com/') == 'www.nytimes.com' assert mc_url.get_url_host('http://*****:*****@WHITEHOUSE.GOV/michelle.html') == 'whitehouse.gov'