def test_guess_date(): with pytest.raises(McGuessDateException): # noinspection PyTypeChecker guess_date(url=None, html=None) # noinspection PyTypeChecker guess_date(url="https://www.nytimes.com/2017/10/some_news.html", html=None) # noinspection PyTypeChecker guess_date(url=None, html="Something") # Found result = guess_date(url="https://www.nytimes.com/2017/10/some_news.html", html=""" <html><head> <meta property="article:published" itemprop="datePublished" content="2017-10-13T04:56:54-04:00" /> </head></html> """) assert result.found is True assert result.guess_method.startswith('Extracted from') assert result.timestamp == 1507885014 assert result.date == '2017-10-13T08:56:54' # Not found (undateable, even though the date is there in <meta />) result = guess_date(url="https://en.wikipedia.org/wiki/Progressive_tax", html=""" <html><head> <meta property="article:published" itemprop="datePublished" content="2017-10-13T04:56:54-04:00" /> </head></html> """) assert result.found is False assert result.guess_method is None assert result.timestamp is None assert result.date is None
def benchmark_date_guessing(): """Benchmark Python date guessing code.""" if len(sys.argv) < 2: sys.exit("Usage: %s <directory of html files>" % sys.argv[0]) directory = sys.argv[1] for file in os.listdir(directory): filename = os.fsdecode(file) if filename.endswith(".txt"): fh = open(os.path.join(directory, filename)) content = fh.read() print(filename + ": " + str(len(content))) date_guess = guess_date( url='http://dont.know.the.date/some/path.html', html=content) print(date_guess.date)
def generate_story( db: DatabaseHandler, url: str, content: str, fallback_date: typing.Optional[datetime.datetime] = None) -> dict: """Add a new story to the database by guessing metadata using the given url and content. This function guesses the medium, feed, title, and date of the story from the url and content. Arguments: db - db handle url - story url content - story content fallback_date - fallback to this date if the date guesser fails to find a date """ if len(url) < 1: raise McTMStoriesException("url must not be an empty string") url = url[0:_MAX_URL_LENGTH] medium = mediawords.tm.media.guess_medium(db, url) feed = get_spider_feed(db, medium) spidered_tag = mediawords.tm.media.get_spidered_tag(db) title = mediawords.util.parse_html.html_title(content, url, _MAX_TITLE_LENGTH) story = { 'url': url, 'guid': url, 'media_id': medium['media_id'], 'title': title, 'description': '' } # postgres refuses to insert text values with the null character for field in ('url', 'guid', 'title'): story[field] = re2.sub('\x00', '', story[field]) date_guess = guess_date(url, content) story[ 'publish_date'] = date_guess.date if date_guess.found else fallback_date if story['publish_date'] is None: story['publish_date'] = datetime.datetime.now().isoformat() try: story = db.create('stories', story) except mediawords.db.exceptions.handler.McUniqueConstraintException: raise McTMStoriesDuplicateException( "Attempt to insert duplicate story url %s" % url) except Exception: raise McTMStoriesException("Error adding story: %s" % traceback.format_exc()) db.query( "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)", { 'a': story['stories_id'], 'b': spidered_tag['tags_id'] }) assign_date_guess_tag(db, story, date_guess, fallback_date) log.debug("add story: %s; %s; %s; %d" % (story['title'], story['url'], story['publish_date'], story['stories_id'])) db.create('feeds_stories_map', { 'stories_id': story['stories_id'], 'feeds_id': feed['feeds_id'] }) download = create_download_for_new_story(db, story, feed) mediawords.dbi.downloads.store_content(db, download, content) _extract_story(db, story) return story
def generate_story( db: DatabaseHandler, url: str, content: str, title: str = None, publish_date: datetime.datetime = None, fallback_date: typing.Optional[datetime.datetime] = None) -> dict: """Add a new story to the database by guessing metadata using the given url and content. This function guesses the medium, feed, title, and date of the story from the url and content. If inserting the story results in a unique constraint error based on media_id and url, return the existing story instead. Arguments: db - db handle url - story url content - story content fallback_date - fallback to this date if the date guesser fails to find a date """ if len(url) < 1: raise McTMStoriesException("url must not be an empty string") url = url[0:mediawords.dbi.stories.stories.MAX_URL_LENGTH] medium = mediawords.tm.media.guess_medium(db, url) feed = get_spider_feed(db, medium) spidered_tag = mediawords.tm.media.get_spidered_tag(db) if title is None: title = mediawords.util.parse_html.html_title( content, url, mediawords.dbi.stories.stories.MAX_TITLE_LENGTH) story = { 'url': url, 'guid': url, 'media_id': medium['media_id'], 'title': title, 'description': '' } # postgres refuses to insert text values with the null character for field in ('url', 'guid', 'title'): story[field] = re2.sub('\x00', '', story[field]) if publish_date is None: date_guess = guess_date(url, content) story[ 'publish_date'] = date_guess.date if date_guess.found else fallback_date if story['publish_date'] is None: story['publish_date'] = datetime.datetime.now().isoformat() else: story['publish_date'] = publish_date story = mediawords.dbi.stories.stories.add_story(db, story, feed['feeds_id']) db.query( """ insert into stories_tags_map (stories_id, tags_id) select %(a)s, %(b)s where not exists ( select 1 from stories_tags_map where stories_id = %(a)s and tags_id = %(b)s ) """, { 'a': story['stories_id'], 'b': spidered_tag['tags_id'] }) if publish_date is None: assign_date_guess_tag(db, story, date_guess, fallback_date) log.debug("add story: %s; %s; %s; %d" % (story['title'], story['url'], story['publish_date'], story['stories_id'])) if story.get('is_new', False): download = create_download_for_new_story(db, story, feed) mediawords.dbi.downloads.store_content(db, download, content) _extract_story(db, story) return story
def generate_story( db: DatabaseHandler, url: str, content: str, title: str = None, publish_date: datetime.datetime = None, fallback_date: typing.Optional[datetime.datetime] = None) -> dict: """Add a new story to the database by guessing metadata using the given url and content. This function guesses the medium, feed, title, and date of the story from the url and content. If inserting the story results in a unique constraint error based on media_id and url, return the existing story instead. Arguments: db - db handle url - story url content - story content fallback_date - fallback to this date if the date guesser fails to find a date """ if len(url) < 1: raise McTMStoriesException("url must not be an empty string") url = url[0:mediawords.dbi.stories.stories.MAX_URL_LENGTH] medium = mediawords.tm.media.guess_medium(db, url) feed = get_spider_feed(db, medium) spidered_tag = mediawords.tm.media.get_spidered_tag(db) if title is None: title = mediawords.util.parse_html.html_title(content, url, mediawords.dbi.stories.stories.MAX_TITLE_LENGTH) story = { 'url': url, 'guid': url, 'media_id': medium['media_id'], 'title': title, 'description': '' } # postgres refuses to insert text values with the null character for field in ('url', 'guid', 'title'): story[field] = re2.sub('\x00', '', story[field]) if publish_date is None: date_guess = guess_date(url, content) story['publish_date'] = date_guess.date if date_guess.found else fallback_date if story['publish_date'] is None: story['publish_date'] = datetime.datetime.now().isoformat() else: story['publish_date'] = publish_date try: story = db.create('stories', story) except mediawords.db.exceptions.handler.McUniqueConstraintException: return mediawords.tm.stories.get_story_match(db=db, url=story['url']) except Exception: raise McTMStoriesException("Error adding story: %s" % traceback.format_exc()) db.query( "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)", {'a': story['stories_id'], 'b': spidered_tag['tags_id']}) if publish_date is None: assign_date_guess_tag(db, story, date_guess, fallback_date) log.debug("add story: %s; %s; %s; %d" % (story['title'], story['url'], story['publish_date'], story['stories_id'])) db.create('feeds_stories_map', {'stories_id': story['stories_id'], 'feeds_id': feed['feeds_id']}) download = create_download_for_new_story(db, story, feed) mediawords.dbi.downloads.store_content(db, download, content) _extract_story(db, story) return story