def test_batched_titles_generates_title_id_lookup(self, mocked_split_batches, mocked_prepare_title, mocked_articles): mocked_split_batches.return_value = iter([[1, 2, 3, 4, 5, 6]]) mocked_articles = [ mocked_articles([{ 'id': x, 'title': 'dummy_title' } for x in range(1, 7)]) ] mocked_session = mock.Mock() mocked_session.query().filter().all.side_effect = mocked_articles mocked_prepare_title.side_effect = ('clean title A', 'clean title B', 'clean title B', 'clean title C', 'clean title A', 'clean title B') expected_result = defaultdict(list) expected_result.update({ 'clean title A': [1, 5], 'clean title B': [2, 3, 6], 'clean title C': [4] }) batcher = BatchedTitles([1, 2, 3, 4, 5, 6], batch_size=3, session=mocked_session) list(batcher) for title, ids in expected_result.items(): assert ids == batcher[title]
def test_batched_titles_returns_all_prepared_titles(self, mocked_split_batches, mocked_prepare_title, mocked_articles): mocked_split_batches.return_value = iter([[1, 2, 3], [4, 5, 6]]) # mocking a generator mocked_articles = [mocked_articles([{'id': 1, 'title': 'title A'}, {'id': 2, 'title': 'title B'}, {'id': 3, 'title': 'title C'}]), mocked_articles([{'id': 4, 'title': 'title D'}, {'id': 5, 'title': 'title E'}, {'id': 6, 'title': 'title F'}])] mocked_session = mock.Mock() mocked_session.query().filter().all.side_effect = mocked_articles mocked_prepare_title.side_effect = ('prepared title A', 'prepared title B', 'prepared title C', 'prepared title D', 'prepared title E', 'prepared title F') batcher = BatchedTitles([1, 2, 3, 4, 5, 6], batch_size=3, session=mocked_session) result = sorted(list(batcher)) assert result == ['prepared title A', 'prepared title B', 'prepared title C', 'prepared title D', 'prepared title E', 'prepared title F']
def test_batched_titles_calls_split_batches_correctly(self, mocked_split_batches, mocked_prepare_title, mocked_articles): mocked_split_batches.return_value = iter([[1, 2, 3, 4, 5, 6]]) mocked_session = mock.Mock() mocked_session.query().filter().all.return_value = mocked_articles([{'id': 1, 'title': 'dummy_title'}]) mocked_prepare_title.return_value = 'clean title A' batcher = BatchedTitles([1, 2, 3, 4], batch_size=2, session=mocked_session) list(batcher) assert mocked_split_batches.mock_calls == [mock.call([1, 2, 3, 4], 2)]
def run(self): pp = pprint.PrettyPrinter(indent=4, width=100) mag_config = misctools.get_config(self.mag_config_path, 'mag') mag_subscription_key = mag_config['subscription_key'] # database setup database = 'dev' if self.test else 'production' logging.warning(f"Using {database} database") self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database) Base.metadata.create_all(self.engine) with db_session(self.engine) as session: paper_fields = [ "Id", "Ti", "F.FId", "CC", "AA.AuN", "AA.AuId", "AA.AfN", "AA.AfId", "AA.S" ] author_mapping = { 'AuN': 'author_name', 'AuId': 'author_id', 'AfN': 'author_affiliation', 'AfId': 'author_affiliation_id', 'S': 'author_order' } field_mapping = { 'Id': 'mag_id', 'Ti': 'title', 'F': 'fields_of_study', 'AA': 'mag_authors', 'CC': 'citation_count', 'logprob': 'mag_match_prob' } logging.info( "Querying database for articles without fields of study") arxiv_ids_to_process = { a.id for a in (session.query(Article).filter( ~Article.fields_of_study.any()).all()) } total_arxiv_ids_to_process = len(arxiv_ids_to_process) logging.info(f"{total_arxiv_ids_to_process} articles to process") all_articles_to_update = BatchWriter(self.insert_batch_size, update_existing_articles, self.engine) batched_titles = BatchedTitles(arxiv_ids_to_process, 10000, session) batch_field_of_study_ids = set() for count, expr in enumerate(build_expr(batched_titles, 'Ti'), 1): logging.debug(pp.pformat(expr)) expr_length = len(expr.split(',')) logging.info(f"Querying MAG for {expr_length} titles") total_arxiv_ids_to_process -= expr_length batch_data = query_mag_api(expr, paper_fields, mag_subscription_key) logging.debug(pp.pformat(batch_data)) returned_entities = batch_data['entities'] logging.info( f"{len(returned_entities)} entities returned from MAG (potentially including duplicates)" ) # dedupe response keeping the entity with the highest logprob deduped_mag_ids = dedupe_entities(returned_entities) logging.info( f"{len(deduped_mag_ids)} entities after deduplication") missing_articles = expr_length - len(deduped_mag_ids) if missing_articles != 0: logging.info(f"{missing_articles} titles not found in MAG") batch_article_data = [] for row in returned_entities: # exclude duplicate titles if row['Id'] not in deduped_mag_ids: continue # renaming and reformatting for code, description in field_mapping.items(): try: row[description] = row.pop(code) except KeyError: pass for author in row.get('mag_authors', []): for code, description in author_mapping.items(): try: author[description] = author.pop(code) except KeyError: pass if row.get('citation_count', None) is not None: row['citation_count_updated'] = date.today() # reformat fos_ids out of dictionaries try: row['fields_of_study'] = { f['FId'] for f in row.pop('fields_of_study') } except KeyError: row['fields_of_study'] = [] batch_field_of_study_ids.update(row['fields_of_study']) # get list of ids which share the same title try: matching_articles = batched_titles[row['title']] except KeyError: logging.warning( f"Returned title not found in original data: {row['title']}" ) continue # drop unnecessary fields for f in ['prob', 'title']: del row[f] # add each matching article for this title to the batch for article_id in matching_articles: batch_article_data.append({**row, 'id': article_id}) # check fields of study are in database batch_field_of_study_ids = { fos_id for article in batch_article_data for fos_id in article['fields_of_study'] } logging.debug('Checking fields of study exist in db') found_fos_ids = { fos.id for fos in (session.query(FieldOfStudy).filter( FieldOfStudy.id.in_(batch_field_of_study_ids)).all()) } missing_fos_ids = batch_field_of_study_ids - found_fos_ids if missing_fos_ids: # query mag for details if not found update_field_of_study_ids(mag_subscription_key, session, missing_fos_ids) # add this batch to the queue all_articles_to_update.extend(batch_article_data) logging.info( f"Batch {count} done. {total_arxiv_ids_to_process} articles left to process" ) if self.test and count == 2: logging.warning("Exiting after 2 batches in test mode") break # pick up any left over in the batch if all_articles_to_update: all_articles_to_update.write() # mark as done logging.warning("Task complete") self.output().touch()