def test_multinational_flag_is_set_correctly(self): score = np.float64(1.21111) article_id = 2 links = create_article_institute_links(article_id, ['a'], score) assert links[0]['is_multinational'] is False links = create_article_institute_links(article_id, ['a', 'b', 'c'], score) assert links[0]['is_multinational'] is True
def test_multiple_results_are_returned_for_multinationals(self): score = np.float64(1.31111) article_id = 2 links = create_article_institute_links(article_id, ['a', 'b', 'c'], score) assert len(links) == 3
def test_data_is_returned_in_correct_format(self): score = np.float64(1.11111) article_id = 1 links = create_article_institute_links(article_id, ['a'], score) expected_result = [{'article_id': 1, 'institute_id': 'a', 'is_multinational': False, 'matching_score': 1.11111}] assert links == expected_result
def test_create_article_institute_links_converts_score_to_float(self): score = np.float64(1.11111) article_id = 1 links = create_article_institute_links(article_id, ['a'], score) assert type(links[0]['matching_score']) == float
def run(self): # database setup database = 'dev' if self.test else 'production' logging.info(f"Using {database} database") self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database) Base.metadata.create_all(self.engine) article_institute_batcher = BatchWriter(self.insert_batch_size, add_article_institutes, self.engine) match_attempted_batcher = BatchWriter(self.insert_batch_size, update_existing_articles, self.engine) fuzzer = ComboFuzzer([fuzz.token_sort_ratio, fuzz.partial_ratio], store_history=True) # extract lookup of GRID institute names to ids - seems to be OK to hold in memory institute_name_id_lookup = grid_name_lookup(self.engine) with db_session(self.engine) as session: # used to check GRID ids from MAG are valid (they are not all...) all_grid_ids = {i.id for i in session.query(Institute.id).all()} logging.info(f"{len(all_grid_ids)} institutes in GRID") article_query = (session.query( Article.id, Article.mag_authors).filter( Article.institute_match_attempted.is_(False) & ~Article.institutes.any() & Article.mag_authors.isnot(None))) total = article_query.count() logging.info( f"Total articles with authors and no institutes links: {total}" ) logging.debug("Starting the matching process") articles = article_query.all() for count, article in enumerate(articles, start=1): article_institute_links = [] for author in article.mag_authors: # prevent duplicates when a mixture of institute aliases are used in the same article existing_article_institute_ids = { link['institute_id'] for link in article_institute_links } # extract and validate grid_id try: extracted_grid_id = author['affiliation_grid_id'] except KeyError: pass else: # check grid id is valid if (extracted_grid_id in all_grid_ids and extracted_grid_id not in existing_article_institute_ids): links = create_article_institute_links( article_id=article.id, institute_ids=[extracted_grid_id], score=1) article_institute_links.extend(links) logging.debug(f"Used grid_id: {extracted_grid_id}") continue # extract author affiliation try: affiliation = author['author_affiliation'] except KeyError: # no grid id or affiliation for this author logging.debug(f"No affiliation found in: {author}") continue # look for an exact match on affiliation name try: institute_ids = institute_name_id_lookup[affiliation] except KeyError: pass else: institute_ids = set( institute_ids) - existing_article_institute_ids links = create_article_institute_links( article_id=article.id, institute_ids=institute_ids, score=1) article_institute_links.extend(links) logging.debug(f"Found an exact match for: {affiliation}") continue # fuzzy matching try: match, score = fuzzer.fuzzy_match_one( affiliation, institute_name_id_lookup.keys()) except KeyError: # failed fuzzy match logging.debug(f"Failed fuzzy match: {affiliation}") else: institute_ids = institute_name_id_lookup[match] institute_ids = set( institute_ids) - existing_article_institute_ids links = create_article_institute_links( article_id=article.id, institute_ids=institute_ids, score=score) article_institute_links.extend(links) logging.debug( f"Found a fuzzy match: {affiliation} {score} {match}" ) # add links for this article to the batch queue article_institute_batcher.extend(article_institute_links) # mark that matching has been attempted for this article match_attempted_batcher.append( dict(id=article.id, institute_match_attempted=True)) if not count % 100: logging.info( f"{count} processed articles from {total} : {(count / total) * 100:.1f}%" ) if self.test and count == 50: logging.warning("Exiting after 50 articles in test mode") logging.debug(article_institute_batcher) break # pick up any left over in the batches if article_institute_batcher: article_institute_batcher.write() if match_attempted_batcher: match_attempted_batcher.write() logging.info("All articles processed") logging.info( f"Total successful fuzzy matches for institute names: {len(fuzzer.successful_fuzzy_matches)}" ) logging.info( f"Total failed fuzzy matches for institute names{len(fuzzer.failed_fuzzy_matches): }" ) # mark as done logging.info("Task complete") self.output().touch()