def replace_dict_with_text(json_path): json_file = igem.get_json(json_path) # print(json_path) # print(json_file.items()) for doi, paper_dict in json_file.items(): originalText = paper_dict['originalText'] json_file.update({doi: originalText}) igem.save_json(input_path, json_file)
def loop(subset, outfile): if os.path.isfile(outfile): fulltext_dict = igem.get_json(outfile) else: fulltext_dict = {} # hyperparam (-1 if max) max_calls = -1 # Stats calls = 0 fails = 0 not_oa = 0 successes_or_found = 0 queries = 0 lenn = len(subset) try: # Looping through quantify_dataset output json. for pmid, metadata in subset.items(): # dont go over max calls. (-1 if infinite) if calls == max_calls or (queries == -1): print("[{num_run}] Query limit reached.", end='\r') break else: calls += 1 # # tmp if calls < 6936: continue # checks if paper has been successfully fetched before if pmid in fulltext_dict: successes_or_found += 1 print(f"[{num_run}] ## Call {calls} found.", end=' #########\r') continue # THE FETCH fullpaper = get_paper(pmid, metadata) if (fullpaper): fulltext_dict[pmid] = fullpaper successes_or_found += 1 queries += 1 print( f"[{num_run}] Call {calls} success. {round(calls / lenn * 100, 2)}% done. {round(successes_or_found / (calls-not_oa) * 100, 2)}% successful.", end=' #########\r') else: fails += 1 print( f"[{num_run}] Call {calls} failed. {round(calls / lenn * 100, 2)}% done. {round(successes_or_found / (calls-not_oa) * 100, 2)}% successful.", end=' #########\r') # if calls % 25 == 0: # print(f'[{num_run}] ############# {successes_or_found} successes, {lenn - calls} left, {round(successes_or_found / (calls-not_oa) * 100, 2)}% successful.', end=' ####\r') except KeyboardInterrupt: pass # save to file igem.save_json(outfile, fulltext_dict) # Print Stats print("") print("###### STATS ######") print(f"Total calls: {calls}") print(f"Total number of queries: {queries}") print(f"Total number of Elsevier papers: {calls - not_oa}") print(f"Number of Non-Elsevier papers skipped: {not_oa}") print(f"Number of fetch failures: {fails}") print(f"Papers in storage: {len(fulltext_dict)}") print(f"% of success: {successes_or_found / (calls-not_oa) * 100}%")
) if calls % 25 == 0: print( f'[{num_run}] ############# {successes_or_found} successes, {lenn - calls}left.' ) except KeyboardInterrupt: pass # save to file with open(json_file, 'w') as fp: json.dump(elsevier_fulltexts, fp) # vary: alter frequency of file save fp.close() # Print Stats print("") print("###### STATS ######") print(f"Total calls: {calls}") print(f"Total number of queries: {queries}") print(f"Total number of Elsevier papers: {calls - not_elsevier}") print(f"Number of Non-Elsevier papers skipped: {not_elsevier}") print(f"Number of fetch failures: {fails}") print(f"Papers in storage: {len(elsevier_fulltexts)}") print(f"% of success: {successes_or_found / (calls-not_elsevier) * 100}%") ''' RUN ''' loop_elsevier(igem.get_json(in_file), out_file)
f"[{num_run}] Call {calls} success. {round(calls / lenn * 100, 2)}% done. {round(successes_or_found / (calls-not_oa) * 100, 2)}% successful.", end=' #########\r') else: fails += 1 print( f"[{num_run}] Call {calls} failed. {round(calls / lenn * 100, 2)}% done. {round(successes_or_found / (calls-not_oa) * 100, 2)}% successful.", end=' #########\r') # if calls % 25 == 0: # print(f'[{num_run}] ############# {successes_or_found} successes, {lenn - calls} left, {round(successes_or_found / (calls-not_oa) * 100, 2)}% successful.', end=' ####\r') except KeyboardInterrupt: pass # save to file igem.save_json(outfile, fulltext_dict) # Print Stats print("") print("###### STATS ######") print(f"Total calls: {calls}") print(f"Total number of queries: {queries}") print(f"Total number of Elsevier papers: {calls - not_oa}") print(f"Number of Non-Elsevier papers skipped: {not_oa}") print(f"Number of fetch failures: {fails}") print(f"Papers in storage: {len(fulltext_dict)}") print(f"% of success: {successes_or_found / (calls-not_oa) * 100}%") ''' RUN ''' loop(igem.get_json(in_file), out_file)
# Set up with input path to load in JSON and prep a CSV to write to input_path = sys.argv[1] #out = sys.argv[2] out_name = input_path.split("/")[-1].split(".")[0] csv_file = 'output_ner/{}_{}.csv'.format( "sentence_annotations", out_name ) # can uncomment previous line, change stuff inside format to out to modify name of file if not os.path.exists('output_ner'): os.makedirs('output_ner') # read in json properly if os.path.exists(input_path): text_files = igem.get_json(input_path) else: raise Exception("supply correct input file") #if there's a cache, reopen it: smiles_cache = {} #keys are names, SMILES are values cache_name = 'smiles_cache.json' # IF RUNNING IN PARALLEL: change to f"smiles_cache_{out_name}.json" if os.path.exists(cache_name): smiles_cache = igem.get_json(cache_name) #initiate tagger cpt = ChemCrfPosTagger() # tracking counts, for monitoring runs count = 0