Пример #1
0
def replace_dict_with_text(json_path):
    json_file = igem.get_json(json_path)
    # print(json_path)
    # print(json_file.items())
    for doi, paper_dict in json_file.items():
        originalText = paper_dict['originalText']
        json_file.update({doi: originalText})

    igem.save_json(input_path, json_file)
Пример #2
0
def loop(subset, outfile):
    if os.path.isfile(outfile):
        fulltext_dict = igem.get_json(outfile)
    else:
        fulltext_dict = {}

    # hyperparam (-1 if max)
    max_calls = -1

    # Stats
    calls = 0
    fails = 0
    not_oa = 0
    successes_or_found = 0
    queries = 0

    lenn = len(subset)

    try:
        # Looping through quantify_dataset output json.
        for pmid, metadata in subset.items():

            # dont go over max calls. (-1 if infinite)
            if calls == max_calls or (queries == -1):
                print("[{num_run}] Query limit reached.", end='\r')
                break
            else:
                calls += 1

            # # tmp
            if calls < 6936:
                continue

            # checks if paper has been successfully fetched before
            if pmid in fulltext_dict:
                successes_or_found += 1
                print(f"[{num_run}] ## Call {calls} found.",
                      end=' #########\r')
                continue

            # THE FETCH
            fullpaper = get_paper(pmid, metadata)

            if (fullpaper):
                fulltext_dict[pmid] = fullpaper
                successes_or_found += 1
                queries += 1
                print(
                    f"[{num_run}] Call {calls} success. {round(calls / lenn * 100, 2)}% done. {round(successes_or_found / (calls-not_oa) * 100, 2)}% successful.",
                    end=' #########\r')
            else:
                fails += 1
                print(
                    f"[{num_run}] Call {calls} failed. {round(calls / lenn * 100, 2)}% done. {round(successes_or_found / (calls-not_oa) * 100, 2)}% successful.",
                    end=' #########\r')

            # if calls % 25 == 0:
            #     print(f'[{num_run}] ############# {successes_or_found} successes, {lenn - calls} left, {round(successes_or_found / (calls-not_oa) * 100, 2)}% successful.', end=' ####\r')
    except KeyboardInterrupt:
        pass

    # save to file
    igem.save_json(outfile, fulltext_dict)

    # Print Stats
    print("")
    print("###### STATS ######")
    print(f"Total calls: {calls}")
    print(f"Total number of queries: {queries}")
    print(f"Total number of Elsevier papers: {calls - not_oa}")
    print(f"Number of Non-Elsevier papers skipped: {not_oa}")
    print(f"Number of fetch failures: {fails}")
    print(f"Papers in storage: {len(fulltext_dict)}")
    print(f"% of success: {successes_or_found / (calls-not_oa) * 100}%")
                )

            if calls % 25 == 0:
                print(
                    f'[{num_run}] ############# {successes_or_found} successes, {lenn - calls}left.'
                )

    except KeyboardInterrupt:
        pass

    # save to file
    with open(json_file, 'w') as fp:
        json.dump(elsevier_fulltexts, fp)
        # vary: alter frequency of file save
        fp.close()

    # Print Stats
    print("")
    print("###### STATS ######")
    print(f"Total calls: {calls}")
    print(f"Total number of queries: {queries}")
    print(f"Total number of Elsevier papers: {calls - not_elsevier}")
    print(f"Number of Non-Elsevier papers skipped: {not_elsevier}")
    print(f"Number of fetch failures: {fails}")
    print(f"Papers in storage: {len(elsevier_fulltexts)}")
    print(f"% of success: {successes_or_found / (calls-not_elsevier) * 100}%")


''' RUN '''
loop_elsevier(igem.get_json(in_file), out_file)
Пример #4
0
                    f"[{num_run}] Call {calls} success. {round(calls / lenn * 100, 2)}% done. {round(successes_or_found / (calls-not_oa) * 100, 2)}% successful.",
                    end=' #########\r')
            else:
                fails += 1
                print(
                    f"[{num_run}] Call {calls} failed. {round(calls / lenn * 100, 2)}% done. {round(successes_or_found / (calls-not_oa) * 100, 2)}% successful.",
                    end=' #########\r')

            # if calls % 25 == 0:
            #     print(f'[{num_run}] ############# {successes_or_found} successes, {lenn - calls} left, {round(successes_or_found / (calls-not_oa) * 100, 2)}% successful.', end=' ####\r')
    except KeyboardInterrupt:
        pass

    # save to file
    igem.save_json(outfile, fulltext_dict)

    # Print Stats
    print("")
    print("###### STATS ######")
    print(f"Total calls: {calls}")
    print(f"Total number of queries: {queries}")
    print(f"Total number of Elsevier papers: {calls - not_oa}")
    print(f"Number of Non-Elsevier papers skipped: {not_oa}")
    print(f"Number of fetch failures: {fails}")
    print(f"Papers in storage: {len(fulltext_dict)}")
    print(f"% of success: {successes_or_found / (calls-not_oa) * 100}%")


''' RUN '''
loop(igem.get_json(in_file), out_file)
Пример #5
0
# Set up with input path to load in JSON and prep a CSV to write to
input_path = sys.argv[1]
#out = sys.argv[2]
out_name = input_path.split("/")[-1].split(".")[0]

csv_file = 'output_ner/{}_{}.csv'.format(
    "sentence_annotations", out_name
)  # can uncomment previous line, change stuff inside format to out to modify name of file

if not os.path.exists('output_ner'):
    os.makedirs('output_ner')

# read in json properly
if os.path.exists(input_path):
    text_files = igem.get_json(input_path)
else:
    raise Exception("supply correct input file")

#if there's a cache, reopen it:
smiles_cache = {}  #keys are names, SMILES are values
cache_name = 'smiles_cache.json'  # IF RUNNING IN PARALLEL: change to f"smiles_cache_{out_name}.json"

if os.path.exists(cache_name):
    smiles_cache = igem.get_json(cache_name)

#initiate tagger
cpt = ChemCrfPosTagger()

# tracking counts, for monitoring runs
count = 0