def replace_dict_with_text(json_path): json_file = igem.get_json(json_path) # print(json_path) # print(json_file.items()) for doi, paper_dict in json_file.items(): originalText = paper_dict['originalText'] json_file.update({doi: originalText}) igem.save_json(input_path, json_file)
def loop(subset, outfile): if os.path.isfile(outfile): fulltext_dict = igem.get_json(outfile) else: fulltext_dict = {} # hyperparam (-1 if max) max_calls = -1 # Stats calls = 0 fails = 0 not_oa = 0 successes_or_found = 0 queries = 0 lenn = len(subset) try: # Looping through quantify_dataset output json. for pmid, metadata in subset.items(): # dont go over max calls. (-1 if infinite) if calls == max_calls or (queries == -1): print("[{num_run}] Query limit reached.", end='\r') break else: calls += 1 # # tmp if calls < 6936: continue # checks if paper has been successfully fetched before if pmid in fulltext_dict: successes_or_found += 1 print(f"[{num_run}] ## Call {calls} found.", end=' #########\r') continue # THE FETCH fullpaper = get_paper(pmid, metadata) if (fullpaper): fulltext_dict[pmid] = fullpaper successes_or_found += 1 queries += 1 print( f"[{num_run}] Call {calls} success. {round(calls / lenn * 100, 2)}% done. {round(successes_or_found / (calls-not_oa) * 100, 2)}% successful.", end=' #########\r') else: fails += 1 print( f"[{num_run}] Call {calls} failed. {round(calls / lenn * 100, 2)}% done. {round(successes_or_found / (calls-not_oa) * 100, 2)}% successful.", end=' #########\r') # if calls % 25 == 0: # print(f'[{num_run}] ############# {successes_or_found} successes, {lenn - calls} left, {round(successes_or_found / (calls-not_oa) * 100, 2)}% successful.', end=' ####\r') except KeyboardInterrupt: pass # save to file igem.save_json(outfile, fulltext_dict) # Print Stats print("") print("###### STATS ######") print(f"Total calls: {calls}") print(f"Total number of queries: {queries}") print(f"Total number of Elsevier papers: {calls - not_oa}") print(f"Number of Non-Elsevier papers skipped: {not_oa}") print(f"Number of fetch failures: {fails}") print(f"Papers in storage: {len(fulltext_dict)}") print(f"% of success: {successes_or_found / (calls-not_oa) * 100}%")
def make_you_smile(sent, c): global successful_spans global out_name global smiles_cache smiles = None # :( # Cleaning of chemical string. c = c.rstrip() c = c.replace(" ", "%20") c = c.replace("\u03b1", "alpha") c = c.replace("\u03b2", "beta") c = c.replace("\u03b3", "gamma") c = c.replace("\u2032", "") for dumb_quote in [ "\u0060", "\u00B4", "\u2018", "\u2019", "\u201C", "\u201D", "\u00f7", "\u2423", "\u2d19", "\ufb02" ]: # quotes and divide c.replace(dumb_quote, "") for dumb_dash in [ "\u007E", "\u2010", "\u2011", "\u2012", "\u2013", "\u2014", "\u2015" ]: c.replace(dumb_dash, "\u002d") print(c) if c.lower() in smiles_cache: smiles = smiles_cache[c.lower()] else: url_nih = 'http://cactus.nci.nih.gov/chemical/structure/' + c + '/smiles' try: print(url_nih) req = urlopen(url_nih, timeout=3) except urllib.error.HTTPError as e: print("No entity returned.") except socket.timeout as t: print("Taking too long, likely an invalid entity.") print(c) except UnicodeEncodeError as e: print("Unicode Encode Error: " + str(e)) except KeyboardInterrupt: igem.save_json(cache_name, smiles_cache) print() print("Restart from this position: " + str(pointer)) raise except Exception as e: print("uh oh, some connection error :(") with open("output_ner/connection_errors_{}.txt".format(out_name), "a") as fh: fh.write("URLLIB CONNECTION ERROR: " + str(sent) + "\n") pass else: if req.getcode() == 200: print("It worked!") smiles = req.read().decode('utf8') smiles_cache[c.lower()] = smiles else: # Try pubchempy is this doesn't work. try: molecule = pcp.get_compounds(c, 'name') # Gets different IDs from the same compound name, that is why molecule[0] smiles = molecule[0].canonical_smiles print("PubChemPy worked!") smiles_cache[c.lower()] = smiles except: print(req.getcode()) raise if smiles is None: smiles_cache[c.lower()] = None return (None, None) else: return c, smiles # :)
def annotate(doi_pmid, text): global count global t0 t1 = time.time() if (count % 10 == 0): with open("{}.log".format(out_name), "a") as f: f.write("\n") f.write("{} out of {} completed\n".format(count, len(text_files.keys()))) f.write("elapsed time: " + str(time.time() - start_time) + "\n") igem.save_json(cache_name, smiles_cache) print() print("{} out of {} completed".format(count, len(text_files.keys()))) print(t1 - t0) t0 = t1 try: sentences = [ p.sentences for p in Document.from_string(text.encode()) if hasattr(p, 'sentences') ] # this has character-based indices except: sentences = [[]] sentence_found = [] starts = [] ends = [] indices = [] tagged = [] chemicals_found = [] bio_entities = [] bio_entities_with_pos = [] names_found = [] smiles_found = [] names_and_smiles = [] sentences = sentences[0] # weird nesting from CDE, do not change tot = time.time() times = 0 span_total = 0 successful_spans = 0 for i in range(len(sentences)): #TODO: change this to all sentences s = sentences[i] t_s_0 = time.time() # Part of Speech Tagger (used later for NLP) try: pos = (s.pos_tagged_tokens) except Exception as e: pos = cpt.tag(s.split()) spans = s.cems # generating here for enzyme finding span_names = [c.text for c in spans] # Enzymes in sentence (using regex) # attempt to get full enzyme names: enzyme_names = [] enzyme_names_locs = [] for i_w in range(len(pos)): word = pos[i_w][0] for m in re.finditer(r'[a-zA-Z]+ase\b', word): enzyme = m.group(0) i_l = i_w while i_l > 0: prev_word = pos[i_l][0] prev_pos = pos[i_l][1] if prev_word in span_names: enzyme = prev_word + " " + enzyme elif prev_pos not in ":;{}|,./<>?!": break i_l -= 1 enzyme_names.append(enzyme) enzyme_names_locs.append((enzyme, i_l, i_w)) spans_sent = [] smiles_sent = [] names_sent = [] names_smiles_sent = [] for r in range(len(spans)): span = spans[r] c = span.text # Tries to get smiles on entire string, then if it doesn't work, deals with the case where c is a conglomerate of chemicals seperated by spaces. name_smiles_tuples = get_smiles(s, c) print(name_smiles_tuples) print() # Ignore chemical if not found if not name_smiles_tuples or (len(name_smiles_tuples) == 1 and not name_smiles_tuples[0][0]): continue successful_spans += len(name_smiles_tuples) for name, smiles in name_smiles_tuples: if name: span_dict = { "text": name, "start": span.start, "end": span.end, "smiles": smiles } # Indexing through pos tokens to find chemical entities p = 0 while p < len(pos): token = pos[p][0] if token == span.text: span_dict["pos"] = pos[p][1] break p += 1 spans_sent.append(span_dict) names_sent.append(name) smiles_sent.append(smiles) names_smiles_sent.append((name, smiles)) # Leave for loop and add entries for each sentence in a given literature to lists sentence_found.append(s.text) chemicals_found.append(spans_sent) names_found.append( ", ".join(names_sent) ) # two commas and a space for redundancy, since IUPAC has commas smiles_found.append(", ".join(smiles_sent)) names_and_smiles.append(names_smiles_sent) starts.append(s.start) ends.append(s.end) indices.append(i) bio_entities.append(", ".join(enzyme_names)) bio_entities_with_pos.append(enzyme_names_locs) tagged.append(pos) if len(spans) > 0: times += time.time() - t_s_0 span_total += len(spans) #print(time.time()-t_s_0) # Create a dataframe with annotations from a given literature. print() print("Average time per span (one identified chemical entity): " + str(times / (span_total + 0.01))) t_an = time.time() print("Time for all sentences in text: " + str(t_an - tot)) print("Successfully classified span percent in paper: " + str(successful_spans / (span_total + 0.01))) # put all lists into a dictionary and coerce to dataframe! good riddance annotations = { "sentence": sentence_found, "start": starts, "end": ends, "indices": indices, "sentence_pos": tagged, "enzymes": bio_entities, "enzyme_locations": bio_entities_with_pos, "chemical_entities_full": chemicals_found, "chemical_names": names_found, "chemical_smiles": smiles_found, "name_smile_tuples": names_and_smiles } annots_csv = pd.DataFrame(annotations) annots_csv["lit_id"] = doi_pmid # Reorder our dataframe. annots_csv = annots_csv[[ "lit_id", "indices", "start", "end", "sentence", "sentence_pos", "enzymes", "enzyme_locations", "chemical_entities_full", "chemical_names", "chemical_smiles", "name_smile_tuples" ]] # Add the datagram to our csv_file, appending if it exists and creating a new one if not. if os.path.isfile(csv_file): annots_csv.to_csv(csv_file, mode='a', header=False, index=False) else: annots_csv.to_csv(csv_file, index=False)
# order the items: sorted_dictionary_by_pmid = collections.OrderedDict(sorted(text_files.items())) try: start = 0 ## CHANGE IF RESUMING! iter_dict = list(sorted_dictionary_by_pmid.items()) for pointer in range(len(iter_dict))[start:]: entry = iter_dict[pointer] doi_pmid = entry[0] text = entry[1] try: if text and isinstance(text, str): annotate(doi_pmid, text) count += 1 except Exception as e: print() print(e) print("here's a paper error!: " + doi_pmid) raise with open("output_ner/connection_errors_{}.txt".format(out_name), "a") as fh: fh.write("General paper error: " + str(doi_pmid) + "\n") continue except KeyboardInterrupt: igem.save_json(cache_name, smiles_cache) print("Restart from this position: " + str(pointer)) pass