encoding="utf-8") counter = {} for line in file.iter(): drug = line[3] drug = drug.split(";")[0] if drug: if drug not in counter: counter[drug] = 1 else: counter[drug] += 1 print(counter) print(len(counter)) s = sorted(counter.items(), key=lambda x: x[1])[::-1] print(s) save_file = FM("../PUBMED_DATA/drug_counter", extension=".txt") fp = save_file.get_filepointer() for i in s: fp.write(("\t".join([i[0], str(i[1])]) + '\n')) fp.close() # with open("../PUBMED_DATA/drug_counter.latest.txt", 'w', encoding = "utf8") as fp: # for i in s: # fp.write(("\t".join([i[0],str(i[1])])+'\n'))
compacted_fda_dict[new_key]["drug"].lower().split(";"), fda_dict[key]["drug"].split(";")) compacted_fda_dict[new_key]["industry"] = setify_lists( compacted_fda_dict[new_key]["industry"].lower().split(";"), fda_dict[key]["industry"].split(";")) compacted_fda_dict[new_key]["date"] = setify_lists( compacted_fda_dict[new_key]["date"].split(";"), fda_dict[key]["date"].split(";")) compacted_fda_dict[new_key]["indication"] = setify_lists( compacted_fda_dict[new_key]["indication"].lower().split(";"), fda_dict[key]["indication"].split(";")) # fda["drug"], fda["COMMON_DRUG_BANK_ALIAS"], fda["industry"], fda["date"], fda["min_date"] pool_file = FM("../FDA/FDA_by_drugbank2606", extension=".txt", format="tsv", olddata_dir="../OLD_DATA/FDA") pool_data = [] pool_data.append( "FDA_APPROUVED_DRUG COMMON_DRUG_BANK_ALIAS INDUSTRY INDICATION DATES_OF_APPROVAL FIRST_DATE_OF_APPROVAL" ) def try_parsing_date(text): for fmt in ("%B %d %Y", "%b. %d %Y", "%b %Y", "%B %Y", "%B %d%Y", "%B %Y", "%B%d %Y", "%B%d%Y"): try: return datetime.strptime(text, fmt) except ValueError:
''' Cleaning of the fda database. Specifically extra trailling spaces and tabs. ''' from utils import File_Reader as FR from utils import File_Maker as FM charstrip = "^[ ]+|[ ]+$" fda_file = FR("../FDA/FDA_DRUG_DATABASE_cured.txt", encoding="utf-16", sep="\t", strip_chars_pattern=charstrip) fda = [] for f in fda_file.iter(): fda.append(f[:5]) print(fda) file = FM("../FDA/FDA_DRUG_DATABASE_cured_cleaned", data_stream=fda, extension=".txt") file.save() # with open("../FDA/FDA_DRUG_DATABASE_cured_cleaned.latest.txt", 'w', encoding = 'utf-8') as fp: # for i in fda: # fp.write("\t".join(i)+'\n')
match[pmid][2].append(names) match[pmid] = (year, title, match[pmid][2], description) else: match[pmid] = (year, title, match[pmid][2] , description) # tf.step() missing= 0 for k,v in match.items(): if v==("","",[],""): missing+=1 pubmedNdrugs = FM("../PUBMED_DATA/pubmed2606_with_drugs", extension = ".txt", olddata_dir = "../OLD_DATA/PUBMED_DATA/") with pubmedNdrugs.get_filepointer() as fp: for k,v in match.items(): if v[2]: res = "" for i in range(len(v[2])): v[2][i] = ";".join(v[2][i]) topop = [] for i in v[2]: for j in v[2]: # " " not in j.replace(i, "") and not " "+i+" " in v[1] if i in j and len(i) < len(j) and i not in topop and "-" in j.replace(i, "") and not v[1].count(i)>1: topop.append(i) for to in topop: v[2].pop(v[2].index(to))
choices = [new_drugs[c] for c in index] print("conflict") print(len(new_dict)) for new in new_drugs: if not new in new_dict: new_dict[new] = {} for i in range(len(header)): new_dict[new][header[i]] = "" print(len(new_dict)) for new in new_drugs: new_dict[new]["DRUG_NAMES"] = new pool_file = FM("../DRUG_LISTS/drug_pivot_2606", extension=".txt", format="tsv", olddata_dir="../OLD_DATA/PUBMED_DATA") pool_data = [] pool_data.append(header) for drug, values in sorted(new_dict.items()): pool_data.append(values.values()) pool_file.set_datastream(pool_data) pool_file.save() pool_file.close()
pool.append(item) # #Build final result from pool while pool: elems = pool.pop() res = [] while elems: searching = elems.pop() if searching not in res: res.append(searching) for p in pool: if searching in p: for i in p: elems.append(i) indices = [i for i, x in enumerate(pool) if x == p] pool = [x for i, x in enumerate(pool) if i not in indices] final_ref.append(sorted(res)) csv = [] for i in final_ref: csv.append(";".join(i)) csv = list(set(csv)) csv.sort() full_drug_list = FM("../DRUG_LISTS/full_drug_list", extension=".txt") with full_drug_list.get_filepointer() as fp: for drug in csv: fp.write(drug + "\n") fp.close()
# Parse xml file. print("parsing xml") tree = ET.parse('../DRUGBANK/drugbank_db_schema.xml') root = tree.getroot() res = [] def tree_builder(node,depth): for child in node: print(child.tag) res.append(depth * '\t' + child.tag) tree_builder(child, depth+1) tree_builder(root, 0) tree_tagging = FM("../DRUGBANK/tree_tagging", extension = ".txt") with tree_tagging.get_filepointer() as fp: for t in res: fp.write(t+'\n') fp.close() # def tree_builder(node): # children = [] # for elem in node: # children.append(elem.tag) # next_elem = tree_builder(elem) # if next_elem: # children.append(next_elem) # return children
if min_index!=-1: entry["FIRST_FDA_SubmissionStatusDate"] = dates[min_index] entry["FIRST_FDA_SponsorName"] = sponsors[min_index] else: entry["FIRST_FDA_SubmissionStatusDate"] = "" entry["FIRST_FDA_SponsorName"] = "" entry[app+"SponsorName"] = ";".join(list(set(entry[app+"SponsorName"].split(";")))) entry[app+"ActiveIngredient"] = ";".join(list(set(entry[app+"ActiveIngredient"].split(";")))) entry[app+"SubmissionStatusDate"] = ";".join(list(set(entry[app+"SubmissionStatusDate"].split(";")))) entry[app+"SubmissionStatus"] = ";".join(list(set(entry[app+"SubmissionStatus"].split(";")))) pool_file = FM("../FDA/FDA2018_by_drugbank2606", extension = ".txt", format = "tsv", olddata_dir = "../OLD_DATA/FDA") pool_data = [] pool_data.append([head for head in header]) for entry in drugs_dict.values(): pool_data.append([str(v) for v in entry.values()]) pool_file.set_datastream(pool_data) pool_file.save() pool_file.close()
from utils import File_Maker as FM file = FM("GRAPH/test78324264.txt", replace_old=True, version_control=False) print(file.get_filename()) print(file.get_extension()) print(file.get_savedir()) print(file.original_dir) fp = file.get_filepointer() fp.write("adzaf") fp.write("sth") fp.write("rthter") fp.write("rthet") fp.close()
"%m/%Y")) if line_min[0] is not current_min[0]: pmids = [line_min[0], current_min[0]] dates = [line_min[1], current_min[1]] drugbank_dict[db_alias][ "OLDEST_DATE_OF_PUBLICATION"] = datetime.strftime( dates[dates.index(min(dates))], "%m/%Y") drugbank_dict[db_alias]["OLDEST_PMID"] = pmids[dates.index( min(dates))] print(line[0]) print(line_min) print(drugbank_dict[db_alias]["OLDEST_DATE_OF_PUBLICATION"]) print(drugbank_dict[db_alias]) print("") # print(drugbank_dict[db_alias]) pool_file = FM("../PUBMED_DATA/drugbank2606", extension=".txt", format="tsv", olddata_dir="../OLD_DATA/PUBMED_DATA") pool_data = [] pool_data.append(keys) for drug in drugbank_dict.values(): pool_data.append(drug.values()) print(len([v for v in drugbank_dict.values()])) pool_file.set_datastream(pool_data) pool_file.save() pool_file.close()
pubmed_dict = {} for line in pubmed_file.iter(): pubmed_dict[line[1]] = " ".join([line[4], line[3]]) for drug in drug_dict.values(): if drug["ASSOCIATED_PMID"]: pmids = drug["ASSOCIATED_PMID"].split(";") date_list = [] for pmid in pmids: date_list.append(pubmed_dict[pmid]) for i in range(len(date_list)): date_list[i] = datetime.strptime(date_list[i], "%m %Y") min_index = date_list.index(min(date_list)) drug["OLDEST_PMID"] = pmids[min_index] drug["OLDEST_DATE_OF_PUBLICATION"] = datetime.strftime( date_list[min_index], "%m/%Y") # Save stuff pool_file = FM("../PUBMED_DATA/drugs2606minedalias_with_found_identifiers", extension=".txt", format="tsv", olddata_dir="../OLD_DATA/PUBMED_DATA") for drug in drug_dict.values(): pool_data.append(drug.values()) pool_file.set_datastream(pool_data) pool_file.save() pool_file.close()
d.extend(drugs[index].split(";")) drugs.append(";".join(sorted(list(set(d))))) for d in todel: if d in drugs and drugs.index(d) < maxdel - len(todel): drugs.pop(drugs.index(d)) for d in range(len(drugs)): drugs[d] = ";".join(sorted(drugs[d].split(";"))) drugs = list(set(drugs)) drugs.sort() pool_file = FM("../DRUG_LISTS/drug_list_2606_curated_cleaned", extension=".txt", format="tsv", olddata_dir="../OLD_DATA/DRUG_LISTS", encoding="utf-8") pool_file.set_datastream(drugs) pool_file.save() pool_file.close() # annotations = annotation_file.readlines() # header = annotations.pop(0) # drug_dict = {} # for line in annotations: # drug_dict[line[0]] = {} # for i in range(len(header)): # val = ""
found = True if found: match[pmid] = (year, title, match[pmid][2] + names, description) # tf.step() print(match[28980060]) missing = 0 for k, v in match.items(): if v == ("", "", [], ""): missing += 1 print(missing) pubmedNdrugs = FM("../PUBMED_DATA/pubmedNdrugs", extension=".txt") with pubmedNdrugs.get_filepointer() as fp: for k, v in match.items(): if v[2]: fp.write("\t".join([str(k), v[0], v[1], v[2][0]]) + "\n") else: fp.write("\t".join([str(k), v[0], v[1], ""]) + "\n") fp.close() # with open("../PUBMED_DATA/pubmedNdrugs.latest.txt", "w", encoding = "utf-8") as fp: # for k,v in match.items(): # if v[2]: # fp.write("\t".join([str(k),v[0], v[1],v[2][0]])+"\n") # else: # fp.write("\t".join([str(k),v[0], v[1],""])+"\n") noDrug = FM("../PUBMED_DATA/pubmed_data_2606_noDRUG", extension=".txt")