ontoLabels = pd.read_csv(final_folder + "util_files/ontology_labels.tsv", sep="\t").set_index("index") ontoLabels = ontoLabels.to_dict(orient="index") print("Ontology Labels", len(ontoLabels)) old_dset_mappings = pd.read_csv(final_folder + "util_files/old_dataset_mappings.tsv", sep="\t").set_index("index") old_dset_mappings = old_dset_mappings.to_dict(orient="index") for k in old_dset_mappings: old_dset_mappings[k] = old_dset_mappings[k]['0'] old_dset_mappings[''] = '' print("Old Dataset Mappings", len(old_dset_mappings)) class_list_folder = final_folder + "class_lists/" class_set_files = fu.get_reqd_fileset( class_list_folder, lambda x: False if "classlist" in x.lower() else True) class_sets = {} ccount = 0 for k in class_set_files: a = mfio.load_matrix(class_list_folder + k) for m in a: if not m in class_sets: class_sets[m] = {"files": [], "instance_count": []} class_sets[m]["files"].append(k) class_sets[m]["instance_count"].append(a[m]) ccount += 1 print("Total Class Count", ccount) print("Unique Class Count", len(class_sets)) for k in class_sets: a, b = det_dset(
idfs = open(idf_file) idf_lines = idfs.readlines() idfs.close() for k in range(len(idf_lines)): if k == 0: continue idf_parts = idf_lines[k].strip().split() term = str(idf_parts[0]) vocab_dict[term]["idf"] = float(idf_parts[1]) vocab_dict["<unk>"]["idf"] = UNMAPPED_IDF_CONST word_vecs = load_word_vectors(vector_file) load_vocab(vocab_file) load_idfs(idf_file) print len(vocab_dict), word_vecs.shape fu = FileUtils() onto_folder = "bioontologies/" vec_folder = "onto_vectors_skospref/" unmapped_folder = "unmapped/" mfio = MatrixIO() all_onto_files = fu.get_reqd_fileset(onto_folder, lambda x: False if ".json" in x else True) for k in all_onto_files: print "starting " + k onto_embeddings, all_unmapped = generate_onto_vectors(onto_folder + k) np.save(vec_folder + k, onto_embeddings) mfio.save_matrix(all_unmapped, unmapped_folder + k + ".dict") # In[211]: #onto_file = "meddra.ttl.json"