示例#1
0
ontoLabels = pd.read_csv(final_folder + "util_files/ontology_labels.tsv",
                         sep="\t").set_index("index")
ontoLabels = ontoLabels.to_dict(orient="index")
print("Ontology Labels", len(ontoLabels))

old_dset_mappings = pd.read_csv(final_folder +
                                "util_files/old_dataset_mappings.tsv",
                                sep="\t").set_index("index")
old_dset_mappings = old_dset_mappings.to_dict(orient="index")
for k in old_dset_mappings:
    old_dset_mappings[k] = old_dset_mappings[k]['0']
old_dset_mappings[''] = ''
print("Old Dataset Mappings", len(old_dset_mappings))

class_list_folder = final_folder + "class_lists/"
class_set_files = fu.get_reqd_fileset(
    class_list_folder, lambda x: False if "classlist" in x.lower() else True)
class_sets = {}
ccount = 0
for k in class_set_files:
    a = mfio.load_matrix(class_list_folder + k)
    for m in a:
        if not m in class_sets:
            class_sets[m] = {"files": [], "instance_count": []}
        class_sets[m]["files"].append(k)
        class_sets[m]["instance_count"].append(a[m])
        ccount += 1

print("Total Class Count", ccount)
print("Unique Class Count", len(class_sets))
for k in class_sets:
    a, b = det_dset(
    idfs = open(idf_file)
    idf_lines = idfs.readlines()
    idfs.close()
    for k in range(len(idf_lines)):
        if k == 0: continue
        idf_parts = idf_lines[k].strip().split()
        term = str(idf_parts[0])
        vocab_dict[term]["idf"] = float(idf_parts[1])
    vocab_dict["<unk>"]["idf"] = UNMAPPED_IDF_CONST


word_vecs = load_word_vectors(vector_file)
load_vocab(vocab_file)
load_idfs(idf_file)
print len(vocab_dict), word_vecs.shape

fu = FileUtils()
onto_folder = "bioontologies/"
vec_folder = "onto_vectors_skospref/"
unmapped_folder = "unmapped/"
mfio = MatrixIO()
all_onto_files = fu.get_reqd_fileset(onto_folder, lambda x: False
                                     if ".json" in x else True)
for k in all_onto_files:
    print "starting " + k
    onto_embeddings, all_unmapped = generate_onto_vectors(onto_folder + k)
    np.save(vec_folder + k, onto_embeddings)
    mfio.save_matrix(all_unmapped, unmapped_folder + k + ".dict")
# In[211]:
#onto_file = "meddra.ttl.json"