示例#1
0
def load_things():
    rcgan = RetroCycleGAN(save_folder="test", batch_size=32, generator_lr=0.0001, discriminator_lr=0.001)
    rcgan.load_weights(preface="final", folder=rcgan_folder)
    print("Loading ft")
    generate_fastext_embedding("cat", ft_dir=fasttext_folder)
    print("Ready")
    return rcgan
示例#2
0
def create_sentence_embedding(c1, retrofitted_embeddings, rcgan, newlist):
    s = c1.split(" ")
    concept_vecs = []
    if len(s) > 1:
        for word in s:
            try:
                concept_vecs.append(retrofitted_embeddings.loc[word])
            except:
                print("Creating emb for", word)
                added = False
                for tup in newlist:
                    if word == tup[0]:
                        concept_vecs.append(tup[1])
                        added = True
                        break
                if not added:
                    concept_vecs.append(
                        pd.Series(
                            rcgan.g_AB.predict(
                                np.array(
                                    generate_fastext_embedding(word)).reshape(
                                        1, 300)).reshape(300)))
    concept_vecs = np.array(concept_vecs)
    avg = np.mean(concept_vecs, axis=0)
    return (c1, pd.Series(avg))
示例#3
0
def get_embedding(param):
    global rcgan
    if param in word_dict.keys():
        return word_dict[param]
    s = param.split(" ")
    if len(s)>1:
        a = create_sentence_embedding(param)
        word_dict[param] = a
        return word_dict[param]
    else:
        a= rcgan.g_AB.predict(np.array(generate_fastext_embedding(param)).reshape(1, 300)).reshape(300)
        word_dict[param] = a
        return word_dict[param]
示例#4
0
def create_sentence_embedding(c1):
    s = c1.split(" ")
    concept_vecs = []
    for word in s:
        try:
            concept_vecs.append(retrofitted_embeddings.loc[word])
        except:
            # print("Creating emb for", word)
            if word in word_dict.keys():
                concept_vecs.append(word_dict[word])
            else:
                concept_vecs.append(pd.Series(rcgan.g_AB.predict(np.array(generate_fastext_embedding(word))
                                                             .reshape(1, 300)
                                                             ).reshape(300)))
    concept_vecs = np.array(concept_vecs)
    avg = np.mean(concept_vecs, axis=0)
    return pd.Series(avg)
示例#5
0
def create_data2(use_cache=True):
    global retrofitted_embeddings
    rcgan = RetroCycleGAN(save_folder="test",
                          batch_size=32,
                          generator_lr=0.0001,
                          discriminator_lr=0.001)
    rcgan.load_weights(preface="final",
                       folder="trained_models/retrogans/ft_full_alldata_feb11")

    if os.path.exists("tmp/valid_rels.hd5") and use_cache:
        print("Using cache")
        a = pd.read_hdf("tmp/valid_rels.hd5", "mat")
        b = pd.read_hdf("tmp/updated_embeddings.hd5", "mat")
        b.drop_duplicates(inplace=True)
        return a, b
    assertionspath = "train600k.txt"
    valid_relations = []
    new = []
    with open(assertionspath) as assertionsfile:
        assertions = csv.reader(assertionsfile, delimiter="\t")
        row_num = 0
        skipped = 0
        for assertion_row in tqdm(assertions):
            # if row_num>100: break
            row_num += 1
            if row_num % 100000 == 0: print(row_num)
            try:
                rel = assertion_row[0]
                if "/r/" not in rel: rel = "/r/" + rel
                weight = float(assertion_row[3])
                # print(c1_split)
                c1 = assertion_row[1]
                c2 = assertion_row[2]

                if c1 not in retrofitted_embeddings.index or \
                        c2 not in retrofitted_embeddings.index or \
                        rel not in relations:
                    if rel not in relations:
                        print("Skipping relation", rel)
                        skipped += 1
                        continue
                    # try:
                    if len(c1.split(" ")) > 1:
                        #We have a sentence so create the embedding with the average.
                        # print("Not in index c1:",c1,len(c1.split(" ")))
                        a = create_sentence_embedding(c1,
                                                      retrofitted_embeddings,
                                                      rcgan, new)
                        # a = rcgan.g_AB.predict(np.array(generate_fastext_embedding(c1)).reshape(1, 300))
                        new.append(a)
                    elif c1 not in retrofitted_embeddings.index:
                        print("Not in index still and less than 1 c1:", c1,
                              len(c1.split(" ")))
                        a = rcgan.g_AB.predict(
                            np.array(generate_fastext_embedding(c1)).reshape(
                                1, 300)).reshape(300)
                        new.append((c1, pd.Series(a)))

                    if len(c2.split(" ")) > 1:
                        # print("Not in index c2:",c2,len(c2.split(" ")))
                        a = create_sentence_embedding(c2,
                                                      retrofitted_embeddings,
                                                      rcgan, new)
                        new.append(a)
                    elif c2 not in retrofitted_embeddings.index:
                        print("Not in index still and less than 1 c2:", c2,
                              len(c2.split(" ")))
                        a = rcgan.g_AB.predict(
                            np.array(generate_fastext_embedding(c2)).reshape(
                                1, 300)).reshape(300)
                        new.append((c2, pd.Series(a)))
                        # else:
                        #     a = np.array(generate_fastext_embedding(c1))
                        #     new.append(a)
                        #     print("Adding",c1)

                    # except Exception as e:
                    #     print(e)
                    #     skipped+=1
                    #     continue

                valid_relations.append([rel, c1, c2, weight])
            except Exception as e:
                print("An error ocurred in", row_num, assertion_row)
                print(e)
                pass
                # print(e)
            if len(valid_relations) % 10000 == 0:
                print(len(valid_relations), skipped)
        print(skipped)
    new_index = [x for x in retrofitted_embeddings.index]
    new_vals = [x for x in retrofitted_embeddings.values]
    for i in range(len(new)):
        new_index.append(new[i][0])
        new_vals.append(new[i][1])
    print("Updating embeddings")
    retrofitted_embeddings = pd.DataFrame(data=new_vals, index=new_index)
    print("Dropping dupes")
    retrofitted_embeddings.drop_duplicates(inplace=True)
    print("SAVING TO FILE")
    retrofitted_embeddings.to_hdf("tmp/updated_embeddings.hd5", "mat")
    print("Generating the training data")
    af = pd.DataFrame(data=valid_relations, index=range(len(valid_relations)))
    # af = retrofitted_embeddings
    print("Training data:")
    print(af)
    af.to_hdf("tmp/valid_rels.hd5", "mat")
    return af, retrofitted_embeddings
                          compile=False)
    retrogan.compile(optimizer=Adam(), loss=['mae'])
    retrogan.load_weights(trained_model_path)
    # Load our vocabulary
    target_voc = pd.read_hdf(target_file_loc, 'mat')

    triples = []
    # beef up our vocab with missing entries
    clean_file_contents = profession_words
    in_dataset = tools.check_index_in_dataset(clean_file_contents, target_voc)
    for i, val in enumerate(in_dataset):
        if not val:
            missing_text = clean_file_contents[i]
            print(missing_text)
            # print("Missing:",missing_text)
            we = tools.generate_fastext_embedding(missing_text, ft_dir="../fasttext_model/cc.en.300.bin")
            # print("We:",we)
            if missing_text in names:
                print("Name")
                index = "/c/en/"+missing_text
            else:
                print("Not name")
                index = tools.standardized_concept_uri("en", missing_text)
            # print(index)
            rwe = tools.get_retrofitted_embedding(we, retrogan)
            # print("Retrofitted_embedding",rwe)
            df = pd.DataFrame(data=[rwe], index=[index])
            target_voc = target_voc.append(df)
            print(target_voc.shape)

    target_voc.to_hdf(output_file_loc,'mat')