Exemplo n.º 1
0
def create_fse_model(sentences):
    sentences = [sent["sentence"] for sent in sentences]
    print("SIF create indexes for embeddings")
    ft = load_fasttext_model()
    model = SIF(ft)
    idx_sentences = IndexedList(sentences)
    model.train(idx_sentences)
    return model, idx_sentences
Exemplo n.º 2
0
def sentence_similarity(text1,text2,similarity_threshold=0.50):
    sentences = [modify(text1),modify(text2)]
    ft = FastText(sentences, min_count=1,size=12,workers=4)

    model = Average(ft)
    model.train(IndexedList(sentences),update=True,report_delay=10,queue_factor=4)
    sim = model.sv.similarity(0,1)
    
    if sim >= similarity_threshold:
        return True
    
    else:
        return False
Exemplo n.º 3
0
def sentence_similarity(los):
    sentences = [modify(i) for i in los]
    ft = FastText(sentences, min_count=1, size=12, workers=4)
    model = Average(ft)
    model.train(IndexedList(sentences),
                update=True,
                report_delay=10,
                queue_factor=4)

    res_similar = []
    for i in range(len(los) - 1):
        res_similar.append(model.sv.similarity(i, i + 1))

    return res_similar
Exemplo n.º 4
0
	def sentence_similarity(self,text1,text2,similarity_threshold=0.35):
		sentences = [self.modify(text1),self.modify(text2)]
		ft = FastText(sentences, min_count=1,size=12,workers=4)

		model = Average(ft)
	
		try:
			model.train(IndexedList(sentences),update=True,report_delay=10,queue_factor=4)
		except ZeroDivisionError as z:
			pass

		sim = model.sv.similarity(0,1)

		#if sim >= similarity_threshold:
		#	return True

		#else:
		#	return False
		return sim
Exemplo n.º 5
0
    def sentence_similarity(self, los, percent=0.6):
        sentences = [self.modify(i) for i in los]
        ft = FastText(sentences, min_count=1, size=12, workers=4)

        model = Average(ft)
        model.train(IndexedList(sentences),
                    update=True,
                    report_delay=10,
                    queue_factor=4)

        res_similar = []
        for i in range(len(los) - 1):
            res_similar.append(model.sv.similarity(i, i + 1))

        if np.mean(res_similar) > 0.9:
            PERCENT_REDUCE = percent
            for i in range(len(res_similar)):
                res_similar[i] -= (PERCENT_REDUCE * res_similar[i])

        return res_similar
Exemplo n.º 6
0
 def forward(self, batch):
     assert type(batch[0]) == list
     batch = IndexedList(batch)
     return torch.tensor(self.model.infer(batch), dtype=torch.float32)
Exemplo n.º 7
0
            continue
        # text = ""
        # tp_text = tuple(text.join(row["text"]))
        # text = text.join(row["text"])
        text = row["text"].split()
        sentences.append(text)
        count += 1 
        if count == 20:
            break

# tp_sentences = tuple(sentences)

from fse.models import SIF
from fse import IndexedList
model = SIF(wvecs)
sents = IndexedList(sentences)
model.train(sents)

# f = open("sent_embed.csv", "w")
import numpy as np

array = []
for i in range(len(model.sv)):
    for n in model.sv[i]:
        tmp = n
        print(round(tmp, 7))
        exit()
    array.append(model.sv[i])

np.savetxt(f, array, delimiter=",")
# [f.write(i) for i in model.sv]
        n = int(len(short_input_text) / 50)
        short_input_text = " ".join([short_input_text[50 * x:50 * (x + 1)] + "-" + "<br>" for x in range(n)])
    if len(short_bot_text) > 50:
        n = int(len(short_bot_text) / 50)
        short_bot_text = " ".join([short_bot_text[50 * x:50 * (x + 1)] + "-" + "<br>" for x in range(n)])
    short_entity = str(value['entities']) if "scontent.xx.fbcdn.net" not in str(value['entities']) else "url"
    short_actions = str(value['action_1']) if "scontent.xx.fbcdn.net" not in str(value['action_1']) else "url"

    short_input_texts.append(short_input_text)
    short_bot_texts.append(short_bot_text)
    short_entities.append(short_entity)

ft = FastText(sentences, min_count=1, size=10)

model = Average(ft)
model.train(IndexedList(sentences))

vectors_list = model.sv.vectors.tolist()  # 10 dimensions vectors
# tsne = TSNE(n_components=3)
tsne = TSNE(n_components=2)
tsne_vectors = tsne.fit_transform(vectors_list)

# scores = []
# for k in range(2,20):
#     x = k
#     kmeans = KMeans(n_clusters=x, random_state=0)
#     kmeans = kmeans.fit(tsne_vectors)
#     labels = kmeans.labels_
#     score = silhouette_score(tsne_vectors, labels)
#     inertia = kmeans.inertia_
#     scores.append((k, score,inertia))
Exemplo n.º 9
0
 def encode_batch(self, texts):
     tokens = [self._tokenize(txt) for txt in texts]
     sents = IndexedList(tokens)
     return self.model.infer(sents)
Exemplo n.º 10
0
 def encode(self, text):
     sent = IndexedList([self._tokenize(text)])
     return self.model.infer(sent).squeeze()
Exemplo n.º 11
0
 def calculate_embeddings(self, list):
     from fse import IndexedList
     texts = IndexedList(list)
     embs = self.model.infer(texts)
     return embs
Exemplo n.º 12
0
 def fit(self, list):
     from fse import IndexedList
     texts = IndexedList(list)
     self.model.train(texts)
                     chunks=(2048, None),
                     dtype="f4")
else:
    z_embs = z[emb_path]

# encode & save
if "bert" in model_name:
    for i, batch in enumerate(tqdm(loader)):
        # encode
        embs = batch_encode(batch, tokenizer, model).cpu().numpy()
        # save
        start = i * args.batch_size
        end = start + embs.shape[0]
        z_embs[start:end] = embs[:]
elif "fse" in model_name:
    sent_model = SIF(model, workers=8, lang_freq="en")
    # train
    for i, batch in enumerate(loader):
        sentences = IndexedList([TextBlob(s).tokens for s in batch])
        sent_model.train(sentences)
    sent_model.save(fpath.parent / "fse.model")
    # infer
    for i, batch in enumerate(loader):
        sentences = IndexedList([TextBlob(s).tokens for s in batch])
        # encode
        embs = batch_encode(sentences, sent_model)
        # save
        start = i * args.batch_size
        end = start + embs.shape[0]
        z_embs[start:end] = embs[:]
Exemplo n.º 14
0
sentences = list()
for fileitem in filelist:
    print("Reading " + fileitem + "...")
    filepath = os.path.join(dirpath, fileitem)
    with open(filepath + ".txt") as f:
        temps = list()
        for a in map(lambda x: x.split(), f.read().split("\n")):
            temps.extend(a)
        sentences.append(a)

    print("Read " + fileitem)
wvmod = gensim.downloader.load("word2vec-google-news-300")

avg = Average(wvmod)
avg.wvmod = gensim.downloader.load("word2vec-google-news-300")
train(IndexedList(sentences))
sif = SIF(wvmod)
sif.train(IndexedList(sentences))

simMat = [[0 for a in filelist] for b in filelist]
for a in range(len(filelist)):
    for b in range(len(filelist)):
        sim1 = avg.sv.similarity(a, b)
        sim2 = sif.sv.similarity(a, b)
        simMat[a][b] = sim2
        # simMat[a][b] = scaled_sim(sim1, sim2)

for i in range(len(filelist)):
    print('  '.join(["     "] +
                    [str(a).center(7, ' ') for a in range(len(filelist))]))
    print(str(i).center(4, " "), end="  ")