Пример #1
0
def sentence_similarity(text1,text2,similarity_threshold=0.50):
    sentences = [modify(text1),modify(text2)]
    ft = FastText(sentences, min_count=1,size=12,workers=4)

    model = Average(ft)
    model.train(IndexedList(sentences),update=True,report_delay=10,queue_factor=4)
    sim = model.sv.similarity(0,1)
    
    if sim >= similarity_threshold:
        return True
    
    else:
        return False
Пример #2
0
def sentence_similarity(los):
    sentences = [modify(i) for i in los]
    ft = FastText(sentences, min_count=1, size=12, workers=4)
    model = Average(ft)
    model.train(IndexedList(sentences),
                update=True,
                report_delay=10,
                queue_factor=4)

    res_similar = []
    for i in range(len(los) - 1):
        res_similar.append(model.sv.similarity(i, i + 1))

    return res_similar
Пример #3
0
	def sentence_similarity(self,text1,text2,similarity_threshold=0.35):
		sentences = [self.modify(text1),self.modify(text2)]
		ft = FastText(sentences, min_count=1,size=12,workers=4)

		model = Average(ft)
	
		try:
			model.train(IndexedList(sentences),update=True,report_delay=10,queue_factor=4)
		except ZeroDivisionError as z:
			pass

		sim = model.sv.similarity(0,1)

		#if sim >= similarity_threshold:
		#	return True

		#else:
		#	return False
		return sim
Пример #4
0
    def sentence_similarity(self, los, percent=0.6):
        sentences = [self.modify(i) for i in los]
        ft = FastText(sentences, min_count=1, size=12, workers=4)

        model = Average(ft)
        model.train(IndexedList(sentences),
                    update=True,
                    report_delay=10,
                    queue_factor=4)

        res_similar = []
        for i in range(len(los) - 1):
            res_similar.append(model.sv.similarity(i, i + 1))

        if np.mean(res_similar) > 0.9:
            PERCENT_REDUCE = percent
            for i in range(len(res_similar)):
                res_similar[i] -= (PERCENT_REDUCE * res_similar[i])

        return res_similar
    if len(short_input_text) > 50:
        n = int(len(short_input_text) / 50)
        short_input_text = " ".join([short_input_text[50 * x:50 * (x + 1)] + "-" + "<br>" for x in range(n)])
    if len(short_bot_text) > 50:
        n = int(len(short_bot_text) / 50)
        short_bot_text = " ".join([short_bot_text[50 * x:50 * (x + 1)] + "-" + "<br>" for x in range(n)])
    short_entity = str(value['entities']) if "scontent.xx.fbcdn.net" not in str(value['entities']) else "url"
    short_actions = str(value['action_1']) if "scontent.xx.fbcdn.net" not in str(value['action_1']) else "url"

    short_input_texts.append(short_input_text)
    short_bot_texts.append(short_bot_text)
    short_entities.append(short_entity)

ft = FastText(sentences, min_count=1, size=10)

model = Average(ft)
model.train(IndexedList(sentences))

vectors_list = model.sv.vectors.tolist()  # 10 dimensions vectors
# tsne = TSNE(n_components=3)
tsne = TSNE(n_components=2)
tsne_vectors = tsne.fit_transform(vectors_list)

# scores = []
# for k in range(2,20):
#     x = k
#     kmeans = KMeans(n_clusters=x, random_state=0)
#     kmeans = kmeans.fit(tsne_vectors)
#     labels = kmeans.labels_
#     score = silhouette_score(tsne_vectors, labels)
#     inertia = kmeans.inertia_
Пример #6
0
dirpath = "./data"

sentences = list()
for fileitem in filelist:
    print("Reading " + fileitem + "...")
    filepath = os.path.join(dirpath, fileitem)
    with open(filepath + ".txt") as f:
        temps = list()
        for a in map(lambda x: x.split(), f.read().split("\n")):
            temps.extend(a)
        sentences.append(a)

    print("Read " + fileitem)
wvmod = gensim.downloader.load("word2vec-google-news-300")

avg = Average(wvmod)
avg.wvmod = gensim.downloader.load("word2vec-google-news-300")
train(IndexedList(sentences))
sif = SIF(wvmod)
sif.train(IndexedList(sentences))

simMat = [[0 for a in filelist] for b in filelist]
for a in range(len(filelist)):
    for b in range(len(filelist)):
        sim1 = avg.sv.similarity(a, b)
        sim2 = sif.sv.similarity(a, b)
        simMat[a][b] = sim2
        # simMat[a][b] = scaled_sim(sim1, sim2)

for i in range(len(filelist)):
    print('  '.join(["     "] +
Пример #7
0
def prep_sentence(sentence):
    tokens = []
    for token in word_tokenize(sentence):
        if not_punc.match(token):
            tokens = tokens + prep_token(token)
    return tokens


sentences = CSplitIndexedList(sent_a, sent_b, custom_split=prep_sentence)

sentences[0]
models, results = {}, {}
word2vec = KeyedVectors.load("C:/Users/Kamil/Downloads/word2vec_300_3_polish.bin")


models[f"CBOW-W2V"] = Average(word2vec, lang_freq="pl")
models[f"SIF-W2V"] = SIF(word2vec, components=10)
models[f"uSIF-W2V"] = uSIF(word2vec, length=11)

from gensim.scripts.glove2word2vec import glove2word2vec  
glove = KeyedVectors.load_word2vec_format("C:/Users/Kamil/Downloads/glove_300_3_polish2.txt")
models[f"CBOW-Glove"] = Average(glove,  lang_freq="pl")
print(f"After memmap {sys.getsizeof(glove.vectors)}")
models[f"SIF-Glove"] = SIF(glove, components=15)
models[f"uSIF-Glove"] = uSIF(glove,length=11)

ft = FastTextKeyedVectors.load("D:/fasttext_300_3_polish.bin")
models[f"CBOW-FT"] = Average(ft, lang_freq="pl")
models[f"SIF-FT"] = SIF(ft, components=10)
models[f"uSIF-FT"] = uSIF(ft, length=11)