Exemplo n.º 1
0
 def testWord2vec(self):
     base_dir = os.path.dirname(__file__)
     word2vector = Word2Vector.get_word_model(
         os.path.join(base_dir, "./model/word2vec_word.word2vec.model"))
     words = word2vector.most_similar([u"香辣蟹"])
     self.assertEqual(words[0][0], u"椒盐")
     self.assertEqual(words[1][0], u"排档")
     self.assertEqual(len(words), 10)
     words = word2vector.most_similar([u"香辣蟹", u"啤酒"], topn=5)
     self.assertEqual(words[0][0], u"椒盐")
     self.assertEqual(words[1][0], u"菜品")
     self.assertEqual(len(words), 5)
     word = word2vector.doesnt_match([u"香辣蟹", u"啤酒", u"椒盐", u"地铁站"])
     self.assertEqual(word, u"地铁站")
     cos = word2vector.similarity(u"椒盐", u"香辣蟹")
     self.assertTrue(0.996437 > cos > 0.996436)
     res = word2vector.n_similarity([u"椒盐", u"香辣蟹"], [u"地铁站", u"出去"])
     self.assertTrue(0.980 < res < 0.981)
Exemplo n.º 2
0
from _collections import defaultdict
from pynlpini import Word2Vector
from pipe import select, sort, as_list
import os
import logging

logging.basicConfig(
    format=
    '%(asctime)s - %(filename)s:%(lineno)s - %(levelname)s - %(message)s',
    level=logging.INFO)

labelled_word_path = "../../../data/app/word_correlate_tagger/labelled_word.csv"
tag_list_path = "../../../data/app/word_correlate_tagger/tag.csv"
unlabelled_word_path = "../../../data/app/word_correlate_tagger/unlabelled_word.csv"
result_path = "../../../data/app/word_correlate_tagger/tag_result.csv"
model = Word2Vector.get_phrase_model()
vocabs = set(model.vocab.keys())
ktag_to_utags = defaultdict(set)

if os.path.exists(labelled_word_path):
    with open(labelled_word_path) as labelled_tag_file:
        logging.info("Processing " + labelled_word_path)
        for line in labelled_tag_file:
            line = line.strip().decode("utf-8")
            if len(line.split("\t")) == 2:
                utag = line.split("\t")[0]
                ktag = line.split("\t")[1]
                if utag in vocabs:
                    ktag_to_utags[ktag].add(utag)
if os.path.exists(tag_list_path):
    with open(tag_list_path) as tag_list_file:
Exemplo n.º 3
0
def phrase2vec(txt, topn):
    global phrase2vector
    if phrase2vector is None:
        phrase2vector = Word2Vector.get_phrase_model()
    words = txt.split()
    return json.dumps(phrase2vector.most_similar(words, topn=topn), ensure_ascii=False)
Exemplo n.º 4
0
def word2vec(txt, topn):
    global word2vector
    if word2vector is None:
        word2vector = Word2Vector.get_word_model()
    words = txt.split()
    return json.dumps(word2vector.most_similar(words, topn=topn), ensure_ascii=False)
Exemplo n.º 5
0
from _collections import defaultdict
from pynlpini import Word2Vector
from pipe import select, sort, as_list
import os
import logging

logging.basicConfig(format='%(asctime)s - %(filename)s:%(lineno)s - %(levelname)s - %(message)s', level=logging.INFO)

labelled_word_path = "../../../data/app/word_correlate_tagger/labelled_word.csv"
tag_list_path = "../../../data/app/word_correlate_tagger/tag.csv"
unlabelled_word_path = "../../../data/app/word_correlate_tagger/unlabelled_word.csv"
result_path = "../../../data/app/word_correlate_tagger/tag_result.csv"
model = Word2Vector.get_phrase_model()
vocabs = set(model.vocab.keys())
ktag_to_utags = defaultdict(set)

if os.path.exists(labelled_word_path):
    with open(labelled_word_path) as labelled_tag_file:
        logging.info("Processing " + labelled_word_path)
        for line in labelled_tag_file:
            line = line.strip().decode("utf-8")
            if len(line.split("\t")) == 2:
                utag = line.split("\t")[0]
                ktag = line.split("\t")[1]
                if utag in vocabs:
                    ktag_to_utags[ktag].add(utag)
if os.path.exists(tag_list_path):
    with open(tag_list_path) as tag_list_file:
        logging.info("Processing " + tag_list_path)
        for key in tag_list_file:
            key = key.strip().decode("utf-8")
Exemplo n.º 6
0
 def testPhrase2vec(self):
     base_dir = os.path.dirname(__file__)
     phrase2vector = Word2Vector.get_phrase_model(
         os.path.join(base_dir, "./model/word2vec_phrase.word2vec.model"))
     res = phrase2vector.most_similar([u"历史悠久", u"法国"])
     self.assertEqual(res[2][0], "halohalo")