Exemplo n.º 1
0
def main(args):
    # location of input-output
    data_dir = args.input
    train_loc = os.path.join(data_dir, "train.csv")
    test_loc = os.path.join(data_dir, "test.csv")
    train = pd.read_csv(train_loc)
    test = pd.read_csv(test_loc)
    nlp = spacy.load("en_core_web_lg")

    def extract_bow(data, text_col, id_col, uniq_tokens=None):
        documents = {}
        sent = {}
        if uniq_tokens is None:
            uniq_tokens = {}
        for i, line in tqdm(data.iterrows(), total=data.shape[0]):
            # TODO: remove after debugging
            sent[line[id_col]] = line[text_col]
            if i == 1000:
                # TODO: remove after experiments
                break

            text = nlp(line[text_col])
            tokens = [t for t in text if t.is_alpha and not t.is_stop]
            orths = {t.text: t.orth for t in tokens}
            words = Counter(t.text for t in tokens if t.text in nlp.vocab)
            sorted_words = sorted(words)
            documents[line[id_col]] = (line[id_col], [
                orths[t] for t in sorted_words
            ], np.array([words[t] for t in sorted_words], dtype=np.float32))
        return documents, uniq_tokens, sent

    tid1_nlp, uniq_tokens, tid1_sent = extract_bow(train,
                                                   text_col="title1_en",
                                                   id_col="tid1")
    tid2_nlp, uniq_tokens, tid2_sent = extract_bow(train,
                                                   text_col="title2_en",
                                                   id_col="tid2",
                                                   uniq_tokens=uniq_tokens)

    class SpacyEmbeddings(object):
        def __getitem__(self, item):
            return nlp.vocab[item].vector

    from wmd import TailVocabularyOptimizer

    tid1_calc = WMD(SpacyEmbeddings(),
                    tid1_nlp,
                    vocabulary_min=10,
                    vocabulary_optimizer=TailVocabularyOptimizer(1.))
    tid2_calc = WMD(SpacyEmbeddings(), tid2_nlp, vocabulary_min=3)
    def word_mover_distance(word_embedding_dict_source,
                            word_embedding_dict_target):
        """ Calculate euclidean distance between two dictionaries of arrays.
        """
        try:
            source = np.array(word_embedding_dict_source, dtype=np.float32)
            target = np.array(word_embedding_dict_target, np.float32)
            embeddings = np.concatenate((source, target))

            source_len = source.shape[0]
            target_len = target.shape[0]

            source_words = np.array([i for i in range(source_len)],
                                    dtype=np.int32)
            target_words = np.array(
                [source_len + i for i in range(target_len)], dtype=np.int32)

            source_weights = np.array([1 for i in range(source_len)],
                                      dtype=np.int32)
            target_weights = np.array([1 for i in range(target_len)],
                                      dtype=np.int32)

            nbow = {
                "source": ("source", source_words, source_weights),
                "target": ("target", target_words, target_weights)
            }
            calc = WMD(embeddings, nbow, vocabulary_min=2)

            return calc.nearest_neighbors("source", 1)[0][1]

        except (ValueError, IndexError):
            return 0
Exemplo n.º 3
0
def calc_smd(input_f, output_f="", WORD_REP='elmo', METRIC='sms'):
    if WORD_REP == "elmo":
        MODEL = ElmoEmbedder()
    inF = open(input_f, 'r')
    inLines = inF.readlines()
    inF.close()
    #print("Found", len(inLines), "documents")
    token_doc_list, text_doc_list = tokenize_texts(inLines, WORD_REP, tokenize=True)
    count = 0
    results_list = []
    for doc_id in range(len(token_doc_list)):
        doc = token_doc_list[doc_id]
        text = text_doc_list[doc_id]
        # transform doc to ID list, both words and/or sentences. get ID dict that maps to emb
        [ref_ids, hyp_ids], rep_map = get_embeddings(doc, text, WORD_REP, MODEL, METRIC)
        # get D values
        [ref_id_list, hyp_id_list], [ref_d, hyp_d] = get_weights([ref_ids, hyp_ids], METRIC)
        # format doc as expected: {id: (id, ref_id_list, ref_d)}
        doc_dict = {"0": ("ref", ref_id_list, ref_d), "1": ("hyp", hyp_id_list, hyp_d)}
        calc = WMD(rep_map, doc_dict, vocabulary_min=1)
        try:
            dist = calc.nearest_neighbors(str(0), k=1, early_stop=1)[0][1]  # how far is hyp from ref?
        except Exception as e:
            print(e)
        sim = math.exp(-dist)  # switch to similarity
        results_list.append(sim)
        if doc_id == int((len(token_doc_list) / 10.) * count):
            print(str(count * 10) + "% done with calculations")
            count += 1
    if output_f != "":
        print_score(inLines, output_f, results_list)
    else:
        print("Results: ", np.mean(results_list))

    return 'Done!'
Exemplo n.º 4
0
    def compute_q(self, f_df, q_df, return_f_nbow=False):
        logger.info('Computing question wmds')
        f_nbow = {
            row.Index: self.nbowify(row.Index, row.original)
            for row in f_df.itertuples()
        }
        nb_facts = len(f_nbow)
        q_nbow = {
            row.Index + nb_facts: self.nbowify(row.Index + nb_facts,
                                               row.original)
            for row in q_df.itertuples()
        }

        merged_fnbow = copy.copy(f_nbow)
        merged_fnbow.update(q_nbow)
        q_calc = WMD(SpacyEmbeddings(self.nlp),
                     merged_fnbow,
                     vocabulary_min=1,
                     verbosity=logging.WARNING)
        q_calc.cache_centroids()
        q_closest = pd.Series(
            np.array([
                i for i, _ in q_calc.nearest_neighbors(
                    idx, k=self.config.nearest_k_visible) if i < nb_facts
            ]) for idx in tqdm(q_nbow.keys(), desc='Question wmd...'))
        return (q_closest, f_nbow) if return_f_nbow else q_closest
Exemplo n.º 5
0
    def get_similar_bugs(self, query):

        query = self.text_preprocess(self.get_text(query))
        words = [
            word for word in set(chain(query, *self.corpus))
            if word in self.w2vmodel.wv
        ]
        indices, words = zip(*sorted(((index, word) for (
            index, _), word in zip(self.dictionary.doc2bow(words), words))))
        query = dict(self.tfidf[self.dictionary.doc2bow(query)])
        query = [(new_index, query[dict_index])
                 for new_index, dict_index in enumerate(indices)
                 if dict_index in query]
        documents = [
            dict(self.tfidf[self.dictionary.doc2bow(document)])
            for document in self.corpus
        ]
        documents = [[(new_index, document[dict_index])
                      for new_index, dict_index in enumerate(indices)
                      if dict_index in document] for document in documents]
        embeddings = np.array([self.w2vmodel.wv[word] for word in words],
                              dtype=np.float32)
        nbow = dict(((index, list(chain([None], zip(*document))))
                     for index, document in enumerate(documents)
                     if document != []))
        nbow["query"] = tuple([None] + list(zip(*query)))
        distances = WMD(embeddings, nbow,
                        vocabulary_min=1).nearest_neighbors("query")

        return [
            self.bug_ids[distance[0]] for distance in distances
            if self.bug_ids[distance[0]] != query["id"]
        ]
Exemplo n.º 6
0
    def get_distance(self, query1, query2):
        query1 = self.text_preprocess(self.get_text(query1))
        query2 = self.text_preprocess(self.get_text(query2))

        words = [
            word for word in set(chain(query1, query2, *self.corpus))
            if word in self.w2vmodel.wv
        ]
        indices, words = zip(*sorted(((index, word) for (
            index, _), word in zip(self.dictionary.doc2bow(words), words))))
        query1 = dict(self.tfidf[self.dictionary.doc2bow(query1)])
        query2 = dict(self.tfidf[self.dictionary.doc2bow(query2)])

        query1 = [(new_index, query1[dict_index])
                  for new_index, dict_index in enumerate(indices)
                  if dict_index in query1]
        query2 = [(new_index, query2[dict_index])
                  for new_index, dict_index in enumerate(indices)
                  if dict_index in query2]
        embeddings = np.array([self.w2vmodel.wv[word] for word in words],
                              dtype=np.float32)
        nbow = {}
        nbow["query1"] = tuple([None] + list(zip(*query1)))
        nbow["query2"] = tuple([None] + list(zip(*query2)))
        distances = WMD(embeddings, nbow,
                        vocabulary_min=1).nearest_neighbors("query1")

        return distances[0][1]
def calculate_similarity(candidate, next_id, emb):
    s = time.time()
    can_doc = calculator.nlp(candidate[essay_field])
    similarities = []
    next_id, emb, can_id_list, can_weights = calculator.get_embeddings_ids_weights(
        can_doc, next_id, emb, method)
    nbow = {"hypothesis": ("hypothesis", can_id_list, can_weights)}

    for id, item in processed_refs.items():
        ref_weights = item["weights"]
        ref_id_list = item["id_list"]
        nbow[id] = (id, ref_id_list, ref_weights)

    calc = WMD(emb, nbow, vocabulary_min=1)
    # print("NBOW")
    # print(nbow)
    distances = calc.nearest_neighbors("hypothesis",
                                       k=len(processed_refs),
                                       early_stop=1)

    for id, dist in distances:
        similarity = np.exp(-dist)
        similarities.append({
            "candidate_id": candidate[id_field],
            "reference_id": id,
            "similarity": similarity,
            "dist": dist,
            "score": candidate[score_field]
        })
    print("Time taken for candidate " + str(candidate[id_field]) + " is " +
          str(time.time() - s))

    return similarities
 def get_similarity_dist(self, candidate, reference, method):
     emb, nbow = self.get_emb_nbow(candidate, reference, method)
     # print("emb:", emb.keys())
     # print("nbow:", nbow)
     calc = WMD(emb, nbow, vocabulary_min=1)
     dist = calc.nearest_neighbors("reference", k=1, early_stop=1)
     # print("Dist:", dist)
     dist = dist[0][1]
     similarity = np.exp(-dist)
     return similarity, dist
Exemplo n.º 9
0
 def __init__(self,
              id2vec=None,
              df=None,
              nbow=None,
              verbosity=logging.DEBUG,
              wmd_cache_centroids=True,
              wmd_kwargs=None,
              gcs_bucket=None,
              repo2nbow_kwargs=None,
              initialize_environment=True):
     if initialize_environment:
         initialize()
     self._log = logging.getLogger("similar_repos")
     self._log.setLevel(verbosity)
     if gcs_bucket:
         backend = create_backend(args="bucket=" + gcs_bucket)
     else:
         backend = create_backend()
     if id2vec is None:
         self._id2vec = Id2Vec(log_level=verbosity, backend=backend)
     else:
         assert isinstance(id2vec, Id2Vec)
         self._id2vec = id2vec
     self._log.info("Loaded id2vec model: %s", self._id2vec)
     if df is None:
         if df is not False:
             self._df = DocumentFrequencies(log_level=verbosity,
                                            backend=backend)
         else:
             self._df = None
             self._log.warning("Disabled document frequencies - you will "
                               "not be able to query custom repositories.")
     else:
         assert isinstance(df, DocumentFrequencies)
         self._df = df
     self._log.info("Loaded document frequencies: %s", self._df)
     if nbow is None:
         self._nbow = NBOW(log_level=verbosity, backend=backend)
     else:
         assert isinstance(nbow, NBOW)
         self._nbow = nbow
     self._log.info("Loaded nBOW model: %s", self._nbow)
     self._repo2nbow = Repo2nBOW(self._id2vec,
                                 self._df,
                                 log_level=verbosity,
                                 **(repo2nbow_kwargs or {}))
     self._log.info("Creating the WMD engine...")
     self._wmd = WMD(self._id2vec.embeddings,
                     self._nbow,
                     verbosity=verbosity,
                     **(wmd_kwargs or {}))
     if wmd_cache_centroids:
         self._wmd.cache_centroids()
Exemplo n.º 10
0
def get_sim(doc, text, wordrep, model, metric):
    [ref_ids, hyp_ids], rep_map = get_embeddings(doc, text, wordrep, model, metric)
    [ref_id_list, hyp_id_list], [ref_d, hyp_d] = get_weights([ref_ids, hyp_ids], metric)
    # format doc as expected: {id: (id, ref_id_list, ref_d)}
    doc_dict = {"0": ("ref", ref_id_list, ref_d), "1": ("hyp", hyp_id_list, hyp_d)}
    calc = WMD(rep_map, doc_dict, vocabulary_min=1)
    try:
        dist = calc.nearest_neighbors(str(0), k=1, early_stop=1)[0][1]  # how far is hyp from ref?
    except:
        return 0.0
    sim = math.exp(-dist)  # switch to similarity
    return sim
Exemplo n.º 11
0
def calc_smd(inLines, model):
    global nlp
    nlp = model
    #print("Found", len(inLines), "documents")
    # TODO: rewrite this
    token_doc_list, text_doc_list = tokenize_texts(inLines)
    results_list = []
    for doc_id in range(len(token_doc_list)):
        doc = token_doc_list[doc_id]
        # TODO: rewrite this
        text = text_doc_list[doc_id]
        # transform doc to ID list, both words and/or sentences. get ID dict that maps to emb
        # TODO: rewrite this
        try:
            [ref_ids, hyp_ids], rep_map = get_embeddings(doc, text)
        except ValueError:
            print(inLines[doc_id])
            print('ValueError: max() arg is an empty sequence; get_embeddings')
            continue
        # get D values
        [ref_id_list, hyp_id_list], [ref_d,
                                     hyp_d] = get_weights([ref_ids, hyp_ids])
        # format doc as expected: {id: (id, ref_id_list, ref_d)}
        doc_dict = {
            "0": ("ref", ref_id_list, ref_d),
            "1": ("hyp", hyp_id_list, hyp_d)
        }
        calc = WMD(rep_map, doc_dict, vocabulary_min=1)
        try:
            dist = calc.nearest_neighbors(
                str(0), k=1, early_stop=1)[0][1]  # how far is hyp from ref?
            sim = math.exp(-dist)  # switch to similarity
        except IndexError:
            print(
                'dist = calc.nearest_neighbors(str(0), k=1, early_stop=1)[0][1]'
            )
            print('IndexError: list index out of range')
            print(inLines[doc_id])
            continue
        except UnboundLocalError:
            print('dist could not be calculated')
            print(inLines[doc_id])
            continue
        except ValueError:
            print('Too little vocabulary')
            print(inLines[doc_id])
            continue

        results_list.append((inLines[doc_id], sim))

    return score_list(results_list)
Exemplo n.º 12
0
 def __init__(self, id2vec=None, df=None, nbow=None, prune_df_threshold=1,
              verbosity=logging.DEBUG, wmd_cache_centroids=True, wmd_kwargs=None,
              gcs_bucket=None, repo2nbow_kwargs=None, initialize_environment=True):
     if initialize_environment:
         initialize()
     self._log = logging.getLogger("similar_repos")
     self._log.setLevel(verbosity)
     if gcs_bucket:
         backend = create_backend(args="bucket=" + gcs_bucket)
     else:
         backend = create_backend()
     if id2vec is None:
         self._id2vec = Id2Vec(log_level=verbosity).load(backend=backend)
     else:
         assert isinstance(id2vec, Id2Vec)
         self._id2vec = id2vec
     self._log.info("Loaded id2vec model: %s", self._id2vec)
     if df is None:
         if df is not False:
             self._df = DocumentFrequencies(log_level=verbosity).load(backend=backend)
         else:
             self._df = None
             self._log.warning("Disabled document frequencies - you will "
                               "not be able to query custom repositories.")
     else:
         assert isinstance(df, DocumentFrequencies)
         self._df = df
     if self._df is not None:
         self._df = self._df.prune(prune_df_threshold)
     self._log.info("Loaded document frequencies: %s", self._df)
     if nbow is None:
         self._nbow = NBOW(log_level=verbosity).load(backend=backend)
     else:
         assert isinstance(nbow, NBOW)
         self._nbow = nbow
     self._log.info("Loaded nBOW model: %s", self._nbow)
     self._repo2nbow = Repo2nBOW(
         self._id2vec, self._df, log_level=verbosity, **(repo2nbow_kwargs or {}))
     assert self._nbow.dep("id2vec")["uuid"] == self._id2vec.meta["uuid"]
     if len(self._id2vec) != self._nbow.matrix.shape[1]:
         raise ValueError("Models do not match: id2vec has %s tokens while nbow has %s" %
                          (len(self._id2vec), self._nbow.matrix.shape[1]))
     self._log.info("Creating the WMD engine...")
     self._wmd = WMD(self._id2vec.embeddings, self._nbow,
                     verbosity=verbosity, **(wmd_kwargs or {}))
     if wmd_cache_centroids:
         self._wmd.cache_centroids()
Exemplo n.º 13
0
 def __init__(self,
              id2vec=None,
              df=None,
              nbow=None,
              prune_df_threshold=1,
              wmd_cache_centroids=True,
              wmd_kwargs: Dict[str, Any] = None,
              languages: Tuple[List, bool] = (None, False),
              engine_kwargs: Dict[str, Any] = None):
     backend = create_backend()
     if id2vec is None:
         self._id2vec = Id2Vec().load(backend=backend)
     else:
         assert isinstance(id2vec, Id2Vec)
         self._id2vec = id2vec
     self._log.info("Loaded id2vec model: %s", self._id2vec)
     if df is None:
         if df is not False:
             self._df = DocumentFrequencies().load(backend=backend)
         else:
             self._df = None
             self._log.warning("Disabled document frequencies - you will "
                               "not be able to query custom repositories.")
     else:
         assert isinstance(df, DocumentFrequencies)
         self._df = df
     if self._df is not None:
         self._df = self._df.prune(prune_df_threshold)
     self._log.info("Loaded document frequencies: %s", self._df)
     if nbow is None:
         self._bow = BOW().load(backend=backend)
     else:
         assert isinstance(nbow, BOW)
         self._bow = nbow
     self._log.info("Loaded BOW model: %s", self._bow)
     assert self._bow.get_dep("id2vec")["uuid"] == self._id2vec.meta["uuid"]
     if len(self._id2vec) != self._bow.matrix.shape[1]:
         raise ValueError(
             "Models do not match: id2vec has %s tokens while nbow has %s" %
             (len(self._id2vec), self._bow.matrix.shape[1]))
     self._log.info("Creating the WMD engine...")
     self._wmd = WMD(self._id2vec.embeddings, self._bow, **(wmd_kwargs
                                                            or {}))
     if wmd_cache_centroids:
         self._wmd.cache_centroids()
     self._languages = languages
     self._engine_kwargs = engine_kwargs
Exemplo n.º 14
0
    def compute_f(self, f_df, f_nbow=None):
        logger.info('Computing fact wmds')
        f_nbow = {
            row.Index: self.nbowify(row.Index, row.original)
            for row in f_df.itertuples()
        } if f_nbow is None else f_nbow

        f_calc = WMD(SpacyEmbeddings(self.nlp),
                     f_nbow,
                     vocabulary_min=1,
                     verbosity=logging.WARNING)
        f_calc.cache_centroids()
        f_closest = pd.Series(
            np.array([
                i for i, _ in f_calc.nearest_neighbors(
                    idx, k=self.config.nearest_k_visible)
            ]) for idx in tqdm(f_nbow.keys(), desc='Fact wmd...'))
        return f_closest
 def fit_wme_model(self, d_max=6, r=1024):
     self._r = r
     possible_words = list(self.word_mapping)
     nbow = {}
     for i in range(r):
         d = random.sample(range(1, d_max + 1), 1)[0]
         random_doc = random.sample(possible_words, d)
         doc_embeddings = [self.word_mapping[word] for word in random_doc]
         document, idf_ids = zip(*[(word.glove_id, word.idf_id)
                                   for word in doc_embeddings])
         words = np.array(document, dtype=np.uint32)
         idf_weights = np.array(
             [self.tf_idf_model.idf_[idf_id] for idf_id in idf_ids],
             dtype=np.float32)
         weights = idf_weights
         doc_id = '#' + str(i + 1)
         nbow[doc_id] = (doc_id, words, weights)
     self.wmd = WMD(embeddings=self.glove_model.word_vectors.astype(
         np.float32),
                    nbow=nbow,
                    vocabulary_min=1)
Exemplo n.º 16
0
def calc_smd(opts, output_f=""):
    inF = open(opts.input_file, 'r')
    inLines = inF.readlines()
    inF.close()
    print("Found", len(inLines), "documents")
    token_doc_list, text_doc_list = tokenize_texts(inLines)
    count = 0
    results_list = []
    for doc_id in range(len(token_doc_list)):
        doc = token_doc_list[doc_id]
        text = text_doc_list[doc_id]
        # transform doc to ID list, both words and/or sentences. get ID dict that maps to emb
        [ref_ids, hyp_ids], rep_map = get_embeddings(doc, text)
        # get D values
        [ref_id_list, hyp_id_list], [ref_d,
                                     hyp_d] = get_weights([ref_ids, hyp_ids])
        # format doc as expected: {id: (id, ref_id_list, ref_d)}
        doc_dict = {
            "0": ("ref", ref_id_list, ref_d),
            "1": ("hyp", hyp_id_list, hyp_d)
        }
        calc = WMD(rep_map, doc_dict, vocabulary_min=1)
        try:
            dist = calc.nearest_neighbors(
                str(0), k=1, early_stop=1)[0][1]  # how far is hyp from ref?
        except:
            print(doc, text)
        sim = math.exp(-dist)  # switch to similarity
        results_list.append(sim)
        if doc_id == int((len(token_doc_list) / 10.) * count):
            print(str(count * 10) + "% done with calculations")
            count += 1
    # added by wchen to compute correlation scores with human annotated scores
    hscoreF = open(opts.score_file, 'r')
    hscoreLines = hscoreF.readlines()
    hscoreF.close()
    compute_corrs(opts, results_list, hscoreLines)
Exemplo n.º 17
0
def calc_smd(ref, hyp, model):
    global nlp
    nlp = model
    doc, text = tokenize_texts([ref, hyp])
    count = 0
    results_list = []
    # transform doc to ID list, both words and/or sentences. get ID dict that maps to emb
    [ref_ids, hyp_ids], rep_map = get_embeddings(doc, text)
    # get D values
    [ref_id_list, hyp_id_list], [ref_d,
                                 hyp_d] = get_weights([ref_ids, hyp_ids])
    # format doc as expected: {id: (id, ref_id_list, ref_d)}
    doc_dict = {
        "0": ("ref", ref_id_list, ref_d),
        "1": ("hyp", hyp_id_list, hyp_d)
    }
    calc = WMD(rep_map, doc_dict, vocabulary_min=1)
    try:
        dist = calc.nearest_neighbors(
            str(0), k=1, early_stop=1)[0][1]  # how far is hyp from ref?
    except:
        print(doc, text)
    sim = math.exp(-dist)  # switch to similarity
    return sim
Exemplo n.º 18
0
# List of page names we will fetch from Wikipedia and query for similarity
titles = sys.argv[1:] or ["Germany", "Spain", "Google"]

documents = {}
for title in titles:
    print("fetching", title)
    pages = requests.get(
        "https://en.wikipedia.org/w/api.php?action=query&format=json&titles=%s"
        "&prop=extracts&explaintext" % title).json()["query"]["pages"]
    print("parsing", title)
    text = nlp(next(iter(pages.values()))["extract"])
    tokens = [t for t in text if t.is_alpha and not t.is_stop]
    words = Counter(t.text for t in tokens)
    orths = {t.text: t.orth for t in tokens}
    sorted_words = sorted(words)
    documents[title] = (title, [orths[t] for t in sorted_words],
                        numpy.array([words[t] for t in sorted_words],
                                    dtype=numpy.float32))


# Hook in WMD
class SpacyEmbeddings(object):
    def __getitem__(self, item):
        return nlp.vocab[item].vector

calc = WMD(SpacyEmbeddings(), documents)
print("calculating")
# Germany shall be closer to Spain than to Google
for title, relevance in calc.nearest_neighbors(titles[0]):
    print("%24s\t%s" % (title, relevance))
Exemplo n.º 19
0
# pip3 install wmd
# https://github.com/src-d/wmd-relax
# 论文: http://www.cs.cornell.edu/~kilian/papers/wmd_metric.pdf

import time
import numpy
from wmd import WMD
import pickle
embeddings = numpy.array([[0.1, 1], [1, 0.1], [0.8, 0.7]], dtype=numpy.float32)
nbow = {  # key: 序号, 向量, 权重;
    "first": ("#1", [0, 1, 2], numpy.array([1.5, 0.3, 0.5],
                                           dtype=numpy.float32)),
    "你好": ("#3", [1, 2], numpy.array([1.3, 0.5], dtype=numpy.float32)),
    "second": ("#2", [0, 1], numpy.array([0.75, 0.15], dtype=numpy.float32))
}
calc = WMD(embeddings, nbow, vocabulary_min=2)
origin = "first"
print(calc.nearest_neighbors(origin))

model_file = '/home/gswewf/yhb/model/wx_vector_char.pkl'

with open(model_file, "rb") as f:
    w2v_model = pickle.load(f, encoding='iso-8859-1')  # 此处耗内存 60.8 MiB

words_list = []
w_emb = []
for word, emb in w2v_model.items():
    words_list.append(word)
    w_emb.append(emb)

from jieba.analyse.tfidf import TFIDF
    text = nlp(doctext)
    tokens = [t for t in text if t.is_alpha and not t.is_stop]

    words = Counter(t.text for t in tokens)
    orths = {t.text: t.orth for t in tokens}
    sorted_words = sorted(words)
    documents[title] = (title, [orths[t] for t in sorted_words],
                        numpy.array([words[t] for t in sorted_words],
                                    dtype=numpy.float32))


# Hook in WMD
class SpacyEmbeddings(object):
    def __getitem__(self, item):
        return nlp.vocab[item].vector


embeddings = SpacyEmbeddings()

vocabulary_min = 10
calc = WMD(embeddings, documents, vocabulary_min=vocabulary_min)

print("calculating")
# Germany shall be closer to Spain than to Google

neigbors_of_germany = calc.nearest_neighbors(titles[0])

for title, relevance in neigbors_of_germany:
    print("%24s\t%s" % (title, relevance))