Exemplo n.º 1
0
def compute_both_neighbours(chosen, path_composed_emb, path_observed_emb):
    """
        Returns the neighbours of the composed/observed representations of the chosen words 
        in an observed space.
    """
    original_nearest_neighbours = {}
    composed_nearest_neighbours = {}

    observed_space = Word2VecKeyedVectors.load_word2vec_format(
        path_observed_emb, binary=False)
    observed_space.vectors = normalize(observed_space.vectors,
                                       norm="l2",
                                       axis=1)

    composed_space = Word2VecKeyedVectors.load_word2vec_format(
        path_composed_emb, binary=False)
    composed_space.vectors = normalize(composed_space.vectors,
                                       norm="l2",
                                       axis=1)

    chosen_words = set([tup[0] for tup in chosen])

    composed_words = composed_space.wv.vocab
    observed_words = observed_space.wv.vocab

    for word, rank in chosen:
        original_vec = observed_space.get_vector(word)
        composed_vec = composed_space.get_vector(word)

        original_composed_cosine = np.dot(original_vec, composed_vec)
        sims = observed_space.similar_by_vector(vector=original_vec,
                                                topn=False)
        neighbours = [(observed_space.index2word[widx], sims[widx])
                      for widx in range(len(sims))]
        neighbours.append(("%s\_c" % word, original_composed_cosine))
        sorted_neighbours = sorted(neighbours,
                                   key=lambda tup: tup[1],
                                   reverse=True)
        print("neighbours of the original representation of %s" % (word))
        c_idx = [
            idx for idx, tup in enumerate(sorted_neighbours)
            if tup[0] == "%s\_c" % word
        ]
        print(word, original_composed_cosine, c_idx)

        original_nearest_neighbours[word] = sorted_neighbours[:11]
        print(original_nearest_neighbours[word])
        comp_index = c_idx[0]
        if comp_index >= 5:
            composed_nearest_neighbours[
                "%s\_c" % word] = sorted_neighbours[comp_index - 5:comp_index +
                                                    6]
        else:
            composed_nearest_neighbours["%s\_c" %
                                        word] = sorted_neighbours[:11 -
                                                                  comp_index]
        print("neighbours of the composed representation of %s" % (word))
        print(composed_nearest_neighbours["%s\_c" % word])

    return original_nearest_neighbours, composed_nearest_neighbours
Exemplo n.º 2
0
def read_gensim_model(file_name):
    extension = Path(file_name).suffix
    if extension == '.txt':
        model = Word2VecKeyedVectors.load_word2vec_format(file_name,
                                                          binary=False)
    elif extension == '.bin' or extension == '.w2v':
        model = Word2VecKeyedVectors.load_word2vec_format(file_name,
                                                          binary=True)
    else:
        raise Exception("unknown extension for embeddings file")

    return model
Exemplo n.º 3
0
def get_latent(args):
	print("Loading embedding model...")
	model_name = 'WORD2VEC_' + args.target_dataset + '.model'
	embedding_model = Word2VecKeyedVectors.load(os.path.join(CONFIG.EMBEDDING_PATH, model_name))
	print("Loading embedding model completed")
	
	full_data = []
	df_data = pd.read_csv(os.path.join(CONFIG.DATASET_PATH, args.target_dataset, 'posts.csv'), header=None, encoding='utf-8-sig')

	short_code_list = []
	row_list = []
	csv_name = 'text_word2vec_' + args.target_dataset + '.csv'
	pbar = tqdm(total=df_data.shape[0])

	for index, row in df_data.iterrows():
		pbar.update(1)
		short_code = row.iloc[0]
		short_code_list.append(short_code)
		text_data = row.iloc[1]
		#full_data.append([text_data, short_code])
		vector_list = []
		for word in text_data.split():
			vector_list.append(embedding_model.get_vector(word))
		vector = np.mean(vector_list, axis=0)
		row_list.append(vector)
		del text_data
	pbar.close()

	result_df = pd.DataFrame(data=row_list, index=short_code_list, columns=[i for i in range(300)])
	result_df.index.name = "short_code"
	result_df.sort_index(inplace=True)
	result_df.to_csv(os.path.join(CONFIG.CSV_PATH, csv_name), encoding='utf-8-sig')
	print("Finish!!!")
def emb_to_gensim(e):
    '''
	Convert embedding to gensim format

	Parameters
	----------
	e: Embedding | instance of class Embedding

	Returns
	-------
	gpairs: Word2VecKeyedVectors | Embedding in Gensim format
	'''
    rank = np.shape(e.vectors)[1]
    gpairs = GensimPairs(rank)
    gpairs.add([word for word in e.words], [np.array(v) for v in e.vectors])
    return gpairs
Exemplo n.º 5
0
def test_doc2vec():
    """
    测试doc2vec的效果
    :return: 输出结果
    """
    documents = get_documents(cache=True, jieba=True)
    #加载模型, training继续训练模型
    model = train_doc2vec(documents, training=True, epoch=200)
    #用于打印
    documents = get_documents(cache=True, jieba=False)
    # 过滤出给的关键字fintags不在字典中的词语 ,所以这个词语没有词向量,无法计算相似度
    filter_tags = [tag for tag in finTags if tag in model.wv]
    if finTags != filter_tags:
        print('给定的fintags这写关键字不在doc2vec生成的字典中, 请更改关键字或者扩充训练文档, 使得训练文档包含这个关键字',
              set(finTags) - set(filter_tags))
    tagsvec = model.wv[filter_tags]
    keywords = []
    for idx, doc in enumerate(documents):
        docvec = model.docvecs[idx]
        #计算所有tag与这个文档的相似度
        tagssim = Word2VecKeyedVectors.cosine_similarities(docvec, tagsvec)
        maxsim = max(tagssim)
        keyword = finTags[list(tagssim).index(maxsim)]
        print(f"doc2vec计算的最接近的keyword是: {keyword}, 相似度是: {maxsim}, 文档是: {doc}")
        keywords.append(keyword)
    print(keywords)
    return keywords
Exemplo n.º 6
0
def load_word2vec_binary(file):
    """
    Load a word2vec embeddings in binary format as in the origin C tool.
    :param file: a binary file.
    :return: KeyedVectors
    """
    return Word2VecKeyedVectors.load_word2vec_format(file, binary=True)
Exemplo n.º 7
0
 def test_persistence_word2vec_format(self):
     """Test storing/loading the model in word2vec format."""
     tmpf = get_tmpfile('gensim_fasttext_w2v_format.tst')
     model = FT_gensim(sentences, min_count=1, size=10)
     model.wv.save_word2vec_format(tmpf, binary=True)
     loaded_model_kv = Word2VecKeyedVectors.load_word2vec_format(tmpf, binary=True)
     self.assertEqual(len(model.wv.vocab), len(loaded_model_kv.vocab))
     self.assertTrue(np.allclose(model['human'], loaded_model_kv['human']))
Exemplo n.º 8
0
 def test_persistence_word2vec_format(self):
     """Test storing/loading the model in word2vec format."""
     tmpf = get_tmpfile('gensim_fasttext_w2v_format.tst')
     model = FT_gensim(sentences, min_count=1, size=10)
     model.wv.save_word2vec_format(tmpf, binary=True)
     loaded_model_kv = Word2VecKeyedVectors.load_word2vec_format(tmpf, binary=True)
     self.assertEqual(len(model.wv.vocab), len(loaded_model_kv.vocab))
     self.assertTrue(np.allclose(model['human'], loaded_model_kv['human']))
Exemplo n.º 9
0
def load_embedding(embedding_dir):
    from gensim.models.keyedvectors import Word2VecKeyedVectors
    token_file = os.path.join(embedding_dir, "token_list.npy")
    token_list = np.load(token_file)
    vector_file = os.path.join(embedding_dir, "vector_list.npy")
    vector_list = np.load(vector_file)
    model = Word2VecKeyedVectors(vector_list.shape[1])
    model.add(token_list, vector_list)
    return model.wv
Exemplo n.º 10
0
 def _create_keyed_vectors(self) -> KeyedVectors:
     kv = Word2VecKeyedVectors(vector_size=self.vector_dimension)
     if gensim.__version__[0] >= '4':
         kv.key_to_index = self._data().word2idx
     else:
         kv.vocab = _WordEmbedVocabAdapter(self._data())
     kv.vectors = self.matrix
     kv.index2entity = list(self._data().words)
     return kv
Exemplo n.º 11
0
def compute_neighbours(chosen, path_composed_emb, path_observed_emb,
                       no_neighbours):
    """
        Returns the neighbours of words from a composed space in an observed space.
    """
    nearest_neighbours = {}

    observed_space = Word2VecKeyedVectors.load_word2vec_format(
        path_observed_emb, binary=False)
    observed_space.vectors = normalize(observed_space.vectors,
                                       norm="l2",
                                       axis=1)

    composed_space = Word2VecKeyedVectors.load_word2vec_format(
        path_composed_emb, binary=False)
    composed_space.vectors = normalize(composed_space.vectors,
                                       norm="l2",
                                       axis=1)

    chosen_words = set([tup[0] for tup in chosen])

    composed_words = composed_space.wv.vocab
    observed_words = observed_space.wv.vocab

    for word, rank in chosen:
        original_vec = observed_space.get_vector(word)
        composed_vec = composed_space.get_vector(word)

        original_composed_cosine = np.dot(original_vec, composed_vec)
        neighbours = observed_space.similar_by_vector(vector=original_vec,
                                                      topn=no_neighbours)
        neighbours.append(("%s\_c" % word, original_composed_cosine))
        sorted_neighbours = sorted(neighbours,
                                   key=lambda tup: tup[1],
                                   reverse=True)
        c_idx = [
            idx for idx, tup in enumerate(sorted_neighbours)
            if tup[0] == "%s\_c" % word
        ]
        print(word, original_composed_cosine, c_idx)

        nearest_neighbours[word] = sorted_neighbours

    return nearest_neighbours
Exemplo n.º 12
0
    def load_term_embeddings(
            term_ids: Set[int], emb_path: str,
            idx_to_term: Dict[int, str]) -> Dict[int, List[float]]:
        """Get all embeddings for the given terms from the given file.

        Args:
            term_ids: The ids of the input terms.
            emb_path: The path to the given embedding file.
            idx_to_term: Maps term_id to term.
        Return:
            A dictionary of the form: {term_id: embedding}
        """
        pck = False
        if emb_path.endswith('.pickle'):
            pck = True
            print('  SPECIAL CASE: load embeddings from pickle...')
            with open(emb_path, 'rb') as f:
                emb_dict = pickle.load(f)
                # *** This code would be used if local mean would be
                # computed.
                # print('  Calculating average embeddings...')
                # model = {}
                # for term_id in emb_dict:
                #     embs = []
                #     for doc_id in emb_dict[term_id]:
                #         embs.extend(emb_dict[term_id][doc_id])
                #     model[term_id] = np.mean(embs, axis=0)
                # ***
                model = {tid: emb for tid, emb in emb_dict.items()}
        else:
            logging.getLogger("gensim.models").setLevel(logging.WARNING)
            logging.getLogger("gensim.scripts.glove2word2vec").setLevel(
                logging.WARNING)
            logging.getLogger("gensim").setLevel(logging.WARNING)
            print('Load embeddings from:')
            print(emb_path)
            try:
                model = KeyedVectors.load(emb_path)
            except:
                model = Word2VecKeyedVectors.load_word2vec_format(emb_path,
                                                                  binary=True)
        term_id_to_emb = {}
        global_embs_ids = []
        for term_id in term_ids:
            try:
                if pck:
                    term_id_to_emb[term_id] = model[term_id]
                else:
                    term_id_to_emb[term_id] = model.wv[str(term_id)]
            except KeyError:
                global_embs_ids.append((term_id, idx_to_term[term_id]))
                # term_id_to_emb[term_id] = term_ids_to_embs_global[term_id]
        if global_embs_ids:
            print('WARNING: No embeddings found for:', global_embs_ids)
            print('WARNING: {} terms excluded.'.format(len(global_embs_ids)))
        return term_id_to_emb
Exemplo n.º 13
0
def save_gensim_model(words, word_reprs, output_file, binary=True):
    """Save word representations in w2v format. Word order is not preserved"""
    vocab = dict()
    for word in words:
        vocab[word] = Vocab(index=len(vocab))

    model = Word2VecKeyedVectors(word_reprs.shape[1])
    model.vocab = vocab
    model.vectors = word_reprs
    model.save_word2vec_format(fname=output_file, binary=binary)
    def __init__(self, filename: str):
        self.model = Word2VecKeyedVectors.load_word2vec_format(filename)

        # Collect ranked list of words in vocab
        words = self.model.index2word

        w_rank = {}
        for i, word in enumerate(words):
            w_rank[word] = i
        self.words = w_rank
Exemplo n.º 15
0
def eval_on_file(path_composed_emb, path_observed_emb, save_path):
    raw_observed_space = Word2VecKeyedVectors.load_word2vec_format(
        path_observed_emb, binary=False)

    targets = read_targets(path_composed_emb)
    raw_composed_space = Word2VecKeyedVectors.load_word2vec_format(
        path_composed_emb, binary=False)

    q1, q2, q3, ranks = evaluateRank(targets, raw_composed_space,
                                     raw_observed_space, 1000)
    print("Q1: " + str(q1) + ", Q2: " + str(q2) + ", Q3: " + str(q3))

    if save_path:
        printDictToFile(ranks, save_path + '_rankedCompounds.txt')

        sortedRanks = sorted(ranks.values())
        printListToFile(sortedRanks, save_path + '_ranks.txt')
        logResult(q1, q2, q3, save_path + '_quartiles.txt')

    return q1, q2, q3, ranks
Exemplo n.º 16
0
def load_model(model_name, epoch):
    from gensim.models import KeyedVectors
    from gensim.models.keyedvectors import FastTextKeyedVectors, Word2VecKeyedVectors
    from gensim.models.fasttext import load_facebook_vectors, load_facebook_model
    from gensim.models.wrappers import FastText

    if epoch != '50+10':
        # if epoch choice is epoch 10 or 50 (no continued training with CSPC problem texts)
        if model_name.lower() == 'word2vec':
            return Word2VecKeyedVectors.load(f"trained_models/word2vec/idwiki.epoch-{epoch}.dim-300.kv")
        elif model_name.lower() == 'glove':
            return KeyedVectors.load_word2vec_format(
                f"trained_models/glove/converted.idwiki.epoch-{epoch}.dim-300.model.txt")
        elif model_name.lower() == 'fasttext':
            model = FastText.load_fasttext_format(
                f"trained_models/fasttext/idwiki.epoch-{epoch}.dim-300.bin")
            return model.wv
    else:
        # if epoch choice is 50+10, i.e. the 50 epoch word2vec model that's trained further with CSPC problem texts
        return Word2VecKeyedVectors.load(f"trained_models/word2vec/idwiki-cspc.epoch-50.dim-300.kv")

    return None
def convert_legacy_to_keyvec(legacy_w2v):
    dim = len(legacy_w2v[legacy_w2v.keys()[0]])
    vectors = Word2VecKeyedVectors(dim)

    ws = []
    vs = []

    for word, vect in legacy_w2v.items():
        ws.append(word)
        vs.append(vect)
        assert(len(vect) == dim)
    vectors.add(ws, vs, replace=True)
    return vectors
Exemplo n.º 18
0
def load_word_vectors(key_vecs_file, weights_file):
    """
    loads w2v keyvecs and lexicon into memory
    :param key_vecs_file: path to keyvecs file
    :param weights_file: path to lexicon w2v file
    :return: keyvecs, lexicon
    """
    logger.info("loading word2vec model...")

    wv = Word2VecKeyedVectors.load(key_vecs_file)
    weights = np.load(weights_file)

    return wv, weights
Exemplo n.º 19
0
    def load_word2vec_file(path):
        """Load from a word2vec file.

        Parameters:
            path (Path): The path to the word2vec file.

        Returns:
            WordEmbedding: The resulting word embedding.
        """
        with redirect_stderr(open(os.devnull)):
            gensim_obj = Word2VecKeyedVectors.load_word2vec_format(str(path))
        return WordEmbedding(
            gensim_obj=gensim_obj,
            source=path,
        )
Exemplo n.º 20
0
def distance_of_10_pairs_of_words(X: np.ndarray, word2vec_model: Word2VecKeyedVectors):
    speaker_offset = int(X.shape[0] / 2)
    total_samples = 10
    speaker_1_random = X[np.random.choice(speaker_offset, total_samples, replace=False)]
    speaker_2_random = X[speaker_offset + np.random.choice(speaker_offset, total_samples, replace=False)]

    for speaker_1, speaker_2 in zip(speaker_1_random, speaker_2_random):
        speaker_1 = [w for w in speaker_1.split() if w in word2vec_model]
        speaker_2 = [w for w in speaker_2.split() if w in word2vec_model]

        distance = word2vec_model.n_similarity(speaker_1, speaker_2)
        print("The distance between:")
        print(" ".join(speaker_1))
        print(" ".join(speaker_2))
        print("Distance: ", distance)
 def __init__(self,
              threshold=0.5,
              word2vecpath="model/word_embedding/embedding.wv",
              datapath="news_ch_2_seg/7.json"):
     self.threshold = threshold
     self.stopword2tag = {'m', 'p', 'x', 'c', 'uj', 'd', 'f', 'r', 'ul'}
     self.stopword2tag.add('a')
     self.word2vec = Word2VecKeyedVectors.load(word2vecpath)
     with open(datapath, 'r') as load_f:
         self.data = json.load(load_f)
     self.content, self.title, self.label = [], [], []
     self.Xtrain = None
     self.init()
     self.de_stopword()
     self.vectorize()
Exemplo n.º 22
0
def __get_similarity_words__(embeddings: Word2VecKeyedVectors, words: list,
                             other_words: list) -> float:
    if len(words) == 0 or len(other_words) == 0:
        return 0

    summed_avgs = 0
    for w in words:
        dist = []
        for o_w in other_words:
            sim = embeddings.similarity(w, o_w)
            dist.append(sim)
        avg = sum(dist) / len(other_words)
        summed_avgs += avg

    return summed_avgs / len(words)
def load_legacy_w2v_as_keyvecs(w2v_file, dim=50):
    vectors = None
    with open(w2v_file, 'r') as f:
        vectors = Word2VecKeyedVectors(dim)

        ws = []
        vs = []

        for line in f:
            vect = line.strip().rsplit()
            word = vect[0]
            vect = np.array([float(x) for x in vect[1:]])
            if(dim == len(vect)):
                ws.append(word)
                vs.append(vect)
        vectors.add(ws, vs, replace=True)
    return vectors
Exemplo n.º 24
0
def model_to_csv(target_model):
	model_name = 'WORD2VEC_' + target_model + '.model'
	model = Word2VecKeyedVectors.load(os.path.join(CONFIG.EMBEDDING_PATH,model_name))
	vocab = list(model.vocab)
	vocab_list = [x for x in vocab]
	print("vocab length: ", len(vocab_list))

	# f_csv = open(DF_PATH+'Word2VecBlog300_5_min10_mecab.csv', 'w', encoding='utf-8-sig', newline='')
	print("started to write csv")
	csv_name = target_model + '.csv'
	f_csv = open(os.path.join(CONFIG.CSV_PATH, csv_name), 'w', encoding='utf-8-sig', newline='')
	wr = csv.writer(f_csv)

	for voca in vocab_list:
		wr.writerow([voca]+model[voca].tolist())

	f_csv.close()
	print("completed to write csv")
Exemplo n.º 25
0
    def __init__(self, dimensions=None, gensim_obj=None, source=None):
        """Initialize a word embedding.

        At least one of dimensions and gensim_obj must be provided. If both are
        used, dimensions is ignored.

        Parameters:
            dimensions (int): The number of dimensions of the embedding.
            gensim_obj (gensim.Word2VecKeyedVectors):
                A gensim word embedding or related model.
            source (Path): The path of the source file.

        Raises:
            ValueError:
                If neither dimensions nor gensim_obj is provided.
                If dimensions is not a positive integer.
                If the word vectors in the gensim_obj cannot be determined.
        """
        if dimensions is None and gensim_obj is None:
            raise ValueError(
                'one of dimensions or gensim_obj must be provided')
        if gensim_obj is None:
            if not isinstance(dimensions, int) and dimensions > 0:
                raise ValueError('dimensions must be a positive integer')
            self.keyed_vectors = Word2VecKeyedVectors(dimensions)
        elif isinstance(gensim_obj, WordEmbeddingsKeyedVectors):
            if not hasattr(gensim_obj, 'save_word2vec_format'):
                raise ValueError(
                    f'gensim_obj {type(gensim_obj)} does not have attribute "save_word2vec_format"'
                )
            self.keyed_vectors = gensim_obj
        elif isinstance(gensim_obj, BaseWordEmbeddingsModel):
            if not hasattr(gensim_obj, 'wv'):
                raise ValueError(
                    f'gensim_obj {type(gensim_obj)} does not have attribute "wv"'
                )
            self.keyed_vectors = gensim_obj.wv
        else:
            raise ValueError(
                f'unable to determine word vectors in gensim object {gensim_obj}'
            )
        self.source = source
        # forcefully normalize the vectors
        self.keyed_vectors.vectors = normalize(self.keyed_vectors.vectors)
Exemplo n.º 26
0
def main():
    parser = argparse.ArgumentParser(
        description="Time how long it takes gensim to read an embedding file.")
    parser.add_argument("embedding",
                        help="The path ot the embeddings file to read")
    parser.add_argument("--format", required=True, choices=("binary", "text"))
    args = parser.parse_args()

    binary = True if args.format == "binary" else False

    tic = time.time()
    _ = Word2VecKeyedVectors.load_word2vec_format(args.embedding,
                                                  binary=binary)
    toc = time.time()

    print(
        json.dumps({
            "file": args.embedding,
            "format": args.format,
            "time": toc - tic
        }))
Exemplo n.º 27
0
def parse_program(program: str,
                  parser: Parser = None,
                  code2vec: Word2VecKeyedVectors = None) -> nx.DiGraph:
    if parser is None:
        parser: Parser = get_parser()

    tree = parser.parse(bytes(program, "utf8"))

    # 建立一个空的有向图
    g: nx.DiGraph = nx.DiGraph()

    queue: Queue = Queue()
    queue.put(tree.root_node)

    while not queue.empty():
        # 按照宽度优先的顺序来建立一个有向图
        node = queue.get()

        if not hasattr(node, 'children'):
            continue

        # 依次将父节点与子节点连接起来:root-child 建立边的关系
        for child in node.children:
            g.add_edge(TreeSitterNode(node, program),
                       TreeSitterNode(child, program))

            queue.put(child)

    # embedding are added to each node
    # 使用code2vec的嵌入表示来初始化表示图中的节点
    if code2vec is not None:
        zeros = np.zeros(code2vec.vector_size)
        for node in g.nodes:
            name = node.name.lower()
            if name in code2vec:
                g.add_node(node, data=code2vec.get_vector(name))
            else:
                g.add_node(node, data=zeros)

    return g
Exemplo n.º 28
0
 def __init__(self, model: str = "glove", aggregation: str = "average"):
     """ Load pre-trained embeddings, either locally if model is a local file path
     or a Word2VecKeyedVector object, or downloaded from the gensim API if a string
     is provided.
     """
     if aggregation not in {"average", "sum", "minmax"}:
         raise ValueError(
             f"Unknown embeddings aggregation mode: {aggregation}, the available "
             "ones are: average, sum, or minmax.")
     if isinstance(model, str):
         model = model.lower()
         if model in DEFAULT_PRETRAINED_EMBEDDINGS.keys():
             model_gensim_name = DEFAULT_PRETRAINED_EMBEDDINGS[model]
             self.model = api.load(model_gensim_name)
         elif model in api.info()["models"].keys():
             self.model = api.load(model)  # pragma: no cover
         elif os.path.exists(model):
             logger.info("Loading local model")
             self.model = Word2VecKeyedVectors.load(model)
             if not isinstance(self.model, Word2VecKeyedVectors):
                 raise TypeError(
                     "The input model should be a Word2VecKeyedVectors object but "
                     f"it is a {type(self.model)} object.")
         else:
             raise KeyError(
                 f"Unknown pre-trained model name: {model}. Available models are"
                 + ", ".join(api.info()["models"].keys()))
         logger.info("Loaded model keyed vectors: " + model)
     elif isinstance(model, Word2VecKeyedVectors):
         self.model = model
         logger.info("Loaded model keyed vectors.")
     else:
         raise TypeError(
             "Input pre-trained model should be a string or a gensim "
             "Word2VecKeyedVectors object")
     self.aggregation = aggregation
     self.embedding_dimension = self.model.vector_size
     if self.aggregation == "minmax":
         self.embedding_dimension *= 2
def retrain():
    with app.app_context():
        temp = Projects.query.with_entities(Projects.title).all()
        titles = [i[0] for i in temp]
        temp = Projects.query.with_entities(Projects.abstract).all()
        abstracts = [i[0] for i in temp]

        msrcsv = 'MetaData/' + 'MSRTrainData.csv'
        leecsv = 'MetaData/' + 'LeeDocSimTrain.csv'
        tit_df = pd.read_csv(msrcsv, error_bad_lines=False)
        abs_df = pd.read_csv(leecsv, error_bad_lines=False)
        word_model = Word2VecKeyedVectors.load("MetaData/" + WORD_VEC_MODEL)
        new_words_list = []
        for index, row in tit_df.iterrows():
            for i in [row['Sentence1'], row['Sentence2']]:
                new_words_list.append(preprocess_string(remove_stopwords(i)))

        for index, row in abs_df.iterrows():
            for i in [row['Document1'], row['Document2']]:
                new_words_list.append(preprocess_string(remove_stopwords(i)))

        for i in titles:
            new_words_list.append(preprocess_string(remove_stopwords(i)))
        for i in abstracts:
            new_words_list.append(preprocess_string(remove_stopwords(i)))

        new_model = Word2Vec(new_words_list,
                             size=DIMENSIONS,
                             window=5,
                             min_count=1,
                             workers=4)
        word_vecs = []
        words = []
        for lis in new_words_list:
            for word in lis:
                words.append(word)
                word_vecs.append(new_model.wv[word])
        word_model.add(words, word_vecs, replace=False)
        word_model.save("MetaData/" + WORD_VEC_MODEL)
Exemplo n.º 30
0
    def train(path_corpus: str, fname: str, path_out_dir: str,
              term_ids: Set[int], doc_ids: Set[int]) -> str:
        """'Train ELMo embeddings. This means averaging for context.

        ******
        IMPORTANT: At the moment no averaging is done! So the input
        embeddings are just returned as output embeddings!
        ******

        Args:
            path_corpus: The path to the text file used for training.
            fname: The filename for the embedding file.
            path_out_dir: The path to the output directory.
            term_ids: The set of current term-ids.
            doc_ids: The set of doc-ids making up the current subcorpus.
        Return:
            The path to the embedding file.
        """
        raw_path = 'embeddings/{}.vec'.format(fname)
        path_out = os.path.join(path_out_dir, raw_path)

        # *** tmp lines ***
        tmp_path_in = os.path.join(path_out_dir,
                                   'embeddings/embs_token_ELMo_avg.pickle')
        averaged_embs = pickle.load(open(tmp_path_in, 'rb'))
        averaged_embs = {str(k): v for k, v in averaged_embs.items()}
        # *** tmp lines ***
        key = list(averaged_embs.keys())[0]
        vector_size = len(averaged_embs[key])
        m = Word2VecKeyedVectors(vector_size=vector_size)
        m.vocab = averaged_embs
        m.vectors = np.array(list(averaged_embs.values()))
        my_save_word2vec_format(binary=True,
                                fname=path_out,
                                total_vec=len(averaged_embs),
                                vocab=m.vocab,
                                vectors=m.vectors)
        return path_out
Exemplo n.º 31
0
    def learn(self, nx_g, mapping):
        g = RiWalkGraph.RiGraph(nx_g, self.args)

        walk_time, bfs_time, ri_time, walks_writing_time = g.process_random_walks(
        )

        print('walk_time', walk_time / self.args.workers, flush=True)
        print('bfs_time', bfs_time / self.args.workers, flush=True)
        print('ri_time', ri_time / self.args.workers, flush=True)
        print('walks_writing_time',
              walks_writing_time / self.args.workers,
              flush=True)

        wv = self.learn_embeddings()

        original_wv = Word2VecKeyedVectors(self.args.dimensions)
        original_nodes = list(mapping.keys())
        original_vecs = [
            wv.word_vec(str(mapping[node])) for node in original_nodes
        ]
        original_wv.add(entities=list(map(str, original_nodes)),
                        weights=original_vecs)
        return original_wv