Exemplo n.º 1
0
def get_latent(args):
	print("Loading embedding model...")
	model_name = 'WORD2VEC_' + args.target_dataset + '.model'
	embedding_model = Word2VecKeyedVectors.load(os.path.join(CONFIG.EMBEDDING_PATH, model_name))
	print("Loading embedding model completed")
	
	full_data = []
	df_data = pd.read_csv(os.path.join(CONFIG.DATASET_PATH, args.target_dataset, 'posts.csv'), header=None, encoding='utf-8-sig')

	short_code_list = []
	row_list = []
	csv_name = 'text_word2vec_' + args.target_dataset + '.csv'
	pbar = tqdm(total=df_data.shape[0])

	for index, row in df_data.iterrows():
		pbar.update(1)
		short_code = row.iloc[0]
		short_code_list.append(short_code)
		text_data = row.iloc[1]
		#full_data.append([text_data, short_code])
		vector_list = []
		for word in text_data.split():
			vector_list.append(embedding_model.get_vector(word))
		vector = np.mean(vector_list, axis=0)
		row_list.append(vector)
		del text_data
	pbar.close()

	result_df = pd.DataFrame(data=row_list, index=short_code_list, columns=[i for i in range(300)])
	result_df.index.name = "short_code"
	result_df.sort_index(inplace=True)
	result_df.to_csv(os.path.join(CONFIG.CSV_PATH, csv_name), encoding='utf-8-sig')
	print("Finish!!!")
Exemplo n.º 2
0
def load_model(model_name, epoch):
    from gensim.models import KeyedVectors
    from gensim.models.keyedvectors import FastTextKeyedVectors, Word2VecKeyedVectors
    from gensim.models.fasttext import load_facebook_vectors, load_facebook_model
    from gensim.models.wrappers import FastText

    if epoch != '50+10':
        # if epoch choice is epoch 10 or 50 (no continued training with CSPC problem texts)
        if model_name.lower() == 'word2vec':
            return Word2VecKeyedVectors.load(f"trained_models/word2vec/idwiki.epoch-{epoch}.dim-300.kv")
        elif model_name.lower() == 'glove':
            return KeyedVectors.load_word2vec_format(
                f"trained_models/glove/converted.idwiki.epoch-{epoch}.dim-300.model.txt")
        elif model_name.lower() == 'fasttext':
            model = FastText.load_fasttext_format(
                f"trained_models/fasttext/idwiki.epoch-{epoch}.dim-300.bin")
            return model.wv
    else:
        # if epoch choice is 50+10, i.e. the 50 epoch word2vec model that's trained further with CSPC problem texts
        return Word2VecKeyedVectors.load(f"trained_models/word2vec/idwiki-cspc.epoch-50.dim-300.kv")

    return None
Exemplo n.º 3
0
def load_word_vectors(key_vecs_file, weights_file):
    """
    loads w2v keyvecs and lexicon into memory
    :param key_vecs_file: path to keyvecs file
    :param weights_file: path to lexicon w2v file
    :return: keyvecs, lexicon
    """
    logger.info("loading word2vec model...")

    wv = Word2VecKeyedVectors.load(key_vecs_file)
    weights = np.load(weights_file)

    return wv, weights
 def __init__(self,
              threshold=0.5,
              word2vecpath="model/word_embedding/embedding.wv",
              datapath="news_ch_2_seg/7.json"):
     self.threshold = threshold
     self.stopword2tag = {'m', 'p', 'x', 'c', 'uj', 'd', 'f', 'r', 'ul'}
     self.stopword2tag.add('a')
     self.word2vec = Word2VecKeyedVectors.load(word2vecpath)
     with open(datapath, 'r') as load_f:
         self.data = json.load(load_f)
     self.content, self.title, self.label = [], [], []
     self.Xtrain = None
     self.init()
     self.de_stopword()
     self.vectorize()
def model_to_csv(target_model):
	model_name = 'WORD2VEC_' + target_model + '.model'
	model = Word2VecKeyedVectors.load(os.path.join(CONFIG.EMBEDDING_PATH,model_name))
	vocab = list(model.vocab)
	vocab_list = [x for x in vocab]
	print("vocab length: ", len(vocab_list))

	# f_csv = open(DF_PATH+'Word2VecBlog300_5_min10_mecab.csv', 'w', encoding='utf-8-sig', newline='')
	print("started to write csv")
	csv_name = target_model + '.csv'
	f_csv = open(os.path.join(CONFIG.CSV_PATH, csv_name), 'w', encoding='utf-8-sig', newline='')
	wr = csv.writer(f_csv)

	for voca in vocab_list:
		wr.writerow([voca]+model[voca].tolist())

	f_csv.close()
	print("completed to write csv")
def retrain():
    with app.app_context():
        temp = Projects.query.with_entities(Projects.title).all()
        titles = [i[0] for i in temp]
        temp = Projects.query.with_entities(Projects.abstract).all()
        abstracts = [i[0] for i in temp]

        msrcsv = 'MetaData/' + 'MSRTrainData.csv'
        leecsv = 'MetaData/' + 'LeeDocSimTrain.csv'
        tit_df = pd.read_csv(msrcsv, error_bad_lines=False)
        abs_df = pd.read_csv(leecsv, error_bad_lines=False)
        word_model = Word2VecKeyedVectors.load("MetaData/" + WORD_VEC_MODEL)
        new_words_list = []
        for index, row in tit_df.iterrows():
            for i in [row['Sentence1'], row['Sentence2']]:
                new_words_list.append(preprocess_string(remove_stopwords(i)))

        for index, row in abs_df.iterrows():
            for i in [row['Document1'], row['Document2']]:
                new_words_list.append(preprocess_string(remove_stopwords(i)))

        for i in titles:
            new_words_list.append(preprocess_string(remove_stopwords(i)))
        for i in abstracts:
            new_words_list.append(preprocess_string(remove_stopwords(i)))

        new_model = Word2Vec(new_words_list,
                             size=DIMENSIONS,
                             window=5,
                             min_count=1,
                             workers=4)
        word_vecs = []
        words = []
        for lis in new_words_list:
            for word in lis:
                words.append(word)
                word_vecs.append(new_model.wv[word])
        word_model.add(words, word_vecs, replace=False)
        word_model.save("MetaData/" + WORD_VEC_MODEL)
Exemplo n.º 7
0
 def __init__(self, model: str = "glove", aggregation: str = "average"):
     """ Load pre-trained embeddings, either locally if model is a local file path
     or a Word2VecKeyedVector object, or downloaded from the gensim API if a string
     is provided.
     """
     if aggregation not in {"average", "sum", "minmax"}:
         raise ValueError(
             f"Unknown embeddings aggregation mode: {aggregation}, the available "
             "ones are: average, sum, or minmax.")
     if isinstance(model, str):
         model = model.lower()
         if model in DEFAULT_PRETRAINED_EMBEDDINGS.keys():
             model_gensim_name = DEFAULT_PRETRAINED_EMBEDDINGS[model]
             self.model = api.load(model_gensim_name)
         elif model in api.info()["models"].keys():
             self.model = api.load(model)  # pragma: no cover
         elif os.path.exists(model):
             logger.info("Loading local model")
             self.model = Word2VecKeyedVectors.load(model)
             if not isinstance(self.model, Word2VecKeyedVectors):
                 raise TypeError(
                     "The input model should be a Word2VecKeyedVectors object but "
                     f"it is a {type(self.model)} object.")
         else:
             raise KeyError(
                 f"Unknown pre-trained model name: {model}. Available models are"
                 + ", ".join(api.info()["models"].keys()))
         logger.info("Loaded model keyed vectors: " + model)
     elif isinstance(model, Word2VecKeyedVectors):
         self.model = model
         logger.info("Loaded model keyed vectors.")
     else:
         raise TypeError(
             "Input pre-trained model should be a string or a gensim "
             "Word2VecKeyedVectors object")
     self.aggregation = aggregation
     self.embedding_dimension = self.model.vector_size
     if self.aggregation == "minmax":
         self.embedding_dimension *= 2
Exemplo n.º 8
0
from backend.function_for_clean import tokenize_me
from gensim.models.keyedvectors import Word2VecKeyedVectors
from database.db import *
import pymorphy2

morph = pymorphy2.MorphAnalyzer()
model = Word2VecKeyedVectors.load("../ml/russian_database")


def word_type(word: str) -> str:
    """
    Получает на вход слово, возращает его часть речи.
    """
    return str(morph.parse(word)[0].tag).split(",")[0]


def phrase_to_vector_to_str(phrase: str):
    try:
        return sum(model[f"{word}_{word_type(word)}"]
                   if f"{word}_{word_type(word)}" in model.vocab else 0
                   for word in phrase.split()).reshape(1, -1).tostring()
    except AttributeError:
        return


if __name__ == '__main__':
    database = DB()
    cleaning_table = CleaningTable(database.get_connection())
    clean_table = CleanTable(database.get_connection())
    data = cleaning_table.get_all()
Exemplo n.º 9
0
def ml(phrase: str) -> list:
    """
    Получает на вход строку с 'красивыми' данными разделённые пробелами, делает анализ запроса и возращает
    список топ-5 (от наиболее похожих до наименее похожих) номеров ошибок.
    """

    morph = MorphAnalyzer()

    def word_type(word: str) -> str:
        """
        Получает на вход слово, возращает его часть речи.
        """
        return str(morph.parse(word)[0].tag).split(",")[0]

    def start_timer():
        """
        Начало таймера для проверки скорости работы определнного блока кода
        """
        global start
        start = time()

    def end_timer(text: str):
        """
        Конец таймера для проверки скорости работы определённого блока кода
        (Выводит в консоль)
        """
        global start
        this_time = time() - start
        print("-" * 44)
        print(this_time, "sec")
        print(text)
        print("-" * 44)

    print("#" * 44)
    print(f"Entered data: {phrase.split()}")
    print("#" * 44)
    model = Word2VecKeyedVectors.load("../ml/russian_database")
    db = DB()
    clean_table = CleanTable(db.get_connection())
    data = clean_table.get_all()

    minn = []
    try:
        main_vector = sum([model[f"{word}_{word_type(word)}"] if f"{word}_{word_type(word)}" in model.vocab
                           else 0 for word in phrase.split()]).reshape(1, -1)
    except AttributeError:
        main_text = set(phrase.split())
        out = []
        start_timer()
        for stroke in data:
            if len(set(stroke[2].split()) & main_text) >= len(main_text) // 3 + 1:
                out.append(stroke[1])
        shuffle(out)
        out = out[:10]
        end_timer(f"ERROR: Cannot transform all words to vector -> shuffle -> out: {out}")
        return out

    start_timer()
    for stroke in data:
        try:
            this_vector = [frombuffer(stroke[3], dtype=float32)]
            this = (cosine_similarity(this_vector, main_vector), stroke[1])
            if len(minn) < 5:
                minn.append(this)
            elif minn[0][0][0] < this[0][0]:
                minn[0] = this
                minn = sorted(minn, key=lambda a: a[0][0])
        except AttributeError:
            pass
        except ValueError:
            pass
    out = [out_data[-1] for out_data in minn[::-1]]
    end_timer(f"Word -> vec -> top5: {out}")
    return out
Exemplo n.º 10
0
IMAGE_FOLDER = Path('./uploadedImages/')

# the name of the model for item similarity to download
# for more models: https://github.com/RaRe-Technologies/gensim-data
simModelName = 'glove-wiki-gigaword-50'
simModel = None

# Load in the proper gensim files
modelLocation = Path('./sim_model_encoding')

if modelLocation.exists():
    # if the file exists, load that file
    # there should be two files, 'sim_model_encoding' and the same but with an
    # extension of vectors.npy
    print(' * Loading model from local')
    simModel = word2vec.load(str(modelLocation))
else:
    # get the gensim model from online and save it for future use if
    # there is no file
    print(' * Loading model remotely')
    simModel = gens_api.load(simModelName)
    simModel.save(str(modelLocation))


@items_router.route("/api/")
def hello():
    return "This is the API for the Seekr App!"


@items_router.route('/api/items', methods=['GET'])
def get_all_items():
Exemplo n.º 11
0
import pickle
import numpy as np
from gensim.models.keyedvectors import Word2VecKeyedVectors

model = Word2VecKeyedVectors.load('model/wv.model')

index = model.index2entity
index_inv = {}
for i,word in enumerate(index):
    index_inv[word] = i+1
index_inv['<p>'] = 0
with open('data/index.pickle','wb') as f:
    pickle.dump(['<p>']+index,f)
with open('data/index_inv.pickle','wb') as f:
    pickle.dump(index_inv,f)
z = np.zeros(model.wv[index[0]].shape,dtype='float32')
with open('data/vec.pickle','wb') as f:
    pickle.dump(np.array([z]+[model.wv[word] for word in index]),f)
Exemplo n.º 12
0
def infer_private_companies(alpha=-1, topn=top):
    vocab_file = working_dir + properties[
        "evaluation.private_vocab_filename_prefix"] + str(current_year)
    peer_dir = working_dir + properties["training.private_peer_dir_name"]
    if not os.path.isdir(peer_dir):
        os.mkdir(peer_dir)

    peer_filepath = peer_dir + "/" + \
                    properties["training.private_peers_filename_prefix"] + str(current_year) + ".csv"

    report_filepath = os.path.join(
        working_dir, properties["evaluation.infer_report_filename_prefix"] +
        str(current_year) + ".txt")
    if not os.path.isfile(report_filepath):
        f = open(report_filepath, "w")
        f.close()

    report = open(report_filepath, "a")
    report.write("Model filepath: " + model_filepath + "\n")
    report.write("Model load time: " + str(model_load_time) + "\n")
    report.write("Result peer file: " + peer_filepath + "\n")
    report.close()

    if not os.path.isfile(vocab_file):
        vocab = Word2VecKeyedVectors(vector_size=model.vector_size)

        f = open(
            os.path.join(properties["private_companies_list_dir"],
                         str(current_year) + ".txt"), "r")
        companies_list = f.readlines()
        f.close()

        start = time.time()
        start_index = 0
        while start_index < len(companies_list):
            create_processes(list(companies_list[start_index:start_index +
                                                 16]))
            start_index += 16

        s = time.time()
        load_vocab(vocab)
        lt = time.time() - s

        print("Vocab size: " + str(len(vocab.index2entity)))

        good_time_counts = dict(time_count)
        report = open(report_filepath, "a")
        report.write("Time taken to generate bag of words: " +
                     str(good_time_counts["total_bow_time"]) + "\n")
        report.write("Time taken to infer vectors: " +
                     str(good_time_counts["total_infer_time"]) + "\n")
        report.write("Time taken to load vectors: " + str(lt) + "\n")
        report.write("Time taken to build vocab: " + str(time.time() - start) +
                     "\n")
        s = time.time()
        vocab.save(vocab_file)
        report.write("Time taken to save vocab file: " + str(time.time() - s) +
                     "\n")
        report.close()
        print("KeyedVectors stored succesfully.")
    else:
        start = time.time()
        s = time.time()
        vocab = Word2VecKeyedVectors.load(vocab_file, mmap="r")
        report = open(report_filepath, "a")
        report.write("Time taken to load vocab: " + str(time.time() - s) +
                     "\n")
        report.close()

    companies = write_companies(vocab)

    s = time.time()
    top = int(len(companies) * topn / 100)

    count = 0
    if os.path.isfile(peer_filepath):
        report = open(report_filepath, "a")
        report.write("Peer file already present.\n")
        report.close()
        return
    result = open(peer_filepath, "w")
    result.write("focal_firmid\trival_firmid\twtnic_score\n")
    for company in companies:
        if company not in firmIds:
            continue
        focal_key = firmIds[company][0]
        sims = vocab.most_similar(positive=[company], topn=top)
        for (rival_company, score) in sims:
            if rival_company not in firmIds or company == rival_company:
                continue
            if score < threshold:
                continue
            rival_key = firmIds[rival_company][0]
            result.write(
                str(focal_key) + "\t" + str(rival_key) + "\t" + str(score) +
                "\n")
        count += 1
        if count % 100 == 0:
            print("Similarity done for: ", count)

    result.close()
    report = open(report_filepath, "a")
    report.write("Peer file generation time: " + str(time.time() - s) + "\n")
    report.write("Time taken: " + str(time.time() - start) + "\n\n")
    report.close()