def load_embeddings(self, file_path): # Embeddins must be in fastText format either bin or print('Loading embeddins...') if file_path.endswith('.bin'): from gensim.models.wrappers import FastText embeddings = FastText.load_fasttext_format(file_path) else: from gensim.models import KeyedVectors embeddings = KeyedVectors.load_word2vec_format(file_path) return embeddings
def from_gensim(self, fname, normalizeVectors=True): import gensim print >> sys.stderr, "Loading embeddings file", fname if re.search(r'bin$', fname): model = FastText.load_fasttext_format(fname) elif re.search(r'bin$', fname): model = gensim.models.KeyedVectors.load_word2vec_format( fname, binary=True, encoding='utf-8', unicode_errors='ignore', fvocab=fvocab #,unicode_errors='replace' ) elif re.search(r'vec$', fname): model = gensim.models.KeyedVectors.load_word2vec_format( fname, binary=False, fvocab=fvocab) else: model = gensim.models.KeyedVectors.load(fname) if "wv" in model: model = model.wv try: self.vecs = model.vectors except: self.vecs = model.syn0 self.id2label = [] + model.index2word for w in model.vocab: self.labels[w] = { "id": model.vocab[w].index, "freq": model.vocab[w].count } if normalizeVectors: self.vecs = normalize(self.vecs) self.build() return self
def Import(lang, path): Target_lang = lang # print(len(model.words)) word_list = pd.read_csv('wordlists/religion_Translations/' + lang + '.csv') # model = FastText.load_fasttext_format("D:\Data\Word Embedding\Vectors/" + path) ########## Import Word Lists ################## word_list = np.array(word_list) occupations = [] Islam = [] # man Christianity = [] # woman this is woman bias Terrorism = [] LenOfIslam = 18 # The length of lists for male and female LenOfChristianity = 15 # The length of lists for male and female LenOfNatrual = 48 print(word_list[:, :]) print(len(word_list[:, 1])) for i in range(0, LenOfIslam): Islam.append(word_list[i][0]) print(Islam) for i in range(0, LenOfChristianity): Christianity.append(word_list[i][1]) Christianity.append(word_list[i][1]) for i in range(0, LenOfNatrual): Terrorism.append(word_list[i][2]) print(Terrorism) print( '''This is the Translated result of Terrorism word list for this language''' ) print("Import Loading") model = FastText.load_fasttext_format("D:\Data\Word Embedding\Vectors/" + path, encoding='utf-8') # model = KeyedVectors.load_word2vec_format("D:\Data\Word Embedding\Vectors/" + path, encoding='utf-8') return model, Islam, Christianity, Terrorism
def export_to_file(path_to_model, output_file): output = codecs.open(output_file, 'w+', 'utf-8') print("Converting to text") model = FastText.load_fasttext_format(path_to_model) vocab = model.wv.vocab # output.write(str(len(vocab)) + " " + str(len(model[vocab[0]]))) header = False for mid in tqdm.tqdm(vocab): if not header: output.write(str(len(vocab)) + " " + str(len(model[mid])) + "\n") header = True vector = list() for dimension in model[mid]: vector.append(str(dimension)) # line = { "mid": mid, "vector": vector } vector_str = " ".join(vector) line = mid + " " + vector_str # line = json.dumps(line) output.write(line + "\n") output.close() print("Done!")
def get_fasttext_model(dataset="tweet", model_type="bin"): w2v_rootdir = os.path.join(res_basedir, "word2vecs") tweets_rootdir = os.path.join(resources_rootdir, "tweet_w2v", "tweet_fasttext") ds_rootdir = os.path.join(resources_rootdir, "ds_aa", "fasttext_embs") amazon_rootdir = os.path.join(resources_rootdir, "amazon", "fasttext_embs") # amazon_rootdir = os.path.join(resources_rootdir, "amazon", "fasttext_embs_50_eps") if dataset == "tweet": model_path = os.path.join(tweets_rootdir, "tweet_fasttext.{}".format(model_type)) elif dataset == "ds": model_path = os.path.join(ds_rootdir, "ds_fasttext.{}".format(model_type)) elif dataset == "amazon": model_path = os.path.join(amazon_rootdir, "amazon_fasttext.{}".format(model_type)) elif dataset == "wiki": model_path = os.path.join(w2v_rootdir, "wiki.en/wiki.en.{}".format(model_type)) elif dataset == "simple": model_path = os.path.join(w2v_rootdir, "wiki.simple/wiki.simple.{}".format(model_type)) print "fasttext model: ", model_path if model_type == "bin": model = FastText.load_fasttext_format(model_path) else: model = KeyedVectors.load_word2vec_format(model_path, binary=False) return model
def __init__(self, Model, params, predict=False): self.Model = Model self.params = params if predict: print("Loading Vectors") self.vec_model = FastText.load_fasttext_format( 'vectors/cc.en.300.bin/cc.en.300.bin').wv print("Finished Loading Vectors") # self.vec_model = [] print("loading model") self.test_val = tf.placeholder(tf.float32, shape=(None, max_word_count, 300)) self.test_output = self.Model.build_model(self.test_val, self.params, predict=True) self.sess = tf.InteractiveSession() self.saver = tf.train.Saver() # self.sess.run(tf.global_variables_initializer()) # self.sess.run(tf.local_variables_initializer()) self.saver.restore( self.sess, tf.train.latest_checkpoint(f"tensorboard_{self.Model.type}")) print("finished Loading model")
def load_ft(): w2v_model = FastText.load_fasttext_format('../embedding/cc.zh.300.bin') print("Finish Load") dim = len(w2v_model['好']) fw1 = codecs.open("../embedding/embedding_all_ftoov_%d.txt" % (dim), 'w', encoding='utf-8') vocab_dict = pickle.load(open('../data/vocabulary.pkl', 'rb')) word_list = ['unk' for i in range(len(vocab_dict))] for k, v in vocab_dict.items(): word_list[v] = k # print(word_list) embedding_matrix = np.zeros((len(vocab_dict), dim)) miss = 0 for index, w in enumerate(word_list): if index % 1000 == 0: print(index) try: # in_set.add(w) embeds = np.asarray(w2v_model[w]) except: w2v_model.most_similar(w) miss += 1 print(w) embeds = np.random.uniform(-0.25, 0.25, dim) embedding_matrix[index] = embeds fw1.write(str(len(word_list)) + ' ' + str(dim) + '\n') for index, w in enumerate(word_list): fw1.write(w) for i in embedding_matrix[index]: fw1.write(' ' + str(i)) fw1.write('\n') pickle.dump(vocab_dict, open('../data/vocabulary2.pkl', 'wb')) print(len(word_list)) print("miss:%d" % miss)
def main(): print("read word embeddings") word2vec = FastText.load_fasttext_format("data/embeddings/wiki.vi.bin").wv print("read relations") vectors = [] metadata = [] with open("data/RelationNormalize.txt", "r", encoding="utf8") as f: for line in f: w = line.strip().lower() if w in word2vec and w not in metadata: metadata.append(w) vec = word2vec.word_vec(w) vec = list(map(str, vec)) vectors.append("\t".join(vec)) del word2vec print("saving data") with open("data/vectors.txt", "w", encoding="utf8") as f: f.write("\n".join(vectors)) del vectors with open("data/metadata.txt", "w", encoding="utf8") as f: f.write("\n".join(metadata)) del metadata
# loss = criterion(output, true_scores) # loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 1000) # optimizer.step() # return output, loss.item() # # # def calculate_true_score(sentence_and_tags, true_tags): # tags = sentence_and_tags[1] # length = len(tags) # assert length == len(true_tags) # count = 0 # for i in range(length): # if tags[i] == true_tags[i]: # count += 1 # return count/length if __name__ == '__main__': language_model = FastText.load_fasttext_format(PATH_TO_MODEL) x, y, tag_set = pr.load_train_test_validation_sets(PATH_TO_DATA) train_set = pr.WordsDataset(x, tag_set) test_set = pr.WordsDataset(y, tag_set) item = train_set[16][0]["words"] print(item) my_little_tensor = create_tensor(item, None, language_model) print(my_little_tensor.shape) print("-----------------") model = train_model(train_set, language_model)
from nltk.tag.stanford import StanfordNERTagger from elasticsearch import Elasticsearch import itertools import nltk from pymongo import MongoClient from nltk.corpus import wordnet from nltk import ne_chunk, pos_tag, word_tokenize from nltk.tree import Tree from nltk.corpus import stopwords import re import csv from gensim.models.wrappers import FastText model = FastText.load_fasttext_format( '/Users/sepidehmesbah/Downloads/fastText/modelFT') from app.modules import filter_entities client = MongoClient('localhost:4321') db = client.pub import random from nltk.tag.stanford import StanfordPOSTagger # english_postagger = StanfordPOSTagger('/Users/sepidehmesbah/Downloads/stanford-postagger-full-2016-10-31/models/english-bidirectional-distsim.tagger', '/Users/sepidehmesbah/Downloads/stanford-postagger-full-2016-10-31/stanford-postagger.jar') # print(english_postagger.tag('Figures 3 (a) and (b) show the precision-recall curves for the three datasets: MovieLens, NewsSmall and NewsBig.3'.split())) dsnames = [] # corpuspath = "/Users/sepidehmesbah/SmartPub/DataProfiling/dataset-names.txt" # with open(corpuspath,"r") as file: # for row in file.readlines(): # dsnames.append(row.strip()) # # ############################### #
class_weighted = False OUTPUT_DIR = '../Output/' category2index = pickle.load( open(os.path.join(OUTPUT_DIR + 'category2index_%s.dict' % DATASET), 'rb')) categories = [''] * len(category2index) for cate, i in category2index.items(): categories[i] = cate print(categories) # Creating the model print("Loading the FastText Model") # en_model = {"test":np.array([0]*300)} en_model = FastText.load_fasttext_format('../FastText/wiki.en/wiki.en') class PretrainFastTextClassifier: def __init__(self): ''' load data ''' self.domains_train = pickle.load( open(OUTPUT_DIR + 'training_domains_%s.list' % DATASET, 'rb')) self.domains_train = [ d for cat_domains in self.domains_train for d in cat_domains ] self.domains_val = pickle.load( open(OUTPUT_DIR + 'validation_domains_%s.list' % DATASET, 'rb')) self.domains_val = [ d for cat_domains in self.domains_val for d in cat_domains ]
from gensim.models.wrappers import FastText model = FastText.load_fasttext_format('../data/raw/BIN/wiki.simple') # Give the embedding of a given word print(model.wv['brain']) # Test if a word is in the model print('brain' in model.wv.vocab) # Give the most similar words print(model.most_similar('brain')) # Compute similarity between two words print(model.similarity('brain', 'synapse')) # Compute cosine distance between two groups of words trained_model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant']) # Make arithmetic with words print(model.wv.most_similar(positive=['king', 'woman'], negative=['man']))
from gensim.models.wrappers import FastText from gensim.models.fasttext import FastText, load_facebook_vectors from gensim.models.keyedvectors import KeyedVectors import numpy import numpy as np from numpy.random import seed seed(1) #%% fast load wxTrain1, wxTest1, wyTrain1, wyTest1, sentences_corpus, keywords_dictionary, labels = dsurdu.Generate_Urdu_Ngrams( _ngram_range=(1, 1), _max_features=3000, words=True) w2v_file_fast_text = "C:\FasText\\cc.ur.300.bin.gz" w2vmodel = FastText.load_fasttext_format(w2v_file_fast_text) print("Word 2 Vector File Loaded!") vector = w2vmodel.wv['easy'] print("Shape of Vector:" + str(vector.shape)) #%% fast vector generation vectors_per_document = 5 X_train_Vector = [] for kl in keywords_dictionary: vector_list = [] for word in kl[0:vectors_per_document]: if word in w2vmodel.wv.vocab: vector_list.append(w2vmodel[word]) else: vector_list.append(np.random.uniform(-0.1, 0.1, 300))
def __init__(self): # add data imports self.data_df = pd.read_csv("data.csv") self.data_path = "./complete.json.gz" self.data_split = data_split self.stop_words = [ 'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'further', 'was', 'here', ] print("Loading Vectors") self.vec_model = FastText.load_fasttext_format( 'vectors/cc.en.300.bin').wv # self.vec_model = {} print("Completed Loading Vectors") self.data_df["label"] = self.data_df["label"].astype(int) self.data_df["text"] = self.data_df["text"].astype(str) self.data_df["text"] = self.data_df["text"].str.lower() self.data_df["text"] = self.data_df["text"].str.strip(to_strip=".!?,") self.data_df["text"] = self.data_df["text"].str.split() self.data_df["text"] = self.data_df["text"].apply( lambda x: [w for w in x if not w in self.stop_words])
def calculate_similarity(source_doc, target_doc, embedding="Glove", threshold=0): """Calculates & returns similarity scores between given source document & all the target documents.""" def w2v_vectorize(doc): """Identify the vector values for each word in the given document""" doc = [i.lower().split() for i in doc] word_list = [] for w in doc: w = [word for word in w if word not in stopwords.words('english')] word_list.append(w) vec_list = [] for words in word_list: word_vecs = [] for word in words: try: vec = w2v_model[word] word_vecs.append(vec) except KeyError: pass vector = np.mean(word_vecs, axis=0) vec_list.append(vector) vectors = np.mean(vec_list, axis=0) return vectors def glove_vectorize(doc): """Identify the vector values for each word in the given document""" doc = [i.lower().split() for i in doc] word_list = [] for w in doc: w = [word for word in w if word not in stopwords.words('english')] word_list.append(w) vec_list = [] for words in word_list: word_vecs = [] for word in words: try: vec = glove_model[word] word_vecs.append(vec) except KeyError: pass vector = np.mean(word_vecs, axis=0) vec_list.append(vector) vectors = np.mean(vec_list, axis=0) return vectors def fasttext_vectorize(doc): """Identify the vector values for each word in the given document""" doc = " ".join(doc) doc = doc.lower() words = [w for w in doc.split(" ")] word_vecs = [] for word in words: try: vec = fasttext_model[word] word_vecs.append(vec) except KeyError: # Ignore, if the word doesn't exist in the vocabulary pass vector = np.mean(word_vecs, axis=0) return vector def cosine_sim(vecA, vecB): """Find the cosine similarity distance between two vectors.""" csim = np.dot(vecA, vecB) / (np.linalg.norm(vecA) * np.linalg.norm(vecB)) if np.isnan(np.sum(csim)): return 0 return csim if embedding == "Word2Vec": w2v_model = KeyedVectors.load_word2vec_format( 'Semantic_Similarity/Word_Embedding/data/GoogleNews-vectors-negative300.bin', binary=True, ) source_vec = w2v_vectorize(source_doc) target_vec = w2v_vectorize(target_doc) sim_score = cosine_sim(source_vec, target_vec) if sim_score > threshold: return sim_score elif embedding == "Glove": source_vec = glove_vectorize(source_doc) target_vec = glove_vectorize(target_doc) sim_score = cosine_sim(source_vec, target_vec) if sim_score > threshold: return sim_score elif embedding == "FastText": fasttext_model = FastText.load_fasttext_format( 'Semantic_Similarity/Word_Embedding/data/cc.en.300.bin') source_vec = fasttext_vectorize(source_doc) target_vec = fasttext_vectorize(target_doc) sim_score = cosine_sim(source_vec, target_vec) if sim_score > threshold: return sim_score
import sys from gensim.models.wrappers import FastText from gensim.scripts.glove2word2vec import glove2word2vec model = FastText.load_fasttext_format(sys.argv[1]) with open(sys.argv[2], 'r', encoding='utf-8') as fin: with open(sys.argv[3], 'w', encoding='utf-8') as fout: for line in fin: word = line.strip() if word in model: embs = list(model[word]) else: embs = [1e-8 for _ in range(model.vector_size)] embs = [str(item) for item in embs] fout.write(' '.join([word]+embs)+'\n') if len(sys.argv)>4 is not None: glove2word2vec(sys.argv[3], sys.argv[4])
# # for item in iterator: # title = item.find('TITLE') # Title_text = title.text # print(Title_text) # body = item.find('BODY') # body_text = body.text # print(body_text) f_model_file = 'C:/Users/ffayaza/Documents/MscProEmb/WordEmberding/Model/wiki/wiki.ta.vec' f_model_bin = 'C:/Users/ffayaza/Documents/MscProEmb/WordEmberding/Model/wiki/wiki.ta.bin' ftext = KeyedVectors.load_word2vec_format(f_model_file) # print(ftext.wv.vocab) # print(ftext.wv['தலைமையில்']) model = FastText.load_fasttext_format(f_model_bin, encoding='utf8') # print(model.wv.vocab) # print(model.wv['தலைமையில்']) def avg_sentence_vector(words, model, num_features, index2word_set): #function to average all words vectors in a given paragraph featureVec = np.zeros((num_features, ), dtype="float32") nwords = 0 for word in words: print(word) if word in index2word_set: print(word) nwords = nwords + 1 featureVec = np.add(featureVec, model[word])
from gensim.models.wrappers import FastText from nltk.stem.porter import PorterStemmer from nltk.tokenize import RegexpTokenizer from gensim.models.phrases import Phraser import re from stop_words import get_stop_words from sqlalchemy import create_engine # Word score and stem mapping db_connect = create_engine('sqlite:///stem_map_20170802.db') # Word embedding model model = FastText.load_fasttext_format('language_model_20170802') # Bigrams phraser bigram = Phraser.load('bigram_phraser_20170801.model') def get_stem_from_word(word): """ Get stem from word """ p_stemmer = PorterStemmer() return '_'.join([p_stemmer.stem(w) for w in word.split('_')]) def get_most_similar(word, topn=10, gender=None, pos=None): """ Get most similar words by gender or part of speech Returns a list of dictionaries with: - 'word': list tuples containing of word expansions from the recommended stem,
except:pass return tmp/len(text) def ensText(text,model): text=text.split() tmp=[] for i in text: try:tmp+=[model.wv[i]] except:pass return tmp def sim(a,b):return 1 - spatial.distance.cosine(a, b) def meanSim(text,title): s=0 for v in text:s+=sim(v,title) return s/(1+len(text)) from gensim.models.wrappers import FastText model = FastText.load_fasttext_format('/home/celvaigh/these/divers/wiki.fr/wiki.fr.bin') fr="wiki.fr.bin" en="wiki.en.bin" #model = word_vectors = KeyedVectors.load_word2vec_format('/home/celvaigh/these/divers/wiki.fr/wiki.fr.bin', binary=True) def computeCorpusSims(name,lg): if lg=="fr":model = FastText.load_fasttext_format(fr) else:model = FastText.load_fasttext_format(en) data=pd.read_csv(name, sep='\t') texts=data["text"] titles=data["title"] size=len(texts) sims=[] for i in range(size):sims+=[meanSim(ensText(texts[i],model),MWV(titles[i],model))] sims.sort() return sims
clean_text = re.sub(r'[^\w\s]', '', clean_text) return clean_text ## Loading data train = pd.read_csv(DATA_PATH + 'train-expanded_NoLing.csv') validation = pd.read_csv(DATA_PATH + 'dev_expanded_NoLing.csv') answer_texts_train = pd.read_csv(DATA_PATH + 'answer_texts_train_NoLing.csv') answer_texts_validation = pd.read_csv(DATA_PATH + 'answer_texts_dev_NoLing.csv') test = pd.read_csv(DATA_PATH + 'test_NoLabels_NoLing.csv') answer_texts_test = pd.read_csv(DATA_PATH + 'answer_texts_test_NoLing.csv') translations = pickle.load( open(EXTRA_DATA_PATH + 'arabic_english_translations.p', 'rb')) embeddings_index = FastText.load_fasttext_format(EMBEDDING_PATH + 'cc.en.300.bin') ## Preparing data train['pool'] = train['pool'].apply(literal_eval) train['answer_ids'] = train['answer_ids'].apply(literal_eval) validation['pool'] = validation['pool'].apply(literal_eval) validation['answer_ids'] = validation['answer_ids'].apply(literal_eval) answer_texts_test.set_index('answer_id', drop=False, inplace=True) test['candidates'] = test['candidates'].apply(literal_eval) test.set_index('Unnamed: 0', drop=True, inplace=True) for idx, row in test.iterrows(): test.set_value(idx, 'candidates', sorted([int(x) for x in set(row['candidates'])])) answer_texts_test['clean_answer'] = answer_texts_test['answer'].apply( clean_text)
from vacancies import vacancy_dict from gensim.models.wrappers import FastText import gensim import pandas as pd from scipy.spatial.distance import euclidean from openpyxl import load_workbook import string model = FastText.load_fasttext_format('data.bin') wb = load_workbook('candidates.xlsx') def clean_str(s): for c in string.punctuation: s = s.replace(c, "") return s def get_similarity_euql(model, first_sentence, second_sentence): similarity = 0 first_sentence = [ i for i in clean_str(first_sentence).split() if i in model ] second_sentence = [ i for i in clean_str(second_sentence).split() if i in model ] for i in first_sentence: first_vector = model[i] sim_i = 0
new_question_tokens = [] for token in question_doc: if isPunctuation(token) or isStopWord(token) or isInContext(token, context_token_lemmas): new_question_tokens.append(token.text) new_question = TreebankWordDetokenizer().detokenize(new_question_tokens) if new_question == "": paragraph['qas'].pop(i) else: paragraph['qas'][i]['question'] = new_question i += 1 num_topics += 1 print("{}/{} topics done".format(num_topics, datalen)) from gensim.models.wrappers import FastText model = FastText.load_fasttext_format('wiki.simple') print("Model Loaded...") nlp = spacy.load("en_core_web_sm") def getMostSimContextWord(token, context_tokens): highest_sim = float('-Inf') most_sim_word = None for ct in context_tokens: # if isPunctuation(ct) or isStopWord(ct): # continue # else: if ct.lower() in model.wv.vocab: curr_sim = model.similarity(token.text.lower(), ct.lower()) if curr_sim >= highest_sim: highest_sim = curr_sim most_sim_word = ct
from gensim.models.wrappers import FastText model = FastText.load_fasttext_format('wiki.ko.bin') print("Finish creating model") print(model.most_similar('전자')) print(model.similarity('전자', '전기'))
from pymongo import MongoClient from nltk.corpus import wordnet from nltk import ne_chunk, pos_tag, word_tokenize from nltk.corpus import stopwords from app.modules import normalized_pub_distance from bson.objectid import ObjectId from gensim.models.wrappers import FastText model = FastText.load_fasttext_format('/data/modelFT') dsnames = [] mtnames = [] datasetspath = '/data/dataset_names.txt' with open(datasetspath, "r") as file: for row in file.readlines(): dsnames.append(row.strip()) methodpath = '/data/method_names.txt' with open(methodpath, "r") as file: for row in file.readlines(): mtnames.append(row.strip()) def is_int_or_float(s): ''' return 1 for int, 2 for float, -1 for not a number''' try: float(s) return 1 if s.count('.') == 0 else 2 except ValueError: return -1
import fasttext import fasttext.util from gensim.models.keyedvectors import KeyedVectors from gensim.models.wrappers import FastText ft = fasttext.load_model('fasttext_cc/cc.tr.300.bin') print("300 bin loaded") fasttext.util.reduce_model(ft, 100) print("300 bin reduced") ft.save_model('fasttext_cc/cc.tr.100.bin') print("100 bin saved") model = FastText.load_fasttext_format('fasttext_cc/cc.tr.100.bin') print("100 bin read") model.wv.save_word2vec_format('fasttext_cc/cc.tr.100.vec', binary=False) print("100 text saved")
def load_fasttext(path=None): word2vecmodel = FastText.load_fasttext_format(path) return word2vecmodel
def original_ratings(): "Function returning the automatic specificity scores along with the human ratings." data = loadmat('./original_data/specificity_automated.mat') automatic = data['specificity_automated'][0] data = loadmat('./original_data/specificity_scores_MEM5S.mat') ratings = data['scores'] ratings = [nanmean([nanmean(row) for row in image]) for image in ratings] return automatic, ratings def load_images(): image_data = loadmat('./original_data/memorability_888_img_5_sent.mat') images = [[s[0] for s in group] for group in image_data['memorability_sentences']] return images print("Loading vectors.") model = FastText.load_fasttext_format(VECTOR_LOCATION) print("Loaded.") automatic, ratings = original_ratings() images = load_images() vectorizer, analyzer = analyze_corpus(images) scores = [] for i, image in enumerate(images): if i % 10 == 0: print(i) score = image_specificity(image, vectorizer, analyzer, model) scores.append(score) result = spearmanr(automatic, ratings)
"""FastTextを利用して文章を変換する""" import pickle import numpy as np from gensim.models.wrappers import FastText with open('../raw_data/remap.pkl', 'rb') as f: pickle.load(f) pickle.load(f) pickle.load(f) pickle.load(f) pickle.load(f) texts = pickle.load(f) fasttext = FastText.load_fasttext_format('../raw_data/cc.en.300.bin') def sec2vec(sentence): global fasttext # 文を単語に分ける words = sentence.split() # 存在する単語のみ利用 words = [fasttext[word] for word in words if word in fasttext] # 文のベクトルを平均で算出 if len(words) == 0: return np.zeros((300, ), dtype=np.float32) return np.mean(words, axis=0) # レビュー文をベクトル化 r = np.ndarray((len(texts), 300), dtype=np.float32)
img_embedding = 300 nr_input_lines = 5000 # used to determine steps_per_epoch (nr batches) photos_per_batch = 5 # a batch will consists of photos_per_batch photos (each photo has 20-30 text samples) lstm_cell = 128 examples_train = 0 # In[3]: filepath = "dataset_txt/Flickr8k.lemma.token.txt" fast_text = "fastText_eng/wiki.simple" dir_imgs = "dataset_img" token_start = 'sstart' token_end = 'eend' special_token = 'xx' # load fast text - takes a lot of time model_embeddings = FastTextWrapper.load_fasttext_format(fast_text) # some initial info about whole dataset def get_initial_info_data(filepath): global examples_train token_set = set() with open(filepath, 'r') as f: for line in f: #print(line.split(' ')[0].split('#')[0]) txt = " ".join(line.split(' ')[1:]) tokens = nltk.word_tokenize(txt) examples_train += len(tokens) + 1 for token in tokens: token_set.add(token.lower())
print('x_train shape:', X_tr.shape) print('x_test shape:', X_te.shape) print("================") print(X_tr) print("================") print(X_te) print("================") from gensim.models.wrappers import FastText print("start...loading...wiki....en") model = FastText.load_fasttext_format('wiki.en') nb_words = min(max_features, len(tokenizer.word_index)) embedding_matrix = np.zeros((nb_words, embed_size)) for word, i in tokenizer.word_index.items(): if i >= nb_words: continue if word in model.wv: embedding_matrix[i] = model[word] print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0)) import tensorflow as tf gpu_options = tf.GPUOptions(allow_growth=True)
from preprocessing import * from gensim.models.wrappers import FastText import pandas as pd import numpy as np NUMBER_DOC = 4124 print("starting loading fasttext") model = FastText.load_fasttext_format('cc.en.300.bin') print("fasttext loaded") def building_idf(path, number_doc): data = pd.read_csv(path, sep=" ") data.columns = ['frequency', 'word', 'pos', 'num_doc_occ'] data['idf'] = data.apply( lambda element: np.log(NUMBER_DOC / element['num_doc_occ']), axis=1) return data def sim(element_1, element_2): return model.similarity(element_1, element_2) def simple_similarity_calcul(sentence_1, sentence_2, data): # No filter cleared_sentence_1_2 = formatting_sentences([sentence_1, sentence_2]) cleared_sentence_1, cleared_sentence_2 = cleared_sentence_1_2[ 0], cleared_sentence_1_2[1]