def sisters(path): sentence_embedding = sister.MeanEmbedding(lang="en") parsed_data = pd.read_csv(path, sep='|', index_col=0) # Pandas series with numpy array values to dataframe return pd.DataFrame.from_records( parsed_data['line'].apply(sentence_embedding))
def convert_to_vector_test(data): embedder = sister.MeanEmbedding(lang="en") vectorized_data = [] for sentences in data: new_sent = [] for sentence in sentences: new_sent.append(embedder(sentence)) vectorized_data.append(new_sent) return vectorized_data
def convert_to_vector_representation2(data): embedder = sister.MeanEmbedding(lang="en") vectorized_data = [] for sentences, y in data: new_sent = [] for sentence in sentences: new_sent.append(embedder(sentence)) vectorized_data.append((new_sent, y)) return vectorized_data
def embed(): print("Embedding transcripts") data = fm.get_df("0_parsed") sentence_embedding = sister.MeanEmbedding(lang="en") embedded = data["parsed"]["line"].apply(sentence_embedding) d = {"embedded": pd.DataFrame.from_records(embedded, index=embedded.index)} embedded = data.join(pd.concat(d, axis=1)) fm.write_df(embedded, "1_embedded_fasttext") return embedded
def sister_embeddings(x, *args): import sister aggregating_strategy = args[0] embedding = None if aggregating_strategy == 'mean': embedding = sister.MeanEmbedding(lang="en") if embedding is None: raise KeyError("Insufficient vespine gas") return embedding(x)
def fasttext_embed(token): global embedder if embedder is None: embedder = sister.MeanEmbedding(lang="en") if token == '': token = 'unk' if token in memoization: return memoization[token] memoization[token] = embedder(token) return memoization[token]
def review_embedding(df): #각 영화 리뷰를 100차원의 값으로 임베딩 review_list = df['리뷰'].tolist() vector1_list = [] sentence_embedding = sister.MeanEmbedding(lang="file", fasttextfile="cc.ko.100.bin") for i in range(len(review_list)): s1 = sentence_embedding(review_list[i]) vector1_list.append(s1) vector_df = pd.DataFrame(vector1_list) #DataFrame으로 return vector_df #merge
def __init__(self): self.prediction_model = 'voiceCon_NET.hdf5' self.prediction_matches = 'prediction_matches.pickle' self.matcher = None if not os.path.exists(self.prediction_model): #if no trained model exists print('Trained model not found... Preparing to train new model') PrepForTrain.Prep() else: print('Trained model found') self.embedder = sister.MeanEmbedding(lang='en') self.NET = load_model(self.prediction_model) print('Looking for Prediction Matcher...') with open(self.prediction_matches, 'rb') as P: self.matcher = pickle.load(P) print('Prediction Matcher Found!!')
def fasttext(instances, lang_code='en'): """ take a list of strings and lang code and return the fasttext embeddings of sentences :param instances: a list of triples, [left context], target word, [right context]) :param lang_code: str :return: a pair of 2-D array: features of left and right contexts """ embedder = sister.MeanEmbedding(lang=lang_code) left_feats = [] right_feats = [] for left_context, _, right_context in instances: if left_context == []: left_feats.append(np.zeros(300)) else: left_feats.append(embedder(' '.join(left_context))) if right_context == []: right_feats.append(np.zeros(300)) else: right_feats.append(embedder(' '.join(right_context))) return left_feats, right_feats
def fasttexttagger(df_new): embedder = sister.MeanEmbedding(lang="en") for index, value in df_new.iterrows(): sentence = df_new.at[index, "paraphrased_question"] vector = embedder(sentence) for ind, i in enumerate(vector): df_new.at[index, str(ind)] = i numerical_cols = [ '#', '$', "''", '(', ')', ',', '.', ':', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``' ] for i in range(0, 300): numerical_cols.append(str(i)) # scaler = StandardScaler() # scaler2 = StandardScaler() # scaler.fit(df_AllMain[numerical_cols]) # df_new[numerical_cols] = scaler.fit_transform(df_new[numerical_cols]) return df_new
def __init__(self, lang, tokenizer=None): if tokenizer is None: tokenizer = SimpleTokenizer() self.embedder = sister.MeanEmbedding(lang=lang, tokenizer=tokenizer)
import pickle import numpy as np import pandas as pd import sister from sister.word_embedders import FasttextEmbedding from sklearn.metrics.pairwise import cosine_similarity import joblib embedder = sister.MeanEmbedding(lang="es", word_embedder = FasttextEmbedding('es')) def remove_symbol(s): s = s.replace(",", "") s = s.replace(".", "") s = s.replace(";", "") s = s.replace(":", "") s = s.replace("_", "") s = s.replace("+", "") s = s.replace("ª", "") s = s.replace("-", "") s = s.replace("<", "") s = s.replace(">", "") s = s.replace("!", "") s = s.replace("?", "") s = s.replace("(", "") s = s.replace(")", "") s = s.replace("[", "") s = s.replace("]", "") s = s.replace("'", "") s = s.replace("0", "") s = s.replace("1", "") s = s.replace("2", "")
def __init__(self): self.CATEGORY_FILE = 'exCats.pickle' self.CATEGORIES = None self.data_categories = 'category' #categorized data path used for training self.prepared_data = 'voiceConData.pickle' self.retrain = False self.trained_model = 'voiceCon_NET.hdf5' if not os.path.exists(self.trained_model): print( 'No existing trained model found. Preparing to train new model' ) self.retrain = True if not os.path.exists(self.CATEGORY_FILE): print('There are no existing categories of commands\n\ Creating Category file') with open(self.CATEGORY_FILE, 'wb') as Cfile: pickle.dump([], Cfile) # initialize with empty list print('New Categroy file created') #after new file is created, check categroy dir and load print('Checking for new categories...') with open(self.CATEGORY_FILE, 'rb') as Cfile: self.CATEGORIES = pickle.load(Cfile) new_cats = [] for roots, files, dirs in os.walk(self.data_categories): print('Current Categories found: ', dirs) for category in dirs: if category not in self.CATEGORIES: new_cats.append(category) print('found new categroy:', category) self.CATEGORIES.append(category) print('Adding new categroy') if len(new_cats) == 0: print('No new categories found') else: self.retrain = True print('Remebering new Categories:...', new_cats) with open(self.CATEGORY_FILE, 'wb') as Cfile: pickle.dump(self.CATEGORIES, Cfile) print('Current categories in memory:', self.CATEGORIES) #NB for now no ability to delete categories #i think i will leave it like this for a while #now load data and prepcess self.data = {} for category in self.CATEGORIES: #print(category) label = category.strip('.txt') self.data[label] = pd.read_csv(self.data_categories + '/' + category) #print(self.data) if not os.path.exists(self.prepared_data): print('No prepared data for training found') print('Initializing Embedder...') self.embedder = sister.MeanEmbedding(lang='en') self.INIT() if self.retrain: print( 'Since new categories were added or new model required, Retraining model...' ) print('Loading Prepared Data') output = self.normalize() trainingData = output[0] labels = output[1] trainer.Trainer(trainingData, labels, see_history=True)
import numpy as np import sister embedder=sister.MeanEmbedding(lang='en') from scipy.spatial.distance import cosine class Intent: threshold=0.7 #say def intent_searcher(self,testing_phrases,training_phrases): iteration_count=0 testing_phrase=embedder(testing_phrases) for training_phrase in training_phrases: training_phrase=training_phrase.reshape(300,1) cosine_sim=1- cosine(training_phrase,testing_phrase) if iteration_count == 0: cosine_max=cosine_sim if (cosine_max<cosine_sim): cosine_max=cosine_sim iteration_count= iteration_count+1 if (cosine_max>=self.threshold): return True,cosine_max else: return False,0
import sister import pickle import os import glob import json import re import random import numpy as np sentence_embedding = sister.MeanEmbedding(lang="en") def welcome_message(): print(' _______ _______ _____ _____ ') print(' |______ |_____| | | | |') print(' ______| | | |_____ __|__ |_____|') def softmax(x): """Compute softmax values for each sets of scores in x.""" return np.exp(x) / np.sum(np.exp(x), axis=0) class Salio(): def __init__(self): self.last_sentence = "hi" self.learn = False try: self.load_pickle("save.p") except:
def setup_model(self): # FastText self.model = sister.MeanEmbedding(lang="en")
Kipp Freud 12/02/2020 ''' #------------------------------------------------------------------ import sister import numpy as np from util.message import message import util.utilities as ut #------------------------------------------------------------------ EMBEDDER = sister.MeanEmbedding(lang="en") #------------------------------------------------------------------ # ----------------------------------------------------------------------------------------- # public functions # ----------------------------------------------------------------------------------------- @ut.timeit def embed(sent): """ Will return a vector embedding of the given string sentence. """ if not isinstance(sent, str): message.logError("Given sentence must be a string.", "nlp_util::embed")