def build_tsne_embeddings(tags, c2v_model=None, tsne_model=None): """ Use sklearn TSNE to build embedding layer. :return: an Numpy array 1. Use `char2vec` to transform tags into a 150D vectors -> (num_tags, 150) 2. Feed word embeddings into sklearn TSNE model """ if not c2v_model: c2v_model = c2v.load_model("train_fr_150/") word_embeddings = c2v_model.vectorize(tags) print(f"Word embedding shape: {word_embeddings.shape}") if not tsne_model: # preplexity: we should experiment with value between 5 and 50 to see different results # n_components: the input word embeding is a (num_words, 150) 2-D matrix # n_iter: number of iterations for optimization, >= 250 # random_state: seed for random number generator tsne_model = TSNE(perplexity=40, n_components=2, init="pca", n_iter=2500, random_state=23) tsne_embeddings = tsne_model.fit_transform(word_embeddings) print(f"T-SNE embedding shape: {tsne_embeddings.shape}") return tsne_embeddings
def pca(l_cluster): with open('datasetParsingDEF.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 for row in csv_reader: if line_count == 0: print(f'Column names are {", ".join(row)}') line_count += 1 else: print('WORD', row[1][:-4]) song_Strings.append(row[2]) song_Names.append(row[1][:-4]) line_count += 1 print(f'Processed {line_count} lines.') # Load Inutition Engineering pretrained model # Models names: 'eng_50', 'eng_100', 'eng_150' 'eng_200', 'eng_300' c2v_model = chars2vec.load_model(embedding) # Create word embeddings word_embeddings = c2v_model.vectorize_words(song_Strings) # Project embeddings on plane using the PCA projection_2d = sklearn.decomposition.PCA( n_components=2).fit_transform(word_embeddings) # Draw words on plane f = plt.figure(figsize=(8, 6)) plt.title("KMean - Divisione : " + n_clusterString + ' Cluster - Embedding : ' + embedding) #label_color = [LABEL_COLOR_MAP[l] for l in l_cluster] print(song_Names) trasformLabelColor(l_cluster) print(label_color_Final) i = 0 print(len(l_cluster)) modificaLabelColor() print(label_color_Final) for j in range(len(projection_2d)): print(j) plt.scatter(projection_2d[j, 0], projection_2d[j, 1], marker=('$' + 'o' + '$'), s=30, label=j, c=label_color_Final[j]) i = i + 1 plt.savefig('./Scatter/Kmean/' + embedding + '/' + n_clusterString + '.png')
def spectralClustering(): words = [] with open('./datasetFit.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 for row in csv_reader: if line_count == 0: #print(f'Column names are {", ".join(row)}') line_count += 1 else: words.append(row[2]) line_count += 1 with open('./datasetCouple.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 arrayDiStringhe = [] for row in csv_reader: if line_count == 0: #print(f'Column names are {", ".join(row)}') line_count += 1 else: words.append(row) line_count += 1 for i in range(len(words)): if (words[i]): stringa = str(words[i]) stringa = stringa.replace("[", "") stringa = stringa.replace("]", "") stringa = stringa.replace("'", "") arrayDiStringhe.append(stringa) c2v_model = chars2vec.load_model('eng_50') word_embeddings = c2v_model.vectorize_words(arrayDiStringhe) #print(word_embeddings) #print(len(word_embeddings)) clustering = SpectralClustering(n_clusters=9, assign_labels="discretize", random_state=0).fit(word_embeddings) labels = clustering.labels_ #print(labels) l = len(labels) if (labels[l - 1] == labels[l - 2]): #print('TRUE') return True else: #print('FALSE') return False
def pca(l_cluster): words = [] etichette = [] with open('datasetParsingDEF.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 for row in csv_reader: if line_count == 0: print(f'Column names are {", ".join(row)}') line_count += 1 else: print(row[1], row[2]) words.append(row[2]) etichette.append(row[1]) line_count += 1 print(f'Processed {line_count} lines.') # Load Inutition Engineering pretrained model # Models names: 'eng_50', 'eng_100', 'eng_150' 'eng_200', 'eng_300' c2v_model = chars2vec.load_model('eng_100') # Create word embeddings word_embeddings = c2v_model.vectorize_words(words) # Project embeddings on plane using the PCA projection_2d = sklearn.decomposition.PCA( n_components=2).fit_transform(word_embeddings) # Draw words on plane f = plt.figure(figsize=(8, 6)) #label_color = [LABEL_COLOR_MAP[l] for l in l_cluster] print(label_color) i = 0 print(len(l_cluster)) for j in range(len(projection_2d)): print(j) plt.scatter(projection_2d[j, 0], projection_2d[j, 1], marker=('$' + 'o' + '$'), s=30, label=j, c=label_color[l_cluster[j]]) i = i + 1 plt.show()
def getCommentEmbeddings(model_name, comments): if not model_name in [ 'eng_50', 'eng_100', 'eng_150', 'eng_200', 'eng_300' ]: print( "Error: arguments 'model' should be one of eng_50, eng_100, eng_150, eng_200, and eng_300" ) exit() if type(comments[0]) == str: c2v_model = chars2vec.load_model(model_name) comments = list(map(lambda x: x.strip(), comments)) comment_embeddings = c2v_model.vectorize_words(comments) return comment_embeddings elif type(comments[0][0]) == str: comments_embeddings = [] for comments_ in comments: c2v_model = chars2vec.load_model(model_name) comments_ = list(map(lambda x: x.strip(), comments_)) comment_embeddings = c2v_model.vectorize_words(comments_) comments_embeddings.append(comment_embeddings) return comments_embeddings
def __init__(self, dataset_dir, vocab_size, word_size, char_size, mode): self.vocab_size = vocab_size # 128 -> ascii number self.word_size = word_size self.char_size = char_size self.mode = mode self.c2v_model = chars2vec.load_model('eng_{}'.format(char_size)) # dir = <dir>/{train|val}/<filename>.json self.filenames = sorted([ os.path.join(dataset_dir, mode, f) for f in os.listdir(os.path.join(dataset_dir, mode)) if re.match(r'.*\.json', f) ]) self.data = [self.read_file(file) for file in self.filenames] self.document_lists = [] self.labels = []
def main(): max_review_length = 50 c2v_model = chars2vec.load_model('eng_50') num_class = 9 char_dic = create_dic() f_train_in = open("Datasets/categ.txt", "r") f_train_out = open("Datasets/outs.txt", "r") ''' Training model''' train_model(f_train_in, f_train_out, max_review_length, c2v_model, num_class) '''Predict''' embedding_model = load_model( "Categorical_classifier_models/Categorical_classifier_embedd.h5") test_model(embedding_model, char_dic, max_review_length) return
LR_INIT = 1e-4 VOCAB_SIZE = 128 WORD_SIZE = 250 CHAR_SIZE = 50 WARMUP_EPOCHS = 100 TRAIN_EPOCHS = 1500 NUM_CLASS = 5 GRID_SIZE = [64, 64] CLASS_NAME = [ "Don't care", "Merchant Name", "Merchant Address", "Transaction Date", "Total" ] # model config c2v_model = chars2vec.load_model('eng_{}'.format(150)) model = GridClassifier(num_class=NUM_CLASS, gird_size=GRID_SIZE) optimizer = tf.keras.optimizers.Adam(lr=LR_INIT, clipnorm=10.0) cross_entropy = SparseCategoricalCrossentropy(from_logits=True) model_ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=optimizer, net=model) model_manager = tf.train.CheckpointManager( model_ckpt, './checkpoints/grid_receipt_classifier_train.tf', max_to_keep=5) if model_manager.latest_checkpoint: # restore checkpoint model_ckpt.restore(model_manager.latest_checkpoint) print("Restored from {}".format(model_manager.latest_checkpoint)) else:
import chars2vec import sklearn.decomposition import matplotlib.pyplot as plt # Load Inutition Engineering pretrained model # Models names: 'eng_50', 'eng_100', 'eng_150' c2v_model = chars2vec.load_model('eng_50') words = [ 'Natural', 'Language', 'Understanding', 'Naturael', 'Longuge', 'Updderctundjing', 'Motural', 'Lamnguoge', 'Understaating', 'Naturrow', 'Laguage', 'Unddertandink', 'Nattural', 'Languagge', 'Umderstoneding' ] # Create word embeddings word_embeddings = c2v_model.vectorize_words(words) # Project embeddings on plane using the PCA projection_2d = sklearn.decomposition.PCA( n_components=2).fit_transform(word_embeddings) # Draw words on plane f = plt.figure(figsize=(8, 6)) for j in range(len(projection_2d)): plt.scatter(projection_2d[j, 0], projection_2d[j, 1], marker=('$' + words[j] + '$'), s=500 * len(words[j]), label=j, facecolors='green' if words[j]
def __init__( self, language_model='eng_50', ): # phoc_vectors have a size n_test_samplesXn_ensmbles self.c2v_model = chars2vec.load_model(language_model)
import chars2vec as c2v import sklearn.decomposition import matplotlib.pyplot as plt import tensorflow as tf """ Mute tensorflow warning """ tf.logging.set_verbosity(tf.logging.ERROR) """ 2D Visualization script by using PCA on the vectorization of a list of words""" c2v_model = c2v.load_model("train_fr_150") """ Words to visualize """ words = [ 'est', 'ezt', 'zest', 'carotte', 'carote', 'carottte', 'langage', 'language', 'langqge', 'francais', 'franssais', 'francqis', 'bread', 'brad', 'breod', 'broad' ] word_embeddings = c2v_model.vectorize(words) """ Optional print of euclidean distances between vectors """ def print_distance(words): import numpy as np print("\t", end='') for word in words: print("%-10.6s" % word, end='\t') print("") for i, vec1 in enumerate(word_embeddings): print(words[i], end=' ') for vec2 in word_embeddings: print("%10.4f" % np.linalg.norm(vec1 - vec2), end='\t')
def execute_spell_suggester(input_word): char_emb_model = chars2vec.load_model(model_path) return spell_corrector.compute_correct_word(input_word, startChar_to_words_dict, char_emb_model)
X_train = [ ('mecbanizing', 'mechanizing'), # similar words, target is equal 0 ('dicovery', 'dis7overy'), # similar words, target is equal 0 ('prot$oplasmatic', 'prtoplasmatic'), # similar words, target is equal 0 ('copulateng', 'lzateful'), # not similar words, target is equal 1 ('estry', 'evadin6'), # not similar words, target is equal 1 ('cirrfosis', 'afear') # not similar words, target is equal 1 ] y_train = [0, 0, 0, 1, 1, 1] model_chars = [ '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' ] # Create and train chars2vec model using given training data my_c2v_model = chars2vec.train_model(dim, X_train, y_train, model_chars) # Save pretrained model chars2vec.save_model(my_c2v_model, path_to_model) words = ['list', 'of', 'words'] # Load pretrained model, create word embeddings c2v_model = chars2vec.load_model(path_to_model) word_embeddings = c2v_model.vectorize_words(words) word_embeddings['of']
def pca(l_cluster): with open('datasetParsing2DEF.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 for row in csv_reader: if line_count == 0: print(f'Column names are {", ".join(row)}') line_count += 1 else: print('WORD', row[1][:-4]) song_Strings.append(row[2]) song_Names.append(row[1][:-4]) line_count += 1 print(f'Processed {line_count} lines.') # Load Inutition Engineering pretrained model # Models names: 'eng_50', 'eng_100', 'eng_150' 'eng_200', 'eng_300' c2v_model = chars2vec.load_model(embedding) # Create word embeddings word_embeddings = c2v_model.vectorize_words(song_Strings) # Project embeddings on plane using the PCA projection_2d = sklearn.decomposition.PCA( n_components=2).fit_transform(word_embeddings) # Draw words on plane f = plt.figure(figsize=(8, 6)) plt.title("KMean - Divisione : " + n_clusterString + ' Cluster - Embedding : ' + embedding) #label_color = [LABEL_COLOR_MAP[l] for l in l_cluster] print(song_Names) trasformLabelColor(l_cluster) print(label_color_Final) i = 0 print(len(l_cluster)) modificaLabelColor() print(label_color_Final) print(len(projection_2d)) assex = [] assey = [] for j in range(0, len(projection_2d)): assex.append(projection_2d[j, 0]) assey.append(projection_2d[j, 1]) fig = go.Figure(data=go.Scatter( x=assex, y=assey, mode='markers', text=song_Names, marker=dict( size=16, color=label_color_Final, # set color equal to a variable showscale=True, ))) fig.update_xaxes(showgrid=False) fig.update_yaxes(showgrid=False) fig.update_layout(title_text=algo + ' ' + embedding + ' ' + n_clusterString, plot_bgcolor='rgb(236,241,243)') fig.show() for j in range(len(projection_2d)): print(j) plt.scatter(projection_2d[j, 0], projection_2d[j, 1], marker=('$' + 'o' + '$'), s=30, label=j, c=label_color_Final[j]) i = i + 1
def __init__(self, language_model = 'eng_50', max_word_len = 20): # phoc_vectors have a size n_test_samplesXn_ensmbles self.c2v_model = chars2vec.load_model(language_model) self.len_vec = len(self.c2v_model.vectorize_words(['dump']).squeeze()) self.len_output = max_word_len*int(language_model.replace('eng_',''))
def generateCSV(labels, name): with open('cluster.csv', 'w') as csvfile: filewriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) filewriter.writerow(['N', 'Song', 'Cluster']) i = 0 for l in labels: single_Name = name[i][:-4] filewriter.writerow([i, single_Name, l]) i = i + 1 c2v_model = chars2vec.load_model(embedding) words = [] etichette = [] with open('datasetParsingDEF.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 for row in csv_reader: if line_count == 0: print(f'Column names are {", ".join(row)}') line_count += 1 else: print(row[1], row[2]) words.append(row[2]) etichette.append(row[1])
def wembed_features(df, model, tokenizer, char_embeddings=False, window=3): # Features Matrix features = [] if char_embeddings == True: c2v_features_size = 100 char2vec_model = chars2vec.load_model('eng_'+str(c2v_features_size)) for ind, row in df.iterrows(): # Get text string = row['text'] # Tokenize tokens = tokenizer.tokenize(string) tokens = list(tokens) tokens = [x.lower() for x in tokens] # tokens = [lemmatizer.lemmzatize(tok) for tok in tokens] token_vectors = [] accepted_tokens = [] for i in range(int(window/2)): vector_size = model.vector_size if char_embeddings == True: vector_size += c2v_features_size token_vectors.append(np.zeros(vector_size)) accepted_tokens.append('null') for token in tokens: try: wembed_features = model.word_vec(token) wembed_features = np.reshape( wembed_features, (1, wembed_features.shape[0])) if char_embeddings == True: c2v_feature = char2vec_model.vectorize_words([token]) wembed_features = np.hstack((wembed_features, c2v_feature)) token_vectors.append(wembed_features) accepted_tokens.append(token) except Exception as e: wembed_features_size = model.vector_size if char_embeddings == True: wembed_features_size += c2v_features_size token_vectors.append(np.random.rand(wembed_features_size)) accepted_tokens.append(token+'#rand') for i in range(int(window/2)): vector_size = model.vector_size if char_embeddings == True: vector_size += c2v_features_size token_vectors.append(np.zeros(vector_size)) accepted_tokens.append('null') # Window Buffer last = 0 vector_size = model.vector_size if char_embeddings == True: vector_size += c2v_features_size window_buffer = np.zeros((window, vector_size)) # Final Vector List final_vectors = [] final_tokens = [] for vector in token_vectors: if last < window: # Update Buffer with new vector window_buffer[last, :] = vector # If window is full if last == window-1: new_vec = window_buffer.mean(axis=0) final_vectors.append(new_vec) final_tokens.append('-'.join(accepted_tokens[0:3])) last += 1 else: if window == 1: next_pos = 0 else: next_pos = (last % window) window_buffer[next_pos, :] = vector new_vec = window_buffer.mean(axis=0) final_vectors.append(new_vec) final_tokens.append( '-'.join(accepted_tokens[last+1-window:last+1])) last += 1 # End of Buffer if last == len(token_vectors): break # Free Up Memory del window_buffer del accepted_tokens del token_vectors # If final_vectors is empty fill zeros if len(final_tokens) < 2: features.append([0 for i in range(0, 8)]) else: features.append(wembed_util(final_vectors)) # print('features: ',features) print('ws: ', window, ind) features_df = pd.DataFrame(features, columns=[ 'max_sim', 'min_sim', 'max_dsim', 'min_dsim', 'max_wsim', 'min_wsim', 'max_wdsim', 'min_wdsim']) if len(features_df.isnull().any(1).nonzero()[0]) == 0: print("No Nans") else: features_df = features_df.fillna(0) return features_df