def produce_image_from_model(model_fn, logdir, **unused): dataset = input_data.read_data_sets("data/MNIST/", one_hot=False, reshape=False) inputs = np.concatenate( [x.images for x in [dataset.train, dataset.validation, dataset.test]]) labels = np.concatenate( [x.labels for x in [dataset.train, dataset.validation, dataset.test]]) images = vector_to_matrix_mnist(inputs) sprite_array = 1 - create_sprite_image(images) imsave(f'{logdir}/sprites.png', sprite_array) batch_size = None input_t = tf.placeholder(dtype=tf.float32, name='input_t', shape=(batch_size, *inputs.shape[1:])) z_t = model_fn(input_t, 2) tf.train.get_or_create_global_step() sess = tf.train.MonitoredTrainingSession(checkpoint_dir=logdir, save_checkpoint_secs=None, save_summaries_steps=None) print('generating points....') batch_size = 1000 points = np.concatenate([ sess.run(z_t, feed_dict={input_t: inputs[i:i + batch_size]}) for i in tqdm(range(0, len(inputs), batch_size)) ]) # add another zeroed dimension to get 3 dimensions for tensorflow projector points = np.stack( (points[:, 0], points[:, 1], np.zeros(shape=len(points))), axis=1) save_embeddings(outputs=points, labels=labels, logdir=logdir)
def evaluate(model, test_loader, test_embeddings, save=True, model_name=None): mean_pred_embeddings = predict_mean_embeddings(model, test_loader) if save: if model_name == None: raise ValueError('A filename should be provided.') save_embeddings(mean_pred_embeddings, model_name) predicted_results = {} euclidean_distances = [] cos_sims = [] nb_of_pred = 0 for label in mean_pred_embeddings: if label in test_embeddings: y_pred = mean_pred_embeddings[label].reshape(1, -1) y_true = test_embeddings[label].reshape(1, -1) euclidean_distances.append(eucl_dist(y_true, y_pred)) cos_sims.append(cos_sim(y_true, y_pred)) nb_of_pred += 1 logging.info('\nResults on the test:') logging.info('Mean euclidean dist: {}'.format( np.mean(euclidean_distances))) logging.info('Variance of euclidean dist: {}'.format( np.std(euclidean_distances))) logging.info('Mean cosine sim: {}'.format(np.mean(cos_sims))) logging.info('Variance of cosine sim: {}'.format(np.std(cos_sims))) logging.info('Number of labels evaluated: {}'.format(nb_of_pred)) return mean_pred_embeddings
def save(self): ent_embeds = self.ent_embeds.eval(session=self.session) nv_ent_embeds = self.name_embeds.eval(session=self.session) rv_ent_embeds = self.rv_ent_embeds.eval(session=self.session) av_ent_embeds = self.av_ent_embeds.eval(session=self.session) rel_embeds = self.rel_embeds.eval(session=self.session) att_embeds = self.attr_embeds.eval(session=self.session) save_embeddings(self.out_folder, self.kgs, ent_embeds, nv_ent_embeds, rv_ent_embeds, av_ent_embeds, rel_embeds, att_embeds)
def train_skipgram (corpus_dir, extn, learning_rate, embedding_size, num_negsample, epochs, batch_size, output_dir,valid_size): ''' :param corpus_dir: folder containing WL kernel relabeled files. All the files in this folder will be relabled according to WL relabeling strategy and the format of each line in these folders shall be: <target> <context 1> <context 2>.... :param extn: Extension of the WL relabled file :param learning_rate: learning rate for the skipgram model (will involve a linear decay) :param embedding_size: number of dimensions to be used for learning subgraph representations :param num_negsample: number of negative samples to be used by the skipgram model :param epochs: number of iterations the dataset is traversed by the skipgram model :param batch_size: size of each batch for the skipgram model :param output_dir: the folder where embedding file will be stored :param valid_size: number of subgraphs to be chosen at random to validate the goodness of subgraph representation learning process in every epoc :return: name of the file that contains the subgraph embeddings (in word2vec format proposed by Mikolov et al (2013)) ''' op_fname = '_'.join([os.path.basename(corpus_dir), 'dims', str(embedding_size), 'epochs', str(epochs),'embeddings.txt']) op_fname = os.path.join(output_dir, op_fname) if os.path.isfile(op_fname): logging.info('The embedding file: {} is already present, hence NOT training skipgram model ' 'for subgraph vectors'.format(op_fname)) return op_fname logging.info("Initializing SKIPGRAM...") corpus = Corpus(corpus_dir, extn = extn, max_files=0) # just load 'max_files' files from this folder corpus.scan_and_load_corpus() valid_examples = np.concatenate((np.random.choice(corpus.high_freq_word_ids, valid_size, replace=False), np.random.choice(corpus.low_freq_word_ids, valid_size, replace=False))) model_skipgram = skipgram( doc_size=corpus._vocabsize, # for doc2vec skipgram model, the doc size should be same as word size vocabulary_size=corpus._vocabsize, # size of i/p and o/p layers learning_rate=learning_rate, # will decay over time? embedding_size=embedding_size, # hidden layer neurons num_negsample=num_negsample, num_steps=epochs, # no. of time the training set will be iterated through corpus=corpus, # data set of (target,context) tuples valid_dataset=valid_examples, # validation set (a small subset) of (target, context) tuples? ) final_embeddings, final_weights = model_skipgram.train( corpus=corpus, batch_size=batch_size, valid_dataset=valid_examples, ) logging.info('Write the matrix to a word2vec format file') save_embeddings(corpus, final_embeddings, embedding_size, op_fname) logging.info('Completed writing the final embeddings, pls check file: {} for the same'.format(op_fname)) return op_fname
def predict_OOV(model, char_to_idx, OOV_path, filename): OOVs = load_vocab(OOV_path) vectorizer = Vectorizer(char_to_idx) examples = [(vectorizer.vectorize_sequence(word), word) for word in OOVs] loader = DataLoader(examples, collate_fn=collate_x, use_gpu=False, batch_size=1) model.model.eval() predicted_embeddings = {} for x, y in loader: x = tensors_to_variables(x) embeddings = torch_to_numpy(model.model(x)) for label, embedding in zip(y, embeddings): predicted_embeddings[label] = embedding save_embeddings(predicted_embeddings, filename)
def train(): model = get_loaded_model(force_gpu=True) embeddings = get_embeddings(model=model, data=get_data()) save_embeddings(embeddings, embeddings_file)
x, target, label = batch['embedding'], batch['embedding'], batch['label'] (decoded, predict) = net(x) # loss val_dec_loss[0].append(sparse_autoencoder_error(decoded[0], target[0])) val_dec_loss[1].append(sparse_autoencoder_error(decoded[1], target[1])) val_dec_loss[2].append(sparse_autoencoder_error(decoded[2], target[2])) val_cla_loss.append(BCE_loss(predict, label)) val_loss.append(sum([val_dec_loss[i][it] for i in range(3)]) * opt.alpha + val_cla_loss[it]) # metrics val_mse[0].append(MSE(decoded[0], target[0])) val_mse[1].append(MSE(decoded[1], target[1])) val_mse[2].append(MSE(decoded[2], target[2])) val_acc.append((predict > 0.5) == label.byte()) it += 1 net.train() print(" validation: \n val_loss: %4f, decode_0: %4f, decode_1: %4f, decode_2: %4f, classify_loss: %4f" % (Mean(val_loss), Mean(val_dec_loss[0]),Mean(val_dec_loss[1]), Mean(val_dec_loss[2]), Mean(val_cla_loss))) print(" val_mean_square_error: decode_0: %4f, decode_1: %4f, decode_2: %4f" % (Mean(val_mse[0]), Mean(val_mse[1]), Mean(val_mse[2]))) print(" val_classify_accuracy: %.4f" % Mean(val_acc)) # save model & embeddings save_model(net, opt) save_embeddings(net, dataset, opt) end_time = time.time() print("time: ", end_time - begin_time)
def test(experiment_name, task, gpu_num=0, pretrained='', margin=0.4, losstype='deepcca'): cosined = False embed_dim = 1024 gpu_num = int(gpu_num) margin = float(margin) # Setup the results and device. results_dir = setup_dirs(experiment_name) if not os.path.exists(results_dir + 'test_results/'): os.makedirs(results_dir + 'test_results/') test_results_dir = results_dir + 'test_results/' device = setup_device(gpu_num) #### Hyperparameters ##### #Initialize wandb #import wandb #wandb.init(project=experiment_name) #config = wandb.config with open(results_dir + 'hyperparams_test.txt', 'w') as f: f.write('Command used to run: python ') f.write(' '.join(sys.argv)) f.write('\n') f.write('device in use: ' + str(device)) f.write('\n') f.write('--experiment_name ' + str(experiment_name)) f.write('\n') # Setup data loaders and models based on task. if task == 'cifar10': train_loader, test_loader = cifar10_loaders() model_A = CIFAREmbeddingNet() model_B = CIFAREmbeddingNet() elif task == 'mnist': train_loader, test_loader = mnist_loaders() model_A = MNISTEmbeddingNet() model_B = MNISTEmbeddingNet() elif task == 'uw': uw_data = 'bert' train_loader, test_loader = uw_loaders(uw_data) if uw_data == 'bert': model_A = RowNet(3072, embed_dim=1024) # Language. model_B = RowNet(4096, embed_dim=1024) # Vision. # Finish model setup. model_A.load_state_dict( torch.load(results_dir + 'train_results/model_A_state.pt')) model_B.load_state_dict( torch.load(results_dir + 'train_results/model_B_state.pt')) model_A.to(device) model_B.to(device) # Put models into evaluation mode. model_A.eval() model_B.eval() """For UW data.""" ## we use train data to calculate the threshhold for distance. a_train = [] b_train = [] # loading saved embeddings to be faster a_train = load_embeddings(test_results_dir + 'lang_embeds_train.npy') b_train = load_embeddings(test_results_dir + 'img_embeds_train.npy') # Iterate through the train data. if a_train is None or b_train is None: a_train = [] b_train = [] print( "Computing embeddings for train data to calculate threshhold for distance" ) for data in train_loader: anchor_data = data[0].to(device) positive_data = data[1].to(device) label = data[2] a_train.append( model_A(anchor_data.to(device)).cpu().detach().numpy()) b_train.append( model_B(positive_data.to(device)).cpu().detach().numpy()) print("Finished Computing embeddings for train data") #saving embeddings if not already saved save_embeddings(test_results_dir + 'lang_embeds_train.npy', a_train) save_embeddings(test_results_dir + 'img_embeds_train.npy', b_train) a_train = np.concatenate(a_train, axis=0) b_train = np.concatenate(b_train, axis=0) # Test data # For accumulating predictions to check embedding visually using test set. # a is embeddings from domain A, b is embeddings from domain B, ys is their labels a = [] b = [] ys = [] instance_data = [] # loading saved embeddings to be faster a = load_embeddings(test_results_dir + 'lang_embeds.npy') b = load_embeddings(test_results_dir + 'img_embeds.npy') if a is None or b is None: compute_test_embeddings = True a = [] b = [] # Iterate through the test data. print("computing embeddings for test data") for data in test_loader: language_data, vision_data, object_name, instance_name = data language_data = language_data.to(device) vision_data = vision_data.to(device) instance_data.extend(instance_name) if compute_test_embeddings: a.append( model_A(language_data).cpu().detach().numpy()) # Language. b.append(model_B(vision_data).cpu().detach().numpy()) # Vision. ys.extend(object_name) print("finished computing embeddings for test data") # Convert string labels to ints. labelencoder = LabelEncoder() labelencoder.fit(ys) ys = labelencoder.transform(ys) #saving embeddings if not already saved save_embeddings(test_results_dir + 'lang_embeds.npy', a) save_embeddings(test_results_dir + 'img_embeds.npy', b) # Concatenate predictions. a = np.concatenate(a, axis=0) b = np.concatenate(b, axis=0) ab = np.concatenate((a, b), axis=0) ground_truth, predicted, distance = object_identification_task_classifier( a, b, ys, a_train, b_train, lamb_std=1, cosine=cosined) #### Retrieval task by giving an image and finding the closest word descriptions #### ground_truth_word, predicted_word, distance_word = object_identification_task_classifier( b, a, ys, b_train, a_train, lamb_std=1, cosine=cosined) with open('retrieval_non_pro.csv', mode='w') as retrieval_non_pro: csv_file_writer = csv.writer(retrieval_non_pro, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_file_writer.writerow( ['image', 'language', 'predicted', 'ground truth']) for i in range(50): csv_file_writer.writerow([ instance_data[0], instance_data[i], predicted_word[0][i], ground_truth_word[0][i] ]) precisions = [] recalls = [] f1s = [] precisions_pos = [] recalls_pos = [] f1s_pos = [] #print(classification_report(oit_res[i], 1/np.arange(1,len(oit_res[i])+1) > 0.01)) for i in range(len(ground_truth)): p, r, f, s = precision_recall_fscore_support(ground_truth[i], predicted[i], warn_for=(), average='micro') precisions.append(p) recalls.append(r) f1s.append(f) p, r, f, s = precision_recall_fscore_support(ground_truth[i], predicted[i], warn_for=(), average='binary') precisions_pos.append(p) recalls_pos.append(r) f1s_pos.append(f) print('\n ') print(experiment_name + '_' + str(embed_dim)) print('MRR, KNN, Corr, Mean F1, Mean F1 (pos only)') print('%.3g & %.3g & %.3g & %.3g & %.3g' % (mean_reciprocal_rank( a, b, ys, cosine=cosined), knn(a, b, ys, k=5, cosine=cosined), corr_between(a, b, cosine=cosined), np.mean(f1s), np.mean(f1s_pos))) plt.figure(figsize=(14, 7)) for i in range(len(ground_truth)): fpr, tpr, thres = roc_curve(ground_truth[i], [1 - e for e in distance[i]], drop_intermediate=True) plt.plot(fpr, tpr, alpha=0.08, color='r') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.savefig(test_results_dir + '_' + str(embed_dim) + '_ROC.svg') # Pick a pair, plot distance in A vs distance in B. Should be correlated. a_dists = [] b_dists = [] for _ in range(3000): i1 = random.randrange(len(a)) i2 = random.randrange(len(a)) a_dists.append(euclidean(a[i1], a[i2])) b_dists.append(euclidean(b[i1], b[i2])) # a_dists.append(cosine(a[i1], a[i2])) # b_dists.append(cosine(b[i1], b[i2])) # Plot. plt.figure(figsize=(14, 14)) #plt.title('Check Distance Correlation Between Domains') plt.xlim([0, 3]) plt.ylim([0, 3]) # plt.xlim([0,max(a_dists)]) # plt.ylim([0,max(b_dists)]) # plt.xlabel('Distance in Domain A') # plt.ylabel('Distance in Domain B') plt.xlabel('Distance in Language Domain') plt.ylabel('Distance in Vision Domain') #plt.plot(a_dists_norm[0],b_dists_norm[0],'.') #plt.plot(np.arange(0,2)/20,np.arange(0,2)/20,'k-',lw=3) plt.plot(a_dists, b_dists, 'o', alpha=0.5) plt.plot(np.arange(0, 600), np.arange(0, 600), 'k--', lw=3, alpha=0.5) #plt.text(-0.001, -0.01, 'Corr: %.3f'%(pearsonr(a_dists,b_dists)[0]), fontsize=20) plt.savefig(test_results_dir + '_' + str(embed_dim) + '_CORR.svg') # Inspect embedding distances. clas = 5 # Base class. i_clas = [i for i in range(len(ys)) if ys[i].item() == clas] i_clas_2 = np.random.choice(i_clas, len(i_clas), replace=False) clas_ref = 4 # Comparison class. i_clas_ref = [i for i in range(len(ys)) if ys[i].item() == clas_ref] ac = np.array([a[i] for i in i_clas]) bc = np.array([b[i] for i in i_clas]) ac2 = np.array([a[i] for i in i_clas_2]) bc2 = np.array([b[i] for i in i_clas_2]) ac_ref = np.array([a[i] for i in i_clas_ref]) aa_diff_ref = norm(ac[:min(len(ac), len(ac_ref))] - ac_ref[:min(len(ac), len(ac_ref))], ord=2, axis=1) ab_diff = norm(ac - bc2, ord=2, axis=1) aa_diff = norm(ac - ac2, ord=2, axis=1) bb_diff = norm(bc - bc2, ord=2, axis=1) # aa_diff_ref = [cosine(ac[:min(len(ac),len(ac_ref))][i],ac_ref[:min(len(ac),len(ac_ref))][i]) for i in range(len(ac[:min(len(ac),len(ac_ref))]))] # ab_diff = [cosine(ac[i],bc2[i]) for i in range(len(ac))] # aa_diff = [cosine(ac[i],ac2[i]) for i in range(len(ac))] # bb_diff = [cosine(bc[i],bc2[i]) for i in range(len(ac))] bins = np.linspace(0, 0.1, 100) plt.figure(figsize=(14, 7)) plt.hist(ab_diff, bins, alpha=0.5, label='between embeddings') plt.hist(aa_diff, bins, alpha=0.5, label='within embedding A') plt.hist(bb_diff, bins, alpha=0.5, label='within embedding B') plt.hist(aa_diff_ref, bins, alpha=0.5, label='embedding A, from class ' + str(clas_ref)) plt.title('Embedding Distances - Class: ' + str(clas)) plt.xlabel('L2 Distance') plt.ylabel('Count') plt.legend() #labelencoder.classes_ classes_to_keep = [36, 6, 9, 46, 15, 47, 50, 22, 26, 28] print(labelencoder.inverse_transform(classes_to_keep)) ab_norm = [ e for i, e in enumerate(ab) if ys[i % len(ys)] in classes_to_keep ] ys_norm = [e for e in ys if e in classes_to_keep] color_index = {list(set(ys_norm))[i]: i for i in range(len(set(ys_norm)))} #set(ys_norm) markers = ["o", "v", "^", "s", "*", "+", "x", "D", "h", "4"] marker_index = { list(set(ys_norm))[i]: markers[i] for i in range(len(set(ys_norm))) } embedding = umap.UMAP(n_components=2).fit_transform( ab_norm) # metric='cosine' # Plot UMAP embedding of embeddings for all classes. f, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10)) mid = len(ys_norm) ax1.set_title('Language UMAP') for e in list(set(ys_norm)): x1 = [ embedding[:mid, 0][i] for i in range(len(ys_norm)) if ys_norm[i] == e ] x2 = [ embedding[:mid, 1][i] for i in range(len(ys_norm)) if ys_norm[i] == e ] ax1.scatter( x1, x2, marker=marker_index[int(e)], alpha=0.5, c=[sns.color_palette("colorblind", 10)[color_index[int(e)]]], label=labelencoder.inverse_transform([int(e)])[0]) ax1.set_xlim([min(embedding[:, 0]) - 4, max(embedding[:, 0]) + 4]) ax1.set_ylim([min(embedding[:, 1]) - 4, max(embedding[:, 1]) + 4]) ax1.grid(True) ax1.legend(loc='upper center', bbox_to_anchor=(1.1, -0.08), fancybox=True, shadow=True, ncol=5) ax2.set_title('Vision UMAP') for e in list(set(ys_norm)): x1 = [ embedding[mid::, 0][i] for i in range(len(ys_norm)) if ys_norm[i] == e ] x2 = [ embedding[mid::, 1][i] for i in range(len(ys_norm)) if ys_norm[i] == e ] ax2.scatter( x1, x2, marker=marker_index[int(e)], alpha=0.5, c=[sns.color_palette("colorblind", 10)[color_index[int(e)]]]) ax2.set_xlim([min(embedding[:, 0]) - 4, max(embedding[:, 0]) + 4]) ax2.set_ylim([min(embedding[:, 1]) - 4, max(embedding[:, 1]) + 4]) ax2.grid(True) plt.savefig(test_results_dir + '_' + str(embed_dim) + '_UMAP_wl.svg', bbox_inches='tight')
def train_skipgram(corpus_dir, extn, learning_rate, embedding_size, num_negsample, epochs, batch_size, output_dir, valid_size): ''' :param corpus_dir: folder containing WL kernel relabeled files. All the files in this folder will be relabled according to WL relabeling strategy and the format of each line in these folders shall be: <target> <context 1> <context 2>.... :param extn: Extension of the WL relabled file :param learning_rate: learning rate for the skipgram model (will involve a linear decay) :param embedding_size: number of dimensions to be used for learning subgraph representations :param num_negsample: number of negative samples to be used by the skipgram model :param epochs: number of iterations the dataset is traversed by the skipgram model :param batch_size: size of each batch for the skipgram model :param output_dir: the folder where embedding file will be stored :param valid_size: number of subgraphs to be chosen at random to validate the goodness of subgraph representation learning process in every epoc :return: name of the file that contains the subgraph embeddings (in word2vec format proposed by Mikolov et al (2013)) ''' op_fname = '_'.join([ os.path.basename(corpus_dir), 'dims', str(embedding_size), 'epochs', str(epochs), 'embeddings.txt' ]) op_fname = os.path.join(output_dir, op_fname) if os.path.isfile(op_fname): logging.info( 'The embedding file: {} is already present, hence NOT training skipgram model ' 'for subgraph vectors'.format(op_fname)) return op_fname logging.info("Initializing SKIPGRAM...") corpus = Corpus( corpus_dir, extn=extn, max_files=0) # just load 'max_files' files from this folder corpus.scan_and_load_corpus() valid_examples = np.concatenate( (np.random.choice(corpus.high_freq_word_ids, valid_size, replace=False), np.random.choice(corpus.low_freq_word_ids, valid_size, replace=False))) model_skipgram = skipgram( doc_size=corpus. _vocabsize, # for doc2vec skipgram model, the doc size should be same as word size vocabulary_size=corpus._vocabsize, # size of i/p and o/p layers learning_rate=learning_rate, # will decay over time? embedding_size=embedding_size, # hidden layer neurons num_negsample=num_negsample, num_steps= epochs, # no. of time the training set will be iterated through corpus=corpus, # data set of (target,context) tuples valid_dataset= valid_examples, # validation set (a small subset) of (target, context) tuples? ) final_embeddings, final_weights = model_skipgram.train( corpus=corpus, batch_size=batch_size, valid_dataset=valid_examples, ) logging.info('Write the matrix to a word2vec format file') save_embeddings(corpus, final_embeddings, embedding_size, op_fname) logging.info( 'Completed writing the final embeddings, pls check file: {} for the same' .format(op_fname)) return op_fname
def save_char_embeddings(model, char_to_idx, filename='mimick_char_embeddings'): char_embeddings = {} for char, idx in char_to_idx.items(): char_embeddings[char] = torch_to_numpy(model.model.mimick_lstm.embeddings.weight.data[idx]) save_embeddings(char_embeddings, filename)
os.path.join(store_path, photo_name)) products_df_final = products_df_final.sort_values(by="photo") products_df_final.to_csv(os.path.join("..", "utils", "products.csv"), index=False) if __name__ == "__main__": seed = 2018 csvs_path = os.path.join("..", "notebooks") dataset_path = os.path.join("..", "photos_resized") store_path = os.path.join("..", "static", "images", "store") remove_gitkeep(store_path) #removes gitkeep files remove_gitkeep(os.path.join("..", "static", "images", "recommend")) remove_gitkeep(os.path.join("..", "static", "images", "user")) start_store( seed, csvs_path, dataset_path, store_path ) #creating the store dataframe and placing the images on the store directory resizing = (224, 224) shape_output = 1024 model = MobileNet(input_shape=(224, 224, 3), weights="imagenet", include_top=False, pooling="avg") save_embeddings( store_path, "embeddings.npy", model, preprocess_input, shape_output, resizing ) #creating the embeddings file for the store imagescess_input, shape_output, resizing) #creating the embeddings file for the store imagescess_input, shape_output, resizing) #creating the embeddings file for the store images
def build(train_data, save_word_embeddings=False, save_model=False): """Returns the sentiment of the parsed sentence. Args: train_data (df) : The data the model is to be built from. save_word_embeddings (bool) : If true, will save the embedding data. save_model (bool) : If true, will save the model. Returns model : The sentiment analyser model, fit to the training data. """ global corpus_vocabulary X_train = train_data['content'] y_train = train_data['label'] vocab_size = 10000 # TODO automatic train_sequences = corpus_vocabulary.texts_to_sequences(X_train.values) padded_train = keras.preprocessing.sequence.pad_sequences(train_sequences, padding='post', maxlen=140) model = keras.Sequential() model.add(keras.layers.Embedding(vocab_size, 40)) model.add(keras.layers.GlobalAveragePooling1D()) model.add(keras.layers.Dense(4, activation=tf.nn.relu)) model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid)) model.summary() model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) split = int(len(X_train) / 4) # number of comments halved x_val = padded_train[:split] partial_x_train = padded_train[split:] y_val = y_train[:split] partial_y_train = y_train[split:] model.fit(partial_x_train, partial_y_train, epochs=150, batch_size=512, validation_data=(x_val, y_val), verbose=1) if save_word_embeddings == True: word_index = corpus_vocabulary.word_index save_embeddings(model, word_index) if save_model == True: import datetime as dt now = dt.datetime.now().__str__() model.save(os.getcwd() + '/saved_model_data/models/model_' + now + '.h5') print("Model saved.") return model
def execute(sentence, save_word_embeddings=False, plot_loss_acc=False): """Returns the sentiment of the parsed sentence. Args: sentence (str) : The sentence to be analised. save_word_embeddings (bool) : If true, will save the embedding data. plot_loss_acc (bool) : If true, will plot the loss and accuracy during the training of the data. Returns score (float) : The sentiment score of the sentence. 1 - cyber abusive, 0 - not cyber abusive. """ parsed_test = pd.DataFrame({"content": pd.Series(sentence)}) current_directory = os.getcwd() train_data = pd.read_csv(current_directory + "/data/DataTurks/dump.csv") train_data = train_data.sample(frac=1).reset_index(drop=True) X_train = train_data['content'][:18000] X_test = parsed_test['content'] y_train = train_data['label'][:18000] tokenizer = Tokenizer(num_words=10000) tokenizer.fit_on_texts(train_data['content']) train_sequences = tokenizer.texts_to_sequences(X_train.values) test_sequences = tokenizer.texts_to_sequences(X_test.values) vocab_size = 10000 padded_train = keras.preprocessing.sequence.pad_sequences(train_sequences, padding='post', maxlen=140) padded_test = keras.preprocessing.sequence.pad_sequences(test_sequences, padding='post', maxlen=140) model = keras.Sequential() model.add(keras.layers.Embedding(vocab_size, 40)) model.add(keras.layers.GlobalAveragePooling1D()) model.add(keras.layers.Dense(4, activation=tf.nn.relu)) model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid)) model.summary() model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) split = int(len(X_train) / 4) # number of comments halved x_val = padded_train[:split] partial_x_train = padded_train[split:] y_val = y_train[:split] partial_y_train = y_train[split:] history = model.fit(partial_x_train, partial_y_train, epochs=120, batch_size=512, validation_data=(x_val, y_val), verbose=1) if save_word_embeddings == True: word_index = tokenizer.word_index save_embeddings(model, word_index) if plot_loss_acc == True: history_dict = history.history history_dict.keys() epochs = range(1, len(history_dict['acc']) + 1) plot_accuracy(epochs, history_dict['acc'], history_dict['val_acc']) plt.clf() plot_loss(epochs, history_dict['loss'], history_dict['val_loss']) sentiment_score = model.predict(padded_test) return str(sentiment_score[0][0])