def nbow(): ''' NBOW baseline ''' WV_CORPUS = "origin" embeddings, word_indices = get_embeddings( corpus=WV_CORPUS, dim=embedding_dim) train_set = SemEvalDataLoader(verbose=False, ekphrasis=True).get_data(task="A", years=None, datasets=None, only_semEval=True) test_data = SemEvalDataLoader( verbose=False, ekphrasis=True).get_gold(task="A") X = [obs[1] for obs in train_set] y = [label2id[obs[0]] for obs in train_set] X_test = [obs[1] for obs in test_data] y_test = [label2id[obs[0]] for obs in test_data] task = 'clf' print("-----------------------------") if task == 'clf': print('LogisticRegression') else: print("LinearSVC") bow = bow_model(task) bow.fit(X, y) predict = bow.predict(X_test) results = eval_clf(predict, y_test) for res, val in results.items(): print("{}: {:.3f}".format(res, val)) load_result_f1(predict, y_test) nbow = nbow_model(task, embeddings, word_indices) nbow.fit(X, y) predict = nbow.predict(X_test) results = eval_clf(predict, y_test) for res, val in results.items(): print("{}: {:.3f}".format(res, val)) load_result_f1(predict, y_test) print("-----------------------------")
def main(): with open(CONFIG) as reader: config = yaml.safe_load(reader) gamefiles = glob(join(config['main']['games_path'], '*.ulx')) print('Found {} games.'.format(len(gamefiles))) # pprint(gamefiles) # Pick a game. gamefile = gamefiles[1] requested_infos = EnvInfos( admissible_commands=True, command_templates=True, description=True, entities=True, has_lost=True, has_won=True, inventory=True, max_score=True, objective=True, verbs=True, extras=[ "recipe", ], ) env_id = textworld.gym.register_games([gamefile], requested_infos) env_id = textworld.gym.make_batch( env_id, batch_size=config['main']['environment_batch_size'], parallel=True) env = gym.make(env_id) agent = CustomizableAgent(config, *get_embeddings(config['main'])) play(env, agent, config['main']) play(env, agent, config['main'], evaluation=True) agent.cleanup() return
util.print_flag('Loading') util.print_flag('Dataset', big=False) corpus: ColumnCorpus = ColumnCorpus(data_folder='resources/data/', train_file='concat_PharmaCoNER.conll', dev_file=None, test_file='test_PharmaCoNER.conll', column_format={ 0: 'text', 1: 'begin', 2: 'end', 3: 'ner' }) util.print_flag('Embeddings', big=False) pooling_op = 'min' embeddings: StackedEmbeddings = util.get_embeddings(pooling_op) util.print_flag('Training') tag_type = 'ner' model = f'PharmaCoNER-PCE_{pooling_op}-BPEmb-FT-w2v' tagger: SequenceTagger = SequenceTagger( embeddings=embeddings, tag_dictionary=corpus.make_tag_dictionary(tag_type=tag_type), tag_type=tag_type, hidden_size=256, rnn_layers=1, dropout=0.0) print(tagger) trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train(f'resources/models/{model}',
def main(args): # Load word embedding matrix and char embedding matrix word_emb_path = os.path.join(args.data_dir, args.word_emb_file) word_emb_matrix, word2id = util.get_embeddings(word_emb_path, args.word_emb_size, 97572, 'word') print('Got {} word embeddings'.format(len(word2id))) char_emb_path = os.path.join(args.data_dir, args.char_emb_file) char_emb_matrix, char2id = util.get_embeddings(char_emb_path, args.char_emb_size, 94, 'char') print('Got {} char embeddings'.format(len(char2id))) bio2id = {'B': B_IN_BIO, 'I': I_IN_BIO, 'O': O_IN_BIO} for phase in ('train', 'dev'): # Read lines from downloaded files src_list, bio_list, tgt_list = load_data(args.data_dir, phase=phase) print('Read {} lines for the {} set'.format(len(src_list), phase)) assert len(src_list) == len(bio_list) and len(bio_list) == len(tgt_list),\ 'src({}), bio({}), tgt({})'.format(len(src_list), len(bio_list), len(tgt_list)) # Set up for mapping examples to word/char IDs n = len(src_list) max_c_len = args.max_c_len if phase == 'train' else args.max_c_len_test max_q_len = args.max_q_len if phase == 'train' else args.max_q_len_test max_w_len = args.max_w_len # Create empty arrays of padding src_ids = np.full((n, max_c_len), PAD_ID, dtype=np.int32) src_c_ids = np.full((n, max_c_len, max_w_len), PAD_ID, dtype=np.int32) bio_ids = np.full((n, max_c_len), O_IN_BIO, dtype=np.int32) tgt_ids = np.full((n, max_q_len), PAD_ID, dtype=np.int32) tgt_c_ids = np.full((n, max_q_len, max_w_len), PAD_ID, dtype=np.int32) # Fill arrays with IDs for i, (src, bio, tgt) in tqdm(enumerate(zip(src_list, bio_list, tgt_list)), total=n): src_words = src.split()[:max_c_len] src_ids[i, :len(src_words)] = [ word2id.get(w, UNK_ID) for w in src_words ] src_chars = [[c for c in s] for s in src_words] for j, chars in enumerate(src_chars): chars = chars[:max_w_len] src_c_ids[i, j, :len(chars)] = [ char2id.get(c, UNK_ID) for c in chars ] bio_words = bio.split()[:max_c_len] bio_ids[i, :len(bio_words)] = [bio2id[w] for w in bio_words] tgt_words = tgt.split()[:max_q_len] tgt_ids[i, :len(tgt_words)] = [ word2id.get(w, UNK_ID) for w in tgt_words ] tgt_chars = [[c for c in s] for s in tgt_words] for j, chars in enumerate(tgt_chars): chars = chars[:max_w_len] src_c_ids[i, j, :len(chars)] = [ char2id.get(c, UNK_ID) for c in chars ] # Save arrays filled with IDs with h5py.File(os.path.join(args.data_dir, 'data.hdf5'), 'a') as hdf5_fh: phase_group = hdf5_fh.create_group(phase) phase_group.create_dataset('src_ids'.format(phase), data=src_ids, chunks=True) phase_group.create_dataset('src_c_ids'.format(phase), data=src_c_ids, chunks=True) phase_group.create_dataset('bio_ids'.format(phase), data=bio_ids, chunks=True) phase_group.create_dataset('tgt_ids'.format(phase), data=tgt_ids, chunks=True) # Save embedding matrices word_emb_path = os.path.join(args.data_dir, 'word_embs.npy') np.save(word_emb_path, word_emb_matrix) char_emb_path = os.path.join(args.data_dir, 'char_embs.npy') np.save(char_emb_path, char_emb_matrix)
def reply_to_message(**payload): data = payload['data'] if 'user' not in data or ut.is_bot(data['user'], BOT_ID): return webclient = payload['web_client'] user = data['user'] channel = data['channel'] message = data['text'] if ut.is_public(channel) and not ut.is_bot_tagged(message, BOT_ID_REGEX): return with open(str(pd.datetime.now().date()) + '.txt', 'a') as logFile: logFile.write( str(pd.datetime.now()) + ' \t user : '******' Text: ' + str(message) + '\n') message = ut.get_clean_message(message, BOT_ID_REGEX) # if ut.is_single_word(message): # if message in single_word_set: # reply = ' '.join(single_word_set[message]) # ut.send_reply(user, webclient, channel, reply) # returnx if len(message) < 2: ut.send_reply(user, webclient, channel, " Hi ! Please ask Me Detailed Questions ") return message_embedding = ut.get_embeddings(model, [message]) top_index, top_scores = ut.get_top_replies(embedding_matrix, message_embedding) top_replies = [answers[ind] for ind in top_index] top_q = [questions[ind] for ind in top_index] if top_replies[0] == '-1': ut.send_reply(user, webclient, channel, SYLLABUS) return if top_scores[0] < 0.5: with open('Unanswered.csv', 'a') as f: f.write(str(message) + ',\n') ut.send_reply( user, webclient, channel, 'Sorry, I didn\'t get that!\nPlease elaborate your question') elif top_scores[0] > 0.5 and top_scores[0] < 0.6: with open('Unanswered.csv', 'a') as f: f.write(str(message) + ',\n') if (len(top_q[0]) < 9): ut.send_reply(user, webclient, channel, top_replies[0]) else: reply = '\n*I Found These Matching Queries, Please see if it answers your question, else try elaborating your Question.* \n' for index in range(len(top_q)): reply = reply + '\n' + str(index + 1) + '. ' + top_q[ index] + ' \n ' + top_replies[index] + ' \n' reply = reply + '\n' + '*If your Query is still unanswered please reach out to your college faculty*' ut.send_reply(user, webclient, channel, reply) else: ut.send_reply(user, webclient, channel, top_replies[0])
# Import the dataset dataset = pd.read_csv('./QA.csv', header=None, encoding="utf-8") # Split dataset into questions and answers questions = dataset.iloc[:, 0].values answers = dataset.iloc[:, 1].values # Open Unanswered questions CSV for logging #`UQ = open('Unanswered.csv', 'w') print('############ trying to load model ############') # Load Model model = tf.saved_model.load('use/', tags=None) print('############ model is now loaded ############') # Get Embedding Matrix embedding_matrix = ut.get_embeddings(model, questions) # Setup Slack Client API rtm_client = RTMClient(token=SLACK_TOKEN, connect_method='rtm.start') print('############ Starting RTM Client ############') #rtm_client.start() print('## Started ###') # Start the client if it didn't start implicitly. try: print('############ Inside Try ############') rtm_client.start() print('############ Exiting Try with No Exceptions ############') except:
file_list = [] allowed_extensions = ['.jpg'] for file in os.listdir(args['image']): valid_file = False for extension in allowed_extensions: if file.endswith(extension): valid_file = True break if valid_file == True: file_list.append(args['image'] + '/' + file) print("detecting faces") images, img_with_faces = util.load_and_align_data(file_list) print("getting embeddings") embeddings = util.get_embeddings(args['model'], images) print('finding distinct groups') faces = util.get_face_labels(embeddings, max_clusters=int(args['max_clusters']), opt_cluster_threshold=int(args['threshold'])) if args['output'] != None: i = 0 for img in img_with_faces: person_name = 'person' + str(faces[i]) cls_folder = args['output'] + '/' + person_name if not os.path.isdir(cls_folder): os.makedirs(cls_folder) new_path = cls_folder + '/' + str(i) + '.jpg' shutil.move(img, new_path) i += 1
def __init__(self, dataset, train, trans, device, params=None): self.dataset = dataset self.train = train self.basic = trans.basic self.augment = trans.augmentation self.params = params self.concatenated = ( type(dataset) == torch.utils.data.dataset.ConcatDataset) if self.concatenated: if params["dset"] == "CIFAR10": self.data = torch.cat([ torch.from_numpy(self.dataset.datasets[i].data) for i in range(len(self.dataset.datasets)) ], dim=0) self.targets = torch.cat([ torch.tensor(self.dataset.datasets[i].targets) for i in range(len(self.dataset.datasets)) ], dim=0) elif params["dset"] == "MNIST" or params["dset"] == "FASHIONMNIST": self.data = torch.cat([ self.dataset.datasets[i].data for i in range(len(self.dataset.datasets)) ], dim=0) self.targets = torch.cat([ self.dataset.datasets[i].targets for i in range(len(self.dataset.datasets)) ], dim=0) else: self.targets = self.dataset.targets self.data = self.dataset.data if self.train: # Creates three lists of indices that will be called later. # The first index will be a specific image, the second will be the same image (augmented) # and the third will be a random image (most likely different if dataset is balanced) self.original_indices = np.arange(self.data.shape[0]) regular_dataloader = torch.utils.data.DataLoader( SingleDataset(dataset, augment=False, trans=trans, params=params), batch_size=params["batch_size"], shuffle=False) # Do not shuffle print("Running data through previous network...") regular_embeddings, augmented_embeddings = get_embeddings( regular_dataloader, device, params) random_indices = np.copy(self.original_indices) np.random.shuffle(random_indices) augmented_distances, random_distances = [], [] # if params["show_plots"]: # # plt.title("Augmented and Random Distances (boundary is {:.3f})".format(boundary)) # # plt.axvline(x=rand_min, color='r') # # plt.axvline(x=rand_max, color='r') # plt.hist(random_distances.cpu(), bins=200, label="Random") # # plt.hist(combined.cpu(), bins=bins, label="Combined") # # plt.hist(augmented_distances.cpu(), bins=bins, label="Augmented") # plt.legend() # plt.show() print("RTM index is {0} (number of pairs is {1})".format( params["rtm_index"], params["num_pairs"])) if params["rtm_index"] is None: # Do not use RTM # if params["curr_epoch"] == 0: indices_mask = np.arange(len(self.original_indices)) self.original_indices = self.original_indices[indices_mask] self.similar_indices = np.copy(self.original_indices) self.different_indices = random_indices[indices_mask] else: random_distances, random_indices = [], [] indices_matrix = np.random.choice( self.original_indices, (params["num_pairs"], len(self.original_indices))) for i in range(params["num_pairs"]): shuffled_indices = np.copy(self.original_indices) np.random.shuffle(shuffled_indices) # distances.append(F.cosine_similarity(embeddings, embeddings[different_indices], 1)) random_distances.append( torch.norm(regular_embeddings[self.original_indices] - regular_embeddings[indices_matrix[i, :]], p=2, dim=1)) random_indices.append(shuffled_indices) # random_indices = np.vstack(random_indices) random_distances = torch.stack(random_distances).cpu() different_selection = np.argpartition( random_distances, params["rtm_index"], axis=0)[params["rtm_index"], :].numpy( ) # Gets the rtm_index-th nearest neighbor different_indices = indices_matrix[ different_selection, np.arange(self.original_indices.shape[0])] self.different_indices = different_indices self.similar_indices = np.copy(self.original_indices) print("Dataset is size:", len(self.original_indices)) else: # Creates three lists of indices that will be called later. # The first index will be a specific label, the second will be the same label, # and the third will be a different label self.labels_set = set(self.targets.numpy()) x_original_indices, x_similar_indices, x_different_indices = [], [], [] for label in self.labels_set: original_indices = np.arange(len( self.targets))[np.where(self.targets == label)[0]] similar_indices = np.copy(original_indices) np.random.shuffle(similar_indices) different_indices = np.random.choice( np.arange(len( self.targets))[np.where(self.targets != label)[0]], len(original_indices)) x_original_indices.append(torch.from_numpy(original_indices)) x_similar_indices.append(torch.from_numpy(similar_indices)) x_different_indices.append(torch.from_numpy(different_indices)) self.original_indices = torch.cat(x_original_indices, dim=0) self.similar_indices = torch.cat(x_similar_indices, dim=0) self.different_indices = torch.cat(x_different_indices, dim=0)
def train(): # Load data en, it = get_embeddings() # Vocab x Embedding_dimension # Create data-loaders g_data_loader = torch.utils.data.DataLoader(CustomDataSet(en), batch_size=mini_batch_size, shuffle=True) d_data_loader = torch.utils.data.DataLoader(CustomDataSet(it), batch_size=mini_batch_size, shuffle=True) # Create models g = Generator(input_size=g_input_size, hidden_size=g_hidden_size, output_size=g_output_size) d = Discriminator(input_size=d_input_size, hidden_size=d_hidden_size, output_size=d_output_size) # Define loss function and optimizers loss_fn = torch.nn.BCELoss() d_optimizer = optim.Adam(d.parameters(), lr=d_learning_rate, betas=optim_betas) g_optimizer = optim.Adam(g.parameters(), lr=g_learning_rate, betas=optim_betas) if torch.cuda.is_available(): # Move the network and the optimizer to the GPU g = g.cuda() d = d.cuda() loss_fn = loss_fn.cuda() for epoch in range(num_epochs): d_losses = [] g_losses = [] start_time = timer() g_iter = iter(g_data_loader) mini_batch = 1 for d_real_data in d_data_loader: # Inspired from https://github.com/devnag/pytorch-generative-adversarial-networks/blob/master/gan_pytorch.py for d_index in range(d_steps): # 1. Train D on real+fake d.zero_grad() # Reset the gradients # 1A: Train D on real d_real_data = to_variable( d_real_data) # Could add some noise to the real data later d_real_decision = d(d_real_data) d_real_error = loss_fn(d_real_decision, to_variable( torch.ones(mini_batch_size, 1))) # ones = true d_real_error.backward( ) # compute/store gradients, but don't change params d_losses.append(d_real_error.data.cpu().numpy()) # 1B: Train D on fake d_gen_input = to_variable(next(g_iter)) d_fake_data = g(d_gen_input).detach( ) # detach to avoid training G on these labels d_fake_decision = d(d_fake_data) # Add noise later d_fake_error = loss_fn(d_fake_decision, to_variable( torch.zeros(mini_batch_size, 1))) # zeros = fake d_fake_error.backward() d_losses.append(d_fake_error.data.cpu().numpy()) d_optimizer.step( ) # Only optimizes D's parameters; changes based on stored gradients from backward() sys.stdout.write("[%d/%d] :: Discriminator Loss: %f \r" % (mini_batch, len(en) // mini_batch_size, np.asscalar(np.mean(d_losses)))) sys.stdout.flush() mini_batch += 1 mini_batch = 1 for gen_input in g_data_loader: for g_index in range(g_steps): # 2. Train G on D's response (but DO NOT train D on these labels) g.zero_grad() gen_input = to_variable(gen_input) g_fake_data = g(gen_input) g_fake_decision = d(g_fake_data) # Add noise later g_error = loss_fn( g_fake_decision, to_variable(torch.ones( mini_batch_size, 1))) # we want to fool, so pretend it's all genuine g_losses.append(g_error.data.cpu().numpy()) g_error.backward() g_optimizer.step() # Only optimizes G's parameters sys.stdout.write("[%d/%d] :: Generator Loss: %f \r" % (mini_batch, len(en) // mini_batch_size, np.asscalar(np.mean(g_losses)))) sys.stdout.flush() mini_batch += 1 print( "Epoch {} : Discriminator Loss: {:.5f}, Generator Loss: {:.5f}, Time elapsed {:.2f} mins" .format(epoch, np.asscalar(np.mean(d_losses)), np.asscalar(np.mean(g_losses)), (timer() - start_time) / 60)) return g