def main(argv=None): args = commandLineParser.parse_args() if not os.path.isdir('CMDs'): os.mkdir('CMDs') with open('CMDs/get_bert_dists.cmd', 'a') as f: f.write(' '.join(sys.argv) + '\n') # Get document of all training (unshuffled) responses # Convert each word to an ID # Initialise an np array of dimension of the largest possible ID for a word - call it word_freqs # Initialise each element in array to 0 # Loop through list of all response word IDs and increment corresponding position in word_freqs by 1 for each ID encountered # Loop through word_freqs: if element is > 0, set to 1/element; if element is = 0, set to 1.5 (this is the value set by lots of real algorithms) # Save word_freqs array as a numpy array that can be loaded by step_train_simGrid.py and converted to tf tensor and then have tf.gather to applied to it for a list of word IDs. response_path = '/home/alta/relevance/vr311/data_vatsal/BULATS/content_words/responses.txt' wlist_path = '/home/alta/relevance/vr311/data_vatsal/BULATS/content_words/tfrecords_train/input.wlist.index' data, _ = text_to_array(response_path, wlist_path, strip_start_end=False) # Note, input.wlist.index starts word IDs at 0 but all word IDs in 'data' start at 1 (i.e. corresponds to the line number in input.wlist.index instead), so a word ID of 0 is impossible word_freqs = np.zeros(62415+1) print('GOT HERE') print(data[:2]) for w_id in np.nditer(data): word_freqs[w_id] += 1 print('WOW, HERE NOW') for w in np.nditer(word_freqs, op_flags = ['readwrite']): if w > 0: w[...] = 1/w else: w[...] = 1.5 print('PHEW') # Word ID of 0 is impossible word_freqs[0] = -1 np.savetxt('/home/alta/relevance/vr311/data_vatsal/BULATS/content_words/idf.txt', word_freqs) sort_word_freqs = np.sort(word_freqs) top_ten = sort_word_freqs[:10] for val in np.nditer(top_ten): print(np.where(val >= word_freqs ))
def main(argv=None): args = commandLineParser.parse_args() if not os.path.isdir('CMDs'): os.mkdir('CMDs') with open('CMDs/get_bert_dists.cmd', 'a') as f: f.write(' '.join(sys.argv) + '\n') resp_path = '/home/alta/relevance/vr311/data_vatsal/BULATS/responses.txt' wlist_path = '/home/alta/relevance/vr311/data_vatsal/input.wlist.index' resps, resp_lens = text_to_array(resp_path, wlist_path) #print(resps.shape) save_path = '/home/alta/relevance/vr311/models_min_data/baseline/ATM' path = os.path.join(save_path, 'sorted_resps.txt') np.savetxt(path, resps) path = os.path.join(save_path, 'sorted_resp_lens.txt') np.savetxt(path, resp_lens)
def main(argv=None): args = commandLineParser.parse_args() if not os.path.isdir('CMDs'): os.mkdir('CMDs') with open('CMDs/step_compute_prompt_embeddings.cmd', 'a') as f: f.write(' '.join(sys.argv) + '\n') prompts, prompt_lens = text_to_array(args.prompt_path, input_index=args.wlist_path) # Initialize and Run the Model atm = AttentionTopicModel(network_architecture=None, load_path=args.load_path, debug_mode=args.debug, epoch=args.epoch) atm.get_prompt_embeddings(prompts, prompt_lens, os.path.join(args.load_path, 'model'))
def main(argv=None): args = commandLineParser.parse_args() if not os.path.isdir('CMDs'): os.mkdir('CMDs') with open('CMDs/step_compute_prompt_embeddings.cmd', 'a') as f: f.write(' '.join(sys.argv) + '\n') if args.strip_start_end: print("Stripping the first and last word (should correspond to <s> and </s> marks) from the input prompts. Should only be used with legacy dataset formatting") prompts, prompt_lens = text_to_array(args.prompt_path, args.wlist_path, strip_start_end=args.strip_start_end) # Initialize and Run the Model atm = AttentionTopicModel(network_architecture=None, load_path=args.load_path, debug_mode=args.debug, epoch=args.epoch) atm.get_prompt_embeddings(prompts, prompt_lens, args.save_path)
def main(argv=None): args = commandLineParser.parse_args() if not os.path.isdir('CMDs'): os.mkdir('CMDs') with open('CMDs/step_train_attention_grader.cmd', 'a') as f: f.write(' '.join(sys.argv) + '\n') f.write('--------------------------------\n') topics, topic_lens = text_to_array(args.topic_path, input_index=args.wlist_path) atm = HierarchicialAttentionTopicModel(network_architecture=None, seed=args.seed, name=args.name, save_path='./', load_path=args.load_path, debug_mode=args.debug, epoch=args.epoch) atm.fit(train_data=args.train_data, valid_data=args.valid_data, load_path=args.init, topics=topics, topic_lens=topic_lens, unigram_path=args.topic_count_path, train_size=args.train_size, learning_rate=args.learning_rate, lr_decay=args.lr_decay, dropout=args.dropout, attention_dropout=args.attention_dropout, distortion=args.distortion, batch_size=args.batch_size, optimizer=tf.train.AdamOptimizer, optimizer_params={}, n_epochs=args.n_epochs, n_samples=args.n_samples, epoch=0) atm.save()
def main(argv=None): args = commandLineParser.parse_args() if not os.path.isdir('CMDs'): os.mkdir('CMDs') with open('CMDs/step_train_attention_grader.cmd', 'a') as f: f.write(' '.join(sys.argv) + '\n') f.write('--------------------------------\n') train_size = get_train_size_from_meta(args.meta_data_path) topics, topic_lens = text_to_array(args.topic_path, args.wlist_path, strip_start_end=False) # Augmented data aug_topics, aug_topic_lens = text_to_array( "/home/alta/relevance/vr311/data_vatsal/BULATS/translate/ar1.txt", args.wlist_path, strip_start_end=False) aug_topics2, aug_topic_lens2 = text_to_array( "/home/alta/relevance/vr311/data_vatsal/BULATS/translate/de1.txt", args.wlist_path, strip_start_end=False) aug_topics3, aug_topic_lens3 = text_to_array( "/home/alta/relevance/vr311/data_vatsal/BULATS/translate/fr1.txt", args.wlist_path, strip_start_end=False) aug_topics4, aug_topic_lens4 = text_to_array( "/home/alta/relevance/vr311/data_vatsal/BULATS/translate/greek1.txt", args.wlist_path, strip_start_end=False) aug_topics5, aug_topic_lens5 = text_to_array( "/home/alta/relevance/vr311/data_vatsal/BULATS/translate/hebrew1.txt", args.wlist_path, strip_start_end=False) aug_topics6, aug_topic_lens6 = text_to_array( "/home/alta/relevance/vr311/data_vatsal/BULATS/translate/hi1.txt", args.wlist_path, strip_start_end=False) aug_topics7, aug_topic_lens7 = text_to_array( "/home/alta/relevance/vr311/data_vatsal/BULATS/translate/ja1.txt", args.wlist_path, strip_start_end=False) aug_topics8, aug_topic_lens8 = text_to_array( "/home/alta/relevance/vr311/data_vatsal/BULATS/translate/ko1.txt", args.wlist_path, strip_start_end=False) aug_topics9, aug_topic_lens9 = text_to_array( "/home/alta/relevance/vr311/data_vatsal/BULATS/translate/ru1.txt", args.wlist_path, strip_start_end=False) aug_topics10, aug_topic_lens10 = text_to_array( "/home/alta/relevance/vr311/data_vatsal/BULATS/translate/af1.txt", args.wlist_path, strip_start_end=False) aug_topics11, aug_topic_lens11 = text_to_array( "/home/alta/relevance/vr311/data_vatsal/BULATS/eda/data/one_aug.txt", args.wlist_path, strip_start_end=False) aug_topics12, aug_topic_lens12 = text_to_array( "/home/alta/relevance/vr311/data_vatsal/BULATS/eda/data/one_aug2.txt", args.wlist_path, strip_start_end=False) aug_topics13, aug_topic_lens13 = text_to_array( "/home/alta/relevance/vr311/data_vatsal/BULATS/eda/data/one_aug3.txt", args.wlist_path, strip_start_end=False) aug_topics14, aug_topic_lens14 = text_to_array( "/home/alta/relevance/vr311/data_vatsal/BULATS/eda/data/one_aug4.txt", args.wlist_path, strip_start_end=False) aug_topics15, aug_topic_lens15 = text_to_array( "/home/alta/relevance/vr311/data_vatsal/BULATS/eda/data/one_aug5.txt", args.wlist_path, strip_start_end=False) aug_topics16, aug_topic_lens16 = text_to_array( "/home/alta/relevance/vr311/data_vatsal/BULATS/eda/data/one_aug6.txt", args.wlist_path, strip_start_end=False) aug_topics17, aug_topic_lens17 = text_to_array( "/home/alta/relevance/vr311/data_vatsal/BULATS/eda/data/one_aug7.txt", args.wlist_path, strip_start_end=False) aug_topics18, aug_topic_lens18 = text_to_array( "/home/alta/relevance/vr311/data_vatsal/BULATS/eda/data/one_aug8.txt", args.wlist_path, strip_start_end=False) aug_topics19, aug_topic_lens19 = text_to_array( "/home/alta/relevance/vr311/data_vatsal/BULATS/eda/data/one_aug9.txt", args.wlist_path, strip_start_end=False) # if args.strip_start_end: # print("Stripping the first and last word (should correspond to <s> and </s> marks) from the input prompts. Should only be used with legacy dataset formatting") bert_dists = np.loadtxt( "/home/alta/relevance/vr311/models_min_data/baseline/ATM/bert_dists.txt", dtype=np.float32) sbert_weights = np.loadtxt( "/home/alta/relevance/vr311/models_min_data/baseline/ATM/sbert_weights.txt", dtype=np.float32) arr_unigrams = np.loadtxt( "/home/alta/relevance/vr311/models_min_data/baseline/ATM/arr_unigrams.txt", dtype=np.float32) atm = HierarchicialAttentionTopicModel(network_architecture=None, seed=args.seed, name=args.name, save_path='./', load_path=args.load_path, debug_mode=args.debug, epoch=args.epoch) atm.fit(train_data=args.train_data, valid_data=args.valid_data, load_path=args.init, topics=topics, topic_lens=topic_lens, aug_topics=aug_topics, aug_topic_lens=aug_topic_lens, aug_topics2=aug_topics2, aug_topic_lens2=aug_topic_lens2, aug_topics3=aug_topics3, aug_topic_lens3=aug_topic_lens3, aug_topics4=aug_topics4, aug_topic_lens4=aug_topic_lens4, aug_topics5=aug_topics5, aug_topic_lens5=aug_topic_lens5, aug_topics6=aug_topics6, aug_topic_lens6=aug_topic_lens6, aug_topics7=aug_topics7, aug_topic_lens7=aug_topic_lens7, aug_topics8=aug_topics8, aug_topic_lens8=aug_topic_lens8, aug_topics9=aug_topics9, aug_topic_lens9=aug_topic_lens9, aug_topics10=aug_topics10, aug_topic_lens10=aug_topic_lens10, aug_topics11=aug_topics11, aug_topic_lens11=aug_topic_lens11, aug_topics12=aug_topics12, aug_topic_lens12=aug_topic_lens12, aug_topics13=aug_topics13, aug_topic_lens13=aug_topic_lens13, aug_topics14=aug_topics14, aug_topic_lens14=aug_topic_lens14, aug_topics15=aug_topics15, aug_topic_lens15=aug_topic_lens15, aug_topics16=aug_topics16, aug_topic_lens16=aug_topic_lens16, aug_topics17=aug_topics17, aug_topic_lens17=aug_topic_lens17, aug_topics18=aug_topics18, aug_topic_lens18=aug_topic_lens18, aug_topics19=aug_topics19, aug_topic_lens19=aug_topic_lens19, bert_dists=bert_dists, bert_weights=sbert_weights, arr_unigrams=arr_unigrams, unigram_path=args.topic_count_path, train_size=train_size, learning_rate=args.learning_rate, lr_decay=args.lr_decay, dropout=args.dropout, distortion=args.distortion, batch_size=args.batch_size, optimizer=tf.train.AdamOptimizer, optimizer_params={}, n_epochs=args.n_epochs, n_samples=args.n_samples, epoch=0) atm.save()
def main(args): if not os.path.isdir('CMDs'): os.mkdir('CMDs') with open('CMDs/step_train_attention_grader.cmd', 'a') as f: f.write(' '.join(sys.argv) + '\n') f.write('--------------------------------\n') tfrecords_dir_in_domain = os.path.join(args.data_dir_in_domain, 'tfrecords') topic_path_in_domain = os.path.join(tfrecords_dir_in_domain, args.topic_file) wlist_path = os.path.join(tfrecords_dir_in_domain, args.wlist_file) topic_count_path_in_domain = os.path.join(tfrecords_dir_in_domain, args.topic_count_file) train_data_in_domain = os.path.join(tfrecords_dir_in_domain, args.train_file) valid_data = os.path.join(tfrecords_dir_in_domain, args.valid_file) dataset_meta_path_in_domain = os.path.join(tfrecords_dir_in_domain, args.meta_file) train_size = get_train_size_from_meta(dataset_meta_path_in_domain) topics_in_domain, topic_lens_in_domain = text_to_array(topic_path_in_domain, wlist_path, strip_start_end=args.strip_start_end) if args.which_training_cost != 'conflictive': assert args.data_dir_in_domain is not None tfrecords_dir_out_domain = os.path.join(args.data_dir_out_domain, 'tfrecords') train_data_out_domain = os.path.join(tfrecords_dir_out_domain, args.train_file) if args.strip_start_end: print("Stripping the first and last word (should correspond to <s> and </s> marks) " "from the input prompts. Should only be used with legacy dataset formatting") atm = ATMPriorNetwork(network_architecture=None, seed=args.seed, name=args.name, save_path='./', load_path=args.load_path, debug_mode=args.debug, epoch=args.epoch) if args.which_training_cost == 'conflictive': atm.fit(train_data=train_data_in_domain, valid_data=valid_data, load_path=args.init, topics=topics_in_domain, topic_lens=topic_lens_in_domain, unigram_path=topic_count_path_in_domain, train_size=train_size, learning_rate=args.learning_rate, lr_decay=args.lr_decay, dropout=args.dropout, distortion=args.distortion, presample_batch_size=args.batch_size, optimizer=tf.train.AdamOptimizer, optimizer_params={}, n_epochs=args.n_epochs, epoch=0, which_trn_cost=args.which_training_cost, loss_regularisation_weight=args.conflictive_weight) else: atm.fit_with_ood(train_data_in_domain=train_data_in_domain, train_data_out_domain=train_data_out_domain, valid_data=valid_data, load_path=args.init, topics_in_domain=topics_in_domain, topic_lens_in_domain=topic_lens_in_domain, unigram_path_in_domain=topic_count_path_in_domain, train_size=train_size, learning_rate=args.learning_rate, lr_decay=args.lr_decay, dropout=args.dropout, distortion=args.distortion, presample_batch_size=args.batch_size, optimizer=tf.train.AdamOptimizer, optimizer_params={}, n_epochs=args.n_epochs, epoch=0, which_trn_cost=args.which_training_cost, out_of_domain_weight=args.out_of_domain_weight) atm.save()
def main(args): if not os.path.isdir('CMDs'): os.mkdir('CMDs') with open('CMDs/step_train_attention_grader.cmd', 'a') as f: f.write(' '.join(sys.argv) + '\n') f.write('--------------------------------\n') if args.strip_start_end: print( "Stripping the first and last word (should correspond to <s> and </s> marks) from the input prompts. Should only be used with legacy dataset formatting" ) # Whether to train a prior network or standard ATM if args.train_prior_network: atm_class = ATMPriorNetworkStudent else: atm_class = AttentionTopicModelStudent for epoch in range(0, args.n_epochs): atm_student = atm_class(network_architecture=None, seed=args.seed, name=args.name, save_path='./', load_path=args.load_path, debug_mode=args.debug, epoch=args.load_epoch, num_teachers=args.num_teachers) # Get the paths to all the relevant files for this epoch if args.reuse_epoch_dataset: epoch_tfrecords_dir = os.path.join(args.teacher_data_dir, 'tfrecords') else: if args.loop_epochs: epoch_data_num = epoch % args.loop_epochs else: epoch_data_num = epoch epoch_tfrecords_dir = os.path.join( args.teacher_data_dir, 'epoch' + str(epoch_data_num + 1), 'tfrecords') topic_path = os.path.join(epoch_tfrecords_dir, args.topic_file) wlist_path = os.path.join(epoch_tfrecords_dir, args.wlist_file) topic_count_path = os.path.join(epoch_tfrecords_dir, args.topic_count_file) train_data = os.path.join(epoch_tfrecords_dir, args.train_file) valid_data = os.path.join(epoch_tfrecords_dir, args.valid_file) dataset_meta_path = os.path.join(epoch_tfrecords_dir, args.meta_file) train_size = get_train_size_from_meta(dataset_meta_path) topics, topic_lens = text_to_array( topic_path, wlist_path, strip_start_end=args.strip_start_end) if epoch == 0: init = args.init else: init = None atm_student.fit_student(train_data=train_data, valid_data=valid_data, load_path=init, topics=topics, topic_lens=topic_lens, unigram_path=topic_count_path, train_size=train_size, learning_rate=args.learning_rate, lr_decay=args.lr_decay, dropout=args.dropout, distortion=args.distortion, batch_size=args.batch_size, optimizer=tf.train.AdamOptimizer, optimizer_params={}, n_epochs=1, epoch=epoch, use_teacher_stat=(not args.match_samples)) atm_student.save() # Reset the graph so that the model can be reloaded for the next epoch (not the nicest way to do it, I know) tf.reset_default_graph()