def create_corpora(corpus_fn): unigram_corpus = read_corpus(open(corpus_fn), skip=["#"]) normalize_corpus(unigram_corpus) morpheme_corpus = read_corpus(open(corpus_fn), "#") normalize_corpus(morpheme_corpus) return unigram_corpus, morpheme_corpus
def create_wfsa(options): # open output file or write to stdout output = open(options.output, "w") if options.output else sys.stdout # read initial transitions if given it = options.initial_transitions initial_transitions = Automaton.read_transitions(it) if it else {} # create uniform automaton with given number of states per letter # and the possibility of predefine some transitions if options.emitfile: numbers_per_letters = read_dict(open(options.emitfile)) automaton = Automaton.create_uniform_automaton(numbers_per_letters, initial_transitions=initial_transitions) automaton.dump(output) if not options.smooth: automaton.smooth() return if options.numstate: input_ = sys.stdin corpus = read_corpus(input_, options.separator) alphabet = get_alphabet(corpus) numbers_per_letters = dict([(letter, options.numstate) for letter in alphabet]) if options.num_epsilons: numbers_per_letters["EPSILON"] = options.num_epsilons automaton = Automaton.create_uniform_automaton(numbers_per_letters, initial_transitions) if options.smooth: automaton.smooth() automaton.dump(output) return if options.init_from_corpus: if len(initial_transitions) > 0: raise Exception( "Using initial transitions (-I option) when " + "creating automaton from corpus is not implemented" ) input_ = open(options.init_from_corpus) corpus = read_corpus(input_, options.separator) corpus = normalize_corpus(corpus) automaton = Automaton.create_from_corpus(corpus) if options.smooth: automaton.smooth() automaton.dump(output) return # fallback logging.error("Options are not complete, something is missing to create " + "an Automaton") sys.exit(-1)
def main(): quantizer = AbstractQuantizer.read(open(sys.argv[1])) corp = read_corpus(open(sys.argv[2]), separator="#") normalize_corpus(corp) probs = corp.values() dist = compute_entropy(probs, quantizer) print dist
def main(): logging.basicConfig(level=logging.INFO, format='%(message)s') parser = argparse.ArgumentParser(description='Train LDA model') parser.add_argument('--train', help='training corpus', required=True) parser.add_argument('--topics', help='number of topics', type=int, required=True) parser.add_argument('--iter', help='number of iterations', type=int, required=True) parser.add_argument('--pyp', help='use pyp priors', action='store_true') args = parser.parse_args() vocabulary = Vocabulary() logging.info('Reading training corpus') with open(args.train) as train: training_corpus = read_corpus(train, vocabulary) if args.pyp: logging.info('Using a PYP prior') doc_process = lambda: PYP(theta_doc, d_doc, Uniform(args.topics)) topic_process = lambda: PYP(theta_topic, d_topic, Uniform(len(vocabulary))) else: logging.info('Using a Dirichlet prior') doc_process = lambda: DirichletMultinomial(args.topics, theta_doc) topic_process = lambda: DirichletMultinomial(len(vocabulary), theta_topic) model = TopicModel(args.topics, len(training_corpus), doc_process, topic_process) logging.info('Training model with %d topics', args.topics) run_sampler(model, training_corpus, args.iter)
def similarity(sent, topN=10): corpus_lines = read_corpus(ner_result_path) texts = [line.split("\t")[0].split(' ') for line in corpus_lines] keywords = one_ner_tag(sent) dictionary = corpora.Dictionary(texts) num_features = len(dictionary.token2id) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus) new_vec = dictionary.doc2bow(keywords) # 相似度计算 index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features) # index = similarities.Similarity('-Similarity-index', corpus, num_features) # print('\nTF-IDF模型的稀疏向量集:') # for i in tfidf[corpus]: # print(i) # print('\nTF-IDF模型的keyword稀疏向量:') # print(tfidf[new_vec]) sims = index[tfidf[new_vec]] sims = sorted(enumerate(sims), key=lambda item: -item[1]) print("\n相似度计算") print('Words: {}\nText: {}\n'.format(keywords, sent)) for k, v in sims[:topN]: i = int(k) print('Similarity: {}\nWords: {}\nText: {}'.format( v, corpus_lines[i].split("\t")[0].split(' '), corpus_lines[i].split("\t")[1]))
def main(): corpus = read_corpus(open(sys.argv[1]), separator="#") normalize_corpus(corpus) wfsa = create_word_wfsa(corpus) wfsa.finalize() if len(sys.argv) == 4: wfsa.quantizer = LogLinQuantizer(int(sys.argv[2]), int(sys.argv[3])) wfsa.round_and_normalize() wfsa.dump(sys.stdout)
def main(args): model = load_model(args) print "loaded " + args.model raw_corpus = corpus.read_corpus(args.corpus) list_words, vocab_map, embeddings, padding_id = corpus.load_embeddings(corpus.load_embedding_iterator(args.embeddings)) print("loaded embeddings") ids_corpus = corpus.map_corpus(vocab_map, raw_corpus) evaluation(args, padding_id, ids_corpus, vocab_map, embeddings, model)
def main(): # read automaton wfsa = Automaton.create_from_dump(open(sys.argv[1])) # read corpus corpus = read_corpus(open(sys.argv[2]), separator=sys.argv[3], skip=[sys.argv[4]]) normalize_corpus(corpus) # call distance_from_corpus distances = {} dist = wfsa.distance_from_corpus(corpus, Automaton.kullback, distances=distances) # print out result for k, v in distances.iteritems(): print k, v
def main(): automaton = Automaton.create_from_dump(open(sys.argv[1])) corpus = read_corpus(open(sys.argv[2])) normalize_corpus(corpus) entropy = float(sys.argv[3]) string_bits = "u" if len(sys.argv) > 4: string_bits = sys.argv[4] q = LogLinQuantizer(10, -20) automaton.quantizer = q encoder = Encoder(entropy, string_bits) print encoder.encode(automaton, corpus)
def main(): corpus = read_corpus(sys.stdin, separator="#") n_corpus = normalize_corpus(corpus) file_name = sys.argv[1] fsa_type = sys.argv[2] if fsa_type == 'plain': fsa_creator = lambda corpus: create_three_state_fsa(corpus) elif fsa_type == 'hogy': fsa_creator = lambda corpus: create_hogy_fsa(corpus) elif fsa_type == 'o': fsa_creator = lambda corpus: create_o_fsa(corpus) elif fsa_type == 'new': fsa_creator = lambda corpus: create_new_three_state_fsa(corpus, ["hogy", ("vala", "ki")], "m") else: logging.critical('unknown fsa type: {0}'.format(fsa_type)) sys.exit(-1) create_wfsa(fsa_creator, file_name, n_corpus)
def main(options): if not options.automaton_file: raise Exception("Automaton \"option\" (-a) is mandatory") automaton = Automaton.create_from_dump(open(options.automaton_file)) if options.quantizer: automaton.quantizer = AbstractQuantizer.read(open(options.quantizer)) automaton.round_and_normalize() input_ = sys.stdin if options.corpus: input_ = open(options.corpus) corpus = read_corpus(input_, options.separator) corpus = normalize_corpus(corpus) learner = Learner.create_from_options(automaton, corpus, options) learner.main() output = sys.stdout if options.output: output = open(options.output, "w") learner.automaton.dump(output)
def sequence_encoding(sequence, str_to_idx): """ Transform list of strings into a tensor of integers to be processed by the pytorch model :param sequence: list of strings :param str_to_idx: dictionary that maps a string to a unique integer :return: pytorch tensor (vector) of long values """ sequence_of_indexes = [str_to_idx[element] for element in sequence] return torch.tensor(sequence_of_indexes, dtype=torch.long) # READ CORPUS, PREPARE DATA: liste_X_train, liste_Y_train, liste_X_test, liste_Y_test, liste_X_dev, liste_Y_dev = read_corpus("sequoia-7.0/sequoia.deep.conll", 0.8, 0.2, 0) liste_X_whole_corpus, liste_Y_whole_corpus, _, _, _, _ = read_corpus("sequoia-7.0/sequoia.deep.conll", 1, 0, 0) print("len(liste_X_train): ", len(liste_X_train)) print("len(liste_X_test): ", len(liste_X_test), " len(liste_X_dev): ", len(liste_X_dev)) print("first element (x,y) train: x = ", liste_X_train[1], ", y = ", liste_Y_train[1]) """ import json with open("pos_data_sequoia.txt", 'w', encoding="utf-8") as pos_data_sequoia_file: sequoia_pos_json = json.dumps({"train_data":{"X":liste_X_train,"Y":liste_Y_train},"test_data":{"X":liste_X_test,"Y":liste_Y_test},"dev_data":{"X":liste_X_dev,"Y":liste_Y_dev}}) pos_data_sequoia_file.write(sequoia_pos_json) """ # Create dictionaries tag_to_idx = create_dict_str_to_idx(liste_Y_whole_corpus) word_to_idx = create_dict_str_to_idx(liste_X_whole_corpus)
def gensim_tags(): ckpt_file = tf.train.latest_checkpoint(model_path) #print(ckpt_file) paths['model_path'] = ckpt_file model = BiLSTM_CRF(args, embeddings, tag2label, word2id, paths, config=config) model.build_graph() saver = tf.train.Saver() classifier = fasttext.load_model(classifier_model_path) with tf.Session(config=config) as sess: print('============= ner_tags =============') saver.restore(sess, ckpt_file) ner_result_fb = open(ner_result_path, 'a+') corpus_lines = read_corpus(ner_corpus_path) for ner_line in corpus_lines: ner_line = ner_line.strip() ner_line = re.sub(r'\s+', '', ner_line) ner_line = re.sub(r'日(凌晨|早晨|上午|中午|下午|晚上|深夜)+', '日', ner_line) line_data = list(ner_line.replace(' ', '', 10).strip()) line_data = [(line_data, ['O'] * len(line_data))] tags = model.get_ner_tag(sess, line_data) Location, Time, Means, Thing = get_entity(tags, ner_line) print('Location: {}\nTime: {}\nMeans: {}\nThing: {}'.format( Location, Time, Means, Thing)) words = [] if len(Time) > 0: time_info = get_date_info(Time[0]) print(time_info) if time_info is None and len(Time) > 1: time_info = get_date_info(Time[1]) print(time_info) words += list(time_info) else: print('NoTime {}'.format(ner_line)) words.append('NoTime') if len(Location) > 0: location_info = getlnglat(Location[0]) print(location_info) words.append(location_info) else: print('NoLocation {}'.format(ner_line)) words.append('NoLocation') if len(Means) > 0: words += Means else: print('NoMeans {}'.format(ner_line)) words.append('NoMeans') if len(Thing) > 0: category = classifier.predict(Thing, k=1)[0][0][0].replace( '__label__', '') print(category) words.append(category) else: print('NoClass {}'.format(ner_line)) words.append('NoClass') print(words) ner_result_fb.write(' '.join(words) + "\t" + ner_line + "\n") ner_result_fb.close()
'%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filename='./logs/post.log', filemode='w', ) console = logging.StreamHandler() console.setLevel(logging.INFO) formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s') console.setFormatter(formatter) logging.getLogger('').addHandler(console) logging.info('Reading training corpus') vocabulary = Vocabulary() with open(args["train_file"]) as train: corpus = read_corpus(train, vocabulary) # tag_models = [PYPLM(args["tag_order"], initial_base=Uniform(args["n_tags"])) for _ in range(args["n_particles"])] # word_models = [PYPLM(args["word_order"], initial_base=Uniform(len(vocabulary))) for _ in range(args["n_particles"])] tag_models = [PYPLM(args["tag_order"], initial_base=Uniform(args["n_tags"]))] word_models = [ PYPLM(args["word_order"], initial_base=Uniform(len(vocabulary))) ] logging.info('Training model of order %d', args["tag_order"]) ll_list, ppl_list = run_sentence_sampler(corpus, word_models, tag_models, n_tags=args["n_tags"],
import config import utils import corpus import machine_learning if __name__ == '__main__': """ Entry point for app """ # create log-files and write headers utils.write_resultlog_headers() # read and preprocess corpus corpus = corpus.read_corpus(config.corpus_path) # run main programm if config.use_all_variants == False: machine_learning.run(corpus) else: # gather all possible feature combinations f_combinations = utils.get_feature_combos() count = 1 # run main programm for all combinations for combo in f_combinations: config.feature_selection = combo print("\nRunning configuration {} of {}".format(count, len(f_combinations)))
import ne_chunker import corpus from nltk import pos_tag, word_tokenize from nltk.chunk import conlltags2tree, tree2conlltags # The path to the used corpus, here I used large dataset corpus named Groningen Meaning Bank corpus_root = 'gmb-2.2.0' mode = '--core' data = corpus.read_corpus(corpus_root, mode) training_samples = data[:int(len(data) * 0.9)] test_samples = data[int(len(data) * 0.9):] print "#training samples = %s" % len( training_samples) # training samples = 55809 print "#test samples = %s" % len(test_samples) # test samples = 6201 chunker = ne_chunker.NamedEntityChunker(training_samples[:55809]) # text = "Cristiano Ronaldo is a decent footballer both in Real Madrid, Spain and Manchester United, United Kingdom. He is truly a masterpiece." text = "Geraldi Dzakwan wakes up at 7 am every morning." print chunker.parse(pos_tag(word_tokenize(text))) score = chunker.evaluate([ conlltags2tree([(w, t, iob) for (w, t), iob in iobs]) for iobs in test_samples[:500] ]) # Debugging print score.accuracy() # 0.931132334092
def main(): automaton = Automaton.create_from_dump(open(sys.argv[1])) corpus = read_corpus(open(sys.argv[2]), "#") dc = DistanceCache(automaton, corpus) dc.build_paths()
def main(args): time1 = datetime.now() raw_corpus = corpus.read_corpus(args.corpus) list_words, vocab_map, embeddings, padding_id = corpus.load_embeddings( corpus.load_embedding_iterator(args.embeddings)) print("loaded embeddings") ids_corpus = corpus.map_corpus(vocab_map, raw_corpus) annotations = corpus.read_annotations(args.train) print("got annotations") training_batches = corpus.create_batches(ids_corpus, annotations, args.batch_size, padding_id) print("got batches") time2 = datetime.now() print "time to preprocess: " + str(time2 - time1) if args.model == 'cnn': args.margin = 0.2 if args.load_model: if args.model == 'lstm': print("loading " + args.load_model) lstm = nn.LSTM(input_size=args.embedding_size, hidden_size=args.hidden_size) lstm.load_state_dict(torch.load(args.load_model)) optimizer = Adam(lstm.parameters()) if args.cuda: lstm.cuda() else: print("loading " + args.load_model) cnn = nn.Conv1d(in_channels=args.embedding_size, out_channels=args.hidden_size, kernel_size=3, padding=1) cnn.load_state_dict(torch.load(args.load_model)) optimizer = Adam(cnn.parameters()) if args.cuda: cnn.cuda() else: if args.model == 'lstm': print "training lstm" lstm = nn.LSTM(input_size=args.embedding_size, hidden_size=args.hidden_size) optimizer = Adam(lstm.parameters()) if args.cuda: lstm.cuda() else: print "training cnn" cnn = nn.Conv1d(in_channels=args.embedding_size, out_channels=args.hidden_size, kernel_size=3, padding=1) optimizer = Adam(cnn.parameters()) if args.cuda: cnn.cuda() if args.save_model: if args.model == 'lstm': lstm_model_nums = [] for d in os.listdir("lstm_models"): if "lstm_model" in d: num = int(d[len("lstm_models") - 1:]) lstm_model_nums.append(num) if len(lstm_model_nums) > 0: new_model_num = max(lstm_model_nums) + 1 else: new_model_num = 0 print("creating new model " + "lstm_models/lstm_model" + str(new_model_num)) os.makedirs("lstm_models/lstm_model" + str(new_model_num)) else: cnn_model_nums = [] for d in os.listdir("cnn_models"): if "cnn_model" in d: num = int(d[len("cnn_models") - 1:]) cnn_model_nums.append(num) if len(cnn_model_nums) > 0: new_model_num = max(cnn_model_nums) + 1 else: new_model_num = 0 print("creating new model " + "cnn_models/cnn_model" + str(new_model_num)) os.makedirs("cnn_models/cnn_model" + str(new_model_num)) # lstm tutorial: http://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html # lstm documentation: http://pytorch.org/docs/master/nn.html?highlight=nn%20lstm#torch.nn.LSTM count = 1 hidden_states = [] total_loss = 0.0 time_begin = datetime.now() for epoch in range(10): print "epoch = " + str(epoch) for batch in training_batches: optimizer.zero_grad() if count % 10 == 0: print(count) print "average loss: " + str((total_loss / float(count))) print("time for 10 batches: " + str(datetime.now() - time_begin)) time_begin = datetime.now() titles, bodies, triples = batch title_length, title_num_questions = titles.shape body_length, body_num_questions = bodies.shape title_embeddings, body_embeddings = corpus.get_embeddings( titles, bodies, vocab_map, embeddings) # title if args.model == 'lstm': if args.cuda: title_inputs = [ autograd.Variable( torch.FloatTensor(title_embeddings).cuda()) ] title_inputs = torch.cat(title_inputs).view( title_length, title_num_questions, -1) # title_inputs = torch.cat(title_inputs).view(title_num_questions, title_length, -1) title_hidden = (autograd.Variable( torch.zeros(1, title_num_questions, args.hidden_size).cuda()), autograd.Variable( torch.zeros( (1, title_num_questions, args.hidden_size)).cuda())) else: title_inputs = [ autograd.Variable(torch.FloatTensor(title_embeddings)) ] title_inputs = torch.cat(title_inputs).view( title_length, title_num_questions, -1) title_hidden = (autograd.Variable( torch.zeros(1, title_num_questions, args.hidden_size)), autograd.Variable( torch.zeros((1, title_num_questions, args.hidden_size)))) else: if args.cuda: title_inputs = [ autograd.Variable( torch.FloatTensor(title_embeddings).cuda()) ] else: title_inputs = [ autograd.Variable(torch.FloatTensor(title_embeddings)) ] title_inputs = torch.cat(title_inputs).transpose(0, 1).transpose( 1, 2) if args.model == 'lstm': title_out, title_hidden = lstm(title_inputs, title_hidden) else: title_out = cnn(title_inputs) title_out = F.tanh(title_out) title_out = title_out.transpose(1, 2).transpose(0, 1) # average all words of each question from title_out # title_out (max sequence length) x (batch size) x (hidden size) average_title_out = average_questions(title_out, titles, padding_id) # body if args.model == 'lstm': if args.cuda: body_inputs = [ autograd.Variable( torch.FloatTensor(body_embeddings).cuda()) ] body_inputs = torch.cat(body_inputs).view( body_length, body_num_questions, -1) # body_inputs = torch.cat(body_inputs).view(body_num_questions, body_length, -1) body_hidden = (autograd.Variable( torch.zeros(1, body_num_questions, args.hidden_size).cuda()), autograd.Variable( torch.zeros((1, body_num_questions, args.hidden_size)).cuda())) else: body_inputs = [ autograd.Variable(torch.FloatTensor(body_embeddings)) ] body_inputs = torch.cat(body_inputs).view( body_length, body_num_questions, -1) body_hidden = (autograd.Variable( torch.zeros(1, body_num_questions, args.hidden_size)), autograd.Variable( torch.zeros((1, body_num_questions, args.hidden_size)))) else: if args.cuda: body_inputs = [ autograd.Variable( torch.FloatTensor(body_embeddings).cuda()) ] else: body_inputs = [ autograd.Variable(torch.FloatTensor(body_embeddings)) ] body_inputs = torch.cat(body_inputs).transpose(0, 1).transpose( 1, 2) if args.model == 'lstm': body_out, body_hidden = lstm(body_inputs, body_hidden) else: body_out = cnn(body_inputs) body_out = F.tanh(body_out) body_out = body_out.transpose(1, 2).transpose(0, 1) average_body_out = average_questions(body_out, bodies, padding_id) count += 1 # average body and title # representations of the questions as found by the LSTM hidden = (average_title_out + average_body_out) * 0.5 if args.cuda: triples_vectors = hidden[torch.LongTensor( triples.ravel()).cuda()] else: triples_vectors = hidden[torch.LongTensor(triples.ravel())] triples_vectors = triples_vectors.view(triples.shape[0], triples.shape[1], args.hidden_size) query = triples_vectors[:, 0, :].unsqueeze(1) examples = triples_vectors[:, 1:, :] cos_similarity = F.cosine_similarity(query, examples, dim=2) if args.cuda: targets = autograd.Variable( torch.zeros(triples.shape[0]).type( torch.LongTensor).cuda()) else: targets = autograd.Variable( torch.zeros(triples.shape[0]).type(torch.LongTensor)) # outputs a Variable # By default, the losses are averaged over observations for each minibatch if args.cuda: loss = F.multi_margin_loss(cos_similarity, targets, margin=args.margin).cuda() else: loss = F.multi_margin_loss(cos_similarity, targets, margin=args.margin) total_loss += loss.cpu().data.numpy()[0] loss.backward() optimizer.step() result_headers = ['Epoch', 'MAP', 'MRR', 'P@1', 'P@5'] with open(os.path.join(sys.path[0], args.results_file), 'a') as evaluate_file: writer = csv.writer(evaluate_file, dialect='excel') writer.writerow(result_headers) if args.model == 'lstm': evaluation(args, padding_id, ids_corpus, vocab_map, embeddings, lstm, epoch) else: evaluation(args, padding_id, ids_corpus, vocab_map, embeddings, cnn, epoch) if args.save_model: # saving the model if args.model == 'lstm': print "Saving lstm model epoch " + str( epoch) + " to lstm_model" + str(new_model_num) torch.save( lstm.state_dict(), "lstm_models/lstm_model" + str(new_model_num) + '/' + "epoch" + str(epoch)) else: print "Saving cnn model epoch " + str( epoch) + " to cnn_model" + str(new_model_num) torch.save( cnn.state_dict(), "cnn_models/cnn_model" + str(new_model_num) + '/' + "epoch" + str(epoch))
def main(args): """This file performs domain transfer using an adversarial discriminative network. Example usage: python adversarial_domain.py --ubuntu_path ../askubuntu --android_path ../Android --embeddings ../glove.pruned.txt.gz""" ubuntu_corpus = os.path.join(args.ubuntu_path, 'text_tokenized.txt.gz') android_corpus = os.path.join(args.android_path, 'corpus.tsv.gz') ubuntu_raw_corpus = corpus.read_corpus(ubuntu_corpus) android_raw_corpus = corpus.read_corpus(android_corpus) list_words, vocab_map, embeddings, padding_id = corpus.load_embeddings( corpus.load_embedding_iterator(args.embeddings)) print "loaded embeddings" ubuntu_ids_corpus = corpus.map_corpus(vocab_map, ubuntu_raw_corpus) android_ids_corpus = corpus.map_corpus(vocab_map, android_raw_corpus) ubuntu_train = os.path.join(args.ubuntu_path, 'train_random.txt') ubuntu_train_annotations = corpus.read_annotations(ubuntu_train) print len(ubuntu_train_annotations) ubuntu_training_batches = corpus.create_batches(ubuntu_ids_corpus, ubuntu_train_annotations, args.batch_size, padding_id) print "got ubuntu batches" if args.load_model: if args.model == 'lstm': print("loading " + args.load_model) lstm = nn.LSTM(input_size=300, hidden_size=args.hidden_size) lstm.load_state_dict(torch.load(args.load_model)) optimizer = Adam(lstm.parameters()) if args.cuda: lstm.cuda() else: print("loading " + args.load_model) cnn = nn.Conv1d(in_channels=300, out_channels=args.hidden_size, kernel_size=3, padding=1) cnn.load_state_dict(torch.load(args.load_model)) optimizer = Adam(cnn.parameters()) if args.cuda: cnn.cuda() else: if args.model == 'lstm': print "training lstm" lstm = nn.LSTM(input_size=300, hidden_size=args.hidden_size) optimizer = Adam(lstm.parameters()) if args.cuda: lstm.cuda() else: print "training cnn" cnn = nn.Conv1d(in_channels=300, out_channels=args.hidden_size, kernel_size=3, padding=1) optimizer = Adam(cnn.parameters()) if args.cuda: cnn.cuda() feed_forward = FeedForward(args) if args.cuda: feed_forward.cuda() feed_forward_optimizer = Adam(feed_forward.parameters(), lr=-0.001) android_dev_pos_path = os.path.join(args.android_path, 'dev.pos.txt') android_dev_neg_path = os.path.join(args.android_path, 'dev.neg.txt') android_dev_annotations = android_pairs_to_annotations( android_dev_pos_path, android_dev_neg_path) count = 1 hidden_states = [] total_encoder_loss = 0.0 total_domain_loss = 0.0 total_loss = 0.0 time_begin = datetime.now() time_begin_epoch = datetime.now() for epoch in range(20): print "epoch = " + str(epoch) for batch in ubuntu_training_batches: titles, bodies, triples = batch optimizer.zero_grad() if count % 10 == 0: print(count) print "average encoder loss: " + str( (total_encoder_loss / float(count))) print "average domain loss: " + str( (total_domain_loss / float(count))) print "average loss: " + str((total_loss / float(count))) print("time for 10 batches: " + str(datetime.now() - time_begin)) time_begin = datetime.now() count += 1 ubuntu_batch = corpus.domain_classifier_batch( ubuntu_ids_corpus, ubuntu_train_annotations, padding_id) ubuntu_titles, ubuntu_bodies, _ = ubuntu_batch android_batch = corpus.domain_classifier_batch( android_ids_corpus, android_dev_annotations, padding_id) android_titles, android_bodies, _ = android_batch # print "shapes" # print ubuntu_titles.shape # print android_titles.shape if args.model == 'lstm': model = lstm else: model = cnn hidden_ubuntu = vectorize_question(args, batch, model, vocab_map, embeddings, padding_id) hidden_ubuntu_domain = vectorize_question(args, ubuntu_batch, model, vocab_map, embeddings, padding_id) hidden_android_domain = vectorize_question(args, android_batch, model, vocab_map, embeddings, padding_id) hidden_combined = torch.cat( (hidden_ubuntu_domain, hidden_android_domain)) input_size = int(hidden_combined.size()[0]) output = feed_forward.forward(hidden_combined) domain_labels = [1] * int(hidden_ubuntu_domain.size( )[0]) + [0] * int(hidden_android_domain.size()[0]) if args.cuda: domain_labels = autograd.Variable( torch.LongTensor(domain_labels).cuda()) else: domain_labels = autograd.Variable( torch.LongTensor(domain_labels)) if args.cuda: triples_vectors = hidden_ubuntu[torch.LongTensor( triples.ravel()).cuda()] else: triples_vectors = hidden_ubuntu[torch.LongTensor( triples.ravel())] triples_vectors = triples_vectors.view(triples.shape[0], triples.shape[1], args.hidden_size) query = triples_vectors[:, 0, :].unsqueeze(1) examples = triples_vectors[:, 1:, :] cos_similarity = F.cosine_similarity(query, examples, dim=2) if args.cuda: targets = autograd.Variable( torch.zeros(triples.shape[0]).type( torch.LongTensor).cuda()) else: targets = autograd.Variable( torch.zeros(triples.shape[0]).type(torch.LongTensor)) if args.cuda: encoder_loss = F.multi_margin_loss(cos_similarity, targets, margin=args.margin).cuda() else: encoder_loss = F.multi_margin_loss(cos_similarity, targets, margin=args.margin) total_encoder_loss += encoder_loss.cpu().data.numpy()[0] # if args.cuda: # domain_loss_func = nn.CrossEntropyLoss().cuda() # else: # domain_loss_func = nn.CrossEntropyLoss() # domain_classifier_loss = domain_loss_func(output, domain_labels) if args.cuda: domain_classifier_loss = F.cross_entropy( output, domain_labels).cuda() else: domain_classifier_loss = F.cross_entropy(output, domain_labels) total_domain_loss += domain_classifier_loss.cpu().data.numpy()[0] combined_loss = encoder_loss - args.lam * domain_classifier_loss total_loss += combined_loss.cpu().data.numpy()[0] combined_loss.backward() optimizer.step() feed_forward_optimizer.step() print "time for one epoch: " + str(datetime.now() - time_begin_epoch) time_begin_epoch = datetime.now() evaluation(args, padding_id, android_ids_corpus, model, vocab_map, embeddings)