def get_data(keys=[], name="cross_domain"): if name == "cross_domain": Vocab = elif name == "mix_domain": Vocab = else: print("Invalid dataset name") trustworthy_reviews = get_reviews(review_type="trustworthy", keys=keys) untrustworthy_reviews = get_reviews(review_type="trustworthy", keys=keys) reviews = trustworthy_reviews + untrustworthy_reviews data = data_helpers.build_input_data(reviews, VOCAB) # generaate labels labels_trustworthy = [[1,0] for _ in range(len(trustworthy_reviews ))] labels_untrustworthy = [[0,1] for _ in range(len(untrustworthy_reviews ))] labels = np.array(labels_trustworthy + labels_untrustworthy) # data_helpers.build_vocab(TRUSTWORTHY_REVIEWS + UNTRUSTWORTHY_REVIEWS, vocab_size=30001) print("data len: ", data.shape[0]) print("labels len: ", labels.shape[0]) return data, labels
def prepare_sentences(sentences, vocabulary, max_length): print(sentences) sentences_processed = process_sentences(sentences) sentences_padded, _ = pad_sentences(sentences_processed, sequence_length=max_length) x, _ = build_input_data(sentences_padded, 0, vocabulary) return x
def preencode(df): sentences = make_text_matrix(df) s = [x.split() for x in sentences['text'].values] l = sentences['target'].values sentences_padded = pad_sentences(s) vocabulary, vocabulary_inv = build_vocab(sentences_padded) x, y = build_input_data(sentences_padded, l, vocabulary) return x,y,vocabulary,vocabulary_inv
def preprocess(model): dict_sentences = {} reverse_dict = {} match_dictionary = {} pair_list = [] import sys i = 0 k = 0 maxlen = 0 # this reads in one line at a time from stdin for line in sys.stdin: i+=1 tokens = line.split("\t") sent1 = tokens[0] sent2 = tokens[1] if clean_sent_cond(sent1) or clean_sent_cond(sent2): continue else: k += 1 if not sent1 in dict_sentences: dict_sentences[sent1] = len(dict_sentences) + 1 if not sent2 in dict_sentences: dict_sentences[sent2] = len(dict_sentences) + 1 index_1 = dict_sentences[sent1] index_2 = dict_sentences[sent2] if not index_1 in match_dictionary: match_dictionary[index_1] = [] if not index_2 in match_dictionary: match_dictionary[index_2] = [] match_dictionary[index_1].append(index_2) match_dictionary[index_2].append(index_1) pair_list.append((index_1, index_2)) if i % 10000 == 0: print(str(k) + "/" + str(i)) if k == 500000: break; i = 0 for entry in dict_sentences: simple_sent1 = filter(lambda x: len(x) > 1, data_helpers.clean_str(entry).split(" ")) sent1 = data_helpers.build_input_data(data_helpers.pad_sentences([simple_sent1], 40, padding_word="<PAD/>"), model.vocab) reverse_dict[dict_sentences[entry]] = sent1 if i % 10000 == 0: print(i) i += 1 random.shuffle(pair_list) pickle.dump(reverse_dict, open("sentences_small_x", "wb"), protocol=pickle.HIGHEST_PROTOCOL) print("writing sentences " + str(len(reverse_dict))) pickle.dump(match_dictionary, open("pairs_index_small_x", "wb"), protocol=pickle.HIGHEST_PROTOCOL) print("writing map " + str(len(match_dictionary))) pickle.dump(pair_list, open("pairs_list_small_x", "wb"), protocol=pickle.HIGHEST_PROTOCOL) print("pairs " + str(len(pair_list)))
def __init__(self, positive_file=real_T_file_, negative_file=real_U_file_, fold=FOLD, is_test_data=False): # change real_U_file to fake_U_file super(Data, self).__init__() self.fold = fold ###### BEGIN ##### # load data trustworthy_reviews_for_training, trustworthy_reviews_for_testing, untrustworthy_reviews_for_training, untrustworthy_reviews_for_testing = load_data( fold=fold) train = trustworthy_reviews_for_training + untrustworthy_reviews_for_training test = trustworthy_reviews_for_testing + untrustworthy_reviews_for_testing # generaate labels train_labels1 = [[1, 0] for _ in range(len(trustworthy_reviews_for_training))] train_labels0 = [ [0, 1] for _ in range(len(untrustworthy_reviews_for_training)) ] test_labels1 = [[1, 0] for _ in range(len(trustworthy_reviews_for_testing))] test_labels0 = [[0, 1] for _ in range(len(untrustworthy_reviews_for_testing))] train_labels = np.array(train_labels1 + train_labels0) test_labels = np.array(test_labels1 + test_labels0) # convert word2idx vocabulary, vocabulary_inv = data_helpers.build_vocab(train + test, vocab_size=30001) train = torch.as_tensor( data_helpers.build_input_data(train, vocabulary)) test = torch.as_tensor(data_helpers.build_input_data(test, vocabulary)) assert (train.shape[0], train_labels.shape[0]) assert (test.shape[0], test_labels.shape[0]) if not is_test_data: self.data = train self.labels = train_labels else: self.data = test self.labels = test_labels
def preprocess(self, line): if line is None: return [[0]] if self.which_data == 'amazon': line = html.unescape(line) line = ' '.join(tkn(multi_occur_regex.sub('', line))).lower() sentences = [(tdh.clean_str(line).split(" "))] return (tdh.build_input_data(sentences, self.vocabulary)) elif self.which_data == 'eurlex': pass elif self.which_data == 'tweets': line = ' '.join(tweet_tokenizer.tokenize(line)) sentences = [(tdh.clean_str(line).split(" "))] return (tdh.build_input_data(sentences, self.vocabulary)) return [[0]]
def __init__(self, keys=[]): # change real_U_file to fake_U_file super(ReviewDataset, self).__init__() trustworthy_reviews = get_reviews(review_type="trustworthy", keys=keys) untrustworthy_reviews = get_reviews(review_type="trustworthy", keys=keys) reviews = trustworthy_reviews + untrustworthy_reviews self.data = torch.as_tensor( data_helpers.build_input_data(reviews, vocabulary)) # generaate labels labels_trustworthy = [[1, 0] for _ in range(len(trustworthy_reviews))] labels_untrustworthy = [[0, 1] for _ in range(len(untrustworthy_reviews))] self.labels = np.array(labels_trustworthy + labels_untrustworthy) # convert word2idx vocabulary = get_vocab() print("data len: ", self.data.shape[0]) print("labels len: ", self.labels.shape[0])
def __init__(self, train_keys=[], test_keys=[]): # change real_U_file to fake_U_file super(MixedDomainDataset, self).__init__() trustworthy_reviews, untrustworthy_reviews = load_data() reviews = trustworthy_reviews + untrustworthy_reviews # generate labels labels_trustworthy = [[1, 0] for _ in range(len(trustworthy_reviews))] labels_untrustworthy = [[0, 1] for _ in range(len(untrustworthy_reviews))] self.labels = np.array(labels_trustworthy + labels_untrustworthy) # convert word2idx vocabulary, vocabulary_inv = data_helpers.build_vocab( trustworthy_reviews + untrustworthy_reviews, vocab_size=30001) self.data = torch.as_tensor( data_helpers.build_input_data(reviews, vocabulary)) print("data len: ", self.data.shape[0]) print("labels len: ", self.labels.shape[0])
# Data Preparatopn # ================================================== # Load data print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) x_eval = data_helpers.load_test_data(FLAGS.test_data_file) # Pad sentences sentences_padded_all, max_length = data_helpers.pad_sentences(x_text + x_eval) sentences_padded, max_length = data_helpers.pad_sentences(x_text, max_length) # Build vocabulary vocabulary, vocabulary_inv = data_helpers.build_vocab(sentences_padded_all) x, y = data_helpers.build_input_data(sentences_padded, y, vocabulary) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] print("Vocabulary Size: {:d}".format(len(vocabulary))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) # Training
'text': ' '.join(row[4]), 'catgy': row[5] } for row in test_semeval.itertuples()] val_tweet_list = [{ 'text': ' '.join(row[4]), 'catgy': row[5] } for row in val_semeval.itertuples()] trn_sents, Y_trn = tdh.load_data_and_labels(train_tweet_list, num_labels) tst_sents, Y_tst = tdh.load_data_and_labels(test_tweet_list, num_labels) val_sents, Y_val = tdh.load_data_and_labels(val_tweet_list, num_labels) embedding_file = '/home/cse/phd/csz178057/scratch/squad/data/glove.6B.300d.txt' vocabs = tdh.get_vocabs_embeddings(trn_sents, embedding_file, num_features=300) labels_inv = list(labels_map.emoji_desc) vocabs['labels_inv'] = labels_inv print('\n'.join([ '{},{}'.format(x[0], x[1]) for x in vocabs['word_counts'].most_common(None) ]), file=open(word_count_txt_file, 'w')) X_trn = tdh.build_input_data(trn_sents, vocabs['vocabulary']) X_tst = tdh.build_input_data(tst_sents, vocabs['vocabulary']) X_val = tdh.build_input_data(val_sents, vocabs['vocabulary']) pickle.dump({'x': X_trn, 'y': Y_trn}, open(train_output_file, 'wb')) pickle.dump({'x': X_tst, 'y': Y_tst}, open(test_output_file, 'wb')) pickle.dump({'x': X_val, 'y': Y_val}, open(val_output_file, 'wb')) pickle.dump(vocabs, open(vocab_output_file, 'wb'))
def main(argv): parser = argparse.ArgumentParser( description='CNN baseline for DSTC5 SAP Task') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='') args = parser.parse_args() # load parameters params = data_helpers.load_params("parameters/cnn.txt") pprint(params) ctx_len = int(params['context_length']) train_utters = [] trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading training instances ... ') transcript_contexts = [] for call in trainset: for i, (log_utter, translations, label_utter) in enumerate(call): transcript = data_helpers.tokenize_and_lower( log_utter['transcript']) transcript_contexts += [transcript] speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += [ '%s_%s' % (sa['act'], attr) for attr in sa['attributes'] ] sa_label_list = sorted(set(sa_label_list)) # train_utters += [(transcript_contexts[max(0, i+1-ctx_len):i+1], log_utter['speaker'], sa_label_list, log_utter['utter_index'])] train_utters += [(transcript, log_utter['speaker'], sa_label_list, log_utter['utter_index'])] sys.stderr.write('Done\n') test_utters = [] testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading testing instances ... ') transcript_contexts = [] for call in testset: for i, (log_utter, translations, label_utter) in enumerate(call): try: translation = data_helpers.tokenize_and_lower( translations['translated'][0]['hyp']) except: translation = '' transcript_contexts += [translation] speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += [ '%s_%s' % (sa['act'], attr) for attr in sa['attributes'] ] sa_label_list = sorted(set(sa_label_list)) # test_utters += [(transcript_contexts[max(0, i+1-ctx_len):i+1], log_utter['speaker'], sa_label_list, log_utter['utter_index'])] test_utters += [(translation, log_utter['speaker'], sa_label_list, log_utter['utter_index'])] # pprint(train_utters[:2]) # pprint(test_utters[:2]) # dump_corpus(train_utters, "dstc5_train.txt") # dump_corpus(test_utters, "dstc5_test.txt") # build vocabulary utters = [utter[0].split(' ') for utter in train_utters] max_sent_len = int(params['max_sent_len']) pad_utters = data_helpers.pad_sentences(utters, max_sent_len) vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters) print("vocabulary size: %d" % len(vocabulary)) # build input train_inputs = data_helpers.build_input_data(pad_utters, vocabulary) utters = [utter[0].split(' ') for utter in test_utters] pad_utters = data_helpers.pad_sentences(utters, max_sent_len) test_inputs = data_helpers.build_input_data(pad_utters, vocabulary) # make windowed input data as context train_inputs = data_helpers.build_windowed_input(train_inputs, ctx_len) test_inputs = data_helpers.build_windowed_input(test_inputs, ctx_len) # build labels sa_train_labels = [utter[2] for utter in train_utters] sa_test_labels = [utter[2] for utter in test_utters] label_binarizer = preprocessing.MultiLabelBinarizer() label_binarizer.fit(sa_train_labels + sa_test_labels) train_labels = label_binarizer.transform(sa_train_labels) test_labels = label_binarizer.transform(sa_test_labels) # split speakers into two sets tourist_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'tourist' ] guide_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'guide' ] tourist_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'tourist' ] guide_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide' ] np.random.shuffle(tourist_train_indices) np.random.shuffle(guide_train_indices) # np.random.shuffle(tourist_test_indices) # np.random.shuffle(guide_test_indices) tourist_train_inputs = train_inputs[tourist_train_indices] tourist_train_labels = train_labels[tourist_train_indices] guide_train_inputs = train_inputs[guide_train_indices] guide_train_labels = train_labels[guide_train_indices] tourist_test_inputs = test_inputs[tourist_test_indices] tourist_test_labels = test_labels[tourist_test_indices] guide_test_inputs = test_inputs[guide_test_indices] guide_test_labels = test_labels[guide_test_indices] # load pre-trained word embeddings embedding_dim = int(params['embedding_dim']) embedding_matrix = data_helpers.load_embedding( vocabulary, embedding_dim=embedding_dim, embedding=params['embedding']) run_slu_sequence_task(embedding_matrix, vocabulary, label_binarizer, tourist_train_inputs, tourist_train_labels, tourist_test_inputs, tourist_test_labels) run_slu_sequence_task(embedding_matrix, vocabulary, label_binarizer, guide_train_inputs, guide_train_labels, guide_test_inputs, guide_test_labels) print("")
def main(argv): parser = argparse.ArgumentParser(description='CNN baseline for DSTC5 SAP Task') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='') parser.add_argument('--roletype', dest='roletype', action='store', choices=['guide', 'tourist'], required=True, help='speaker') args = parser.parse_args() threshold_predictor = None train_utters = [] trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading training instances ... ') for call in trainset: for (log_utter, translations, label_utter) in call: if log_utter['speaker'].lower() != args.roletype: continue transcript = data_helpers.tokenize_and_lower(log_utter['transcript']) speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']] sa_label_list = sorted(set(sa_label_list)) train_utters += [(transcript, log_utter['speaker'], sa_label_list)] sys.stderr.write('Done\n') test_utters = [] testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading testing instances ... ') for call in testset: for (log_utter, translations, label_utter) in call: if log_utter['speaker'].lower() != args.roletype: continue try: translation = data_helpers.tokenize_and_lower(translations['translated'][0]['hyp']) except: translation = '' speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']] sa_label_list = sorted(set(sa_label_list)) test_utters += [(translation, log_utter['speaker'], sa_label_list)] pprint(train_utters[:2]) pprint(test_utters[:2]) # load parameters params = data_helpers.load_params("parameters/cnn.txt") pprint(params) num_epochs = int(params['num_epochs']) validation_split = float(params['validation_split']) batch_size = int(params['batch_size']) multilabel = params['multilabel']=="true" # build vocabulary sents = [utter[0].split(' ') for utter in train_utters] max_sent_len = int(params['max_sent_len']) pad_sents = data_helpers.pad_sentences(sents, max_sent_len) vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_sents) print("vocabulary size: %d" % len(vocabulary)) # params['max_sent_len'] = max_sent_len # build inputs train_inputs = data_helpers.build_input_data(pad_sents, vocabulary) test_sents = [utter[0].split(' ') for utter in test_utters] test_pad_sents = data_helpers.pad_sentences(test_sents, max_sent_len) test_inputs = data_helpers.build_input_data(test_pad_sents, vocabulary) # build labels sa_train_labels = [utter[2] for utter in train_utters] sa_test_labels = [utter[2] for utter in test_utters] label_binarizer = preprocessing.MultiLabelBinarizer() label_binarizer.fit(sa_train_labels+sa_test_labels) train_labels = label_binarizer.transform(sa_train_labels) test_labels = label_binarizer.transform(sa_test_labels) # split and shuffle data indices = np.arange(train_inputs.shape[0]) np.random.shuffle(indices) train_inputs = train_inputs[indices] train_labels = train_labels[indices] num_validation = int(validation_split * train_inputs.shape[0]) # x_train = train_inputs[:-num_validation] # y_train = train_labels[:-num_validation] # x_val = train_inputs[-num_validation:] # y_val = train_labels[-num_validation:] x_train = train_inputs y_train = train_labels x_test = test_inputs y_test = test_labels # construct a pytorch data_loader x_train = torch.from_numpy(x_train).long() y_train = torch.from_numpy(y_train).float() dataset_tensor = data_utils.TensorDataset(x_train, y_train) train_loader = data_utils.DataLoader(dataset_tensor, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=False) x_test = torch.from_numpy(x_test).long() y_test = torch.from_numpy(y_test).long() dataset_tensor = data_utils.TensorDataset(x_test, y_test) test_loader = data_utils.DataLoader(dataset_tensor, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=False) # load pre-trained word embeddings embedding_dim = int(params['embedding_dim']) embedding_matrix = data_helpers.load_embedding(vocabulary, embedding_dim=embedding_dim, embedding=params['embedding']) # load model model = SluConvNet(params, embedding_matrix, len(vocabulary), y_train.shape[1]) if torch.cuda.is_available(): model = model.cuda() learning_rate = float(params['learning_rate']) optimizer = optim.Adam(model.parameters(), lr=learning_rate) loss_fn = nn.MultiLabelSoftMarginLoss() # loss_fn = nn.BCEWithLogitsLoss() for epoch in range(num_epochs): model.train() # set the model to training mode (apply dropout etc) for i, (inputs, labels) in enumerate(train_loader): inputs, labels = autograd.Variable(inputs), autograd.Variable(labels) if torch.cuda.is_available(): inputs, labels = inputs.cuda(), labels.cuda() preds = model(inputs) if torch.cuda.is_available(): preds = preds.cuda() loss = loss_fn(preds, labels) optimizer.zero_grad() loss.backward() optimizer.step() if i % 100 == 0: print("current loss: %.4f" % loss) model.eval() # set the model to evaluation mode # if threshold_predictor is None: threshold_predictor = train_threshold(model, train_loader, y_train.numpy()) # count_predictor = train_count(model, train_loader, y_train.numpy()) true_acts, pred_acts, metrics = evaluate(model, label_binarizer, test_loader, y_test, multilabel, threshold_predictor) # true_acts, pred_acts, metrics = evaluate_count(model, label_binarizer, test_loader, y_test, multilabel, count_predictor) print("Precision: %.4f\tRecall: %.4f\tF1-score: %.4f\n" % (metrics[0], metrics[1], metrics[2])) # end of training true_acts, pred_acts, metrics = evaluate(model, label_binarizer, test_loader, y_test, multilabel) print("Precision: %.4f\tRecall: %.4f\tF1-score: %.4f\n" % (metrics[0], metrics[1], metrics[2])) with open(("pred_result_%s.txt" % args.roletype), "w") as f: for pred_act, true_act in zip(pred_acts, true_acts): f.write("pred: %s\ntrue: %s\n\n" % (', '.join(pred_act), ', '.join(true_act)))
def main(argv): parser = argparse.ArgumentParser( description='CNN baseline for DSTC5 SAP Task') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='') parser.add_argument('--devset', dest='devset', action='store', metavar='DEVSET', required=True, help='') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='') args = parser.parse_args() # load parameters params = data_helpers.load_params("parameters/cnn.txt") pprint(params) trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) devset = dataset_walker.dataset_walker(args.devset, dataroot=args.dataroot, labels=True, translations=True) testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True) train_utters, dev_utters, test_utters = data_helpers.load_dstc5_dataset( trainset, devset, testset) train_utters += dev_utters # pprint(train_utters[:2]) # pprint(test_utters[:2]) # dump_corpus(train_utters, "dstc5_train.txt") # dump_corpus(test_utters, "dstc5_test.txt") # build vocabulary utters = [[char for char in utter[0]] for utter in train_utters] max_sent_len = int(params['max_sent_len']) pad_utters = data_helpers.pad_sentences(utters, max_sent_len) vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters) print("vocabulary size: %d" % len(vocabulary)) # build input train_inputs = data_helpers.build_input_data(pad_utters, vocabulary) utters = [[char for char in utter[0]] for utter in test_utters] pad_utters = data_helpers.pad_sentences(utters, max_sent_len) test_inputs = data_helpers.build_input_data(pad_utters, vocabulary) # build labels sa_train_labels = [utter[2] for utter in train_utters] sa_test_labels = [utter[2] for utter in test_utters] label_binarizer = preprocessing.MultiLabelBinarizer() label_binarizer.fit(sa_train_labels + sa_test_labels) train_labels = label_binarizer.transform(sa_train_labels) test_labels = label_binarizer.transform(sa_test_labels) # split speakers into two sets tourist_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'tourist' ] guide_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'guide' ] tourist_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'tourist' ] guide_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide' ] np.random.shuffle(tourist_train_indices) np.random.shuffle(guide_train_indices) # np.random.shuffle(tourist_test_indices) # np.random.shuffle(guide_test_indices) tourist_train_inputs = train_inputs[tourist_train_indices] tourist_train_labels = train_labels[tourist_train_indices] guide_train_inputs = train_inputs[guide_train_indices] guide_train_labels = train_labels[guide_train_indices] tourist_test_inputs = test_inputs[tourist_test_indices] tourist_test_labels = test_labels[tourist_test_indices] guide_test_inputs = test_inputs[guide_test_indices] guide_test_labels = test_labels[guide_test_indices] # load pre-trained word embeddings embedding_dim = int(params['embedding_dim']) embedding_matrix = data_helpers.load_embedding( vocabulary, embedding_dim=embedding_dim, embedding=params['embedding']) run_slu_task(embedding_matrix, vocabulary, label_binarizer, tourist_train_inputs, tourist_train_labels, tourist_test_inputs, tourist_test_labels) run_slu_task(embedding_matrix, vocabulary, label_binarizer, guide_train_inputs, guide_train_labels, guide_test_inputs, guide_test_labels) print("")
def main(argv): parser = argparse.ArgumentParser( description='CNN baseline for DSTC5 SAP Task') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='') parser.add_argument('--devset', dest='devset', action='store', metavar='DEVSET', required=True, help='') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='') args = parser.parse_args() # load parameters params = data_helpers.load_params("parameters/cnn.txt") pprint(params) trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) devset = dataset_walker.dataset_walker(args.devset, dataroot=args.dataroot, labels=True, translations=True) testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True) train_utters, dev_utters, test_utters = data_helpers.load_dstc5_dataset_multitask( trainset, devset, testset) train_utters += dev_utters context_case = 1 # 여기다가 previous labels context 를 구성하는 코드를 작성하자! # 1) 이전 화행 N개 (speaker 구분안함) # 2) 이전 턴의 상대방 발화들의 모든 화행 (n개) if context_case == 1: pass else: pass # pprint(train_utters[:2]) # pprint(test_utters[:2]) # dump_corpus(train_utters, "dstc5_train.txt") # dump_corpus(test_utters, "dstc5_test.txt") # build vocabulary utters = [[char for char in utter[0]] for utter in train_utters] max_sent_len = int(params['max_sent_len']) pad_utters = data_helpers.pad_sentences(utters, max_sent_len) vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_utters) print("vocabulary size: %d" % len(vocabulary)) # build input train_inputs = data_helpers.build_input_data(pad_utters, vocabulary) utters = [[char for char in utter[0]] for utter in test_utters] pad_utters = data_helpers.pad_sentences(utters, max_sent_len) test_inputs = data_helpers.build_input_data(pad_utters, vocabulary) # build labels train_labels_category = [utter[3] for utter in train_utters] test_labels_category = [utter[3] for utter in test_utters] train_labels_attr = [utter[4] for utter in train_utters] test_labels_attr = [utter[4] for utter in test_utters] train_labels_sa = [utter[5] for utter in train_utters] test_labels_sa = [utter[5] for utter in test_utters] label_binarizer_category = preprocessing.MultiLabelBinarizer() label_binarizer_category.fit(train_labels_category + test_labels_category) label_binarizer_attr = preprocessing.MultiLabelBinarizer() label_binarizer_attr.fit(train_labels_attr + test_labels_attr) label_binarizer_sa = preprocessing.MultiLabelBinarizer() label_binarizer_sa.fit(train_labels_sa + test_labels_sa) train_labels_category = label_binarizer_category.transform( train_labels_category) test_labels_category = label_binarizer_category.transform( test_labels_category) train_labels_attr = label_binarizer_attr.transform(train_labels_attr) test_labels_attr = label_binarizer_attr.transform(test_labels_attr) train_labels_sa = label_binarizer_sa.transform(train_labels_sa) test_labels_sa = label_binarizer_sa.transform(test_labels_sa) # split speakers into two sets tourist_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'tourist' ] guide_train_indices = [ i for i, utter in enumerate(train_utters) if utter[1].lower() == 'guide' ] tourist_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'tourist' ] guide_test_indices = [ i for i, utter in enumerate(test_utters) if utter[1].lower() == 'guide' ] np.random.shuffle(tourist_train_indices) np.random.shuffle(guide_train_indices) # np.random.shuffle(tourist_test_indices) # np.random.shuffle(guide_test_indices) tourist_train_inputs = train_inputs[tourist_train_indices] tourist_train_labels_category = train_labels_category[ tourist_train_indices] tourist_train_labels_attr = train_labels_attr[tourist_train_indices] tourist_train_labels_sa = train_labels_sa[tourist_train_indices] tourist_train_labels = (tourist_train_labels_category, tourist_train_labels_attr, tourist_train_labels_sa) guide_train_inputs = train_inputs[guide_train_indices] guide_train_labels_category = train_labels_category[guide_train_indices] guide_train_labels_attr = train_labels_attr[guide_train_indices] guide_train_labels_sa = train_labels_sa[guide_train_indices] guide_train_labels = (guide_train_labels_category, guide_train_labels_attr, guide_train_labels_sa) tourist_test_inputs = test_inputs[tourist_test_indices] tourist_test_labels_category = test_labels_category[tourist_test_indices] tourist_test_labels_attr = test_labels_attr[tourist_test_indices] tourist_test_labels_sa = test_labels_sa[tourist_test_indices] tourist_test_labels = (tourist_test_labels_category, tourist_test_labels_attr, tourist_test_labels_sa) guide_test_inputs = test_inputs[guide_test_indices] guide_test_labels_category = test_labels_category[guide_test_indices] guide_test_labels_attr = test_labels_attr[guide_test_indices] guide_test_labels_sa = test_labels_sa[guide_test_indices] guide_test_labels = (guide_test_labels_category, guide_test_labels_attr, guide_test_labels_sa) # load pre-trained word embeddings embedding_dim = int(params['embedding_dim']) embedding_matrix = data_helpers.load_embedding( vocabulary, embedding_dim=embedding_dim, embedding=params['embedding']) run_slu_task(embedding_matrix, vocabulary, label_binarizer_sa, tourist_train_inputs, tourist_train_labels, tourist_test_inputs, tourist_test_labels) run_slu_task(embedding_matrix, vocabulary, label_binarizer_sa, guide_train_inputs, guide_train_labels, guide_test_inputs, guide_test_labels)
def main(): global cuda cuda = torch.cuda.is_available() if cuda: train_sequence.cuda = cuda sequence_tagger.cuda = cuda utils.cuda = cuda train_sequence_crafted.cuda = cuda if args.crafted: this_train_sequence = train_sequence_crafted else: this_train_sequence = train_sequence utils.log('start reading ner file ') (token_list, tag_list, raw_token_list) = utils.prepare_data(args.input, True) vocabs = pickle.load(open(args.vocab_path, 'rb')) y = list( map(lambda x: np.array(list(map(lambda y: vocabs['y_dict'][y], x))), tag_list)) x = tdh.build_input_data(token_list, vocabs['vocabulary']) #extract crafted features train_data = utils.get_data_with_pos_tag(raw_token_list, tag_list) features = utils.extract_features(train_data, vocabs['uptl'], vocabs['treatment_suffix'], vocabs['disease_suffix'], vocabs['dis']) ds_data = {'x': x, 'y': y, 'z': features} ds = sequence_dataset.sequence_dataset( '.', 'test', ds_data, word_counts=vocabs['word_counts'], vocabulary_inv=vocabs['vocabulary_inv'], crafted_features=args.crafted) val_loader = DataLoader(ds, batch_sampler=data_samplers.BatchSampler( list(map(lambda x: min(999999, len(x[0])), ds)), 256, shuffle=False), num_workers=4) vocab_size = ds.vocab_size embedding_init = vocabs['embedding_init'] embedding_init = embedding_init[:vocab_size] if args.model == 'bilstm': if args.crafted: model = sequence_tagger.BilstmSequenceTaggerCraftedFeatures( len(vocabs['y_dict']), vocab_size, embedding_size=embedding_init.shape[1], hidden_size=args.hidden_size, intermediate_size=args.intermediate_size, embedding_init=embedding_init, crafted_features_size=args.num_crafted) criterion = nn.CrossEntropyLoss() if cuda: criterion.cuda() # my_loss_fn = lambda x, y, z, m: utils.std_loss_fn_crafted( x, y, z, m, criterion) else: model = sequence_tagger.BilstmSequenceTagger( len(vocabs['y_dict']), vocab_size, embedding_size=embedding_init.shape[1], hidden_size=args.hidden_size, intermediate_size=args.intermediate_size, embedding_init=embedding_init) criterion = nn.CrossEntropyLoss() if cuda: criterion.cuda() # my_loss_fn = lambda x, y, m: utils.std_loss_fn(x, y, m, criterion) else: model = sequence_tagger.BilstmCRFSequenceTagger( len(vocabs['y_dict']), vocab_size, embedding_size=embedding_init.shape[1], hidden_size=args.hidden_size, intermediate_size=args.intermediate_size, embedding_init=embedding_init) my_loss_fn = utils.lstm_crf_neg_log_likelihood_loss1 checkpoint = torch.load(args.checkpoint) model.load_state_dict(checkpoint['model']) rec, i, all_pred = this_train_sequence.compute_sequence(-1, model, my_loss_fn, val_loader, None, 'eval', None, None, [], return_preds=True) utils.write_output(all_pred, raw_token_list, vocabs['y_dict_inv'], args.output)
model_name, num_features) #### add titles to existing pickle files test_title_file = '../data/amazon13k/test_titles.txt' train_title_file = '../data/amazon13k/train_titles.txt' vocabs = pickle.load(open(vocab_output_file, 'rb')) train_data = pickle.load(open(train_output_file, 'rb')) test_data = pickle.load(open(test_output_file, 'rb')) train_titles = [x[:-1] for x in open(train_title_file, 'r').readlines()] train_titles_sents = [tdh.clean_str(x).split() for x in train_titles] test_titles = [x[:-1] for x in open(test_title_file, 'r').readlines()] test_titles_sents = [tdh.clean_str(x).split() for x in test_titles] X_trn_titles = tdh.build_input_data(train_titles_sents, vocabs['vocabulary']) X_tst_titles = tdh.build_input_data(test_titles_sents, vocabs['vocabulary']) train_data['titles'] = X_trn_titles test_data['titles'] = X_tst_titles pickle.dump(train_data, open(train_output_file, 'wb')) pickle.dump(test_data, open(test_output_file, 'wb')) #### add labels inv to existing vocabs vocabs = pickle.load(open(vocab_output_file, 'rb')) labels_inv = [ x[:-1] for x in open(labels_path, 'r', errors='ignore').readlines() ] vocabs['labels_inv'] = labels_inv pickle.dump(vocabs, open(vocab_output_file, 'wb'))
X_train = list() X_test = list() for index in train: X_train.append(sentences_padded[index]) for index in test: X_test.append(sentences_padded[index]) y_train = y_class[train] y_test = y_class[test] # building vocabulary on train set print('building vocabulary on train set') vocabulary, vocabulary_inv = build_vocab(X_train) # Maps sentences to vectors based on vocabulary print('Mapping sentences to vectors based on vocabulary') X_train, y_train = build_input_data(X_train, y_train, vocabulary) # print(X_train.shape) X_test, y_test = build_input_data(X_test, y_test, vocabulary) # all x and y for predicting x, y_class = build_input_data(sentences_padded, y_class, vocabulary) # print(X_test.shape) vocabulary_size = len(vocabulary_inv) # building embedding matrix using GloVe word embeddings print('building embedding matrix using GloVe word embeddings') embedding_matrix = create_embedding_matrix('./dataset/myGloVe200d.txt', vocabulary, embedding_dim) # this returns a tensor print("Creating Model...") inputs = Input(shape=(sequence_length,), dtype='int32') embedding = Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=sequence_length)(inputs)
def main(argv): parser = argparse.ArgumentParser( description='CNN baseline for DSTC5 SAP Task') parser.add_argument('--trainset', dest='trainset', action='store', metavar='TRAINSET', required=True, help='') parser.add_argument('--testset', dest='testset', action='store', metavar='TESTSET', required=True, help='') parser.add_argument('--dataroot', dest='dataroot', action='store', required=True, metavar='PATH', help='') args = parser.parse_args() train_utters = [] trainset = dataset_walker.dataset_walker(args.trainset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading training instances ... ') for call in trainset: context_utters = [] context_utter_str = '<PAD/>' context_labels = [] context_label = ['INI_OPENING'] last_speaker = None for (log_utter, translations, label_utter) in call: transcript = data_helpers.tokenize_and_lower( log_utter['transcript']) speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += [ '%s_%s' % (sa['act'], attr) for attr in sa['attributes'] ] sa_label_list = sorted(set(sa_label_list)) if last_speaker is not None and log_utter[ 'speaker'] != last_speaker: if len(context_utters) > 0: context_utter_str = ' <pause> '.join(context_utters) context_label = context_labels[-1] else: context_utter_str = '<PAD/>' context_label = ['INI_OPENING'] context_utters = [] context_labels = [] last_speaker = None if last_speaker is None or log_utter['speaker'] == last_speaker: context_utters += [transcript] # cumulate context utters context_labels += [sa_label_list] last_speaker = log_utter['speaker'] train_utters += [ (transcript, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], context_label) ] # train_utters += [(transcript, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], sa_label_list)] sys.stderr.write('Done\n') test_utters = [] testset = dataset_walker.dataset_walker(args.testset, dataroot=args.dataroot, labels=True, translations=True) sys.stderr.write('Loading testing instances ... ') for call in testset: context_utters = [] context_utter_str = '<PAD/>' context_labels = [] context_label = ['INI_OPENING'] last_speaker = None for (log_utter, translations, label_utter) in call: try: translation = data_helpers.tokenize_and_lower( translations['translated'][0]['hyp']) except: translation = '' speech_act = label_utter['speech_act'] sa_label_list = [] for sa in speech_act: sa_label_list += [ '%s_%s' % (sa['act'], attr) for attr in sa['attributes'] ] sa_label_list = sorted(set(sa_label_list)) if last_speaker is not None and log_utter[ 'speaker'] != last_speaker: if len(context_utters) > 0: context_utter_str = ' <pause> '.join(context_utters) context_label = context_labels[-1] else: context_utter_str = '' context_label = ['INI_OPENING'] context_utters = [] context_labels = [] last_speaker = None if last_speaker is None or log_utter['speaker'] == last_speaker: context_utters += [translation] # cumulate context utters context_labels += [sa_label_list] last_speaker = log_utter['speaker'] test_utters += [ (translation, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], context_label) ] # test_utters += [(translation, context_utter_str, log_utter['speaker'], sa_label_list, log_utter['utter_index'], sa_label_list)] # pprint(train_utters[:2]) # pprint(test_utters[:2]) # dump_corpus(train_utters, "dstc5_train.txt") # dump_corpus(test_utters, "dstc5_test.txt") # load parameters params = data_helpers.load_params("parameters/cnn.txt") pprint(params) # build vocabulary utters = [utter[0].split(' ') for utter in train_utters] ctx_utters = [utter[1].split(' ') for utter in train_utters] print("max context utter length: %d " % max([len(ctx_utter) for ctx_utter in ctx_utters])) max_sent_len = int(params['max_sent_len']) pad_utters = data_helpers.pad_sentences(utters, max_sent_len) pad_ctx_utters = data_helpers.pad_sentences(ctx_utters, max_sent_len) vocabulary, inv_vocabulary = data_helpers.build_vocab(pad_ctx_utters) print("vocabulary size: %d" % len(vocabulary)) # build input train_inputs = data_helpers.build_input_data(pad_utters, vocabulary) train_ctx_inputs = data_helpers.build_input_data(pad_ctx_utters, vocabulary) utters = [utter[0].split(' ') for utter in test_utters] ctx_utters = [utter[1].split(' ') for utter in test_utters] pad_utters = data_helpers.pad_sentences(utters, max_sent_len) pad_ctx_utters = data_helpers.pad_sentences(ctx_utters, max_sent_len) test_inputs = data_helpers.build_input_data(pad_utters, vocabulary) test_ctx_inputs = data_helpers.build_input_data(pad_ctx_utters, vocabulary) # build labels sa_train_labels = [utter[3] for utter in train_utters] sa_test_labels = [utter[3] for utter in test_utters] sa_train_ctx_labels = [utter[5] for utter in train_utters] sa_test_ctx_labels = [utter[5] for utter in test_utters] label_binarizer = preprocessing.MultiLabelBinarizer() label_binarizer.fit(sa_train_labels + sa_test_labels) train_labels = label_binarizer.transform(sa_train_labels) test_labels = label_binarizer.transform(sa_test_labels) train_ctx_labels = label_binarizer.transform(sa_train_ctx_labels) test_ctx_labels = label_binarizer.transform(sa_test_ctx_labels) # split speakers into two sets tourist_train_indices = [ i for i, utter in enumerate(train_utters) if utter[2].lower() == 'tourist' ] guide_train_indices = [ i for i, utter in enumerate(train_utters) if utter[2].lower() == 'guide' ] tourist_test_indices = [ i for i, utter in enumerate(test_utters) if utter[2].lower() == 'tourist' ] guide_test_indices = [ i for i, utter in enumerate(test_utters) if utter[2].lower() == 'guide' ] np.random.shuffle(tourist_train_indices) np.random.shuffle(guide_train_indices) tourist_train_inputs = train_inputs[tourist_train_indices] tourist_train_ctx_inputs = train_ctx_inputs[tourist_train_indices] tourist_train_labels = train_labels[tourist_train_indices] tourist_train_ctx_labels = train_ctx_labels[tourist_train_indices] guide_train_inputs = train_inputs[guide_train_indices] guide_train_ctx_inputs = train_ctx_inputs[guide_train_indices] guide_train_labels = train_labels[guide_train_indices] guide_train_ctx_labels = train_ctx_labels[guide_train_indices] tourist_test_inputs = test_inputs[tourist_test_indices] tourist_test_ctx_inputs = test_ctx_inputs[tourist_test_indices] tourist_test_labels = test_labels[tourist_test_indices] tourist_test_ctx_labels = test_ctx_labels[tourist_test_indices] guide_test_inputs = test_inputs[guide_test_indices] guide_test_ctx_inputs = test_ctx_inputs[guide_test_indices] guide_test_labels = test_labels[guide_test_indices] guide_test_ctx_labels = test_ctx_labels[guide_test_indices] # load pre-trained word embeddings embedding_dim = int(params['embedding_dim']) embedding_matrix = data_helpers.load_embedding( vocabulary, embedding_dim=embedding_dim, embedding=params['embedding']) run_slu_task(embedding_matrix, vocabulary, label_binarizer, tourist_train_inputs, tourist_train_ctx_inputs, tourist_train_labels, tourist_train_ctx_labels, tourist_test_inputs, tourist_test_ctx_inputs, tourist_test_labels, tourist_test_ctx_labels) run_slu_task(embedding_matrix, vocabulary, label_binarizer, guide_train_inputs, guide_train_ctx_inputs, guide_train_labels, guide_train_ctx_labels, guide_test_inputs, guide_test_ctx_inputs, guide_test_labels, guide_test_ctx_labels) print("")
#load word file x_text_train_arg1_word, x_text_train_arg2_word, y_train = data_helpers.load_word_pkl('train_word_file') x_text_dev_arg1_word, x_text_dev_arg2_word, y_dev = data_helpers.load_word_pkl('dev_word_file') pos_vocab,pos_embd = data_helpers.build_pos_vocab_embd('fourway_data_wu_NOTUNK/pos_list.pkl') word_vocab,word_embd = data_helpers.build_word_vocab_embd('fourway_data_wu_NOTUNK/word_list.pkl') # file_pkl = open("./pos_vocab_embd_word_vocab_embd.pkl", "wb") # pickle.dump([pos_vocab,pos_embd,word_vocab,word_embd],file_pkl) # file_pkl.close() x_train_arg1_pos = data_helpers.build_input_data(x_text_train_arg1_pos,pos_vocab,FLAGS.max_document_length) x_train_arg2_pos = data_helpers.build_input_data(x_text_train_arg2_pos,pos_vocab,FLAGS.max_document_length) x_dev_arg1_pos = data_helpers.build_input_data(x_text_dev_arg1_pos,pos_vocab,FLAGS.max_document_length) x_dev_arg2_pos = data_helpers.build_input_data(x_text_dev_arg2_pos,pos_vocab,FLAGS.max_document_length) x_train_arg1_word = data_helpers.build_input_data(x_text_train_arg1_word, word_vocab,FLAGS.max_document_length) x_train_arg2_word = data_helpers.build_input_data(x_text_train_arg2_word, word_vocab,FLAGS.max_document_length) x_dev_arg1_word = data_helpers.build_input_data(x_text_dev_arg1_word,word_vocab,FLAGS.max_document_length) x_dev_arg2_word = data_helpers.build_input_data(x_text_dev_arg2_word,word_vocab,FLAGS.max_document_length) with open("./vocab_embd.txt","w",encoding="utf-8") as write_object: for v in word_vocab: write_object.write(str(v)+"\n") with open("./x_train_arg1.txt", "w") as write_object:
import data_helpers as tdh import numpy as np train_file = '../data/ner.txt' train_output_file = '../data/train1.pkl' vocab_output_file = '../data/vocab1.pkl' word_count_txt_file = '../data/vocab_freq.txt' num_features = 300 model_name = os.path.join( '/home/yatin/phd/nlp/project/xmlcnn/theano_code/word2vec_models/', 'glove.6B.%dd.txt' % (num_features)) #model_name = os.path.join('/home/cse/phd/csz178057/scratch/squad/data', 'glove.6B.%dd.txt' % (num_features)) ner_file = train_file (token_list, tag_list, raw_token_list) = utils.prepare_data(ner_file, False) tag_dict = {'D': 0, 'T': 1, 'O': 2} tag_list1 = list( map(lambda x: np.array(list(map(lambda y: tag_dict[y], x))), tag_list)) vocabs = tdh.get_vocabs_embeddings(token_list, model_name, num_features) vocabs['y_dict'] = tag_dict vocabs['y_dict_inv'] = dict([(tag_dict[k], k) for k in tag_dict]) x_trn = tdh.build_input_data(token_list, vocabs['vocabulary']) pickle.dump({'x': x_trn, 'y': tag_list1}, open(train_output_file, 'wb')) pickle.dump(vocabs, open(vocab_output_file, 'wb')) print('\n'.join( ['{},{}'.format(x[0], x[1]) for x in word_counts.most_common(None)]), file=open(word_count_txt_file, 'w'))
FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") x_raw = data_helpers.load_test_data( '/Users/Winnerineast/Documents/haodaifu/NewData/tobetrained.csv') # Map data into vocabulary vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") vocabulary, vocabulary_inv, max_length = data_helpers.restore_vocabulary( vocab_path) sentences_padded, tmp_length = data_helpers.pad_sentences(x_raw, max_length) x_test, y_test = data_helpers.build_input_data(sentences_padded, None, vocabulary) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))