示例#1
0
文件: dataset.py 项目: yycsu/qa_match
    def __init__(self, args):
        super().__init__()

        train_file = args.train_file
        vocab_file = args.vocab_file

        train_sens = data_utils.load_sentences(train_file, skip_invalid=True)
        word2id, id2word, label2id, id2label = data_utils.load_vocab(
            train_sens, vocab_file)

        data_utils.gen_ids(train_sens, word2id, label2id, 100)
        train_full_tensors = data_utils.make_full_tensors(train_sens)

        raw_x = train_full_tensors[0]
        x_length = train_full_tensors[1]
        x_labels = train_full_tensors[2]

        raw_f = lambda t: id2label[t]
        x_labels_true = np.array(list(map(raw_f, x_labels)))

        n_train = int(len(raw_x) * 1)
        self.train_x, self.test_x = raw_x[:n_train], raw_x[n_train:]
        self.train_length_x, self.test_length_x = x_length[:n_train], x_length[
            n_train:]
        self.train_y, self.test_y = x_labels[:n_train], x_labels[n_train:]
        self.gt_label = x_labels_true
        self.raw_q = ["".join(i.raw_tokens) for i in train_sens]
示例#2
0
def train():
	#load data
	train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
	dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
	test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

	if not os.path.isfile(FLAGS.map_file):
		if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences])
                )
            )
示例#3
0
文件: train.py 项目: yycsu/qa_match
    parser.add_argument("--init_checkpoint", type=str, default='')
    parser.add_argument("--train_file", type=str, default="")
    parser.add_argument("--batch_size", type=int, default=32)
    # customized cluster centers file path, pass either of params 'external_cluster_center' or 'n_clusters'
    parser.add_argument("--external_cluster_center", type=str, default="")
    # number of clusters (init with kmeans)
    parser.add_argument("--n_clusters", type=int, default=20)
    parser.add_argument("--epochs", type=int, default=50)
    parser.add_argument("--warmup_steps", type=int, default=1000)
    parser.add_argument("--learning_rate", type=float, default=0.01)
    # DEC model q distribution param, alpha=1 in paper
    parser.add_argument("--alpha", type=int, default=1)
    parser.add_argument("--layer_num", type=int, default=1)
    parser.add_argument("--token_num", type=int, default=7820)
    parser.add_argument("--lstm_dim", type=int, default=500)
    parser.add_argument("--embedding_dim", type=int, default=1000)
    parser.add_argument("--vocab_file", type=str, default="./vocab")
    parser.add_argument("--model_save_dir", type=str, default="./saved_model")
    args = parser.parse_args()

    word2id, id2word = data_utils.load_vocab_file(args.vocab_file)
    trainset_size = len(
        data_utils.load_sentences(args.train_file, skip_invalid=True))
    other_arg_dict = {}
    other_arg_dict['token_num'] = len(word2id)
    other_arg_dict['trainset_size'] = trainset_size

    exp_data = dataset.ExpDataset(args)
    dec_model = dec_model.DEC(args, other_arg_dict)
    train(exp_data, dec_model, args)
示例#4
0
def train():
    # load data sets
    train_sentences=load_sentences(FLAGS.train_file,FLAGS.zeros)
    dev_sentences=load_sentences(FLAGS.dev_file,FLAGS.zeros)
    test_sentences=load_sentences(FLAGS.test_file,FLAGS.zeros)

    # appoint tagging scheme (IOB/IOBES)
    train_sentences=update_tag_scheme(train_sentences,FLAGS.tag_schema)
    dev_sentences=update_tag_scheme(dev_sentences,FLAGS.tag_schema)
    test_sentences=update_tag_scheme(test_sentences,FLAGS.tag_schema)

    #create maps if not exist
    if not os.path.exists(FLAGS.map_file):
        if FLAGS.pre_emb:
            char_to_id,_=char_mapping(train_sentences)
            char_to_id,id_to_char=augment_with_pretrained(char_to_id,'wiki_100.utf8')
        else:
            char_to_id, id_to_char=char_mapping(train_sentences)
        tag_to_id, id_to_tag=tag_mapping(train_sentences)
        with open(FLAGS.map_file,'wb') as f:
            cPickle.dump([char_to_id,id_to_char,tag_to_id,id_to_tag],f,cPickle.HIGHEST_PROTOCOL)
    else:
        with open(FLAGS.map_file,'rb') as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag=cPickle.load(f)

    # prepare data, get a collection of list containing index
    train_data=prepare_dataset(train_sentences,char_to_id,tag_to_id,True)
    dev_data=prepare_dataset(dev_sentences,char_to_id,tag_to_id,True)
    test_data=prepare_dataset(test_sentences,char_to_id,tag_to_id,True)
    print "%i %i %i sentences in train / dev / test." % (len(train_data),len(dev_data),len(test_data))

    if not FLAGS.pre_emb:
        pre_emb=None
    else:
        pre_emb=load_word2vec(FLAGS.pre_emb_file,char_to_id,FLAGS.char_dim)
        print "init embedding shape: (%d,%d)" %(pre_emb.shape[0],pre_emb.shape[1])

    train_manager=BatchManager(train_data,FLAGS.batch_size,True)
    dev_manager=BatchManager(dev_data,FLAGS.batch_size,False)
    test_manager=BatchManager(test_data,FLAGS.batch_size,False)

    config=BasicModelConfig(FLAGS,len(char_to_id),len(tag_to_id),4)
    tfConfig = tf.ConfigProto()
    tfConfig.gpu_options.per_process_gpu_memory_fraction = FLAGS.memory_usage
    with tf.Session(config=tfConfig) as sess:
        print "Train started!"
        model=BasicModel(config,pre_emb)
        saver=tf.train.Saver()

        # tensorboard
        if not os.path.exists(FLAGS.summaries_dir):
            os.mkdir(FLAGS.summaries_dir)
        merged=tf.summary.merge_all()
        train_writer=tf.summary.FileWriter(os.path.join(FLAGS.summaries_dir,FLAGS.model_name,"train"),sess.graph)
        test_writer=tf.summary.FileWriter(os.path.join(FLAGS.summaries_dir,FLAGS.model_name,"test"),sess.graph)

        # load previous trained model or create a new model
        if not os.path.exists(FLAGS.checkpoints):
            os.mkdir(FLAGS.checkpoints)
        model_name=os.path.join(FLAGS.checkpoints,FLAGS.model_name)
        ckpt=tf.train.get_checkpoint_state(FLAGS.checkpoints)
        if ckpt and ckpt.model_checkpoint_path:
            print "restore from previous traied model: %s" % FLAGS.model_name
            saver.restore(sess,ckpt.model_checkpoint_path)
        else:
            sess.run(tf.global_variables_initializer())

        def evaluate(sess,model,manager):
            strings=[]
            predicts=[]
            goldens=[]
            bar = ProgressBar(max_value=manager.num_batch)
            for batch in bar(manager.iter_batch()):
                batch_string,batch_predict,batch_golden=model.evaluate_step(sess,batch)
                strings.extend(batch_string)
                predicts.extend(batch_predict)
                goldens.extend(batch_golden)
            return strings,predicts,goldens

        best_eval_f1=0
        noimpro_num=0
        for i in range(FLAGS.max_epoch):
            #train
            train_loss=[]
            bar = ProgressBar(max_value=train_manager.num_batch)
            for step,batch in bar(enumerate(train_manager.iter_batch())):
                batch.append(merged)
                summary,global_step,batch_loss=model.train_step(sess,batch,FLAGS.dropout_keep)
                #add summary to tensorboard
                train_writer.add_summary(summary,global_step)
                train_loss.append(batch_loss)
            print "Epoch %d Train loss is %.4f" % (i+1,np.mean(train_loss))

            #dev
            strings,predicts,goldens=evaluate(sess,model,dev_manager)
            eval_f1=report_results(strings,predicts,goldens,id_to_char,id_to_tag,'outputs/dev')
            if eval_f1>best_eval_f1:
                best_eval_f1=eval_f1
                noimpro_num=0
                saver.save(sess,model_name)
            else:
                noimpro_num+=1
            print "Epoch %d Best eval f1:%.6f" % (i+1,best_eval_f1)

            #test
            strings,predicts,goldens=evaluate(sess,model,test_manager)
            test_f1=report_results(strings,predicts,goldens,id_to_char,id_to_tag,'outputs/test',True)
            #early_stop
            if noimpro_num>=3:
                print "Early stop! Final F1 scores on test data is :%.6f" % test_f1
                break
            print
示例#5
0
def train():
    # 加载数据集
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # 选择tag形式 (IOB / IOBES)  默认使用IOBES
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    if not os.path.isfile(FLAGS.map_file):
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            # {'S-LOC': 10, 'E-LOC': 3, 'B-ORG': 4, 'S-PER': 11, 'S-ORG': 12, 'O': 0,
            # 'E-ORG': 5, 'I-LOC': 6, 'I-PER': 7, 'I-ORG': 1, 'B-PER': 8, 'B-LOC': 2, 'E-PER': 9}
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # 转化成数字化的数据
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    #长度不足补0
    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # GPU设置
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data

    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    # 每100次算一次平均loss
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
示例#6
0
result_path = os.path.join("result")

#path for data
# train_file = os.path.join("data", "example.train")
# dev_file = os.path.join("data", "example.dev")
#path for data_medicine_three
# train_file = os.path.join("data", "example_medicine_three.train")
# dev_file = os.path.join("data", "example_medicine_three.dev")
#path for data_medicine_all
train_file = os.path.join("data", "example_medicine_all.train")
dev_file = os.path.join("data", "example_medicine_all.dev")
emb_file = os.path.join("data",
                        "wiki_100.utf8")  #path for pre_trained embedding

#load data and get sentences
train_sentences = load_sentences(train_file)
dev_sentences = load_sentences(dev_file)
# print(train_sentences[5], '\n', dev_sentences[5], '\n', dev_sentences[5])

#获取tag的映射字典
tag_index, id_to_tag = get_tag_index(train_sentences)
print("tag_index:", tag_index, len(tag_index))
print("id_to_tag:", id_to_tag, len(id_to_tag))

#prepare data,对sentences进行处理得到sentence的序列化表示,以及word到ID的映射序列
train_data = prepare_data(train_sentences, seg_dim, tag_index)
# print("train_data:", "\n", train_data[0][2], "\n", train_data[1], "\n", train_data[2][2])
dev_data = prepare_data(dev_sentences, seg_dim, tag_index)

#获取word的映射字典
word_index = train_data[1]
def get_encoded_data(args):
    global agree_words, disagree_words, agree_indices, disagree_indices
    # %% load data
    # load sentence data
    sents, labels = load_sentences(domain=args.domain)

    # load sentiment lexicon
    lexicon = load_lexicon()
    pos_words = [word for word in lexicon if lexicon[word] == 1]
    neg_words = [word for word in lexicon if lexicon[word] == 0]
    lex_labels = [1] * len(pos_words) + [0] * len(neg_words)
    lex_word_seqs = pos_words + neg_words

    # load document data
    mdsd_domain = 'dvd' if args.domain == 'dvds' else args.domain
    doc_texts, doc_labels, _ = load_documents(domains=(mdsd_domain,))  # just one domain, ignore domain labels

    ## build vocabulary
    counter = Counter()
    word_seqs = []
    doc_word_seqs = []
    doc_word_sseqs = []
    # tokenize to words
    for sent in sents:
        word_seqs.append(my_tokenize(sent))  # [[w1, w2, ...], ...]
    for doc in doc_texts:
        doc_word_seqs.append(my_tokenize(doc))
        sent_seqs = []
        for sent in sent_tokenize(doc):
            sent_seqs.append(my_tokenize(sent))
        doc_word_sseqs.append(sent_seqs)  # [[[w11, w12, ...], [w21, w22, ...], ...], ...]
    # stat and index
    lens = []
    doc_lens = []
    doc_sentlens = []
    doc_wordlens = []
    for word_seq in word_seqs:
        counter.update(word_seq)
        lens.append(len(word_seq))
    for word in lexicon.keys():
        counter.update([word])
    for doc_word_seq in doc_word_seqs:
        # counter.update(doc_word_seq)
        doc_lens.append(len(doc_word_seq))
    for sent_seqs in doc_word_sseqs:
        doc_sentlens.append(len(sent_seqs))
        for sent_seq in sent_seqs:
            counter.update(sent_seq)
            doc_wordlens.append(len(sent_seq))
    percentage = 98
    maxlen = int(np.percentile(lens, percentage))
    doc_maxlen_sent = int(np.percentile(doc_sentlens, percentage))  # max sent per doc
    doc_maxlen_word = int(np.percentile(doc_wordlens, percentage))  # max word per sent
    doc_maxlen_word = max(maxlen, doc_maxlen_word)

    # the vocabulary
    min_freq = 3
    word2index = dict()
    idx = 2  # start from 2, 0 as <PAD>, 1 as <OOV>
    for word_count in counter.most_common():
        if word_count[1] >= min_freq or word_count[0] in lexicon:
            word2index[word_count[0]] = idx
            idx += 1
    n_words = len(word2index) + 2
    print('words:', len(word2index))

    print('[agree] words:')
    for word in agree_words:
        if word in word2index:
            agree_indices.add(word2index[word])
            print(' -', word, word2index[word])
    print('[disagree] words:')
    for word in disagree_words:
        if word in word2index:
            disagree_indices.add(word2index[word])
            print(' -', word, word2index[word])
    print('agree: {}\ndisagree: {}'.format(agree_indices, disagree_indices))

    # %% data encoding ====================================================================
    # sent data, and CV version
    seqs = []
    for words in word_seqs:
        seqs.append([word2index.get(word, 1) for word in words])
    padded_seqs_bak = pad_sequences(seqs, maxlen=doc_maxlen_word, padding='post', truncating='post')
    labels_bak = np.asarray(labels, dtype=int)
    print('sent:', padded_seqs_bak.shape, labels_bak.shape)

    # CV-fold split for sentence data
    kf = StratifiedKFold(n_splits=CV, shuffle=True)
    padded_seqs_trains = dict()
    padded_seqs_tests = dict()
    labels_trains = dict()
    labels_tests = dict()
    print('{} fold train/test splitting'.format(CV))
    for cv, (train_idx, test_idx) in enumerate(kf.split(padded_seqs_bak, labels_bak)):
        padded_seqs_trains[cv] = padded_seqs_bak[train_idx]
        padded_seqs_tests[cv] = padded_seqs_bak[test_idx]
        labels_trains[cv] = labels_bak[train_idx]
        labels_tests[cv] = labels_bak[test_idx]

    # lex data
    lex_seqs = []
    for word in lex_word_seqs:
        lex_seqs.append([word2index.get(word, 1)])
    lex_padded_seqs = pad_sequences(lex_seqs, maxlen=1, padding='post', truncating='post')
    lex_labels = np.asarray(lex_labels, dtype=int)
    print(' - lex (all):', lex_padded_seqs.shape, lex_labels.shape)

    # doc data (hierarchical), padding from word to sent
    n_samples = len(doc_word_sseqs)
    doc_padded_seqs = np.zeros(shape=(n_samples, doc_maxlen_sent, doc_maxlen_word), dtype=int)
    for i, sseq_1doc in enumerate(doc_word_sseqs):
        for j, seq_1doc in enumerate(sseq_1doc):
            if j < doc_maxlen_sent:
                for k, word in enumerate(seq_1doc):
                    if k < doc_maxlen_word:
                        doc_padded_seqs[i, j, k] = word2index.get(word, 1)
    doc_labels = np.asarray(doc_labels, dtype=int)
    print(' - doc (all):', doc_padded_seqs.shape, doc_labels.shape)

    # relation data for doc (internal sents) (agree & disagree)
    count_agree, count_disagree = 0, 0
    doc_rel_padded_seqs = np.zeros(shape=(n_samples, doc_maxlen_sent), dtype=int)
    for i in range(0, n_samples):
        for j in range(1, doc_maxlen_sent):
            if doc_padded_seqs[i, j, 0] in agree_indices:
                doc_rel_padded_seqs[i, j] = 1
                count_agree += 1
            if doc_padded_seqs[i, j, 0] in disagree_indices:
                doc_rel_padded_seqs[i, j] = -1
                count_disagree += 1
    print(' - doc sent-rel (all):', doc_rel_padded_seqs.shape)
    print(' - doc sent-rel (all): agree: {}, disagree: {}'.format(count_agree, count_disagree))

    ## sub-sample from lexicon and documents
    print('sub-sampling:')
    # doc data sub-sample
    n_samples = len(padded_seqs_trains[0]) + len(padded_seqs_tests[0])
    doc_padded_seqs, doc_rel_padded_seqs, doc_labels = balanced_subsample3(
        doc_padded_seqs, doc_rel_padded_seqs, doc_labels, subsample_num=n_samples)
    doc_padded_seqs = np.asarray(doc_padded_seqs)
    doc_labels = np.asarray(doc_labels, dtype=int)
    print(' - doc (sampled):', doc_padded_seqs.shape, doc_labels.shape)

    # lex data sub-sample
    lex_padded_seqs, lex_labels = balanced_subsample2(lex_padded_seqs, lex_labels, subsample_num=n_samples)
    lex_padded_seqs = np.asarray(lex_padded_seqs)
    lex_labels = np.asarray(lex_labels, dtype=int)
    print(' - lex (sampled):', lex_padded_seqs.shape, lex_labels.shape)
    ddata = {
        'n_samples': n_samples,
        'n_words': n_words,
        'doc_maxlen_word': doc_maxlen_word,
        'doc_maxlen_sent': doc_maxlen_sent,
        'word2index': word2index,
        'padded_seqs_trains': padded_seqs_trains,
        'labels_trains': labels_trains,
        'padded_seqs_tests': padded_seqs_tests,
        'labels_tests': labels_tests,
        'lex_padded_seqs': lex_padded_seqs,
        'lex_labels': lex_labels,
        'doc_padded_seqs': doc_padded_seqs,
        'doc_labels': doc_labels,
        'doc_rel_padded_seqs': doc_rel_padded_seqs,
    }
    return ddata