Пример #1
0
def infer_cnn(data_path,
              model_path,
              word_vocab_path,
              pos_vocab_path,
              label_vocab_path,
              word_emb_path,
              pos_emb_path,
              batch_size,
              pred_save_path=None):
    # init dict
    word_vocab, pos_vocab, label_vocab = load_vocab(
        word_vocab_path), load_vocab(pos_vocab_path), load_vocab(
            label_vocab_path)
    word_emb, pos_emb = load_pkl(word_emb_path), load_pkl(pos_emb_path)
    word_test, pos_test = test_reader(data_path, word_vocab, pos_vocab,
                                      label_vocab)
    # init model
    model = Model(config.max_len, word_emb, pos_emb, label_vocab=label_vocab)
    ckpt_path = get_ckpt_path(model_path)
    if ckpt_path:
        print("Read model parameters from %s" % ckpt_path)
        model.saver.restore(model.sess, ckpt_path)
    else:
        print("Can't find the checkpoint.going to stop")
        return
    label_pred = model.predict(word_test, pos_test, batch_size)
    save(label_pred, pred_save_path=pred_save_path)
    print("finish prediction.")
def train(save_vocab_path='',
          train_path='',
          test_path='',
          train_seg_path='',
          test_seg_path='',
          model_save_dir='',
          vocab_max_size=5000,
          vocab_min_count=5,
          hidden_dim=512,
          use_cuda=False):

    train_prog = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            avg_cost = train_model()
            optimizer = optimizer_func(hidden_dim)
            optimizer.minimize(avg_cost)

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)

    seg_data(train_path, test_path)
    train_texts = build_dataset(train_seg_path)

    if os.path.exists(save_vocab_path):
        vocab = load_vocab(save_vocab_path)
    else:
        vocab, reverse_vocab = build_vocab(train_texts, min_count=vocab_min_count)
        write_vocab(vocab, save_vocab_path)
        vocab = load_vocab(save_vocab_path)

    train_set = read_data(train_seg_path)
    train_set_ids = transform_data(train_set, vocab)
    num_encoder_tokens = len(train_set_ids)
    max_input_texts_len = max([len(text) for text in train_texts])
    print('num of samples:', len(train_texts))
    print('num of unique input tokens:', num_encoder_tokens)
    print('max sequence length for inputs:', max_input_texts_len)
    # save_word_dict(vocab2id, save_vocab_path)

    train_reader = data_generator(train_set_ids)

    train_data = paddle.batch(paddle.reader.shuffle(train_reader, buf_size=10000), batch_size=batch_size)

    feeder = fluid.DataFeeder(feed_list=['question_word', 'dialogue_word', 'report_word', 'report_next_word'],
                              place=place,
                              program=train_prog)

    exe.run(startup_prog)

    EPOCH_NUM = 20
    for pass_id in six.moves.xrange(EPOCH_NUM):
        batch_id = 0
        for data in train_data():
            cost = exe.run(train_prog, feed=feeder.feed(data), fetch_list=[avg_cost])[0]
            print('pass_id: %d, batch_id: %d, loss: %f' % (pass_id, batch_id, cost))
            batch_id += 1
        fluid.io.save_params(exe, model_save_dir, main_program=train_prog)
Пример #3
0
def main(config, eval_folder):

    # local the vocab file

    text_words_vocab = load_vocab(config.text_words_path)
    text_chars_vocab = load_vocab(config.text_chars_path)
    inv_text_vocab = {v: k for k, v in text_words_vocab.items()}

    # get the processing function
    processing_word = get_processing_word(text_words_vocab,
                                          text_chars_vocab,
                                          lowercase=True,
                                          chars=True)

    #load features:
    word_features = get_trimmed_features(config.word_embeddings_trimmed_path)

    examples = read_examples(eval_folder, processing_word)

    # build WImpModel

    model = WImpModel(config, word_features, None, text_words_vocab["$UNK$"],
                      inv_text_vocab, None)
    model.build_graph()

    words, word_feats, speech_interval_feats = [], [], []
    for sent_key in examples.keys():
        words_, word_feats_, speech_feats_ = zip(*examples[sent_key])
        word_feats_ = list(zip(*word_feats_))

        word_feats.append(word_feats_)
        speech_interval_feats.append(speech_feats_)
        words.append(words_)

    speech_interval_feats_pad_, speech_lengths = pad_sequences(
        speech_interval_feats,
        pad_tok=[0] * config.speech_features_dim,
        nlevels=2)
    speech_feats = speech_interval_feats_pad_[:, :, :, config.
                                              speech_lexical_features_dim:]
    speech_lexical_feats = speech_interval_feats_pad_[:, :, 0, :config.
                                                      speech_lexical_features_dim]

    feed, sequence_lengths = model.get_feed_dict(words=word_feats, dropout=1.0)
    feed[model.speech_features] = speech_feats
    feed[model.speech_lexical_features] = speech_lexical_feats
    feed[model.speech_lengths] = speech_lengths

    predictions = model.test(feed)

    print("\n")
    print("WORD IMPORTANCE PREDICTION OUTPUT")
    print("=================================")
    for sent_id in range(len(words)):
        scores = predictions[0][:sequence_lengths[sent_id]]
        tokens = words[sent_id]
        result = ["%s (%f)" % (w, s) for w, s in zip(tokens, scores)]
        print("--> " + " ".join(result) + "\n")
Пример #4
0
def train(train_data, val_data, fold_idx=None):
    train_dataset = MyDataset(train_data)
    val_dataset = MyDataset(val_data)

    train_loader = DataLoader(train_dataset, batch_size=config.batch_size)
    val_loader = DataLoader(val_dataset, batch_size=config.batch_size)
    from models.hmm import HMM
    word2id, id2word = load_vocab()
    model = HMM(len(config.label2id), len(word2id))

    if fold_idx is None:
        print('start')
        model_save_path = os.path.join(config.model_path,
                                       '{}.bin'.format(model_name))
    else:
        print('start fold: {}'.format(fold_idx + 1))
        model_save_path = os.path.join(
            config.model_path, '{}_fold{}.bin'.format(model_name, fold_idx))

    word_id_list = train_dataset.x_data
    label_id_list = train_dataset.y_data
    model.train(word_id_list, label_id_list)

    y_pred_list = model.predict(train_dataset.x_data)
    train_score = get_score(train_dataset.y_data, y_pred_list)
    y_pred_list = model.predict(val_dataset.x_data)
    val_score = get_score(val_dataset.y_data, y_pred_list)
    msg = 'train score: {0:>6.2%}, val score: {1:>6.2%}'
    print(msg.format(train_score, val_score))
Пример #5
0
    def __init__(self):
        """
        Creates output directories if they don't exist and load vocabulary
        Defines attributes that depends on the vocab.
        Look for the __init__ comments in the class attributes
        """
        # check that the reload directory exists
        if self.dir_reload is not None and not os.path.exists(self.dir_reload):
            print("Weights directory not found ({})".format(self.dir_reload))
            self.dir_reload = None

        # directory for training outputs
        if not os.path.exists(self.dir_output):
            os.makedirs(self.dir_output)

        if not os.path.exists(self.model_output):
            os.makedirs(self.model_output)

        if not os.path.exists(self.dir_plots):
            os.makedirs(self.dir_plots)

        # initializer file for answers
        with open(self.path_results, "a") as f:
            pass

        with open(self.path_results_final, "a") as f:
            pass

        self.vocab = load_vocab(self.path_vocab)
        self.vocab_size = len(self.vocab)
        self.attn_cell_config["num_proj"] = self.vocab_size
        self.id_PAD = self.vocab[PAD]
        self.id_END = self.vocab[END]
        self.logger = get_logger(self.path_log)
Пример #6
0
def infer_classic(model_type='xgboost_lr',
                  model_save_path='',
                  label_vocab_path='',
                  test_data_path='',
                  pred_save_path='',
                  feature_vec_path='',
                  col_sep='\t',
                  feature_type='tfidf_word'):
    # load data content
    data_set, true_labels = data_reader(test_data_path, col_sep)
    # init feature
    feature = Feature(data=data_set,
                      feature_type=feature_type,
                      feature_vec_path=feature_vec_path,
                      is_infer=True)
    # get data feature
    data_feature = feature.get_feature()
    # load model
    if model_type == 'xgboost_lr':
        model = XGBLR(model_save_path)
    else:
        model = load_pkl(model_save_path)

    # predict
    pred_label_probs = model.predict_proba(data_feature)

    # label id map
    label_id = load_vocab(label_vocab_path)
    id_label = {v: k for k, v in label_id.items()}

    pred_labels = [id_label[prob.argmax()] for prob in pred_label_probs]
    pred_output = [
        id_label[prob.argmax()] + col_sep + str(prob.max())
        for prob in pred_label_probs
    ]
    logger.info("save infer label and prob result to:%s" % pred_save_path)
    save_predict_result(pred_output,
                        ture_labels=None,
                        pred_save_path=pred_save_path,
                        data_set=data_set)

    # evaluate
    if true_labels:
        try:
            print(classification_report(true_labels, pred_labels))
            print(confusion_matrix(true_labels, pred_labels))
        except UnicodeEncodeError:
            true_labels_id = [label_id[i] for i in true_labels]
            pred_labels_id = [label_id[i] for i in pred_labels]
            print(classification_report(true_labels_id, pred_labels_id))
            print(confusion_matrix(true_labels_id, pred_labels_id))
        except Exception:
            print("error. no true labels")

    # analysis lr model
    if config.debug and model_type == "logistic_regression":
        feature_weight_dict = load_dict(config.lr_feature_weight_path)
        pred_labels = cal_multiclass_lr_predict(data_set, feature_weight_dict,
                                                id_label)
        print(pred_labels[:5])
Пример #7
0
def infer_deep_model(model_type='cnn',
                     data_path='',
                     model_save_path='',
                     label_vocab_path='',
                     max_len=300,
                     batch_size=128,
                     col_sep='\t',
                     pred_save_path=None):
    from keras.models import load_model
    # load data content
    data_set, true_labels = data_reader(data_path, col_sep)
    # init feature
    # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2)
    if model_type == 'han':
        feature_type = 'doc_vectorize'
    else:
        feature_type = 'vectorize'
    feature = Feature(data_set,
                      feature_type=feature_type,
                      is_infer=True,
                      max_len=max_len)
    # get data feature
    data_feature = feature.get_feature()

    # load model
    model = load_model(model_save_path)
    # predict, in keras, predict_proba same with predict
    pred_label_probs = model.predict(data_feature, batch_size=batch_size)

    # label id map
    label_id = load_vocab(label_vocab_path)
    id_label = {v: k for k, v in label_id.items()}
    pred_labels = [prob.argmax() for prob in pred_label_probs]
    pred_labels = [id_label[i] for i in pred_labels]
    pred_output = [
        id_label[prob.argmax()] + col_sep + str(prob.max())
        for prob in pred_label_probs
    ]
    logger.info("save infer label and prob result to: %s" % pred_save_path)
    save_predict_result(pred_output,
                        ture_labels=None,
                        pred_save_path=pred_save_path,
                        data_set=data_set)
    if true_labels:
        # evaluate
        assert len(pred_labels) == len(true_labels)
        for label, prob in zip(true_labels, pred_label_probs):
            logger.debug('label_true:%s\tprob_label:%s\tprob:%s' %
                         (label, id_label[prob.argmax()], prob.max()))

        print('total eval:')
        try:
            print(classification_report(true_labels, pred_labels))
            print(confusion_matrix(true_labels, pred_labels))
        except UnicodeEncodeError:
            true_labels_id = [label_id[i] for i in true_labels]
            pred_labels_id = [label_id[i] for i in pred_labels]
            print(classification_report(true_labels_id, pred_labels_id))
            print(confusion_matrix(true_labels_id, pred_labels_id))
Пример #8
0
 def init(self):
     if not os.path.exists(self.model_dir):
         os.makedirs(self.model_dir)
     if not os.path.exists(self.summary_dir):
         os.makedirs(self.summary_dir)
     if not os.path.exists(self.logger_dir):
         os.makedirs(self.logger_dir)
     self.logger = self.get_logger(self.logger_path)
     self.vocab_words = load_vocab(self.words_file)
     self.vocab_tags = load_vocab(self.tags_file)
     if self.use_lexicon:
         self.vocab_lexicons = load_vocab(self.lexicons_file)
     self.lexicon_z_embeddings = load_z_vectors(self.lexicon_z_file)
     if self.use_chars:
         self.vocab_chars = load_vocab(self.chars_file)
     else:
         self.vocab_chars = {}
Пример #9
0
 def __init__(self, df, mode='train'):
     self.mode = mode
     self.word2id, _ = load_vocab()
     self.x_data = []
     self.y_data = []
     for i, row in df.iterrows():
         x, y = self.row_to_tensor(row)
         self.x_data.append(x)
         self.y_data.append(y)
Пример #10
0
    def run_evaluate(self, sess, val_set, lr_schedule=None, path_results=None):
        """
        Performs an epoch of evaluation

        Args:
            sess: (tf.Session)
            val_set: Dataset instance
            lr_schedule: (instance of Lr schedule) optional
            path_results: (string) where to write the results
        Returns:
            bleu score: 
            exact match score: 
        """
        vocab = load_vocab(self.config.path_vocab)
        rev_vocab = {idx: word for word, idx in vocab.iteritems()}

        references, hypotheses = [], []
        n_words, ce_words = 0, 0 # for perplexity, sum of ce for all words + nb of words
        
        for img, formula in minibatches(val_set, self.config.batch_size):
            fd = self.get_feed_dict(img, training=False, formula=formula, dropout=1)
            ce_words_eval, n_words_eval, ids_eval = sess.run(
                    [self.ce_words, self.n_words, self.pred_test.ids], feed_dict=fd)

            if self.config.decoding == "greedy":
                ids_eval = np.expand_dims(ids_eval, axis=1)
                
            elif self.config.decoding == "beam_search":
                ids_eval = np.transpose(ids_eval, [0, 2, 1])

            n_words += n_words_eval
            ce_words += ce_words_eval
            for form, pred in zip(formula, ids_eval):
                # pred is of shape (number of hypotheses, time)
                references.append([form])
                hypotheses.append(pred)


        if path_results is None:
            path_results = self.config.path_results

        scores = evaluate(references, hypotheses, rev_vocab, 
                            path_results, self.config.id_END)

        ce_mean = ce_words / float(n_words)
        scores["perplexity"] = np.exp(ce_mean)

        if lr_schedule is not None:
            lr_schedule.update(score=scores["perplexity"])

        return scores
Пример #11
0
def embed2vec_with_english(spanish_embeding_file, english_embedding_file,
                           vocab_file, dim):
    """
    embedding -> numpy
    """

    vocab = load_vocab(vocab_file)

    print('vocab size is {}'.format(len(vocab)))

    weight_matrix = norm_weight(len(vocab), dim)

    weight_matrix[0] = 0.0

    words_found = 0
    unfind_words = []
    find_words = []

    spanish_embedding_vec = load_embedding(spanish_embeding_file)
    english_embedding_vec = load_embedding(english_embedding_file)

    for index, word in enumerate(vocab):

        if word in spanish_embedding_vec:
            weight_matrix[index] = spanish_embedding_vec[word]
            words_found += 1
            find_words.append(word)
        elif word in english_embedding_vec:
            weight_matrix[index] = english_embedding_vec[word]
            find_words.append(word)
        else:
            unfind_words.append(word)

    print('找到词向量的单词数有{},没有找到的有{}'.format(words_found,
                                         len(vocab) - words_found))
    np.savez(config.all_vocab_embedding_file, weights=weight_matrix)

    with open('{}unfind_word.txt'.format(config.multi_task_path),
              'wt',
              encoding='utf-8') as f:
        for line in unfind_words:
            f.write(line + '\n')

    print('Done')
Пример #12
0
def build_pos_embedding(path,
                        overwrite=False,
                        pos_vocab_path=None,
                        pos_vocab_start=1,
                        pos_dim=64):
    if os.path.exists(path) and not overwrite:
        print("already has $s and use it." % path)
        return load_pkl(path)
    pos_vocab = load_vocab(pos_vocab_path)
    pos_vocab_count = len(pos_vocab) + pos_vocab_start
    pos_emb = np.random.normal(size=(
        pos_vocab_count,
        pos_dim,
    )).astype('float32')
    for i in range(pos_vocab_start):
        pos_emb[i, :] = 0.
    # save
    dump_pkl(pos_emb, path, overwrite=True)
    return pos_emb
Пример #13
0
def test(args):
    if args.thread_restrict is True:
        cfg_proto = tf.ConfigProto(intra_op_parallelism_threads=2)
    else:
        cfg_proto = None
    with tf.Session(config=cfg_proto) as sess:
        # Loading the vocabulary files
        vocab, rev_vocab = load_vocab(args)
        args.vocab_size = len(rev_vocab)
        # Creating test model

        # Hacky way to get seq_len
        test_set = load_pickle(args, split='test')
        args.config.seq_len = test_set[0]['sentence_len']

        # Creating training model
        if args.config.elmo is True:
            elmo = hub.Module("https://tfhub.dev/google/elmo/1",
                              trainable=True)
        else:
            elmo = None

        with tf.variable_scope("model", reuse=None):
            model_test = SentimentModel(args,
                                        queue=None,
                                        mode='eval',
                                        elmo=elmo)
        # Reload model from checkpoints, if any
        steps_done = initialize_weights(sess, model_test, args, mode='test')
        logger.info("loaded %d completed steps", steps_done)

        for split in args.eval_splits.split(','):
            test_set = load_pickle(args, split=split)
            results, losses = evaluate(sess, model_test, test_set, args)
            if args.mode != 'train':
                detailed_results(args, split, test_set, rev_vocab, results)
            percent_correct = float(len(
                results['correct'])) * 100.0 / len(test_set)
            logger.info("correct predictions on %s - %.4f. Eval Losses - %.4f",
                        split, percent_correct, losses)
Пример #14
0
def analysis(args):
    if args.thread_restrict is True:
        cfg_proto = tf.ConfigProto(intra_op_parallelism_threads=2)
    else:
        cfg_proto = None
    with tf.Session(config=cfg_proto) as sess:
        # Loading the vocabulary files
        vocab, rev_vocab = load_vocab(args)
        args.vocab_size = len(rev_vocab)
        # Creating test model

        train_set = load_pickle(args, split='train')
        args.config.seq_len = train_set[0]['sentence_len']
        args.config.eval_batch_size = 1
        # Creating training model
        if args.config.elmo is True:
            elmo = hub.Module("https://tfhub.dev/google/elmo/1", trainable=True)
        else:
            elmo = None

        with tf.variable_scope("model", reuse=None):
            model_test = SentimentModel(args, queue=None, mode='eval', elmo=elmo)

        # Reload model from checkpoints, if any
        steps_done = initialize_weights(sess, model_test, args, mode='test')
        logger.info("loaded %d completed steps", steps_done)

        logicnn.append_features(args, train_set, model_test, vocab, rev_vocab)

        dev_set = load_pickle(args, split='dev')
        logicnn.append_features(args, dev_set, model_test, vocab, rev_vocab)

        test_set = load_pickle(args, split='test')
        logicnn.append_features(args, test_set, model_test, vocab, rev_vocab)

        if args.config.elmo is True:
            elmo_embedding_analysis(sess, model_test, test_set)
        else:
            w2v_embedding_analysis(sess, model_test, test_set)
Пример #15
0
def build_word_embedding(path,
                         overwrite=False,
                         sentence_w2v_path=None,
                         word_vocab_path=None,
                         word_vocab_start=2,
                         w2v_dim=256):
    if os.path.exists(path) and not overwrite:
        print("already has $s and use it." % path)
        return load_pkl(path)
    word_vocab = load_vocab(word_vocab_path)
    w2v_dict_full = load_pkl(sentence_w2v_path)
    word_vocab_count = len(w2v_dict_full) + word_vocab_start
    word_emb = np.zeros((word_vocab_count, w2v_dim), dtype='float32')
    for word in word_vocab:
        index = word_vocab[word]
        if word in w2v_dict_full:
            word_emb[index, :] = w2v_dict_full[word]
        else:
            random_vec = np.random.uniform(-0.25, 0.25,
                                           size=(w2v_dim, )).astype('float32')
            word_emb[index, :] = random_vec
    # save
    dump_pkl(word_emb, path, overwrite=True)
    return word_emb
Пример #16
0
def infer_classic(model_type='xgboost_lr',
                  model_save_path='',
                  label_vocab_path='',
                  test_data_path='',
                  pred_save_path='',
                  feature_vec_path='',
                  col_sep='\t',
                  feature_type='tfidf_word'):
    # load data content
    data_set, true_labels = data_reader(test_data_path, col_sep)
    # init feature
    feature = Feature(data_set,
                      feature_type=feature_type,
                      feature_vec_path=feature_vec_path,
                      is_infer=True)
    # get data feature
    data_feature = feature.get_feature()
    # load model
    if model_type == 'xgboost_lr':
        model = XGBLR(model_save_path)
    else:
        model = load_pkl(model_save_path)

    # predict
    pred_label_probs = model.predict_proba(data_feature)

    # label id map
    label_id = load_vocab(label_vocab_path)
    id_label = {v: k for k, v in label_id.items()}

    pred_labels = [id_label[prob.argmax()] for prob in pred_label_probs]
    pred_output = [
        id_label[prob.argmax()] + col_sep + str(prob.max())
        for prob in pred_label_probs
    ]
    logger.info("save infer label and prob result to:%s" % pred_save_path)
    save(pred_output,
         ture_labels=None,
         pred_save_path=pred_save_path,
         data_set=data_set)
    if 'logistic_regression' in model_save_path and config.is_debug:
        count = 0
        features = load_pkl('output/lr_features.pkl')
        for line in data_set:
            if count > 5:
                break
            count += 1
            logger.debug(line)
            words = line.split()
            for category, category_feature in features.items():
                logger.debug('*' * 43)
                logger.debug(category)
                category_score = 0
                for w in words:
                    if w in category_feature:
                        category_score += category_feature[w]
                        logger.debug("%s:%s" % (w, category_feature[w]))
                logger.debug("%s\t%f" % (category, category_score))
                logger.debug('=' * 43)
    if true_labels:
        # evaluate
        try:
            print(classification_report(true_labels, pred_labels))
            print(confusion_matrix(true_labels, pred_labels))
        except UnicodeEncodeError:
            true_labels_id = [label_id[i] for i in true_labels]
            pred_labels_id = [label_id[i] for i in pred_labels]
            print(classification_report(true_labels_id, pred_labels_id))
            print(confusion_matrix(true_labels_id, pred_labels_id))
Пример #17
0
def get_vocabs(vocab_file):
    words = load_vocab(vocab_file)
    word_dict = {}
    for i in range(len(words)):
        word_dict[words[i]] = i
    return word_dict
Пример #18
0
def train_classic(model_type='logistic_regression',
                  data_path='',
                  model_save_path='',
                  feature_vec_path='',
                  col_sep='\t',
                  feature_type='tfidf_word',
                  min_count=1,
                  word_vocab_path='',
                  label_vocab_path='',
                  pr_figure_path=''):
    logger.info("train classic model, model_type:{}, feature_type:{}".format(model_type, feature_type))
    # load data
    data_content, data_lbl = data_reader(data_path, col_sep)
    word_lst = []
    for i in data_content:
        word_lst.extend(i.split())

    # word vocab
    word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True)
    # save word vocab
    write_vocab(word_vocab, word_vocab_path)
    word_id = load_vocab(word_vocab_path)
    # label
    label_vocab = build_vocab(data_lbl)
    # save label vocab
    write_vocab(label_vocab, label_vocab_path)
    label_id = load_vocab(label_vocab_path)
    print(label_id)
    data_label = [label_id[i] for i in data_lbl]
    num_classes = len(set(data_label))
    logger.info('num_classes:%d' % num_classes)
    logger.info('data size:%d' % len(data_content))
    logger.info('label size:%d' % len(data_lbl))

    # init feature
    if feature_type in ['doc_vectorize', 'vectorize']:
        logger.error('feature type error. use tfidf_word replace.')
        feature_type = 'tfidf_word'
    feature = Feature(data=data_content, feature_type=feature_type,
                      feature_vec_path=feature_vec_path, word_vocab=word_vocab, is_infer=False)
    # get data feature
    data_feature = feature.get_feature()

    X_train, X_val, y_train, y_val = train_test_split(
        data_feature, data_label, test_size=0.1, random_state=0)
    if model_type == 'xgboost_lr':
        model = XGBLR(model_save_path=model_save_path)
    else:
        model = get_model(model_type)
    # fit
    model.fit(X_train, y_train)
    # save model
    if model_type != 'xgboost_lr':
        save_pkl(model, model_save_path, overwrite=True)
    # evaluate
    eval(model, X_val, y_val, num_classes=num_classes, pr_figure_path=pr_figure_path)

    # analysis lr model
    if config.debug and model_type == "logistic_regression":
        feature_weight = {}
        word_dict_rev = sorted(word_id.items(), key=lambda x: x[1])
        for feature, index in word_dict_rev:
            feature_weight[feature] = list(map(float, model.coef_[:, index]))
        save_dict(feature_weight, config.lr_feature_weight_path)
Пример #19
0
# Two lengths must be same

filters = [3, 5, 7]
num_filters = [128, 128, 128]
assert len(filters) == len(num_filters)

# - Training
epochs = 10
batch = 256
lr = 0.001
cuda = True

model = "cnn"   # 'cnn' or 'rnn'

# Load vocabulary and make dictionary
vocabs = load_vocab('data/imdb/imdb.vocab')
w2i = {w: i for i, w in enumerate(vocabs)}
i2w = {i: w for i, w in enumerate(vocabs)}
vocab_size = len(vocabs)

# Load Data
train_x, train_y = load_data('data/', train=True)
train_x, train_y = preprocess(train_x, train_y, w2i, maxlen)

# Build Model & Loss & Optimizer
model = RNN(embedding, rnn_hidden, num_layers, bi, output_dim, vocab_size) \
    if model == 'rnn' else CNN(filters, num_filters, maxlen, vocab_size, embedding, output_dim)

# Loss function & Optimizer
criterion = nn.BCELoss()
optim = torch.optim.Adam(model.parameters(), lr)
Пример #20
0
def train_classic(model_type='logistic_regression',
                  data_path='',
                  model_save_path='',
                  feature_vec_path='',
                  col_sep='\t',
                  feature_type='tfidf_word',
                  min_count=1,
                  word_vocab_path='',
                  label_vocab_path='',
                  pr_figure_path=''):
    # load data
    data_content, data_lbl = data_reader(data_path, col_sep)
    word_lst = []
    for i in data_content:
        word_lst.extend(i.split())

    # word vocab
    word_vocab = build_vocab(word_lst,
                             min_count=min_count,
                             sort=True,
                             lower=True)
    # save word vocab
    write_vocab(word_vocab, word_vocab_path)
    # label
    label_vocab = build_vocab(data_lbl)
    # save label vocab
    write_vocab(label_vocab, label_vocab_path)
    label_id = load_vocab(label_vocab_path)
    logger.info(label_id)
    data_label = [label_id[i] for i in data_lbl]
    num_classes = len(set(data_label))
    logger.info('num_classes:%d' % num_classes)

    # init feature
    if feature_type in ['doc_vectorize', 'vectorize']:
        logger.info('feature type error. use tfidf_word replace.')
        feature_type = 'tfidf_word'
    feature = Feature(data=data_content,
                      feature_type=feature_type,
                      feature_vec_path=feature_vec_path,
                      word_vocab=word_vocab)
    # get data feature
    data_feature = feature.get_feature()

    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.1,
                                                      random_state=0)
    if model_type == 'xgboost_lr':
        model = XGBLR(model_save_path=model_save_path)
    else:
        model = get_model(model_type)
    # fit
    model.fit(X_train, y_train)
    # save model
    if model_type != 'xgboost_lr':
        dump_pkl(model, model_save_path, overwrite=True)
    # analysis lr model
    if model_type == "logistic_regression" and config.is_debug:
        # show each category top features
        weights = model.coef_
        vectorizer = load_pkl(feature_vec_path)
        logger.debug("20 top features of each category:")
        features = dict()
        for idx, weight in enumerate(weights):
            feature_sorted = sorted(zip(vectorizer.get_feature_names(),
                                        weight),
                                    key=lambda k: k[1],
                                    reverse=True)
            logger.debug("category_" + str(idx) + ":")
            logger.debug(feature_sorted[:20])
            feature_dict = {k[0]: k[1] for k in feature_sorted}
            features[idx] = feature_dict
        dump_pkl(features, 'output/lr_features.pkl', overwrite=True)

    # evaluate
    eval(model,
         X_val,
         y_val,
         num_classes=num_classes,
         pr_figure_path=pr_figure_path)
Пример #21
0
def train_deep_model(model_type='cnn',
                     data_path='',
                     model_save_path='',
                     word_vocab_path='',
                     label_vocab_path='',
                     min_count=1,
                     max_len=300,
                     batch_size=128,
                     nb_epoch=10,
                     embedding_dim=128,
                     hidden_dim=128,
                     col_sep='\t',
                     num_filters=512,
                     filter_sizes='3,4,5',
                     dropout=0.5):
    # data reader
    data_content, data_lbl = data_reader(data_path, col_sep)
    word_lst = []
    for i in data_content:
        word_lst.extend(i.split())

    # word vocab
    word_vocab = build_vocab(word_lst,
                             min_count=min_count,
                             sort=True,
                             lower=True)
    write_vocab(word_vocab, word_vocab_path)

    # label
    label_vocab = build_vocab(data_lbl)
    write_vocab(label_vocab, label_vocab_path)
    label_id = load_vocab(label_vocab_path)
    logger.info(label_id)
    data_label = [label_id[i] for i in data_lbl]
    # category
    num_classes = len(set(data_label))
    logger.info('num_classes:', num_classes)
    data_label = to_categorical(data_label, num_classes=num_classes)
    logger.info('Shape of Label Tensor:', data_label.shape)

    # init feature
    # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2)
    if model_type == 'han':
        logger.info(
            'Hierarchical Attention Network model feature_type must be: doc_vectorize'
        )
        feature_type = 'doc_vectorize'
    else:
        logger.info('feature_type: vectorize')
        feature_type = 'vectorize'
    feature = Feature(data=data_content,
                      feature_type=feature_type,
                      word_vocab=word_vocab,
                      max_len=max_len)
    # get data feature
    data_feature = feature.get_feature()

    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.1,
                                                      random_state=0)
    if model_type == 'fasttext':
        model = fasttext_model(max_len=max_len,
                               vocabulary_size=len(word_vocab),
                               embedding_dim=embedding_dim,
                               num_classes=num_classes)
    elif model_type == 'cnn':
        model = cnn_model(max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          num_filters=num_filters,
                          filter_sizes=filter_sizes,
                          num_classses=num_classes,
                          dropout=dropout)
    elif model_type == 'rnn':
        model = rnn_model(max_len=max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          hidden_dim=hidden_dim,
                          num_classes=num_classes)
    else:
        model = han_model(max_len=max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          hidden_dim=hidden_dim,
                          num_classes=num_classes)
    cp = ModelCheckpoint(model_save_path,
                         monitor='val_acc',
                         verbose=1,
                         save_best_only=True)
    # fit and save model
    history = model.fit(X_train,
                        y_train,
                        batch_size=batch_size,
                        epochs=nb_epoch,
                        validation_data=(X_val, y_val),
                        callbacks=[cp])
    logger.info('save model:%s' % model_save_path)
    plt_history(history, model_name=model_type)
Пример #22
0
    def __init__(self,
                 input_file_path,
                 seg_input_file_path='',
                 word_vocab_path='',
                 label_vocab_path='',
                 feature_vec_path='',
                 model_save_path='',
                 pred_save_path='',
                 feature_type='tf_word',
                 model_type='logistic',
                 num_classes=2,
                 col_sep='\t',
                 min_count=1,
                 lower_thres=0.5,
                 upper_thres=0.85,
                 label_ratio=0.9,
                 label_min_size=200,
                 batch_size=10,
                 warmstart_size=0.02,
                 stop_words_path='data/stop_words.txt'):
        self.input_file_path = input_file_path
        self.seg_input_file_path = seg_input_file_path if seg_input_file_path else input_file_path + "_seg"
        self.stop_words_path = stop_words_path
        self.word_vocab_path = word_vocab_path if word_vocab_path else "word_vocab.txt"
        self.label_vocab_path = label_vocab_path if label_vocab_path else "label_vocab.txt"
        self.feature_vec_path = feature_vec_path if feature_vec_path else "feature_vec.pkl"
        self.model_save_path = model_save_path if model_save_path else "model.pkl"
        self.pred_save_path = pred_save_path if pred_save_path else "predict.txt"
        self.feature_type = feature_type
        self.num_classes = num_classes
        self.col_sep = col_sep
        self.min_count = min_count
        self.lower_thres = lower_thres
        self.upper_thres = upper_thres
        self.label_ratio = label_ratio

        # 1. load segment data
        if not os.path.exists(self.seg_input_file_path):
            start_time = time()
            seg_data(self.input_file_path,
                     self.seg_input_file_path,
                     col_sep=self.col_sep,
                     stop_words_path=self.stop_words_path)
            logger.info("spend time: %s s" % (time() - start_time))
        self.seg_contents, self.data_lbl = data_reader(
            self.seg_input_file_path, self.col_sep)

        # 2. load original data
        self.content, _ = data_reader(self.input_file_path, self.col_sep)

        # 3. load feature
        word_lst = []
        for i in self.seg_contents:
            word_lst.extend(i.split())
        # word vocab
        self.word_vocab = build_vocab(word_lst,
                                      min_count=self.min_count,
                                      sort=True,
                                      lower=True)
        # save word vocab
        write_vocab(self.word_vocab, self.word_vocab_path)
        # label
        label_vocab = build_vocab(self.data_lbl)
        # save label vocab
        write_vocab(label_vocab, self.label_vocab_path)
        label_id = load_vocab(self.label_vocab_path)
        print("label_id: %s" % label_id)
        self.set_label_id(label_id)
        self.id_label = {v: k for k, v in label_id.items()}
        print('num_classes:%d' % self.num_classes)
        self.data_feature = self._get_feature(self.word_vocab)

        # 4. assemble sample DataObject
        self.samples = self._get_samples(self.data_feature)
        self.batch_num = batch_size if batch_size > 1 else batch_size * len(
            self.samples)
        self.warmstart_num = warmstart_size if warmstart_size > 1 else warmstart_size * len(
            self.samples)
        self.label_min_num = label_min_size if label_min_size > 1 else label_min_size * len(
            self.samples)

        # 5. init model
        self.model = get_model(model_type)
Пример #23
0
class Config():

    multi_task_path = "../input/processing/multi_task_learn/"

    vocab_path = '../input/words.txt'
    vocab_char_path = '../input/char_vocab.txt'
    embed_path = './fasttext/embedding_matrix.npz'
    fasttext_file_path = './fasttext/wiki.es.vec'
    fasttext_file_english_path = './fasttext/wiki.en.vec'

    all_vocab_path = "{}{}".format(multi_task_path, 'all_vocab.txt')
    all_vocab_embedding_file = "{}{}".format(multi_task_path,
                                             'embedding_matrix.npz')
    all_char_vocab_path = "{}{}".format(multi_task_path, 'all_char_vocab.txt')

    all_char_vocab = load_char_vocab(all_char_vocab_path)

    word_vocabs = load_vocab(vocab_path)
    char_vocabs = load_char_vocab(vocab_char_path)

    PAD_WORD = '<PAD>'
    UNK_WORD = '<UNK>'

    vocab_size = len(word_vocabs)

    # character parameters
    use_char_emb = True

    char_vocab_size = len(char_vocabs)
    # char_dim = 50
    char_hidden_size = 64  # for char lstm
    max_word_length = 10
    CHAR_PAD = '@'

    # data path
    input_path = '../input/processing/'

    train_path = input_path + 'train_data.txt'
    english_train_path = input_path + 'english_train.txt'
    spanish_train_path = input_path + 'spanish_train_dedup.txt'

    valid_path = input_path + 'valid_data.txt'
    test_path = input_path + 'test_b.txt'
    model_save_path = './checkpoints/'
    num_workers = 1
    pad_index = 0
    save_model = True
    restore = False  # for restore training

    eval_every = 100

    embed_size = 300

    # hidden_size = random.randint(256, 512)
    hidden_size = 256
    lstm_size = 256
    fc_hidden_size = 512
    linear_size = fc_hidden_size
    num_classes = 2

    max_lengths = random.choice([20, 25])
    batch_size = 64

    lr = random.choice([1e-3])
    # lr = 1e-3
    start_lr = lr
    lr_decay_step = 5
    epochs = 60

    dropout = 0.5
    rnn_dropout = 0.2

    clip = 5.0

    rnn_layers = 3

    batch_not_imporved_throld = 20

    five_fold_path = '../input/processing/5fold/'
    five_fold_save_path = './result/'

    # CNN character parameters
    import random
    user_char_emb = True
    char_dim = 50
    char_kernel_sizes = random.choices([1, 2, 3, 4, 5], k=3)
    char_kernel_nums = random.choices([64, 64, 128, 128], k=3)
    char_output_dim = 128

    embed_dropout = random.uniform(0.1, 0.3)

    pretrained_emb = np.load(embed_path)['weights']

    # for multi task learning
    # multi_task = random.choice([False, True])
    multi_task = True
    multi_task_vocabs = load_vocab(all_vocab_path)
    multi_task_pretraind_emb = np.load(all_vocab_embedding_file)['weights']

    if multi_task:
        char_vocabs = multi_task_vocabs
        pretrained_emb = multi_task_pretraind_emb
        vocab_size = len(multi_task_vocabs)
        char_vocab_size = len(all_char_vocab)
        print('vocab size is ', vocab_size)

    onehot = False

    # early stop

    # ESIM model
    # num_units = 300 + char_`output_dim + pos_embedding_size
    num_units = 300
    char_num_units = num_units + char_output_dim
    project_input = True  # whether to project input embeddings to  different dimensionality
    distance_biases = random.randint(
        15, 30
    )  # number of different distances with biases used in the intra-attention model
    max_sentence_length = max_lengths

    # StackBiLSTMMaxount(SSE)
    h_size = [512, 1024, 2048]
    mlp_d = 1600

    # Decoposable Attention model

    # BIMPM
    num_perspective = 50
    word_dim = embed_size
    word_vocab_size = vocab_size
    max_word_len = max_word_length

    # extra features
    use_extra = False
    extra_path = "../input/processing/with_extra_features/"
    train_data_extra = extra_path + 'train_data.csv'
    valid_data_extra = extra_path + 'valid_data.csv'
    test_data_extra = extra_path + 'test.csv'
    if use_extra:
        extra_feature_nums = len(
            pd.read_csv(train_data_extra, sep='\t').columns) - 3
    else:
        extra_feature_nums = 0

    word_max_lengths = max_word_length
    CHAR_PAD_INDEX = 0
    CHAR_PAD = ' '

    # simaese lstm
    residual = True
    num_layers = 2
    wdrop = 0.25
    dropouti = 0.25

    # MPCNN
    filter_widths = [1, 2, 3, np.inf]
    hidden_layer_units = 512
    n_holistic_filters = 300
    n_per_dim_filters = 32

    msg = (
        'parameters is hidden_size: {}, max_lengths: {} batch_size {} lr: {}, dropout: {}, use_english: {}'
        .format(hidden_size, max_lengths, batch_size, lr, dropout, multi_task))
    print(msg)
Пример #24
0
attn_type = 'general'  # dot, general, concat
attn_dim = 128  # when concat

# - Training
epochs = 200
batch = 128
lr = 0.001
cuda = torch.cuda.is_available()

# - Attention visualization
show_attn = False
show_ex_num = 123

# Load Data and Build dictionaries
src_train_sent, tar_train_sent = load_data('data/', train=True, small=True)
src_dict, src_cand = load_vocab(src_train_sent)
tar_dict, tar_cand = load_vocab(tar_train_sent)
src_vocab_size = len(src_dict)
tar_vocab_size = len(tar_dict)

src_train, tar_train = preprocess(src_train_sent, tar_train_sent, src_dict,
                                  tar_dict, maxlen)

# Build Seq2Seq Model & Loss & Optimizer
model = Seq2seq(embedding_dim, rnn_hidden, num_layers, src_vocab_size,
                tar_vocab_size, bi, attention, attn_type, attn_dim)

criterion = nn.NLLLoss(ignore_index=3)
optim = torch.optim.Adam(model.parameters(), lr)

if cuda:
Пример #25
0
def train_deep_model(model_type='cnn',
                     data_path='',
                     model_save_path='',
                     word_vocab_path='',
                     label_vocab_path='',
                     min_count=1,
                     max_len=300,
                     batch_size=128,
                     nb_epoch=10,
                     embedding_dim=128,
                     hidden_dim=128,
                     col_sep='\t',
                     num_filters=2,
                     filter_sizes='3,4,5',
                     dropout=0.5):
    # data reader
    data_content, data_lbl = data_reader(data_path, col_sep)
    word_lst = []
    for i in data_content:
        word_lst.extend(i.split(" "))

    # word vocab
    word_vocab = build_vocab(word_lst,
                             min_count=min_count,
                             sort=True,
                             lower=True)
    write_vocab(word_vocab, word_vocab_path)

    # label
    label_vocab = build_vocab(data_lbl)
    write_vocab(label_vocab, label_vocab_path)
    label_id = load_vocab(label_vocab_path)
    logger.info(label_id)
    data_label = [label_id[i] for i in data_lbl]
    # category
    num_classes = len(set(data_label))
    logger.info('num_classes:', num_classes)
    data_label = to_categorical(data_label, num_classes=num_classes)
    logger.info('Shape of Label Tensor:', data_label.shape)

    # init feature
    # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2)
    if model_type == 'han':
        logger.info(
            'Hierarchical Attention Network model feature_type must be: doc_vectorize'
        )
        feature_type = 'doc_vectorize'
    else:
        logger.info('feature_type: vectorize')
        feature_type = 'vectorize'

    word_dic = {}
    count = 1
    for word in word_vocab:
        word_dic[word] = count
        count += 1
    data_filter = []
    for line in data_content:
        line_filter = " ".join(
            list(filter(lambda x: x in word_dic, line.split(" "))))
        data_filter.append(line_filter)
    feature = Feature(data=data_filter,
                      feature_type=feature_type,
                      word_vocab=word_vocab,
                      max_len=max_len)
    # get data feature
    data_feature = feature.get_feature()

    X_train, X_val, y_train, y_val = train_test_split(data_feature,
                                                      data_label,
                                                      test_size=0.1,
                                                      random_state=0)
    if model_type == 'fasttext':
        model = fasttext_model(max_len=max_len,
                               vocabulary_size=len(word_vocab),
                               embedding_dim=embedding_dim,
                               num_classes=num_classes)
    elif model_type == 'cnn':
        model = load_model(model_save_path)
    elif model_type == 'rnn':
        model = rnn_model(max_len=max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          hidden_dim=hidden_dim,
                          num_classes=num_classes)
    else:
        model = han_model(max_len=max_len,
                          vocabulary_size=len(word_vocab),
                          embedding_dim=embedding_dim,
                          hidden_dim=hidden_dim,
                          num_classes=num_classes)
    #loss,accuracy = model.evaluate(X_val,y_val)
    #print loss,accuracy
    pre_label = model.predict(X_val, batch_size=32, verbose=0, steps=None)
    print(y_val)
    print(type(y_val))
    with open("./output/result", "w") as f:
        for i in range(len(y_val)):
            f.write("%s\t%f\n" % (y_val[i][2], pre_label[i][2]))
    f.close()