Exemplo n.º 1
0
def test():
    """
  Background test method.
  """
    config = Config()
    device = torch.device("cpu")
    # if config.use_cuda:
    #   device = torch.cuda.set_device(config.gpu[0])
    print('loading corpus')
    vocab_mask = load_vocab(config.vocab)
    label_dic = load_vocab(config.label_file)
    model = TransXL(tag_vocab=dict([val, key]
                                   for key, val in label_dic.items()),
                    bert_config=config.bert_path)
    model = load_train_model(model)
    #   model.crf.use_cuda = False
    model.to(device)
    model.eval()

    while True:
        line = input("input sentence, please:")
        mems = None
        feature = get_text_line_feature(line, vocab_mask, max_length=512)
        input_id = torch.LongTensor(feature.input_id).unsqueeze(0)
        ids, mems = model.predict(input_id, mems)["pred"]
        ids = ids.squeeze(0).numpy().tolist()
        pre_tags = id2tag(label_dic, ids)
        if config.label_mode == "BIOSE":
            result = decode_tags_io(line, pre_tags[1:-1])
        else:
            result = decode_tags_bio(line, pre_tags[1:-1])
        print(result)
Exemplo n.º 2
0
 def __init__(self, txt_path, in_vocab_path, out_vocab_path):
     """Read txt file, input vocab and output vocab (punc vocab)."""
     self.txt_seqs = open(txt_path, encoding='utf8').readlines()
     self.num_seqs = len(self.txt_seqs)
     self.word2id = utils.load_vocab(in_vocab_path,
                                     extra_word_list=["<UNK>", "<END>"])
     self.punc2id = utils.load_vocab(out_vocab_path, extra_word_list=[" "])
Exemplo n.º 3
0
def test():
    """Test Model in test file"""
    # load config
    config = Config()
    print('settings:\n', config)
    # load corpus
    print('loading corpus')
    vocab = load_vocab(config.vocab)
    label_dic = load_vocab(config.tri_cls_label_file)
    # load train and dev and test dataset
    test_data = read_corpus_tri_cls(config.tri_cls_test_file,
                                    max_length=config.max_length,
                                    vocab=vocab)
    test_ids = torch.LongTensor([temp[0] for temp in test_data])
    test_masks = torch.LongTensor([temp[1] for temp in test_data])
    test_types = torch.LongTensor([temp[2] for temp in test_data])
    test_tags = torch.LongTensor([temp[3] for temp in test_data])
    test_dataset = TensorDataset(test_ids, test_masks, test_types, test_tags)
    test_loader = DataLoader(test_dataset,
                             shuffle=False,
                             batch_size=config.batch_size)
    # init model
    model = BertQA(config.bert_path, 2)
    model = load_model(model, name=config.load_tri_cls_path)
    if config.use_cuda and torch.cuda.is_available():
        model.cuda()
    # test model
    evaluate(model, test_loader, 0, config)
Exemplo n.º 4
0
 def __init__(self, txt_path, in_vocab_path, out_vocab_path):
     """Read txt file, input vocab and output vocab (punc vocab)."""
     self.txt_seqs = open(txt_path, encoding='utf8', errors='ignore').readlines()
     self.word2id = utils.load_vocab(in_vocab_path,
                                     extra_word_list=["<UNK>", "<END>"])
     self.punc2id = utils.load_vocab(out_vocab_path,
                                     extra_word_list=[" "])
     self.class2punc = { k : v for (v, k) in self.punc2id.items()}
Exemplo n.º 5
0
def train(**kwargs):
    config = Config()
    config.update(**kwargs)
    print('current config:\n', config)
    if config.use_cuda:
        torch.cuda.set_device(config.gpu)
    print('loading corpus')
    vocab = load_vocab(config.vocab)
    label_dic = load_vocab(config.label_file, user_define=USER_DEFINE)
    tagset_size = len(label_dic)
    train_data = read_corpus(config.train_file, max_length=config.max_length, label_dic=label_dic, vocab=vocab, user_define=USER_DEFINE)
    dev_data = read_corpus(config.dev_file, max_length=config.max_length, label_dic=label_dic, vocab=vocab, user_define=USER_DEFINE)

    train_ids = torch.LongTensor([temp.input_id for temp in train_data])
    train_masks = torch.LongTensor([temp.input_mask for temp in train_data])
    train_tags = torch.LongTensor([temp.label_id for temp in train_data])

    train_dataset = TensorDataset(train_ids, train_masks, train_tags)
    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=config.batch_size)

    dev_ids = torch.LongTensor([temp.input_id for temp in dev_data])
    dev_masks = torch.LongTensor([temp.input_mask for temp in dev_data])
    dev_tags = torch.LongTensor([temp.label_id for temp in dev_data])

    dev_dataset = TensorDataset(dev_ids, dev_masks, dev_tags)
    dev_loader = DataLoader(dev_dataset, shuffle=True, batch_size=config.batch_size)
    model = BERT_LSTM_CRF(config.bert_path, tagset_size, config.bert_embedding, config.rnn_hidden, config.rnn_layer, dropout_ratio=config.dropout_ratio,
                          dropout1=config.dropout1, use_cuda=config.use_cuda)
    if config.load_model:
        assert config.load_path is not None
        model = load_model(model, name=config.load_path)
    if config.use_cuda:
        model.to(DEVICE)
    model.train()
    optimizer = getattr(optim, config.optim)
    optimizer = optimizer(model.parameters(), lr=config.lr, weight_decay=config.weight_decay)
    eval_loss = 10000
    for epoch in range(config.base_epoch):
        step = 0
        for i, batch in enumerate(tqdm(train_loader)):
            step += 1
            model.zero_grad()
            inputs, masks, tags = batch
            inputs, masks, tags = Variable(inputs), Variable(masks), Variable(tags)
            if config.use_cuda:
                inputs, masks, tags = inputs.to(DEVICE), masks.to(DEVICE), tags.to(DEVICE)
            feats = model(inputs, masks)
            loss = model.loss(feats, masks, tags)
            loss.backward()
            optimizer.step()
            if step % 50 == 0:
                print('step: {} |  epoch: {}|  loss: {}'.format(step, epoch, loss.item()))
        loss_temp = dev(model, dev_loader, epoch, config)
        if loss_temp < eval_loss:
            eval_loss = loss_temp
            save_model(model, epoch)
 def load_data(self):
     vocab_event = load_vocab(self.dir_data + 'vocab_event.txt', hasPad=False)
     self.vocab_event = dict({'O': 0})
     for key in vocab_event:
         if key[2:] not in self.vocab_event and key != 'O':
             self.vocab_event.update({key[2:]: len(self.vocab_event)})
     # 34 classes includes event type + None type
     self.vocab_ner = load_vocab(self.dir_data + 'vocab_ner_tail.txt')
     self.num_class_events = len(self.vocab_event)
     self.num_class_entities = len(self.vocab_ner)
Exemplo n.º 7
0
 def __init__(self, txt_path, in_vocab_path, out_vocab_path, sort=True):
     """Read txt file, input vocab and output vocab (punc vocab)."""
     self.txt_seqs = open(txt_path, encoding='utf8', errors='ignore').readlines()
     self.word2id = utils.load_vocab(in_vocab_path,
                                     extra_word_list=["<UNK>", "<END>"])
     self.punc2id = utils.load_vocab(out_vocab_path,
                                     extra_word_list=[" "])
     if sort:
         # Also need to sort in collate_fn cause the sentence length will
         # change after self.preprocess()
         self.txt_seqs.sort(key=lambda x: len(x.split()), reverse=True)
Exemplo n.º 8
0
def inspect_dataset(args):
    """
    Count the number of samples that have tokens that are not in the common vocab subset
    """
    model_vocab = utils.load_vocab(args.model_vocab_file)
    print('Model vocab size:', len(model_vocab))
    common_vocab = utils.load_vocab(args.common_vocab_file)
    print('Common vocab size:', len(common_vocab))

    for file in os.listdir(args.data_dir):
        filename = os.fsdecode(file)
        rel_name = filename.replace('.jsonl', '')
        if filename.endswith('.jsonl'):
            # TODO: Following line is broken..
            facts = utils.load_TREx_data(args,
                                         os.path.join(args.data_dir, filename))
            num_common = 0  # Number of samples in the common vocab subset which is a subset of model vocab
            num_model = 0  # Number of samples in model vocab but not in common vocab
            num_neither = 0  # Number of samples in neither model nor common vocab
            for fact in tqdm(facts):
                sub, obj = fact
                # First check if object is in common vocab
                if obj in common_vocab:
                    num_common += 1
                else:
                    # If not in common vocab, could be in model vocab
                    if obj in model_vocab:
                        num_model += 1
                    else:
                        # Not in common or model vocab
                        num_neither += 1
            assert len(facts) == num_common + num_model + num_neither
            print(
                '{} -> num facts: {}, num common: {}, num model: {}, num neither: {}'
                .format(rel_name, len(facts), num_common, num_model,
                        num_neither))

            # Plot distribution of gold objects
            obj_set = Counter([obj for sub, obj in facts])
            top_obj_set = obj_set.most_common(10)
            print(top_obj_set)
            print()
            gold_objs = pd.DataFrame(top_obj_set, columns=['obj', 'freq'])

            fig, ax = plt.subplots()
            gold_objs.sort_values(by='freq').plot.barh(x='obj',
                                                       y='freq',
                                                       ax=ax)
            plt.savefig(os.path.join(args.out_dir, rel_name + '.png'),
                        bbox_inches='tight')
            plt.close()
Exemplo n.º 9
0
def create_duo_word_clouds(infold, outfold, sub, city1, city2, stopwords):
    vocab1 = utils.load_vocab('vocabs/{}/{}.vocab'.format(sub, city1), 3)
    vocab2 = utils.load_vocab('vocabs/{}/{}.vocab'.format(sub, city2), 3)

    thres1, thres2 = utils.get_threshold(sub, city1, city2)
    results1, results2 = utils.compare_vocabs(vocab1, vocab2, city1, city2,
                                              thres1, thres2)
    text1 = create_text_wc(results1)
    text2 = create_text_wc(results2)

    # frequencies1 = utils.filter_stopwords(results1, stopwords, filter_unprintable=True)
    # frequencies2 = utils.filter_stopwords(results2, stopwords, filter_unprintable=True)
    create_duo_word_clouds_helper(outfold, sub, text1, city1, text2, city2,
                                  stopwords)
Exemplo n.º 10
0
def get_baseline_model(args):
    vocab = utils.load_vocab(args.vocab_json)
    if args.baseline_start_from is not None:
        model, kwargs = utils.load_model(args.baseline_start_from)
    elif args.model_type == 'LSTM':
        kwargs = {
            'vocab': vocab,
            'rnn_wordvec_dim': args.rnn_wordvec_dim,
            'rnn_dim': args.rnn_hidden_dim,
            'rnn_num_layers': args.rnn_num_layers,
            'rnn_dropout': args.rnn_dropout,
            'fc_dims': parse_int_list(args.classifier_fc_dims),
            'fc_use_batchnorm': args.classifier_batchnorm == 1,
            'fc_dropout': args.classifier_dropout,
        }
        model = LstmModel(**kwargs)
    elif args.model_type == 'CNN+LSTM':
        kwargs = {
            'vocab': vocab,
            'rnn_wordvec_dim': args.rnn_wordvec_dim,
            'rnn_dim': args.rnn_hidden_dim,
            'rnn_num_layers': args.rnn_num_layers,
            'rnn_dropout': args.rnn_dropout,
            'cnn_feat_dim': parse_int_list(args.feature_dim),
            'cnn_num_res_blocks': args.cnn_num_res_blocks,
            'cnn_res_block_dim': args.cnn_res_block_dim,
            'cnn_proj_dim': args.cnn_proj_dim,
            'cnn_pooling': args.cnn_pooling,
            'fc_dims': parse_int_list(args.classifier_fc_dims),
            'fc_use_batchnorm': args.classifier_batchnorm == 1,
            'fc_dropout': args.classifier_dropout,
        }
        model = CnnLstmModel(**kwargs)
    elif args.model_type == 'CNN+LSTM+SA':
        kwargs = {
            'vocab': vocab,
            'rnn_wordvec_dim': args.rnn_wordvec_dim,
            'rnn_dim': args.rnn_hidden_dim,
            'rnn_num_layers': args.rnn_num_layers,
            'rnn_dropout': args.rnn_dropout,
            'cnn_feat_dim': parse_int_list(args.feature_dim),
            'stacked_attn_dim': args.stacked_attn_dim,
            'num_stacked_attn': args.num_stacked_attn,
            'fc_dims': parse_int_list(args.classifier_fc_dims),
            'fc_use_batchnorm': args.classifier_batchnorm == 1,
            'fc_dropout': args.classifier_dropout,
        }
        model = CnnLstmSaModel(**kwargs)
    if model.rnn.token_to_idx != vocab['question_token_to_idx']:
        # Make sure new vocab is superset of old
        for k, v in model.rnn.token_to_idx.items():
            assert k in vocab['question_token_to_idx']
            assert vocab['question_token_to_idx'][k] == v
        for token, idx in vocab['question_token_to_idx'].items():
            model.rnn.token_to_idx[token] = idx
        kwargs['vocab'] = vocab
        model.rnn.expand_vocab(vocab['question_token_to_idx'])
    model.cuda()
    model.train()
    return model, kwargs
Exemplo n.º 11
0
def main():
    vocab_words = load_vocab(constants.ALL_WORDS)
    train = Dataset(constants.RAW_DATA + 'train', vocab_words)
    validation = Dataset(constants.RAW_DATA + 'dev', vocab_words)
    test = Dataset(constants.RAW_DATA + 'test', vocab_words)

    # get pre trained embeddings
    embeddings = get_trimmed_w2v_vectors(constants.TRIMMED_FASTTEXT_W2V)

    model = LstmCnnModel(
        model_name=constants.MODEL_NAMES.format('sud', constants.JOB_IDENTITY),
        embeddings=embeddings,
        batch_size=constants.BATCH_SIZE,
        constants=constants,
    )

    # train, evaluate and interact
    model.build()
    model.load_data(train=train, validation=validation)
    model.run_train(epochs=constants.EPOCHS,
                    early_stopping=constants.EARLY_STOPPING,
                    patience=constants.PATIENCE)

    y_pred = model.predict(test)
    preds = []
    labels = []
    for pred, label in zip(y_pred, test.labels):
        labels.extend(label)
        preds.extend(pred[:len(label)])

    p, r, f1, _ = precision_recall_fscore_support(labels,
                                                  preds,
                                                  average='binary')
    print('Result:\tP={:.2f}%\tR={:.2f}%\tF1={:.2f}%'.format(
        p * 100, r * 100, f1 * 100))
Exemplo n.º 12
0
    def __init__(self, path_to_map, path_to_vocab, path_to_index2sense):
        """
        Important Note: During processing new words are added to on_senses and vocabulary,
        therefore they need to be exported again.
        :param path_to_map: ontonotes sense to wordnet sense mappings
        :param path_to_vocab: index2word mapping [.csv]
        :param path_to_index2sense: index2sense mapping [.pickle]
        """
        # sense mappings
        self.on2wn = load_sense_mappings_pickle(path_to_map)
        self.index2sense = load_sense_mappings_pickle(path_to_index2sense)
        self.sense2index = dict()
        for key, value in self.index2sense.items():
            self.sense2index[value.replace("-", ".")] = key

        self.n_onsenses = len(self.sense2index)
        # get rid of that, no longer needed, save memory
        # del self.index2sense

        # Vocab
        self.word2index = load_vocab(path_to_vocab)
        self.n_words = len(self.word2index)
        # test
        print("Sense")
        print(self.sense2index["open.v.2"])
        print(self.on2wn["elaborate.v.1"])
        print(self.on2wn["elaborate.v.1"][0])
        print("Vocab:")
        print(self.word2index["elaborate"])
Exemplo n.º 13
0
async def main(args):
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased',
                                              do_lower_case=False)

    # Load common vocab subset
    common_vocab = utils.load_vocab(args.common_vocab_file)

    # Go though TREx test set and save every sub-obj pair/fact in a dictionary
    trex_set = set()
    with open(args.trex_file, 'r') as f_in:
        lines = f_in.readlines()
        for line in tqdm(lines):
            line = json.loads(line)
            trex_set.add((line['sub_uri'], line['obj_uri']))

    # Get relation ID, i.e. P108
    filename = os.path.basename(os.path.normpath(args.in_file))
    rel_id = filename.split('.')[0]

    queries = []
    with open(args.in_file, 'r') as f_in:
        queries = f_in.readlines()

    with open(args.out_file, 'a+') as f_out:
        await map_async(
            lambda q: get_fact(q, args, tokenizer, trex_set, common_vocab,
                               f_out), queries, args.count, args.max_tasks,
            args.sleep_time)
Exemplo n.º 14
0
def main(args):
    # path to save checkpoint
    if not os.path.isdir(args.checkpoint_path):
        os.mkdir(args.checkpoint_path)
    args.checkpoint_path += '/checkpoint.pt'

    vocab = utils.load_vocab(args.vocab_json)
    train_loader_kwargs = {
        'question_h5': args.train_question_h5,
        'feature_h5': args.train_features_h5,
        'vocab': vocab,
        'batch_size': args.batch_size,
        'shuffle': args.shuffle_train_data == 1,
        'max_samples': args.num_train_samples,
        'num_workers': args.loader_num_workers,
    }
    val_loader_kwargs = {
        'question_h5': args.val_question_h5,
        'feature_h5': args.val_features_h5,
        'vocab': vocab,
        'batch_size': args.batch_size,
        'max_samples': args.num_val_samples,
        'num_workers': args.loader_num_workers,
    }

    with ClevrDataLoader(**train_loader_kwargs) as train_loader, \
       ClevrDataLoader(**val_loader_kwargs) as val_loader:
        train_loop(args, train_loader, val_loader)
Exemplo n.º 15
0
    def __init__(self,
                 args,
                 train_dataset=None,
                 dev_dataset=None,
                 test_dataset=None):
        self.args = args
        self.train_dataset = train_dataset
        self.dev_dataset = dev_dataset
        self.test_dataset = test_dataset

        self.label_lst = get_labels(args)
        self.num_labels = len(self.label_lst)

        # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
        self.pad_token_label_id = args.ignore_index

        self.word_vocab, self.char_vocab, _, _ = load_vocab(args)
        self.pretrained_word_matrix = None
        if not args.no_w2v:
            self.pretrained_word_matrix = load_word_matrix(
                args, self.word_vocab)

        self.model = BiLSTM_CNN_CRF(args, self.pretrained_word_matrix)

        # GPU or CPU
        self.device = "cuda" if torch.cuda.is_available(
        ) and not args.no_cuda else "cpu"
        self.model.to(self.device)

        self.test_texts = None
        if args.write_pred:
            self.test_texts = get_test_texts(args)
            # Empty the original prediction files
            if os.path.exists(args.pred_dir):
                shutil.rmtree(args.pred_dir)
Exemplo n.º 16
0
def test(args):
    test_path = args['--test-src']
    model_path = args['--model-path']
    batch_size = int(args['--batch-size'])
    total_examples = 0
    total_correct = 0
    vocab_path = args['--vocab-src']
    softmax = torch.nn.Softmax(dim=1)

    if args['--data'] == 'quora':
        test_data = utils.read_data(test_path, 'quora')
        vocab_data = utils.load_vocab(vocab_path)
        network = Model(args, vocab_data, 2)
        network.model = torch.load(model_path)

    if args['--cuda'] == str(1):
        network.model = network.model.cuda()
        softmax = softmax.cuda()

    network.model.eval()
    for labels, p1, p2, idx in utils.batch_iter(test_data, batch_size):
        total_examples += len(labels)
        print(total_examples)
        pred, _ = network.forward(labels, p1, p2)
        pred = softmax(pred)
        _, pred = pred.max(dim=1)
        label = network.get_label(labels)
        total_correct += (pred == label).sum().float()
    final_acc = total_correct / total_examples
    print('Accuracy of the model is %.2f' % (final_acc), file=sys.stderr)
Exemplo n.º 17
0
def main(args):
    eval_name = str(os.path.basename(args.data).split('.')[0])
    config = tf.estimator.RunConfig(model_dir=args.model_dir)
    hparams = utils.create_hparams(args)

    vocab_list = utils.load_vocab(args.vocab)
    binf2phone_np = None
    binf2phone = None
    if hparams.decoder.binary_outputs:
        binf2phone = utils.load_binf2phone(args.binf_map, vocab_list)
        binf2phone_np = binf2phone.values

    def model_fn(features, labels, mode, config, params):
        return las_model_fn(features,
                            labels,
                            mode,
                            config,
                            params,
                            binf2phone=binf2phone_np,
                            run_name=eval_name)

    model = tf.estimator.Estimator(model_fn=model_fn,
                                   config=config,
                                   params=hparams)

    tf.logging.info('Evaluating on {}'.format(eval_name))
    model.evaluate(lambda: input_fn(args.data,
                                    args.vocab,
                                    args.norm,
                                    num_channels=args.num_channels,
                                    batch_size=args.batch_size,
                                    binf2phone=None),
                   name=eval_name)
Exemplo n.º 18
0
def _create_pretrained_emb_from_txt(vocab_file,
                                    embed_file,
                                    num_trainable_tokens=3,
                                    dtype=tf.float32,
                                    scope=None):
    """Load pretrain embeding from embed file, and return an embedding matrix.
    Args:
        embed_file: embed file path.
        num_trainable_tokens: Make the first n tokens in the vocab file as trainable
            variables. Default is 3, which is "<unk>", "<s>" and "</s>".
    """
    _, vocab, _ = load_vocab(vocab_file)
    trainable_tokens = vocab[:num_trainable_tokens]
    print("Using pretrained embedding: %s." % embed_file)
    print("  with trainable tokens:")
    emb_dict, emb_size = _load_embed_txt(embed_file)
    for token in trainable_tokens:
        print("    %s" % token)
    for token in vocab:
        if token not in emb_dict:
            emb_dict[token] = [0.0] * emb_size
    emb_mat = np.array([emb_dict[token] for token in vocab],
                       dtype=dtype.as_numpy_dtype())
    emb_mat = tf.constant(emb_mat)
    emb_mat_const = tf.slice(emb_mat, [num_trainable_tokens, 0], [-1, -1])
    with tf.variable_scope(scope or "pretrain_embeddings", dtype=dtype):
        emb_mat_var = tf.get_variable("emb_mat_var",
                                      [num_trainable_tokens, emb_size])  # TODO
    return tf.concat([emb_mat_var, emb_mat_const], 0)
Exemplo n.º 19
0
def main():
    parser = argparse.ArgumentParser(description="Running Simlex test")
    parser.add_argument(
        "--vocab_file_pattern",
        type=str,
        default=None,
        help="vocab path file or file pattern in case of multiple files",
        required=True)
    parser.add_argument(
        "--vector_file_pattern",
        type=str,
        default=None,
        help="vector path file or file pattern in case of multiple files",
        required=True)
    parser.add_argument("--output_file",
                        type=str,
                        default=None,
                        help="file to write output to",
                        required=True)

    args = parser.parse_args()
    vocab_files = glob.glob(str(args.vocab_file_pattern))
    vector_files = glob.glob(str(args.vector_file_pattern))

    with open(os.path.join(ROOT_DIR, f'simlex/{args.output_file}'), 'w') as f:
        for voc, vec in zip(vocab_files, vector_files):
            file_name = os.path.splitext(os.path.basename(voc))[0][4:]
            vocab = load_vocab(voc)
            vectors = load_vectors(vec)
            simlex_score = eval_simlex(simlex_pairs, vocab, vectors)
            f.write('{}: {}'.format(file_name, simlex_score))
            f.write('\n')
        f.close()
Exemplo n.º 20
0
def main():
    opt.load_full = not opt.debug
    word2id, id2word = utils.load_vocab()
    glove_emb = utils.load_glove_emb(word2id)

    word_emb = nn.Embedding.from_pretrained( \
        torch.tensor(glove_emb, dtype=torch.float))

    model = HierarchicalSeq2seq(word_emb=word_emb,
                                word_emb_dim=300,
                                word_vocab_size=len(id2word)).cuda()

    if opt.mode == "train":

        optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, \
                                         model.parameters()),
                                  lr=opt.learning_rate,
                                  initial_accumulator_value=opt.init_accum)

        train_data = utils.load_train_data(demo=opt.demo)

        run_training(model, train_data, optimizer, word2id, id2word, opt)

    elif opt.mode == "inference":

        test_data = utils.load_test_data(setup=opt.setup, demo=opt.demo)
        run_inference(model, test_data, word2id, id2word, opt)
Exemplo n.º 21
0
    def __init__(self, data_dir, vocab_path, random_seed=None):
        self.data_dir = data_dir
        self.vocab = load_vocab(vocab_path)
        # self.num_examples = {"train": -1, "dev": -1, "infer": -1}
        self.num_examples = {"train": -1, "dev": -1, "infer": -1}
	
        np.random.seed(random_seed)
Exemplo n.º 22
0
def main(args):
    print('loading trained model from %s' %
          (args.load_path + '/checkpoint.pt'))
    model, kwargs = utils.load_model(args.load_path + '/checkpoint.pt')
    model.cuda()
    model.eval()

    vocab = utils.load_vocab(args.vocab_json)
    test_loader_kwargs = {
        'question_h5': args.test_question_h5,
        'feature_h5': args.test_features_h5,
        'vocab': vocab,
        'batch_size': args.batch_size,
        'max_samples': None,
        'num_workers': args.loader_num_workers,
    }

    print('loading test data')
    with ClevrDataLoader(**test_loader_kwargs) as test_loader:
        print('%d samples in the test set' % len(test_loader.dataset))

        print('checking test accuracy...')
        acc = check_accuracy(args, model, test_loader)
    print('test accuracy = %.4f' % acc)

    with open(args.load_path + '/checkpoint.pt.json') as f:
        info = json.load(f)

    with open(args.load_path + '/result.txt', 'w') as res:
        res.write('test accuracy: %4f\n' % acc)
        res.write('best val accuracy: %4f\n' % info['best_val_acc'])
        res.write('arguments: \n')
        for k, v in vars(args).items():
            res.write(str(k) + ': ' + str(v) + '\n')
Exemplo n.º 23
0
def main(args):
    config = tf.estimator.RunConfig(model_dir=args.model_dir)
    hparams = utils.create_hparams(args)

    hparams.decoder.set_hparam('beam_width', args.beam_width)

    vocab_list = utils.load_vocab(args.vocab)
    vocab_list_orig = vocab_list
    binf2phone_np = None
    binf2phone = None
    mapping = None
    if hparams.decoder.binary_outputs:
        if args.mapping is not None:
            vocab_list, mapping = utils.get_mapping(args.mapping, args.vocab)
            hparams.del_hparam('mapping')
            hparams.add_hparam('mapping', mapping)

        binf2phone = utils.load_binf2phone(args.binf_map, vocab_list)
        binf2phone_np = binf2phone.values

    def model_fn(features, labels, mode, config, params):
        return las_model_fn(features,
                            labels,
                            mode,
                            config,
                            params,
                            binf2phone=binf2phone_np)

    model = tf.estimator.Estimator(model_fn=model_fn,
                                   config=config,
                                   params=hparams)

    phone_pred_key = 'sample_ids_phones_binf' if args.use_phones_from_binf else 'sample_ids'
    predict_keys = [phone_pred_key, 'embedding', 'alignment']
    if args.use_phones_from_binf:
        predict_keys.append('logits_binf')
        predict_keys.append('alignment_binf')

    audio, _ = librosa.load(args.waveform, sr=SAMPLE_RATE, mono=True)
    features = [calculate_acoustic_features(args, audio)]

    predictions = model.predict(
        input_fn=lambda: input_fn(features,
                                  args.vocab,
                                  args.norm,
                                  num_channels=features[0].shape[-1],
                                  batch_size=args.batch_size),
        predict_keys=predict_keys)
    predictions = list(predictions)
    for p in predictions:
        beams = p[phone_pred_key].T
        if len(beams.shape) > 1:
            i = beams[0]
        else:
            i = beams
        i = i.tolist() + [utils.EOS_ID]
        i = i[:i.index(utils.EOS_ID)]
        text = to_text(vocab_list, i)
        text = text.split(args.delimiter)
        print(text)
Exemplo n.º 24
0
def main(args):

    vocab_list = np.array(utils.load_vocab(args.vocab))

    vocab_size = len(vocab_list)

    config = tf.estimator.RunConfig(model_dir=args.model_dir)
    hparams = utils.create_hparams(
        args, vocab_size, utils.SOS_ID, utils.EOS_ID)

    hparams.decoder.set_hparam('beam_width', args.beam_width)

    model = tf.estimator.Estimator(
        model_fn=las_model_fn,
        config=config,
        params=hparams)

    predictions = model.predict(
        input_fn=lambda: input_fn(
            args.data, args.vocab, num_channels=args.num_channels, batch_size=args.batch_size, num_epochs=1),
        predict_keys='sample_ids')

    if args.beam_width > 0:
        predictions = [vocab_list[y['sample_ids'][:, 0]].tolist() + [utils.EOS]
                       for y in predictions]
    else:
        predictions = [vocab_list[y['sample_ids']].tolist() + [utils.EOS]
                       for y in predictions]

    predictions = [' '.join(y[:y.index(utils.EOS)]) for y in predictions]

    with open(args.save, 'w') as f:
        f.write('\n'.join(predictions))
Exemplo n.º 25
0
def word_sents_to_lword_id_AND_casing_id_sents(word_sents):
    vocab = load_vocab()

    lword_id_sents = []
    casing_id_sents = []

    for word_sent in word_sents:
        lword_id_sent = []
        casing_id_sent = []

        for word in word_sent:
            lword = word.lower()
            lword_id = vocab[lword] if lword in vocab else 1
            lword_id_sent.append(lword_id)

            casing_id = casing_to_id[word_to_casing(word)]
            casing_id_sent.append(casing_id)

        lword_id_sents.append(lword_id_sent)
        casing_id_sents.append(casing_id_sent)

    # Pad sentences
    max_sent_len = len(max(word_sents, key=len))
    lword_id_sents = pad_sequences(lword_id_sents, maxlen=max_sent_len)
    casing_id_sents = pad_sequences(casing_id_sents, maxlen=max_sent_len)

    return lword_id_sents, casing_id_sents
Exemplo n.º 26
0
    def __init__(self, config, embeddings, ntags, nchars=None):

        self.config = config
        self.nchars = nchars
        self.ntags = ntags
        self.embeddings = embeddings
        self.logger = config.logger  # now instantiated in config
        self.vocab_words = load_vocab(self.config.words_filename)
        self.vocab_labels = load_vocab(self.config.labels_filename)
        self.idx_to_word = {
            idx: word
            for word, idx in self.vocab_words.items()
        }
        self.idx_to_tag = {
            idx: word
            for word, idx in self.vocab_labels.items()
        }
Exemplo n.º 27
0
 def __init__(self, filepath):
     vlist, vdict = load_vocab()
     with open(filepath) as f:
         data = f.read().splitlines()
     data = [sent.split(' ') for sent in data]
     self.data = [[vdict[x] for x in sent] for sent in data]
     self.lengths = torch.tensor([len(x) + 1 for x in self.data])
     self.size = len(self.lengths)
Exemplo n.º 28
0
def main():
    timer = Timer()
    timer.start('Load word2vec models...')
    vocab = load_vocab(config.VOCAB_DATA)
    embeddings = get_trimmed_w2v_vectors(config.W2V_DATA)
    timer.stop()

    timer.start('Load data...')
    train = process_data(opt.train, vocab)
    if opt.val is not None:
        if opt.val != '1vs9':
            validation = process_data(opt.val, vocab)
        else:
            validation, train = train.one_vs_nine()
    else:
        validation = None

    if opt.test is not None:
        test = process_data(opt.test, vocab)
    else:
        test = None
    timer.stop()

    timer.start('Build model...')
    model = CnnModel(embeddings=embeddings)
    model.build()
    timer.stop()

    timer.start('Train model...')
    epochs = opt.e
    batch_size = opt.b
    early_stopping = True if opt.p != 0 else False
    patience = opt.p
    pre_train = opt.pre if opt.pre != '' else None
    model_name = opt.name

    model.train(
        model_name,
        train=train,
        validation=validation,
        epochs=epochs,
        batch_size=batch_size,
        early_stopping=early_stopping,
        patience=patience,
        cont=pre_train,
    )
    timer.stop()

    if test is not None:
        timer.start('Test model...')
        preds = model.predict(test, model_name)
        labels = test.labels

        p, r, f1, _ = precision_recall_fscore_support(labels,
                                                      preds,
                                                      average='binary')
        print('Testing result:P=\t{}\tR={}\tF1={}'.format(p, r, f1))
        timer.stop()
Exemplo n.º 29
0
    def __init__(self, log_dir, cfg):

        self.path = log_dir
        self.cfg = cfg

        if cfg.TRAIN.FLAG:
            self.model_dir = os.path.join(self.path, 'Model')
            self.log_dir = os.path.join(self.path, 'Log')
            mkdir_p(self.model_dir)
            mkdir_p(self.log_dir)
            self.writer = SummaryWriter(log_dir=self.log_dir)
            sys.stdout = Logger(logfile=os.path.join(self.path, "logfile.log"))

        self.data_dir = cfg.DATASET.DATA_DIR
        self.max_epochs = cfg.TRAIN.MAX_EPOCHS
        self.snapshot_interval = cfg.TRAIN.SNAPSHOT_INTERVAL

        s_gpus = cfg.GPU_ID.split(',')
        self.gpus = [int(ix) for ix in s_gpus]
        self.num_gpus = len(self.gpus)

        self.batch_size = cfg.TRAIN.BATCH_SIZE
        self.lr = cfg.TRAIN.LEARNING_RATE

        torch.cuda.set_device(self.gpus[0])
        cudnn.benchmark = True

        # load dataset
        self.dataset = ClevrDataset(data_dir=self.data_dir, split="train")
        self.dataloader = DataLoader(dataset=self.dataset,
                                     batch_size=cfg.TRAIN.BATCH_SIZE,
                                     shuffle=True,
                                     num_workers=cfg.WORKERS,
                                     drop_last=True,
                                     collate_fn=collate_fn)

        self.dataset_val = ClevrDataset(data_dir=self.data_dir, split="val")
        self.dataloader_val = DataLoader(dataset=self.dataset_val,
                                         batch_size=200,
                                         drop_last=True,
                                         shuffle=False,
                                         num_workers=cfg.WORKERS,
                                         collate_fn=collate_fn)

        # load model
        self.vocab = load_vocab(cfg)
        self.model, self.model_ema = mac.load_MAC(cfg, self.vocab)
        self.weight_moving_average(alpha=0)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

        self.previous_best_acc = 0.0
        self.previous_best_epoch = 0

        self.total_epoch_loss = 0
        self.prior_epoch_loss = 10

        self.print_info()
        self.loss_fn = torch.nn.CrossEntropyLoss().cuda()
Exemplo n.º 30
0
def main(args):
    if args.randomize_checkpoint_path == 1:
        name, ext = os.path.splitext(args.checkpoint_path)
        num = random.randint(1, 1000000)
        args.checkpoint_path = '%s_%06d%s' % (name, num, ext)

    '''load the vocabulary'''
    vocab = utils.load_vocab(args.vocab_json)

    if args.use_local_copies == 1:
        shutil.copy(args.train_question_h5, '/tmp/train_questions.h5')
        shutil.copy(args.train_features_h5, '/tmp/train_features.h5')
        shutil.copy(args.val_question_h5, '/tmp/val_questions.h5')
        shutil.copy(args.val_features_h5, '/tmp/val_features.h5')
        args.train_question_h5 = '/tmp/train_questions.h5'
        args.train_features_h5 = '/tmp/train_features.h5'
        args.val_question_h5 = '/tmp/val_questions.h5'
        args.val_features_h5 = '/tmp/val_features.h5'

    question_families = None
    if args.family_split_file is not None:
        with open(args.family_split_file, 'r') as f:
            question_families = json.load(f)

    ''' ** Dataloaders creation **
    Create two ClevrDataloaders for training and validation'''
    trainer_loader_kwargs = {
        'question_h5': args.train_question_h5,
        'feature_h5': args.train_features_h5,
        'vocab': vocab,
        'shuffle': args.shuffle_train_data == 1,
        'question_families': question_families,
        'max_samples': args.num_train_samples,
        'num_workers': args.loader_num_workers,
        'images_path': args.train_images
    }

    val_loader_kwargs = {
        'question_h5': args.val_question_h5,
        'feature_h5': args.val_features_h5,
        'vocab': vocab,
        'question_families': question_families,
        'max_samples': args.num_val_samples,
        'num_workers': args.loader_num_workers,
        'images_path': args.val_images
    }

    train_loader = ClevrDataLoader(**trainer_loader_kwargs)
    val_loader = ClevrDataLoader(**val_loader_kwargs)
    ''' ** Dataloaders created ** '''

    train_loop(args, vocab, train_loader, val_loader)

    if args.use_local_copies == 1 and args.cleanup_local_copies == 1:
        os.remove('/tmp/train_questions.h5')
        os.remove('/tmp/train_features.h5')
        os.remove('/tmp/val_questions.h5')
        os.remove('/tmp/val_features.h5')
Exemplo n.º 31
0
def generate(start_word, length):
    parser = argparse.ArgumentParser()
    parser.add_argument("--vocab_file",
                        type=str,
                        default="data/vocab.pkl",
                        help="Vocabulary dictionary")
    parser.add_argument("--vocab_size",
                        type=int,
                        default=2854,
                        help="Vocabulary size")
    parser.add_argument("--embedding_dim",
                        type=int,
                        default=256,
                        help="Dimensionality of the words embedding")
    parser.add_argument("--rnn_size",
                        type=int,
                        default=128,
                        help="Hidden units of rnn layer ")
    parser.add_argument("--num_layers",
                        type=int,
                        default=2,
                        help="Number of rnn layer")
    parser.add_argument("--batch_size",
                        type=int,
                        default=1,
                        help="Minibatch size")
    args, _ = parser.parse_known_args()

    vocab_dict = utils.load_vocab(args.vocab_file)
    index2word = dict(zip(vocab_dict.values(), vocab_dict.keys()))

    text = [start_word]
    text_data = utils.transform(text, vocab_dict)

    checkpoint_dir = os.path.abspath(
        os.path.join(os.path.curdir, "checkpoints"))
    checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
    graph = tf.Graph()
    with graph.as_default():
        sess = tf.Session()
        with sess.as_default():
            rnn = RNNLM(vocab_size=args.vocab_size,
                        embedding_dim=args.embedding_dim,
                        rnn_size=args.rnn_size,
                        num_layers=args.num_layers,
                        batch_size=args.batch_size,
                        training=False)
            saver = tf.train.Saver()
            saver.restore(sess, checkpoint_file)

            for _ in range(length):
                data = np.array([text_data])
                predictions = sess.run(rnn.prediction,
                                       feed_dict={rnn.input_data: data})
                text_data.append(predictions[-1])

    content = [index2word[index] for index in text_data]
    return "".join(content)
def filter_by_freq(filename, freq=100):
    vocab = load_vocab('../gen_data/vocab.1b')
    fout = open(filename+'.f'+str(freq),'w')
    with open(filename) as fin:
        for line in fin:
            arr = line.strip().split('\t')
            if len(arr) != 2:
                logging.info("error line: %s"%('\t'.join(arr)))
                continue
            if arr[0] not in vocab or arr[1] not in vocab:
                logging.info("not in vocab : %s"%('\t'.join(arr)))
                continue
            if vocab[arr[0]] < freq and vocab[arr[1]] < freq:
                logging.info("low freq: %s"%('\t'.join(arr)))
            else:
                fout.write(line)
    fout.close()
def filter_easy_freebase_by_vocab(filename, vocabfile):
	logging.info('BEGIN: cleaning freebasefile: %s'%filename)
	vocab = load_vocab(vocabfile)
	fout = open(filename+'.freq','w')
	with open(filename) as fin:
		for line in fin:
			arr = line.strip().split('\t')
			head = arr[0].lower()
			rel  = arr[1].lower()
			tail = arr[2].lower()
			if head not in vocab or tail not in vocab:
				continue
			if len(head) < 2 or len(tail) < 2:
				continue 
			if vocab[head] < 50 or vocab[tail] < 50:
				continue
			if rel == 'is-a' or '/' in rel or '(' in rel:
				continue
			rel = rel.replace(' ','_')
			fout.write('%s\t%s\t%s\n'%(head,rel,tail))
	fout.close()
	logging.info('END')
Exemplo n.º 34
0
from model import Model
from config import Config
from utils import build_data, load_vocab, get_processing_word, Dataset,\
                clear_data_path, get_trimmed_glove_vectors,Dataset, write_clear_data_pd
import sys
with open(sys.argv[1], "r") as f:
    pipeline = '\t'.join(f.readlines())

config = Config(pipeline)
# load vocabs
vocab_words = load_vocab(config.words_filename)
vocab_labels = load_vocab(config.labels_filename)
vocab_chars = load_vocab(config.chars_filename)

# get processing functions
processing_word = get_processing_word(
    vocab_words, vocab_chars, lowercase=True, chars=config.chars)
processing_label = get_processing_word(
    vocab_labels, lowercase=False, label_vocab=True)

# get pre trained embeddings
embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

test_filepath, _ = write_clear_data_pd(
    config.test_filename, config.DEFAULT, domain=config.domain)
test = Dataset(test_filepath, processing_word, processing_label,
               config.max_iter)

# build model
model = Model(
    config, embeddings, ntags=len(vocab_labels), nchars=len(vocab_chars))
Exemplo n.º 35
0
if __name__=="__main__":


    image_paths = {}
    root_path = "/srv/data/datasets/mscoco/images/"

    for split in 'train val'.split():
        image_ids_path = "datasets/vqa/"+split+"/img_ids.txt"
        image_ids = set([int(x.strip()) for x in open(image_ids_path).readlines()])
        print(split,len(image_ids))
        for x in image_ids:
            name = 'COCO_'+split+'2014_'+format(x, '012')+'.jpg'
            path = join(root_path,split+"2014",name)
            image_paths[x] = path

    q_i2w, q_w2i = load_vocab('datasets/vqa/train/questions.vocab')
    a_i2w, a_w2i = load_vocab('datasets/vqa/train/answers.vocab')

    train_set = Dataset('datasets/vqa/train/dataset.h5',image_paths)
    max_mc = train_set.multiple_choice.shape[-1]
    max_q = train_set.max_q
    val_set = Dataset('datasets/vqa/val/dataset.h5',image_paths,max_q=max_q,max_mc=max_mc)
    Nq = len(q_i2w)
    Na = len(a_i2w)
    
    tf.reset_default_graph()
    # Read the model
    with open("tensorflow-vgg16/vgg16.tfmodel",
              mode='rb') as f:
        fileContent = f.read()
    graph_def = tf.GraphDef()
Exemplo n.º 36
0
    d1 = 512
    demb = 50
    model_name = "marginloss_wholeimage"
    
    image_paths = {}
    root_path = "/srv/data/datasets/mscoco/images/"

    for split in 'train val'.split():
        image_ids_path = "datasets/vqa/"+split+"/img_ids.txt"
        image_ids = set([int(x.strip()) for x in open(image_ids_path).readlines()])
        print(split,len(image_ids))
        for x in image_ids:
            name = 'COCO_'+split+'2014_'+format(x, '012')+'.jpg'
            path = join(root_path,split+"2014",name)
            image_paths[x] = path  
    i2w, w2i = load_vocab('datasets/vqa/vocabulary.txt')

    train_set = Dataset('datasets/vqa/train/dataset_cleaner.h5',
                        image_paths)
    max_mc = train_set.multiple_choice.shape[-1]
    max_q = train_set.max_q
    val_set = Dataset('datasets/vqa/val/dataset_cleaner.h5',
                      image_paths,max_q=max_q,max_mc=max_mc)

    Nvocab = len(i2w)

    tf.reset_default_graph()
    # Read the model
    with open("tensorflow-vgg16/vgg16.tfmodel",
              mode='rb') as f:
        fileContent = f.read()
Exemplo n.º 37
0
def train_lstm():

    optimizer=adam  # only adam is supported by now.
    options = locals().copy()
    with open(prm.outpath, "a") as fout:
        fout.write("parameters:" + str(options) + str(prm.__dict__))

    print "loading dictionary..."
    vocab = utils.load_vocab(prm.vocab_path, prm.n_words)
    options['vocab'] = vocab

    options['vocabinv'] = {}
    for k,v in vocab.items():
        options['vocabinv'][v] = k

    print 'Loading data...'
    options['wiki'] = wiki.Wiki(prm.pages_path)
    options['wikiemb'] = wiki_emb.WikiEmb(prm.pages_emb_path)
    qpp = qp.QP(prm.qp_path)
    q_train, q_valid, q_test = qpp.get_queries()
    a_train, a_valid, a_test = qpp.get_paths()

    print 'Building model'
    # This create the initial parameters as np ndarrays.
    # Dict name (string) -> np ndarray
    params, exclude_params = init_params()

    if prm.reload_model:
        load_params(prm.reload_model, params)

    if prm.wordemb_path:
        print 'loading pre-trained weights for word embeddings'
        params = load_wemb(params, vocab)
        options['W'] = params['W']

    # This create Theano Shared Variable from the parameters.
    # Dict name (string) -> Theano Tensor Shared Variable
    # params and tparams have different copy of the weights.
    tparams = init_tparams(params)

    mean = theano.shared(np.zeros((prm.dim_proj,)).astype(config.floatX)) # avg of the training set
    std = theano.shared(np.zeros((prm.dim_proj,)).astype(config.floatX)) # std of the training set
    t_samples = theano.shared(np.zeros((1,)).astype(config.floatX)) # total number of samples so far
    stats_vars = {'mean': mean, 'std': std, 't_samples': t_samples}
    
    if prm.supervised:
        baseline_vars = {}
    else:
        R_mean = theano.shared(0.71*np.ones((1,)), name='R_mean')
        R_std = theano.shared(np.ones((1,)), name='R_std')
        baseline_vars = {'R_mean': R_mean, 'R_std': R_std}


    is_train, sup, max_hops, k_beam, tq, tq_m, troot_pages, tacts_p, f_pred, cost, \
            scan_updates, baseline_updates, stats_updates, consider_constant, \
            opt_out = \
            build_model(tparams, baseline_vars, stats_vars, options)
            
            
    if prm.decay_c > 0.:
        decay_c = theano.shared(np_floatX(prm.decay_c), name='decay_c')
        weight_decay = 0.
        weight_decay += (tparams['U'] ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay
    
    #get only parameters that are not in the exclude_params list
    tparams_ = OrderedDict([(kk, vv) for kk, vv in tparams.iteritems() if kk not in exclude_params])

    grads = tensor.grad(cost, wrt=itemlist(tparams_), consider_constant=consider_constant)

    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = optimizer(lr, tparams_, grads, tq, tq_m, troot_pages, tacts_p, cost, scan_updates, baseline_updates, \
                                       stats_updates, opt_out=[opt_out['R'], opt_out['page_idx'], opt_out['best_answer'], opt_out['best_page_idx']])

    print 'Optimization'

    if prm.train_size == -1:
        train_size = len(q_train)
    else:
        train_size = prm.train_size

    if prm.valid_size == -1:
        valid_size = len(q_valid)
    else:
        valid_size = prm.valid_size

    if prm.test_size == -1:
        test_size = len(q_test)
    else:
        test_size = prm.test_size

    with open(prm.outpath, "a") as fout:
        fout.write("\n%d train examples" % len(q_train)) 
    with open(prm.outpath, "a") as fout:
        fout.write("\n%d valid examples" % len(q_valid)) 
    with open(prm.outpath, "a") as fout:
        fout.write("\n%d test examples" % len(q_test))

    history_errs = []
    best_p = None

    if prm.validFreq == -1:
        validFreq = len(q_train) / prm.batch_size_train
    else:
        validFreq = prm.validFreq

    if prm.saveFreq == -1:
        saveFreq = len(q_train) / prm.batch_size_train
    else:
        saveFreq = prm.saveFreq

    uidx = 0  # the number of update done
    estop = False  # early stop
    start_time = time.time()

    try:
        for eidx in xrange(prm.max_epochs):
            n_samples = 0

            # Get new shuffled index for the training set.
            kf = get_minibatches_idx(len(q_train), prm.batch_size_train, shuffle=True)

            for _, train_index in kf:
                st = time.time()

                uidx += 1
                is_train.set_value(1.)
                max_hops.set_value(prm.max_hops_train) # select training dataset
                k_beam.set_value(1) # Training does not use beam search
                
                # Select the random examples for this minibatch
                queries = [q_train[t].lower() for t in train_index]
                actions = [a_train[t] for t in train_index]
                
                if prm.supervised == 1:
                    sup_ = True
                elif prm.supervised > 1:
                    if uidx % (int(uidx / prm.supervised) + 1) == 0:
                        sup_ = True
                    else: 
                        sup_ = False
                else:
                    sup_ = False
                    
                if sup_:
                    sup.set_value(1.) # select supervised mode
                    # Get correct actions (supervision signal)
                    acts_p =  get_acts(actions, prm.max_hops_train, k_beam=1)
                else:
                    sup.set_value(0.) # select non-supervised mode
                    acts_p = -np.ones((prm.max_hops_train+1, len(queries)), dtype=np.float32)

                root_pages = get_root_pages(actions)
                
                # Get the BoW for the queries
                q_bow, q_m = utils.BOW2(queries, vocab, prm.max_words_query*prm.n_consec)
                n_samples += len(queries)
                cost, R, pagesidx, best_answer, best_page_idx = f_grad_shared(q_bow, q_m, root_pages, acts_p)
                f_update(prm.lrate) 
                if np.isnan(cost) or np.isinf(cost):
                    print 'NaN detected'
                    return 1., 1., 1.

                if np.mod(uidx, prm.dispFreq) == 0:
                    with open(prm.outpath, "a") as fout:
                        fout.write("\n\nQuery: " + queries[-1].replace("\n"," "))
                        fout.write('\nBest Answer: ' + utils.idx2text(best_answer[-1], options['vocabinv']))
                        fout.write('\nBest page: ' + options['wiki'].get_article_title(best_page_idx[-1]))

                        for i, pageidx in enumerate(pagesidx[:,-1]):
                            fout.write('\niteration: ' +str(i) + " page idx " + str(pageidx) + ' title: ' + options['wiki'].get_article_title(pageidx))
                       
                        fout.write('\nEpoch '+ str(eidx) + ' Update '+ str(uidx) + ' Cost ' + str(cost) + \
                                   ' Reward Mean ' + str(R.mean()) + ' Reward Max ' + str(R.max()) +  \
                                   ' Reward Min ' + str(R.min()))

                        fout.write("\nTime per Minibatch Update: " + str(time.time() - st))
                       

                if prm.saveto and np.mod(uidx, saveFreq) == 0:
                    print 'Saving...',

                    if best_p is not None:
                        params = best_p
                    else:
                        params = unzip(tparams)
                    np.savez(prm.saveto, history_errs=history_errs, **params)
                    pkl.dump(options, open('%s.pkl' % prm.saveto, 'wb'), -1)
                    print 'Done'

                if np.mod(uidx, validFreq) == 0:

                    kf_train = get_minibatches_idx(len(q_train), prm.batch_size_pred, shuffle=True, max_samples=train_size)
                    kf_valid = get_minibatches_idx(len(q_valid), prm.batch_size_pred, shuffle=True, max_samples=valid_size)
                    kf_test = get_minibatches_idx(len(q_test), prm.batch_size_pred, shuffle=True, max_samples=test_size)

                    is_train.set_value(0.)
                    sup.set_value(0.) # supervised mode off
                    max_hops.set_value(prm.max_hops_pred)
                    k_beam.set_value(prm.k)

                    with open(prm.outpath, 'a') as fout:
                        fout.write('\n\nComputing Error Training Set')
                    train_err, train_R, train_accp = pred_error(f_pred, q_train, a_train, options, kf_train)

                    with open(prm.outpath, 'a') as fout:
                        fout.write('\n\nComputing Error Validation Set')
                    valid_err, valid_R, valid_accp = pred_error(f_pred, q_valid, a_valid, options, kf_valid)

                    with open(prm.outpath, 'a') as fout:
                        fout.write('\n\nComputing Error Test Set')
                    test_err, test_R, test_accp = pred_error(f_pred, q_test, a_test, options, kf_test)

                    history_errs.append([valid_err[-1], test_err[-1]])

                    if (uidx == 0 or
                        valid_err[-1] <= np.array(history_errs)[:,0].min()):

                        best_p = unzip(tparams)
                        bad_counter = 0

                    with open(prm.outpath, "a") as fout:
                        fout.write('\n[{per hop}, Avg] Train err ' + str(train_err) + '  Valid err ' + str(valid_err) + '  Test err ' + str(test_err))
                        fout.write('\n[{per hop}, Avg] Train R ' + str(train_R) + '  Valid R ' + str(valid_R) + '  Test R ' + str(test_R))
                        fout.write('\nAccuracy Page Actions   Train ' + str(train_accp) + '  Valid ' + str(valid_accp) + '  Test ' + str(test_accp))

                    if (len(history_errs) > prm.patience and
                        valid_err[-1] >= np.array(history_errs)[:-prm.patience,
                                                               0].min()):
                        bad_counter += 1
                        if bad_counter > prm.patience:
                            print 'Early Stop!'
                            estop = True
                            break

            with open(prm.outpath, "a") as fout:
                fout.write('\nSeen %d samples' % n_samples)

            if estop:
                break

    except KeyboardInterrupt:
        print "Training interupted"

    end_time = time.time()
    if best_p is not None:
        zipp(best_p, tparams)
    else:
        best_p = unzip(tparams)

    is_train.set_value(0.)
    sup.set_value(0.) # supervised mode off
    max_hops.set_value(prm.max_hops_pred)
    k_beam.set_value(prm.k)

    kf_train_sorted = get_minibatches_idx(len(q_train), prm.batch_size_train)

    train_err, train_R, train_accp = pred_error(f_pred, q_train, a_train, options, kf_train_sorted)
    valid_err, valid_R, valid_accp = pred_error(f_pred, q_valid, a_valid, options, kf_valid)
    test_err, test_R, test_accp = pred_error(f_pred, q_test, a_test, options, kf_test)

    with open(prm.outpath, "a") as fout:
        fout.write('\n[{per hop}, Avg] Train err ' + str(train_err) + '  Valid err ' + str(valid_err) + '  Test err ' + str(test_err))
        fout.write('\n[{per hop}, Avg] Train R ' + str(train_R) + '  Valid R ' + str(valid_R) + '  Test R ' + str(test_R))
        fout.write('\nAccuracy Page Actions   Train ' + str(train_accp) + '  Valid ' + str(valid_accp) + '  Test ' + str(test_accp))

    if prm.saveto:
        np.savez(prm.saveto, train_err=train_err,
                    valid_err=valid_err, test_err=test_err,
                    history_errs=history_errs, **best_p)
    with open(prm.outpath, "a") as fout:
        fout.write('\nThe code run for %d epochs, with %f sec/epochs' % ((eidx + 1), (end_time - start_time) / (1. * (eidx + 1))))
    with open(prm.outpath, "a") as fout:
        fout.write('\nTraining took %.1fs' % (end_time - start_time))
    return train_err, valid_err, test_err
Exemplo n.º 38
0
'''
Compute the Inverse Document Frequency (IDF) of Wikipedia
articles using the vocabulary defined in <vocab_path>.
'''

import cPickle as pkl
import numpy as np
import random
import utils
from collections import OrderedDict
from nltk.tokenize import wordpunct_tokenize
import re
import parameters as prm

print 'loading vocabulary'
vocab = utils.load_vocab(prm.vocab_path, prm.n_words)

textbegin = False
title = ''
text = ''
n = 0
f = open(prm.dump_path, "rb")

print 'creating IDF'
m = 0 # number of documents
df = {}  # word-document frenquency

while True:
    line = f.readline()

    if (line == ''):
Exemplo n.º 39
0
Arquivo: run.py Projeto: jxwuyi/WebNav
def train_lstm():

    optimizer=adam  # only adam is supported by now.
    options = locals().copy()
    with open(prm.outpath, "a") as fout:
        fout.write("parameters:" + str(options) + str(prm.__dict__))

    print "loading dictionary..."
    vocab = utils.load_vocab(prm.vocab_path, prm.n_words)
    options['vocab'] = vocab

    options['vocabinv'] = {}
    for k,v in vocab.items():
        options['vocabinv'][v] = k

    print 'Loading data...'
    options['wiki'] = wiki.Wiki(prm.pages_path)
    options['wikiemb'] = wiki_emb.WikiEmb(prm.pages_emb_path)

    #load Q&A Wiki dataset
    qpp = qp.QP(prm.qp_path)
    q_train, q_valid, q_test = qpp.get_queries()
    a_train, a_valid, a_test = qpp.get_paths()

    print 'Building model'
    # This create the initial parameters as np ndarrays.
    # Dict name (string) -> np ndarray
    params, exclude_params = init_params()

    if prm.wordemb_path:
        print 'loading pre-trained weights for word embeddings'
        params = load_wemb(params, vocab)
        options['W'] = params['W']

    if prm.reload_model:
        load_params(prm.reload_model, params)

    params_next = OrderedDict()
    if prm.learning.lower() == 'q_learning' and prm.update_freq > 0:
        # copy params to params_next
        for kk, kv in params.items():
            params_next[kk] = kv.copy()

    # This create Theano Shared Variable from the parameters.
    # Dict name (string) -> Theano Tensor Shared Variable
    # params and tparams have different copy of the weights.
    tparams = init_tparams(params)

    if prm.update_freq > 0:
        tparams_next = init_tparams(params_next)
    else:
        tparams_next = None
  
    if prm.learning.lower() == 'reinforce':
        R_mean = theano.shared(0.71*np.ones((1,)), name='R_mean')
        R_std = theano.shared(np.ones((1,)), name='R_std')
        baseline_vars = {'R_mean': R_mean, 'R_std': R_std}
    else:
        baseline_vars = {}

    iin, out, updates, is_train, sup, max_hops, k_beam, mixer, f_pred, consider_constant \
            = build_model(tparams, tparams_next, baseline_vars, options)

    #get only parameters that are not in the exclude_params list
    tparams_ = OrderedDict([(kk, vv) for kk, vv in tparams.iteritems() if kk not in exclude_params])

    grads = tensor.grad(out[0], wrt=itemlist(tparams_), consider_constant=consider_constant)

    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = optimizer(lr, tparams_, grads, iin, out, updates)

    print 'Optimization'

    if prm.train_size == -1:
        train_size = len(q_train)
    else:
        train_size = prm.train_size

    if prm.valid_size == -1:
        valid_size = len(q_valid)
    else:
        valid_size = prm.valid_size

    if prm.test_size == -1:
        test_size = len(q_test)
    else:
        test_size = prm.test_size

    with open(prm.outpath, "a") as fout:
        fout.write("\n%d train examples" % len(q_train)) 
    with open(prm.outpath, "a") as fout:
        fout.write("\n%d valid examples" % len(q_valid)) 
    with open(prm.outpath, "a") as fout:
        fout.write("\n%d test examples" % len(q_test))

    history_errs = []
    best_p = None

    if prm.validFreq == -1:
        validFreq = len(q_train) / prm.batch_size_train
    else:
        validFreq = prm.validFreq

    if prm.saveFreq == -1:
        saveFreq = len(q_train) / prm.batch_size_train
    else:
        saveFreq = prm.saveFreq

    uidx = 0  # the number of update done
    estop = False  # early stop
    start_time = time.time()
    
    experience = deque(maxlen=prm.replay_mem_size) # experience replay memory as circular buffer.
    experience_r = deque(maxlen=prm.replay_mem_size) # reward of each entry in the replay memory.

    try:
        for eidx in xrange(prm.max_epochs):
            n_samples = 0

            # Get new shuffled index for the training set.
            kf = get_minibatches_idx(len(q_train), prm.batch_size_train, shuffle=True)

            for _, train_index in kf:
                st = time.time()

                uidx += 1
                is_train.set_value(1.)
                max_hops.set_value(prm.max_hops_train) # select training dataset
                k_beam.set_value(1) # Training does not use beam search
                
                # Select the random examples for this minibatch
                queries = [q_train[t].lower() for t in train_index]
                actions = [a_train[t] for t in train_index]
                
                if prm.learning.lower() == 'supervised':
                    sup.set_value(1.) # select supervised mode
                else:
                    sup.set_value(0.)

                # Get correct actions (supervision signal)
                acts_p =  get_acts(actions, prm.max_hops_train, k_beam=1)

                # MIXER
                if prm.mixer > 0 and prm.learning.lower() == 'reinforce':
                    mixer.set_value(max(0, prm.max_hops_train - uidx // prm.mixer))
                else:
                    if prm.learning.lower() == 'supervised':
                        mixer.set_value(prm.max_hops_train+1)
                    else:
                        mixer.set_value(0)

                root_pages = get_root_pages(actions)                
                
                # Get the BoW for the queries.
                q_i, q_m = utils.text2idx2(queries, vocab, prm.max_words_query*prm.n_consec)
                n_samples += len(queries)
                
                if uidx > 1 and prm.learning.lower() == 'q_learning':
                    # Randomly select experiences and convert them to numpy arrays.
                    idxs = np.random.choice(np.arange(len(experience)), size=len(queries))
                    rvs = []
                    for j in range(len(experience[idxs[0]])):
                        rv = []
                        for idx in idxs:
                            rv.append(experience[idx][j])

                        rvs.append(np.asarray(rv))
                else:
                    rvs = [np.zeros((len(queries),prm.max_words_query*prm.n_consec),dtype=np.float32), # rs_q
                           np.zeros((len(queries),prm.max_words_query*prm.n_consec),dtype=np.float32), # rs_q_m
                           np.zeros((len(queries),prm.max_hops_train+1),dtype=np.int32), # rl_idx
                           np.zeros((len(queries),prm.max_hops_train+1),dtype=np.float32), # rt
                           np.zeros((len(queries),prm.max_hops_train+1),dtype=np.float32) # rr
                          ]

                cost, R, l_idx, pages_idx, best_page_idx, best_answer, mask, dist \
                        = f_grad_shared(q_i, q_m, root_pages, acts_p, uidx, *rvs)
                f_update(prm.lrate)

                if prm.learning.lower() == 'q_learning': 
                    # update weights of the next_q_val network.
                    if (prm.update_freq > 0 and uidx % prm.update_freq == 0) or (uidx == prm.replay_start):
                        for tk, tv in tparams.items():
                            if tk in tparams_next:
                                tparams_next[tk].set_value(tv.get_value().copy())

                # Only update memory after freeze_mem or before replay_start.
                if (uidx < prm.replay_start or uidx > prm.freeze_mem) and prm.learning.lower() == 'q_learning':
                    # Update Replay Memory.
                    t = np.zeros((len(queries), prm.max_hops_train+1))
                    rR = np.zeros((len(queries), prm.max_hops_train+1))

                    for i in range(len(queries)):
                        j = np.minimum(mask[i].sum(), prm.max_hops_train)
                        # If the agent chooses to stop or the episode ends,
                        # the reward will be the reward obtained with the chosen document.
                        rR[i,j] = R[i]
                        t[i,j] = 1.
                        
                        add = True
                        if prm.selective_mem >= 0 and uidx > 1:
                            # Selective memory: keep the percentage of memories
                            # with reward=1 approximately equal to <selective_mem>.
                            pr = float(np.asarray(experience_r).sum()) / max(1., float(len(experience_r)))
                            if (pr < prm.selective_mem) ^ (rR[i,j] == 1.): # xor
                                add = False

                        if add:
                            experience.append([q_i[i], q_m[i], l_idx[i], t[i], rR[i]])
                            experience_r.append(rR[i])

                if np.isnan(cost) or np.isinf(cost):
                    print 'NaN detected'
                    return 1., 1., 1.
    
                #if uidx % 100 == 0:
                #    vis_att(pages_idx[:,-1], queries[-1], alpha[:,-1,:], uidx, options)

                if np.mod(uidx, prm.dispFreq) == 0:
                    with open(prm.outpath, "a") as fout:
                        fout.write("\n\nQuery: " + queries[-1].replace("\n"," "))
                        fout.write('\nBest Answer: ' + utils.idx2text(best_answer[-1], options['vocabinv']))
                        fout.write('\nBest page: ' + options['wiki'].get_article_title(best_page_idx[-1]))

                        for i, page_idx in enumerate(pages_idx[:,-1]):
                            fout.write('\niteration: ' +str(i) + " page idx " + str(page_idx) + ' title: ' + options['wiki'].get_article_title(page_idx))
                       
                        fout.write('\nEpoch '+ str(eidx) + ' Update '+ str(uidx) + ' Cost ' + str(cost) + \
                                   ' Reward Mean ' + str(R.mean()) + ' Reward Max ' + str(R.max()) + \
                                   ' Reward Min ' + str(R.min()) + \
                                   ' Q-Value Max (avg per sample) ' + str(dist.max(2).mean()) + \
                                   ' Q-Value Mean ' + str(dist.mean()))
                        #fout.write("\nCost Supervised: " + str(cost_sup))
                        #fout.write("\nCost RL: " + str(cost_RL))

                        fout.write("\nTime per Minibatch Update: " + str(time.time() - st))
                       

                if prm.saveto and np.mod(uidx, saveFreq) == 0:
                    print 'Saving...',

                    if best_p is not None:
                        params = best_p
                    else:
                        params = unzip(tparams)
                    np.savez(prm.saveto, history_errs=history_errs, **params)
                    pkl.dump(options, open('%s.pkl' % prm.saveto, 'wb'), -1)
                    print 'Done'

                if np.mod(uidx, validFreq) == 0 or uidx == 1:
                    if prm.visited_pages_path:
                        shuffle = False
                    else:
                        shuffle = True
                    kf_train = get_minibatches_idx(len(q_train), prm.batch_size_pred, shuffle=shuffle, max_samples=train_size)
                    kf_valid = get_minibatches_idx(len(q_valid), prm.batch_size_pred, shuffle=shuffle, max_samples=valid_size)
                    kf_test = get_minibatches_idx(len(q_test), prm.batch_size_pred, shuffle=shuffle, max_samples=test_size)

                    is_train.set_value(0.)
                    sup.set_value(0.) # supervised mode off
                    mixer.set_value(0) # no supervision
                    max_hops.set_value(prm.max_hops_pred)
                    k_beam.set_value(prm.k)

                    with open(prm.outpath, 'a') as fout:
                        fout.write('\n\nComputing Error Training Set')
                    train_err, train_R, train_accp, visited_pages_train = pred_error(f_pred, q_train, a_train, options, kf_train)

                    with open(prm.outpath, 'a') as fout:
                        fout.write('\n\nComputing Error Validation Set')
                    valid_err, valid_R, valid_accp, visited_pages_valid = pred_error(f_pred, q_valid, a_valid, options, kf_valid)

                    with open(prm.outpath, 'a') as fout:
                        fout.write('\n\nComputing Error Test Set')
                    test_err, test_R, test_accp, visited_pages_test = pred_error(f_pred, q_test, a_test, options, kf_test)

                    if prm.visited_pages_path:
                        pkl.dump([visited_pages_train, visited_pages_valid, visited_pages_test], open(prm.visited_pages_path, 'wb'))

                    history_errs.append([valid_err[-1], test_err[-1]])

                    if (uidx == 0 or
                        valid_err[-1] <= np.array(history_errs)[:,0].min()):

                        best_p = unzip(tparams)
                        bad_counter = 0

                    with open(prm.outpath, "a") as fout:
                        fout.write('\n[{per hop}, Avg] Train err ' + str(train_err) + '  Valid err ' + str(valid_err) + '  Test err ' + str(test_err))
                        fout.write('\n[{per hop}, Avg] Train R ' + str(train_R) + '  Valid R ' + str(valid_R) + '  Test R ' + str(test_R))
                        fout.write('\nAccuracy Page Actions   Train ' + str(train_accp) + '  Valid ' + str(valid_accp) + '  Test ' + str(test_accp))

                    if (len(history_errs) > prm.patience and
                        valid_err[-1] >= np.array(history_errs)[:-prm.patience,
                                                               0].min()):
                        bad_counter += 1
                        if bad_counter > prm.patience:
                            print 'Early Stop!'
                            estop = True
                            break

            with open(prm.outpath, "a") as fout:
                fout.write('\nSeen %d samples' % n_samples)

            if estop:
                break

    except KeyboardInterrupt:
        print "Training interupted"

    end_time = time.time()
    if best_p is not None:
        zipp(best_p, tparams)
    else:
        best_p = unzip(tparams)

    is_train.set_value(0.)
    sup.set_value(0.) # supervised mode off
    mixer.set_value(0) # no supervision
    max_hops.set_value(prm.max_hops_pred)
    k_beam.set_value(prm.k)

    kf_train_sorted = get_minibatches_idx(len(q_train), prm.batch_size_train)

    train_err, train_R, train_accp, visited_pages_train = pred_error(f_pred, q_train, a_train, options, kf_train_sorted)
    valid_err, valid_R, valid_accp, visited_pages_valid = pred_error(f_pred, q_valid, a_valid, options, kf_valid)
    test_err, test_R, test_accp, visited_pages_test = pred_error(f_pred, q_test, a_test, options, kf_test)

    with open(prm.outpath, "a") as fout:
        fout.write('\n[{per hop}, Avg] Train err ' + str(train_err) + '  Valid err ' + str(valid_err) + '  Test err ' + str(test_err))
        fout.write('\n[{per hop}, Avg] Train R ' + str(train_R) + '  Valid R ' + str(valid_R) + '  Test R ' + str(test_R))
        fout.write('\nAccuracy Page Actions   Train ' + str(train_accp) + '  Valid ' + str(valid_accp) + '  Test ' + str(test_accp))

    if prm.saveto:
        np.savez(prm.saveto, train_err=train_err,
                    valid_err=valid_err, test_err=test_err,
                    history_errs=history_errs, **best_p)
    with open(prm.outpath, "a") as fout:
        fout.write('\nThe code run for %d epochs, with %f sec/epochs' % ((eidx + 1), (end_time - start_time) / (1. * (eidx + 1))))
    with open(prm.outpath, "a") as fout:
        fout.write('\nTraining took %.1fs' % (end_time - start_time))
    return train_err, valid_err, test_err