Пример #1
0
def main(args):
    # with open(args.output_dir / 'config.json') as f:
    #     config = json.load(f)

    # loading datasets from jsonl files
    # with open(config['train']) as f:
    #     train = [json.loads(line) for line in f]
    with open(args.valid_data_path) as f:
        valid = [json.loads(valid) for valid in f]
    # with open(config['test']) as f:
    #     test = [json.loads(line) for line in f]

    logging.info('Collecting documents...')
    documents = ([sample['text'] for sample in valid])

    logging.info('Collecting words in documents...')
    tokenizer = Tokenizer(lower=True)
    words = tokenizer.collect_words(documents)

    logging.info('Loading embedding...')
    with open('embedding2.pkl', 'rb') as f:
        embedding = pickle.load(f)

    tokenizer.set_vocab(embedding.vocab)

    logging.info('Creating valid dataset...')
    create_seq2seq_dataset(process_samples(tokenizer, valid),
                           'valid_seq2seq.pkl', tokenizer.pad_token_id)
Пример #2
0
    def __init__(self,
                 topics_path,
                 min_depth=0,
                 max_depth=None,
                 metadata=True,
                 lemmatization=True,
                 use_stop=True,
                 pattern=None,
                 exclude_pattern=None,
                 **kwargs):
        super(TrecTopics, self).__init__(topics_path,
                                         dictionary={},
                                         metadata=metadata,
                                         min_depth=min_depth,
                                         max_depth=max_depth,
                                         pattern=pattern,
                                         exclude_pattern=exclude_pattern,
                                         lines_are_documents=True,
                                         **kwargs)

        self.topics = {}
        self.topics_vecs = None
        self.topic_row_maps = {}
        self.oov = {}

        self.tokenizer = Tokenizer(minimum_len=TOKEN_MIN_LEN,
                                   maximum_len=TOKEN_MAX_LEN,
                                   lowercase=True,
                                   output_lemma=lemmatization,
                                   use_stopwords=use_stop,
                                   extra_stopwords=EXTRA_STOPWORDS)
Пример #3
0
def main(args):
    train_df = pd.read_pickle(args.train_data)
    valid_df = pd.read_pickle(args.valid_data)
    tokenizer = Tokenizer()
    tokenizer.fit_word(train_df.repl_words.tolist())

    train_sentences_idx = sentence_preprocessing(train_df, tokenizer)
    valid_sentences_idx = sentence_preprocessing(valid_df, tokenizer)

    bi_lm_model = BiLM(args.word_emb_size, args.lstm_unit_size,
                       len(tokenizer.vocab_word))

    if torch.cuda.device_count() > 1:
        print("Use", torch.cuda.device_count(), "GPUs.")
        bi_lm_model = torch.nn.DataParallel(bi_lm_model)
    elif torch.cuda.device_count() == 1:
        print("Use single GPU.")
    else:
        print("Use CPU.")
    bi_lm_model.to(device)

    bi_lm_model = train(bi_lm_model, train_sentences_idx, valid_sentences_idx,
                        args.epochs, args.batch_size, args.early_stopping)

    torch.save(bi_lm_model.state_dict(), args.output)
def inference_random():
    # 加载验证集验证
    model = ClassificationModel(len(cfg.char2idx))
    model = load_custom_model(model, cfg.save_model_path).to(cfg.device)

    tokenizer = Tokenizer(cfg.char2idx)
    error = 0
    with open(cfg.test_data_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    for line in lines:
        pairs = line.split('\t')
        label, text = pairs[0], pairs[1]
        input_index, _ = tokenizer.encode(text, max_length=cfg.max_seq_len)
        inputs = torch.tensor(input_index).unsqueeze(0)
        inputs_mask = (inputs > 0).to(torch.float32)
        with torch.no_grad():
            scores = model(inputs, inputs_mask)
            prediction = scores.argmax(-1).item()
        if prediction != int(label):
            print(scores[:, int(label)].item())
            print(label)
            print(text)
            print('-' * 50)
            error += 1
    print(error)
Пример #5
0
def main(args):

    # loading datasets from jsonl files
    with open(args.input_data_path) as f:
        valid = [json.loads(valid) for valid in f]

    logging.info('Collecting documents...')
    documents = ([sample['text'] for sample in valid])

    logging.info('Collecting words in documents...')
    tokenizer = Tokenizer(lower=True)
    words = tokenizer.collect_words(documents)

    logging.info('Loading embedding...')
    """
    embedding = Embedding("./glove.6B.300d.txt", words=words)
    with open('./embedding.pkl', 'wb') as f:
        pickle.dump(embedding, f)
    """
    with open('./embedding.pkl', 'rb') as file:
        embedding = pickle.load(file)
    tokenizer.set_vocab(embedding.vocab)

    logging.info('Creating valid dataset...')
    create_seq2seq_dataset(process_samples(tokenizer, valid), 'data.pkl',
                           tokenizer.pad_token_id)
Пример #6
0
def preProcess():
    print 'PreProcess Reuters Corpus'
    start_time = time.time()
    docs = 0
    bad = 0
    tokenizer = Tokenizer()

    if not os.path.isdir(Paths.base):
        os.makedirs(Paths.base)

    with open(Paths.text_index, 'w') as fileid_out:
      with codecs.open(Paths.texts_clean, 'w', 'utf-8-sig') as out:
          with codecs.open(Paths.reuter_test, 'w', 'utf-8-sig') as test:
              for f in reuters.fileids():
                  contents = reuters.open(f).read()
                  try:
                      tokens = tokenizer.tokenize(contents)
                      docs += 1
                      if docs % 1000 == 0:
                          print "Normalised %d documents" % (docs)

                      out.write(' '.join(tokens) + "\n")
                      # if f.startswith("train"):
                      #
                      # else:
                      #     test.write(' '.join(tokens) + "\n")
                      fileid_out.write(f + "\n")

                  except UnicodeDecodeError:
                      bad += 1
    print "Normalised %d documents" % (docs)
    print "Skipped %d bad documents" % (bad)
    print 'Finished building train file ' + Paths.texts_clean
    end_time = time.time()
    print '(Time to preprocess Reuters Corpus: %s)' % (end_time - start_time)
Пример #7
0
def get_dataloaders(args):
    model_prefix = '{}_{}'.format(args.model_type, args.train_id)

    log_path = args.LOG_DIR + model_prefix + '/'
    checkpoint_path = args.CHK_DIR + model_prefix + '/'
    result_path = args.RESULT_DIR + model_prefix + '/'
    cp_file = checkpoint_path + "best_model.pth.tar"
    init_epoch = 0

    if not os.path.exists(log_path):
        os.makedirs(log_path)
    if not os.path.exists(checkpoint_path):
        os.makedirs(checkpoint_path)

    ## set up the logger
    set_logger(os.path.join(log_path, 'train.log'))

    ## save argparse parameters
    with open(log_path + 'args.yaml', 'w') as f:
        for k, v in args.__dict__.items():
            f.write('{}: {}\n'.format(k, v))

    logging.info('Training model: {}'.format(model_prefix))

    ## set up vocab txt
    # create txt here
    print('running setup')
    setup(args, clear=True)
    print(args.__dict__)

    # indicate src and tgt language
    if args.source_language == 'en':
        src, tgt = 'en', 'zh'
    else:
        src, tgt = 'zh', 'en'

    maps = {'en': args.TRAIN_VOCAB_EN, 'zh': args.TRAIN_VOCAB_ZH}
    vocab_src = read_vocab(maps[src])
    tok_src = Tokenizer(language=src,
                        vocab=vocab_src,
                        encoding_length=args.MAX_INPUT_LENGTH)
    vocab_tgt = read_vocab(maps[tgt])
    tok_tgt = Tokenizer(language=tgt,
                        vocab=vocab_tgt,
                        encoding_length=args.MAX_INPUT_LENGTH)
    logging.info('Vocab size src/tgt:{}/{}'.format(len(vocab_src),
                                                   len(vocab_tgt)))

    ## Setup the training, validation, and testing dataloaders
    train_loader, val_loader, test_loader = create_split_loaders(
        args.DATA_DIR, (tok_src, tok_tgt),
        args.batch_size,
        args.MAX_VID_LENGTH, (src, tgt),
        num_workers=4,
        pin_memory=True)
    logging.info('train/val/test size: {}/{}/{}'.format(
        len(train_loader), len(val_loader), len(test_loader)))

    return train_loader, val_loader, test_loader, tok_src, tok_tgt, len(
        vocab_src), len(vocab_tgt)
Пример #8
0
def main(args):
    
    with open(args.test_input) as f:
        test = [json.loads(line) for line in f]

    logging.info('Collecting documents...')
    documents = (
        [sample['text'] for sample in test]
    )

    logging.info('Collecting words in documents...')
    tokenizer = Tokenizer(lower=True)
    words = tokenizer.collect_words(documents)

    logging.info('Loading embedding...')
    with open(args.embedding_file, 'rb') as f:
        embedding = pickle.load(f)

    tokenizer.set_vocab(embedding.vocab)

  
    logging.info('Creating test dataset...')
    create_seq2seq_dataset(
        process_samples(tokenizer, test),
        args.test_output,
        tokenizer.pad_token_id
    )
def main(path):
    with open(path) as f:
        test = [json.loads(line) for line in f]

    with open("./datasets/seq_tag/embedding.pkl", "rb") as f:
        embedding = pickle.load(f)

    tokenizer = Tokenizer(embedding.vocab, lower=True)
    tokenizer.set_vocab(embedding.vocab)

    logging.info('Creating test dataset...')
    create_seq_tag_dataset(process_seq_tag_samples(tokenizer, test),
                           './datasets/seq_tag/test.pkl')
Пример #10
0
def main(args):
    with open(args.output_dir / 'config.json') as f:
        config = json.load(f)

    # loading datasets from jsonl files
    with open(config['train']) as f:
        train = [json.loads(line) for line in f]
    with open(config['valid']) as f:
        valid = [json.loads(valid) for valid in f]
    with open(config['test']) as f:
        test = [json.loads(line) for line in f]

    logging.info('Collecting documents...')
    documents = (
        [sample['text'] for sample in train]
        + [sample['summary'] for sample in train]
        + [sample['text'] for sample in valid]
        + [sample['text'] for sample in test]
    )

    logging.info('Collecting words in documents...')
    tokenizer = Tokenizer(lower=config['lower_case'])
    words = tokenizer.collect_words(documents)

    logging.info('Loading embedding...')
    embedding = Embedding(config['embedding'], words=words)
    with open(args.output_dir / 'embedding.pkl', 'wb') as f:
        pickle.dump(embedding, f)

    tokenizer.set_vocab(embedding.vocab)

    logging.info('Creating train dataset...')
    create_seq2seq_dataset(
        process_samples(tokenizer, train),
        args.output_dir / 'train.pkl', config,
        tokenizer.pad_token_id
    )
    logging.info('Creating valid dataset...')
    create_seq2seq_dataset(
        process_samples(tokenizer, valid),
        args.output_dir / 'valid.pkl', config,
        tokenizer.pad_token_id
    )
    logging.info('Creating test dataset...')
    create_seq2seq_dataset(
        process_samples(tokenizer, test),
        args.output_dir / 'test.pkl', config,
        tokenizer.pad_token_id
    )
Пример #11
0
def main(args):
    # Read test file
    with open(args.input_dataname) as f:
        test = [json.loads(line) for line in f]
    # Read embedding
    with open(str(args.output_dir) + '/embedding_tag.pkl', 'rb') as f:
        embedding = pickle.load(f)

    tokenizer = Tokenizer(lower=True)
    tokenizer.set_vocab(embedding.vocab)

    logging.info('Creating test dataset...')
    create_seq_tag_dataset(process_seq_tag_samples(tokenizer, test),
                           args.output_dir / 'test_tag.pkl',
                           tokenizer.pad_token_id)
Пример #12
0
def tokenize_sentence(train, test, w2v_model, max_len=6):
    data = pd.concat([train["sentence"],  test["sentence"]]).values
    tok = Tokenizer(max_features=15000, max_len=max_len)
    tokens = tok.fit_transform(data)
    #n = len(train)
    #train_tokens = tokens[:n]
    #test_tokens = tokens[n:]
    vocab_len = tok.vocabulary_size()
    idx_to_word = {v:k for k, v in tok.vocab_idx.items()}
    embedding_matrix = np.zeros((vocab_len+1, W2V_CONFIG["vector_size"]))
    for i in range(vocab_len):
        if i == 0:
            continue
        embedding_matrix[i] = w2v_model[idx_to_word[i]]
    return tok, embedding_matrix
Пример #13
0
def make_env_and_models(args, train_vocab_path, train_splits, test_splits,
                        batch_size=BATCH_SIZE):
    setup()
    image_features_list = ImageFeatures.from_args(args)
    vocab = read_vocab(train_vocab_path)
    tok = Tokenizer(vocab=vocab)
    train_env = R2RBatch(image_features_list, batch_size=batch_size,
                         splits=train_splits, tokenizer=tok)

    enc_hidden_size = hidden_size//2 if args.bidirectional else hidden_size
    glove = np.load(glove_path)
    feature_size = FEATURE_SIZE
    encoder = try_cuda(EncoderLSTM(
        len(vocab), word_embedding_size, enc_hidden_size, vocab_pad_idx,
        dropout_ratio, bidirectional=args.bidirectional, glove=glove))
    decoder = try_cuda(AttnDecoderLSTM(
        action_embedding_size, hidden_size, dropout_ratio,
        feature_size=feature_size))
    test_envs = {
        split: (R2RBatch(image_features_list, batch_size=batch_size,
                         splits=[split], tokenizer=tok),
                eval.Evaluation([split]))
        for split in test_splits}

    return train_env, test_envs, encoder, decoder
Пример #14
0
def test_submission(path_type, max_episode_len, history, MAX_INPUT_LENGTH, feedback_method, n_iters, model_prefix, blind):
    ''' Train on combined training and validation sets, and generate test submission. '''
  
    setup()

    # Create a batch training environment that will also preprocess text
    vocab = read_vocab(TRAINVAL_VOCAB)
    tok = Tokenizer(vocab=vocab, encoding_length=MAX_INPUT_LENGTH)
    train_env = R2RBatch(features, batch_size=batch_size, splits=['train', 'val_seen', 'val_unseen'], tokenizer=tok,
                         path_type=path_type, history=history, blind=blind)
    
    # Build models and train
    enc_hidden_size = hidden_size//2 if bidirectional else hidden_size
    encoder = EncoderLSTM(len(vocab), word_embedding_size, enc_hidden_size, padding_idx, 
                  dropout_ratio, bidirectional=bidirectional).cuda()
    decoder = AttnDecoderLSTM(action_embedding_size, hidden_size, dropout_ratio).cuda()

    train(train_env, encoder, decoder, n_iters, path_type, history, feedback_method, max_episode_len, MAX_INPUT_LENGTH, model_prefix)

    # Generate test submission
    test_env = R2RBatch(features, batch_size=batch_size, splits=['test'], tokenizer=tok,
                        path_type=path_type, history=history, blind=blind)
    agent = Seq2SeqAgent(test_env, "", encoder, decoder, max_episode_len)
    agent.results_path = '%s%s_%s_iter_%d.json' % (RESULT_DIR, model_prefix, 'test', 5000)
    agent.test(use_dropout=False, feedback='argmax')
    agent.write_results()
Пример #15
0
def train_val():
    ''' Train on the training set, and validate on seen and unseen splits. '''

    setup()
    # Create a batch training environment that will also preprocess text
    vocab = read_vocab(TRAIN_VOCAB)
    tok = Tokenizer(vocab=vocab, encoding_length=MAX_INPUT_LENGTH)
    train_env = R2RBatch(features,
                         batch_size=batch_size,
                         splits=['train'],
                         tokenizer=tok)

    # Creat validation environments
    val_envs = {
        split: (R2RBatch(features,
                         batch_size=batch_size,
                         splits=[split],
                         tokenizer=tok), Evaluation([split]))
        for split in ['val_seen', 'val_unseen']
    }

    # Build models and train
    enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size
    encoder = EncoderLSTM(len(vocab),
                          word_embedding_size,
                          enc_hidden_size,
                          padding_idx,
                          dropout_ratio,
                          bidirectional=bidirectional).cuda()
    decoder = AttnDecoderLSTM(Seq2SeqAgent.n_inputs(),
                              Seq2SeqAgent.n_outputs(), action_embedding_size,
                              hidden_size, dropout_ratio).cuda()
    train(train_env, encoder, decoder, n_iters, val_envs=val_envs)
Пример #16
0
def exact_adaptor(parser):
    """
    exact matching
    """
    tokenizer = Tokenizer()

    def method(_, queries):
        nearests = list()
        for query in tqdm(queries):
            protocol = query[0]
            toked_protocol = tokenizer.tokenize(protocol[0])
            catprotocol = ' '.join(toked_protocol)
            toked_candidates = [q[0] for q in query[1:]]
            if not len(toked_protocol):
                nearests.append(None)
                continue

            max_matched = 0
            max_idx = None
            for (idx, toked_can) in enumerate(toked_candidates):
                if ' '.join(toked_can
                            ) in catprotocol and len(toked_can) > max_matched:
                    max_idx = idx
                    max_matched = len(toked_can)
            nearests.append(max_idx)

        return nearests

    return method
Пример #17
0
def create_train_data(data_dir, config):
    from utils import Tokenizer, get_logger
    logger = get_logger('log', './log/log.txt')
    t = Tokenizer(logger)
    model = Data.pre_process_data(data_dir, t, config, logger)
    model.create_tf_record_file(model.sample_file)
    return model
Пример #18
0
def finetune():
    setup()
    # Create a batch training environment that will also preprocess text
    vocab = read_vocab(TRAIN_VOCAB)
    tok = Tokenizer(vocab=vocab, encoding_length=args.maxInput)

    if args.fast_train:
        feat_dict = read_img_features(features_fast)
    else:
        feat_dict = read_img_features(features)

    candidate_dict = utils.read_candidates(CANDIDATE_FEATURES)
    featurized_scans = set(
        [key.split("_")[0] for key in list(feat_dict.keys())])

    train_env = R2RBatch(feat_dict,
                         candidate_dict,
                         batch_size=args.batchSize,
                         splits=['train'],
                         tokenizer=tok)
    print("The finetune data_size is : %d\n" % train_env.size())
    val_envs = {
        split:
        (R2RBatch(feat_dict,
                  candidate_dict,
                  batch_size=args.batchSize,
                  splits=[split],
                  tokenizer=tok), Evaluation([split], featurized_scans, tok))
        for split in ['train', 'val_seen', 'val_unseen']
    }

    train(train_env, tok, args.iters, val_envs=val_envs)
Пример #19
0
def make_env_and_models(args, train_vocab_path, train_splits, test_splits,
                        test_instruction_limit=None):
    setup()
    image_features_list = ImageFeatures.from_args(args)
    vocab = read_vocab(train_vocab_path)
    tok = Tokenizer(vocab=vocab)
    train_env = R2RBatch(image_features_list, batch_size=batch_size,
                         splits=train_splits, tokenizer=tok)

    enc_hidden_size = hidden_size//2 if bidirectional else hidden_size
    glove = np.load(glove_path)
    feature_size = FEATURE_SIZE
    encoder = try_cuda(SpeakerEncoderLSTM(
        action_embedding_size, feature_size, enc_hidden_size, dropout_ratio,
        bidirectional=bidirectional))
    decoder = try_cuda(SpeakerDecoderLSTM(
        len(vocab), word_embedding_size, hidden_size, dropout_ratio,
        glove=glove))

    test_envs = {
        split: (R2RBatch(image_features_list, batch_size=batch_size,
                         splits=[split], tokenizer=tok,
                         instruction_limit=test_instruction_limit),
                eval_speaker.SpeakerEvaluation(
                    [split], instructions_per_path=test_instruction_limit))
        for split in test_splits}

    return train_env, test_envs, encoder, decoder
Пример #20
0
def train():
    # 加载数据
    char2idx, keep_tokens = load_chinese_base_vocab(cfg.vocab_path)
    tokenizer = Tokenizer(char2idx)
    # train_data = glob(cfg.train_data_path + '*')[16 * 1000 * 35:16 * 1000 * 40]
    train_data = glob(cfg.train_data_path + '*')[8 * 5000 * 5:8 * 5000 * 10]
    train_dataset = CustomDataset(train_data, tokenizer, cfg.max_seq_len)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=cfg.batch_size,
                                  collate_fn=padding,
                                  shuffle=True,
                                  num_workers=4,
                                  pin_memory=True)

    # # debug
    # train_data = glob(cfg.test_data_path + '*')[:8 * 5000 * 5]
    # train_dataset = CustomDataset(train_data, tokenizer, cfg.max_seq_len)
    # train_dataloader = DataLoader(train_dataset, batch_size=cfg.batch_size, collate_fn=padding)
    # # debug
    # 加载模型
    model = CustomUnilmModel(len(char2idx))
    # model = load_pretrained_bert(model, cfg.pretrained_model_path, keep_tokens=keep_tokens).to(cfg.device)
    model = load_custom_model(model, cfg.save_model_path).to(cfg.device)

    loss_function = nn.CrossEntropyLoss(ignore_index=0).to(cfg.device)
    optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learn_rate)
    # 迭代训练
    iteration, train_loss = 0, 0
    model.train()
    for inputs, token_type, targets in tqdm(train_dataloader,
                                            position=0,
                                            leave=True):
        attention_mask = unilm_mask(inputs, token_type).to(cfg.device)
        inputs, token_type, targets = inputs.to(cfg.device), token_type.to(
            cfg.device), targets.to(cfg.device)
        prediction = model(inputs, token_type, attention_mask)
        loss = loss_function(
            prediction[:, :-1, :].reshape(-1, prediction.shape[-1]),
            targets.reshape(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        iteration += 1

        if iteration % cfg.print_loss_steps == 0:
            eval_loss = evaluate(model, tokenizer, loss_function)
            print('')
            print('train_loss:{}'.format(train_loss / cfg.print_loss_steps))
            print('evalu_loss:{}'.format(eval_loss))
            test_string(s1, tokenizer, model)
            test_string(s2, tokenizer, model)
            model.train()
            train_loss = 0

        if iteration % cfg.save_model_steps == 0:
            torch.save(model.state_dict(), cfg.save_model_path)
Пример #21
0
def eval(args):
    batch_size = 32
    train_on_gpu = torch.cuda.is_available()

    enc = RNNEncoder(300, args.embedding_file)
    dec = RNNDecoder(300, args.embedding_file)

    device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

    model = Seq2Seq(enc, dec, device).to(device)
    ckpt = torch.load(args.model_path)
    model.load_state_dict(ckpt['state_dict'])

    model.eval()

    embedding_matrix = pickle.load(open(args.embedding_file, 'rb'))
    tokenizer = Tokenizer(lower=True)
    tokenizer.set_vocab(embedding_matrix.vocab)
    eval_data = pickle.load(open(args.test_data_path, 'rb'))
    eval_loader = DataLoader(eval_data,
                             batch_size=batch_size,
                             num_workers=0,
                             shuffle=False,
                             collate_fn=eval_data.collate_fn)

    output_file = open(args.output_path, 'w')
    val_losses = []
    prediction = {}
    for batch in tqdm(eval_loader):
        pred = model(batch, 0)
        pred = torch.argmax(pred, dim=2)
        # batch, seq_len

        for i in range(len(pred)):
            prediction[batch['id'][i]] = tokenizer.decode(
                pred[i]).split('</s>')[0].split(' ', 1)[1]
    pred_output = [
        json.dumps({
            'id': key,
            'predict': value
        })
        for key, value in sorted(prediction.items(), key=lambda item: item[0])
    ]
    output_file.write('\n'.join(pred_output))
    output_file.write('\n')
    output_file.close()
Пример #22
0
    def handle(self, *args, **options):
        tok_tag = self._parse_arg('tokenizer', 'jieba', options)
        sample = self._parse_arg('sample', 1, options)
        w2v = self._parse_arg('w2v', 0, options) > 0
        jtag = self._parse_arg('jtag', 0, options) > 0
        print(tok_tag, sample, w2v, jtag)

        print('Loading doc2vec model...')
        d2v_model = Doc2Vec.load(D2V_PATH)
        print('Model loaded.')

        w2v_model = None
        if w2v:
            print('Loading word2vec model...')
            w2v_model = gensim.models.KeyedVectors.load_word2vec_format(
                W2V_PATH, binary=True, unicode_errors='ignore')
            print('Model loaded.')
        jiebatag_weight = {}
        if jtag:
            jtagweight = JiebaTagWeight.objects.all()
            for jt in jtagweight:
                jiebatag_weight[jt.name] = {
                    'weight': jt.weight,
                    'punish': jt.punish_factor
                }

        evaluator = Evaluator()

        for _ in range(sample):
            evaluator.draw()
            raw_push = evaluator.get_predict_push(
                tokenizer=tok_tag,
                w2v_model=w2v_model,
                jiebatag_weight=jiebatag_weight)

            topic = evaluator.get_topic_field('tokenized')
            topic_words = Tokenizer(tok_tag).cut(topic, pos=False)
            predict_words_ls = [
                Tokenizer(tok_tag).cut(push, pos=False) for push in raw_push
                if 'http' not in push
            ]
            print(topic_words)
            print(len(predict_words_ls))

            score = doc2vec_ndcg(topic_words, predict_words_ls, d2v_model)
            print(score)
Пример #23
0
def make_more_train_env(args, train_vocab_path, train_splits):
    setup(args.seed)
    image_features_list = ImageFeatures.from_args(args)
    vocab = read_vocab(train_vocab_path)
    tok = Tokenizer(vocab=vocab)
    train_env = R2RBatch(image_features_list, batch_size=args.batch_size,
                         splits=train_splits, tokenizer=tok)
    return train_env
Пример #24
0
def train_all(eval_type, seed, max_episode_len, max_input_length, feedback,
              n_iters, prefix, blind, debug, train_vocab, trainval_vocab,
              batch_size, action_embedding_size, target_embedding_size,
              bidirectional, dropout_ratio, weight_decay, feature_size,
              hidden_size, word_embedding_size, lr, result_dir, snapshot_dir,
              plot_dir, train_splits, test_splits):
    ''' Train on the training set, and validate on the test split. '''

    setup(seed, train_vocab, trainval_vocab)
    # Create a batch training environment that will also preprocess text
    vocab = read_vocab(train_vocab if eval_type == 'val' else trainval_vocab)
    tok = Tokenizer(vocab=vocab, encoding_length=max_input_length)
    train_env = R2RBatch(batch_size=batch_size,
                         splits=train_splits,
                         tokenizer=tok,
                         seed=seed,
                         blind=blind)

    # Creat validation environments
    val_envs = {
        split: (R2RBatch(batch_size=batch_size,
                         splits=[split],
                         tokenizer=tok,
                         seed=seed,
                         blind=blind), Evaluation([split], seed=seed))
        for split in test_splits
    }

    # Build models and train
    enc_hidden_size = hidden_size // 2 if bidirectional else hidden_size
    encoder = EncoderLSTM(len(vocab),
                          word_embedding_size,
                          enc_hidden_size,
                          padding_idx,
                          dropout_ratio,
                          bidirectional=bidirectional).cuda()
    decoder = AttnDecoderLSTM(Seq2SeqAgent.n_inputs(),
                              Seq2SeqAgent.n_outputs(), action_embedding_size,
                              hidden_size, dropout_ratio, feature_size).cuda()

    train(eval_type,
          train_env,
          encoder,
          decoder,
          n_iters,
          seed,
          feedback,
          max_episode_len,
          max_input_length,
          prefix,
          blind,
          lr,
          weight_decay,
          result_dir,
          snapshot_dir,
          plot_dir,
          val_envs=val_envs,
          debug=debug)
Пример #25
0
def train_val_augment():
    """
    Train the listener with the augmented data
    """
    setup()

    # Create a batch training environment that will also preprocess text
    vocab = read_vocab(TRAIN_VOCAB)
    tok = Tokenizer(vocab=vocab, encoding_length=args.maxInput)

    # Load the env img features
    feat_dict = read_img_features(features)
    featurized_scans = set(
        [key.split("_")[0] for key in list(feat_dict.keys())])

    # Load the augmentation data
    aug_path = args.aug

    # Create the training environment
    aug_env = R2RBatch(feat_dict,
                       batch_size=args.batchSize,
                       splits=[aug_path],
                       tokenizer=tok,
                       name='aug')

    # import sys
    # sys.exit()
    train_env = R2RBatch(feat_dict,
                         batch_size=args.batchSize,
                         splits=['train'],
                         tokenizer=tok)

    # Printing out the statistics of the dataset
    stats = train_env.get_statistics()
    print("The training data_size is : %d" % train_env.size())
    print("The average instruction length of the dataset is %0.4f." %
          (stats['length']))
    print("The average action length of the dataset is %0.4f." %
          (stats['path']))
    stats = aug_env.get_statistics()
    print("The augmentation data size is %d" % aug_env.size())
    print("The average instruction length of the dataset is %0.4f." %
          (stats['length']))
    print("The average action length of the dataset is %0.4f." %
          (stats['path']))

    # Setup the validation data
    val_envs = {
        split:
        (R2RBatch(feat_dict,
                  batch_size=args.batchSize,
                  splits=[split],
                  tokenizer=tok), Evaluation([split], featurized_scans, tok))
        for split in ['train', 'val_seen', 'val_unseen']
    }

    # Start training
    train(train_env, tok, args.iters, val_envs=val_envs, aug_env=aug_env)
Пример #26
0
def main(args):

    with open(args.output_dir / 'config.json', 'r') as f:
        config = json.load(f)

    with open(args.input_data) as f:
        test = [json.loads(line) for line in f]

    with open(os.path.join(args.output_dir, "embedding.pkl"), 'rb') as f:
        embedding = pickle.load(f)

    tokenizer = Tokenizer(lower=True)
    tokenizer.set_vocab(embedding.vocab)

    logging.info('Creating test dataset...')
    create_seq2seq_dataset(process_samples(tokenizer, test),
                           args.output_dir / 'test_seq.pkl', config,
                           tokenizer.pad_token_id)
Пример #27
0
    def __init__(self, data_base_dir, label_path, max_aspect_ratio,
                 max_encoder_l_h, max_encoder_l_w, max_decoder_l,
                 max_vocab_size, initial_id2voc, initial_voc2id):

        # folder with processed images
        self.data_base_dir = data_base_dir
        # .lst file with formulas
        self.label_path = label_path
        self.max_width = 10000
        self.max_aspect_ratio = max_aspect_ratio
        self.max_encoder_l_h = max_encoder_l_h
        self.max_encoder_l_w = max_encoder_l_w
        self.max_decoder_l = max_decoder_l
        self.min_aspect_ratio = 0.5
        self.vocab_size = max_vocab_size
        self.tokenizer = Tokenizer(initial_id2voc, initial_voc2id)
        # buffer to save groups of batches with same width and height
        self.buffer = defaultdict(lambda: defaultdict(list))
Пример #28
0
def train_val(test_only=False):
    ''' Train on the training set, and validate on seen and unseen splits. '''
    setup()
    vocab = read_vocab(TRAIN_VOCAB)
    tok = Tokenizer(vocab=vocab, encoding_length=args.maxInput)

    feat_dict = read_img_features(features, test_only=test_only)

    if test_only:
        featurized_scans = None
        val_env_names = ['val_train_seen']
    else:
        featurized_scans = set(
            [key.split("_")[0] for key in list(feat_dict.keys())])
        val_env_names = ['val_train_seen', 'val_seen', 'val_unseen']

    if not args.test_obj:
        print('Loading compact pano-caffe object features ... (~3 seconds)')
        import pickle as pkl
        with open('img_features/objects/pano_object_class.pkl', 'rb') as f_pc:
            pano_caffe = pkl.load(f_pc)
    else:
        pano_caffe = None

    train_env = R2RBatch(feat_dict,
                         pano_caffe,
                         batch_size=args.batchSize,
                         splits=['train'],
                         tokenizer=tok)
    from collections import OrderedDict

    if args.submit:
        val_env_names.append('test')

    val_envs = OrderedDict(((split, (R2RBatch(feat_dict,
                                              pano_caffe,
                                              batch_size=args.batchSize,
                                              splits=[split],
                                              tokenizer=tok),
                                     Evaluation([split], featurized_scans,
                                                tok)))
                            for split in val_env_names))

    if args.train == 'listener':
        train(train_env, tok, args.iters, val_envs=val_envs)
    elif args.train == 'validlistener':
        if args.beam:
            beam_valid(train_env, tok, val_envs=val_envs)
        else:
            valid(train_env, tok, val_envs=val_envs)
    elif args.train == 'speaker':
        train_speaker(train_env, tok, args.iters, val_envs=val_envs)
    elif args.train == 'validspeaker':
        valid_speaker(tok, val_envs)
    else:
        assert False
Пример #29
0
 def __init__(self,
              f_abs,
              n_best=1,
              min_length=1,
              max_length=50,
              beam_size=4,
              bert_model='bert-base-uncased'):
     self.n_best = n_best
     self.min_length = min_length
     self.max_length = max_length
     self.beam_size = beam_size
     self.abs_model = self.load_abs_model(f_abs)
     self.eval()
     logger.info(f'Loading BERT Tokenizer [{bert_model}]...')
     self.tokenizerB = BertTokenizer.from_pretrained('bert-base-uncased')
     self.spt_ids_B, self.spt_ids_C, self.eos_mapping = get_special_tokens()
     logger.info('Loading custom Tokenizer for using WBMET embeddings')
     self.tokenizerC = Tokenizer(self.abs_model.args.vocab_size)
     self.tokenizerC.from_pretrained(self.abs_model.args.file_dec_emb)
Пример #30
0
    def __init__(self, input, dictionary=None, merge_title=True, spacy_tokenizer=True,
                 lines_are_documents=True, min_depth=0, max_depth=None, **kwargs):
        """

        Parameters
        ----------
        input : str
            Path to input file/folder.
        dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
            If a dictionary is provided, it will not be updated with the given corpus on initialization.
            If None - new dictionary will be built for the given corpus.
            If `input` is None, the dictionary will remain uninitialized.
        metadata : bool, optional
            If True - yield metadata with each document.
        merge_title : bool, optional
            If True - merge document's title into body text, if title exist.
        min_depth : int, optional
            Minimum depth in directory tree at which to begin searching for files.
        max_depth : int, optional
            Max depth in directory tree at which files will no longer be considered.
            If None - not limited.
        pattern : str, optional
            Regex to use for file name inclusion, all those files *not* matching this pattern will be ignored.
        exclude_pattern : str, optional
            Regex to use for file name exclusion, all files matching this pattern will be ignored.
        lines_are_documents : bool, optional
            If True - each line is considered a document, otherwise - each file is one document.
        kwargs: keyword arguments passed through to the `TextCorpus` constructor.
            See :meth:`gemsim.corpora.textcorpus.TextCorpus.__init__` docstring for more details on these.

        """
        super(TrecCorpus, self).__init__(input=input,
                                         dictionary=dictionary,
                                         metadata=True,
                                         lines_are_documents=lines_are_documents,
                                         min_depth=min_depth, max_depth=max_depth, **kwargs)
        self.merge_title = merge_title
        self.line_feeder = None
        self.spacy_tokenizer = spacy_tokenizer

        # if self.spacy_tokenizer:
        self.tokenizer = Tokenizer(minimum_len=TOKEN_MIN_LEN, maximum_len=TOKEN_MAX_LEN, lowercase=True,
                                   output_lemma=True, use_stopwords=True)
def main(argv):
    with open(CONFIG, 'r') as f:
        config = json.load(f)

    # loading datasets from jsonl files
    testName = argv[1]
    with open(testName, 'r') as f:
        test = [json.loads(line) for line in f]

    tokenizer = Tokenizer(lower=config['lower_case'])

    logging.info('Loading embedding...')
    with open(ENBEDDINT_NAME, 'rb') as f:
        embedding = pickle.load(f)

    tokenizer.set_vocab(embedding.vocab)

    logging.info('Creating test dataset...')
    create_seq2seq_dataset(process_samples(tokenizer, test), 'testSeq2Seq.pkl',
                           config, tokenizer.pad_token_id)