def get_wholeword_label_str(input_ids,config=None,tokenizer=None):
    """
    get whole word label_str from input_ids 
    Args:
        input_ids: Tensor(batch_size,seq_length), indexs of input text
        config: GPT2Config, config of GPT2 model, if not initiated, this function will create a MockConfig by params of input_ids, optional
        tokenizer: GPT2Tokenizer, if not initiated, it will be created using the default setting in utils.tokenization, optional
    Returns:
        label_str: [str], lastword str given lambada as label
    """
    if tokenizer is None:
        tokenizer = Tokenizer()
    if config is None:
        config = MockConfig()
        config.batch_size = input_ids.shape[0]
        config.seq_length = input_ids.shape[1]
        config.vocab_size = tokenizer.vocab_size

    #lastword_range is a list of tuples, seems like [...,(start_position_i,end_position_i),...]
    lastword_range = get_lastword_range(input_ids,config,tokenizer=tokenizer)

    #input_ids requires to shift right for one step for its every first token is <BOS> 
    ids = input_ids[::,1:].asnumpy()

    label_ids = [ id_[index[0]:index[1]].tolist() for index,id_ in zip(lastword_range,ids)]
    
    # use GPT2Tokenizer to decode
    label_str = [ tokenizer.decode(label_id) for label_id in label_ids ]

    return label_str
示例#2
0
def main(args):
    print(args)

    # Load tokenizer
    if args.tokenizer == 'sentencepiece':
        tokenizer = PretrainedTokenizer(pretrained_model=args.pretrained_model,
                                        vocab_file=args.vocab_file)
    else:
        tokenizer = TOKENIZER_CLASSES[args.tokenizer]()
        tokenizer = Tokenizer(tokenizer=tokenizer, vocab_file=args.vocab_file)

    # Build DataLoader
    train_dataset = create_examples(args, tokenizer, mode='train')
    test_dataset = create_examples(args, tokenizer, mode='test')
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True)
    test_loader = DataLoader(test_dataset,
                             batch_size=args.batch_size,
                             shuffle=True)

    # Build Trainer
    trainer = Trainer(args, train_loader, test_loader, tokenizer)

    # Train & Validate
    for epoch in range(1, args.epochs + 1):
        trainer.train(epoch)
        trainer.validate(epoch)
        trainer.save(epoch, args.output_model_prefix)
def get_lastword_range(input_ids,config=None,tokenizer=None):
    """
    Get the range of lastword tokenized index in input_ids

    Args:
        input_ids: Tensor(batch_size,seq_length)
        config: GPT2Config, config of GPT2 model, if not initiated, this function will create a MockConfig by params of input_ids, optional
        tokenizer: GPT2Tokenizer, if not initiated, it will be created using the default setting in utils.tokenization, optional
    
    Returns:
        lastword_range: list(tuple), start and end postion of last word of each text of stringlist that used in selecting tokenized 
        last word index in logits. lastword_logits --> logits[batch_index,start:end,::] 
    """
    if tokenizer is None:
        tokenizer = Tokenizer()
    if config is None:
        config = MockConfig()
        config.batch_size = input_ids.shape[0]
        config.seq_length = input_ids.shape[1]

    string_list = extract_string_from_tensor(input_ids,mode='single',tokenizer=tokenizer,config=config)
    # prefix, _ = split_by_last_word(string_list)
    prefix = split_by_last_word(string_list)

    lastword_range = _get_lastword_range(prefix,string_list,tokenizer)

    return lastword_range
示例#4
0
def main(args):
    print(args)

    # Load tokenizer
    if args.tokenizer == 'sentencepiece':
        tokenizer = PretrainedTokenizer(pretrained_model = args.pretrained_model, vocab_file = args.vocab_file)
    else:
        tokenizer = TOKENIZER_CLASSES[args.tokenizer]()
        tokenizer = Tokenizer(tokenizer = tokenizer, vocab_file = args.vocab_file)

    # Load model
    device = 'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu'
    model = torch.load(args.model).to(device)
    model.eval()

    # Make input
    text = 'I have to admit, I got so emotional all throughout the movie.\
            And some parts brought me to tears. The cast was phenomenal and I think every superhero got to have their spotlight.'
    tokens = tokenizer.tokenize(text)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    padding_length = args.max_seq_len - len(input_ids)
    input_ids = input_ids + ([tokenizer.pad_token_id] * padding_length)
    input_ids = torch.tensor(input_ids).unsqueeze(0).to(device)

    print('--------------------------------------------------------')
    print('tokens: {}'.format(tokens))
    print('input_ids: {}'.format(input_ids))
    print('|input_ids|: {}'.format(input_ids))
    print('--------------------------------------------------------')

    # Inference
    output, attention_weights = model(input_ids)
    print('class: {}'.format(output.argmax(dim=1)))
def _get_lastword_range(prefix, stringlist, tokenizer=None):
    """
    Get the range of lastword tokenized index in label_ids

    Args:
        prefix: list(str), list of text with its last word removed(a.k.a. "prefix") in form of str
        stringlist: list(str), list of text, same as it is in split_by_last_word 
        tokenizer: GPT2Tokenizer, if not initiated, it will be created using the default setting in utils.tokenization, optional
    
    Returns:
        lastword_range: list(tuple), start and end postion of last word of each text of stringlist that used in selecting tokenized 
        last word index in logits. lastword_logits --> logits[batch_index,start:end,::] 
    """
    if tokenizer is None:
        tokenizer = Tokenizer()
        print('[WARNING] parameter: tokenizer is missing in utils.lambada_utils.last_word_index, using Tokenizer() as default tokenizer')
    
    prefix_ids_len = [len(tokenizer.encode(prefix_str))  for prefix_str in prefix] # +1 for including bos 
    full_ids_len = [len(tokenizer.encode(full_str))  for full_str in stringlist] # +1 for including bos 
    
    #lastword_range = [(prefix_length, full_length) for prefix_length, full_length in zip(prefix_ids_len, full_ids_len)] 
    lastword_range_ = [(prefix_length, full_length) for prefix_length, full_length in zip(prefix_ids_len, full_ids_len)]
    lastword_range = []
    for i in range(len(lastword_range_)):
        full_ids = tokenizer.encode(stringlist[i])
        last_prefix_id = tokenizer.encode(prefix[i])[-1]
        range_left = prefix_ids_len[i]
        for j in range(len(full_ids)-2,0,-1):
            if full_ids[j]== last_prefix_id:
                range_left = j+1
                break

        lastword_range.append((range_left,lastword_range_[i][1])) 
    
    return lastword_range
示例#6
0
def main(config):
    print(config)

    list_of_tokens = []
    if config.is_tokenized:
        # read tokens
        with open(config.corpus, 'r', encoding='utf8') as reader:
            for li, line in enumerate(reader):
                list_of_tokens += line.strip().split()
    else:
        # select tokenizer
        if config.tokenizer == 'mecab':
            from konlpy.tag import Mecab
            tokenizer = Tokenizer(tokenization_fn=Mecab().morphs)

        # tokenization & read tokens
        with open(config.corpus, 'r', encoding='utf8') as reader:
            for li, line in enumerate(reader):
                list_of_tokens += tokenizer.tokenize(line.strip())

    # build vocabulary
    vocab = Vocab(list_of_tokens=list_of_tokens,
                  unk_token=config.unk_token,
                  pad_token=config.pad_token,
                  bos_token=config.bos_token,
                  eos_token=config.eos_token,
                  min_freq=config.min_freq,
                  lower=config.lower)
    vocab.build()
    print('Vocabulary size: ', len(vocab))

    # save vocabulary
    with open(config.vocab, 'wb') as writer:
        pickle.dump(vocab, writer)
    print('Vocabulary saved to', config.vocab)
示例#7
0
    def lemmatize(self, stop_words=None):
        tokenizer = Tokenizer(stop_words=stop_words)
        lemmatizer = Lemmatizer(stop_words=stop_words)

        self.lemmatized_queries = dict()
        for q_id in self.queries.dict.keys():
            q = self.queries.get(q_id)

            tok_q = tokenizer.fit_transform(q)
            lem_q = lemmatizer.fit_transform(tok_q)
            self.lemmatized_queries[int(q_id)] = lem_q
def load_queries(queries_filename):
    file = open(queries_filename, 'r')
    queries = {}

    vocab = Vocab()
    tokenizer = Tokenizer()
    lemmatizer = Lemmatizer()

    for l in file.readlines():
        l = l.replace('\n', '')
        l_arr = l.split('\t')
        q = Query()

        q.id = int(l_arr[0])
        q_text = l_arr[1]
        q_syn_text = ''
        if len(l_arr) > 2:
            q_syn_text = l_arr[2]

        q.text = q_text + ' ' + q_syn_text

        q.tokens = lemmatizer.fit_transform(tokenizer.fit_transform(q_text))
        q.synonim_tokens = lemmatizer.fit_transform(tokenizer.fit_transform(q_syn_text))
        queries[q.id] = q

    file.close()

    # create vocab
    for q_id in queries.keys():
        q = queries[q_id]

        tokens = q.tokens + q.synonim_tokens

        vocab.add_phrase(tuple(q.tokens))

        for tkn in tokens:
            vocab.add1(tkn)

        grams, inv_grams, gap_grams = get_ngrams(tokens, 2, inverted=True, with_gap=True)
        for g in grams + inv_grams + gap_grams:
            vocab.add2(g)

    return queries, vocab
示例#9
0
def main(args):
    logger.info(f"Args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}")

    spm_path = os.path.join('spm', args.spm, "spm.model")
    args.sample = parse_sample_options(args.sample)
    logger.info(f"Loading tokenizer from {spm_path}")
    tokenizer = Tokenizer(spm_path)
    args.ntoken = ntoken = len(tokenizer)
    logger.info(f"  Vocabulary size: {ntoken}")

    logger.info("Reading dataset")
    data = {}
    for x in ['train', 'valid', 'test']:
        data[x] = read_data(os.path.join(args.data_dir, f"{x}.query.txt"),
                            min_len=args.min_len)
        logger.info(f"  Number of {x:>5s} data: {len(data[x]):8d}")

    logger.info("Preparing model and optimizer")
    config = LMConfig(ntoken, args.ninp, args.nhid, args.nlayers,
                      args.dropouti, args.dropoutr, args.dropouth,
                      args.dropouto)
    model = LanguageModel(config).to(device)
    params = get_params(model)
    logger.info(
        f"  Number of model parameters: {sum(p.numel() for p in params)}")
    optimizer = torch.optim.Adam(params)

    if args.resume:
        logger.info(f"Loading model from {args.resume}")
        model_load(args.resume, model, optimizer)
        model = model.to(device)

    if n_gpu > 1:
        logger.info(f"Making model as data parallel")
        model = torch.nn.DataParallel(model, dim=1)

    train(model, optimizer, tokenizer, data['train'], data['valid'], args)

    test(model, tokenizer, data['test'], args)
示例#10
0
def main(args):
    logger.info(f"Args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}")

    spm_path = os.path.join('spm', args.spm, "spm.model")
    logger.info(f"Loading tokenizer from {spm_path}")
    tokenizer = Tokenizer(spm_path)
    args.ntoken = ntoken = len(tokenizer)
    args.branching_factor = min([args.branching_factor, args.ntoken])
    logger.info(f"  Vocab size: {ntoken}")

    n_queries_str = f"{f'only {args.n_queries} samples' if args.n_queries else 'all'} quries from"
    logger.info(f"Reading a dataset ({n_queries_str} test.query.txt)")
    seen_set = set(
        read_data(os.path.join(args.data_dir, "train.query.txt"),
                  min_len=args.min_len))
    test_data = read_data(os.path.join(args.data_dir, "test.query.txt"),
                          min_len=args.min_len)
    if args.n_queries:
        random.seed(args.seed)
        test_data = random.sample(test_data, args.n_queries)
    n_seen_test_data = len([x for x in test_data if x in seen_set])
    n_unseen_test_data = len(test_data) - n_seen_test_data
    logger.info(
        f"  Number of test data: {len(test_data):8d} (seen {n_seen_test_data}, unseen {n_unseen_test_data})"
    )

    logger.info(f"Loading model from {args.model_dir}")
    model = model_load(args.model_dir)
    model = model.to(device)

    logger.info('Generation starts!')
    with torch.no_grad():
        generate(model,
                 tokenizer,
                 test_data,
                 args,
                 seen_set=seen_set,
                 calc_mrl=args.calc_mrl)
示例#11
0
    def __init__(self,
                 data_file,
                 meta_info_file,
                 vocab_file,
                 max_seq_length,
                 max_label_num=10,
                 **kwargs):
        super(MetaIntentDataset, self).__init__(data_file, **kwargs)
        self.tokenizer = Tokenizer(backend="bert", vocab_file=vocab_file)
        self.max_seq_length = max_seq_length
        self.max_label_num = max_label_num

        with io.open(meta_info_file) as f:
            meta_info_json = eval(json.load(f))['data']

        self.task_to_idx = dict()
        self.task_to_label_mapping = dict()
        self.task_to_label_features = dict()
        self.label_to_memory_id = {"PAD": 0}

        for task_label_info in meta_info_json:
            labels = task_label_info["labelMap"]

            # 任务中包含的标签
            label_map = {label: idx for idx, label in enumerate(labels)}

            # task_key: 任务名
            task_key = task_label_info["taskKey"]

            self.task_to_idx[task_key] = len(self.task_to_idx)
            self.task_to_label_mapping[task_key] = label_map

            for label in labels:
                # 注意这里有可能出现不同的任务对应的label是一样的名字,但是只要是在同一个dataset下面,就是默认同一个label 名字就是一个意思的表达
                if label not in self.label_to_memory_id:
                    self.label_to_memory_id[label] = len(
                        self.label_to_memory_id)
                                       max_para_len=512,
                                       max_char_len=32,
                                       total_vocab=total_vocab)
    dev_examples = example_wordpiece(dev_examples,
                                     max_ques_len=64,
                                     max_para_len=512,
                                     max_char_len=32,
                                     total_vocab=total_vocab)
    # test_examples = example_wordpiece(test_examples, max_ques_len=64, max_para_len=512, max_char_len=32,
    #                                   total_vocab=total_vocab)

    # word_counter = search_words(train_examples + dev_examples + test_examples)
    word_counter = search_words(train_examples + dev_examples)
    word_counter = filter_words(word_counter, min_count=18)
    tokenization = Tokenizer(
        vocab_file='/home/liwei/data/Tencent_AILab_ChineseEmbedding.txt',
        word_counter=word_counter)
    if not os.path.exists('dataset/preprocessed_data/'):
        os.makedirs('dataset/preprocessed_data/')
    with open('dataset/preprocessed_data/vocab.json', 'w') as w:
        json.dump(tokenization.vocab, w, indent=4)
    np.save('dataset/preprocessed_data/embedding_mat.npy',
            tokenization.embedding)

    examples_to_features(train_examples,
                         type='train',
                         is_training=True,
                         max_para_len=512,
                         tokenization=tokenization)
    examples_to_features(dev_examples,
                         type='dev',
示例#13
0
    # Load vocabulary
    with open(config.vocab, 'rb') as reader:
        vocab = pickle.load(reader)

    # Select tokenizer
    config.tokenizer = config.tokenizer.lower()
    if config.tokenizer == TOKENIZER[0]:
        from nltk.tokenize import word_tokenize
        tokenization_fn = word_tokenize
    elif config.tokenizer == TOKENIZER[1]:
        from konlpy.tag import Mecab
        tokenization_fn = Mecab().morphs

    tokenizer = Tokenizer(tokenization_fn=tokenization_fn,
                          vocab=vocab,
                          is_sentence=config.is_sentence,
                          max_seq_length=config.max_seq_length)

    # Build dataloader
    train_dataset = Corpus(corpus_path=config.train_corpus,
                           tokenizer=tokenizer,
                           cuda=config.cuda)
    valid_dataset = Corpus(corpus_path=config.valid_corpus,
                           tokenizer=tokenizer,
                           cuda=config.cuda)
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=config.batch_size,
                              shuffle=config.shuffle)
    valid_loader = DataLoader(dataset=valid_dataset,
                              batch_size=config.batch_size,
                              shuffle=config.shuffle)
示例#14
0
import yaml
from yaml import Loader
from tokenization import Tokenizer

tokenizer = Tokenizer(False)


def load_scenarios():
    with open('../config/scenarios.yml', 'r') as f:
        data = yaml.load(f.read(), Loader=Loader)
    scenarios = []
    for scenario in data.values():
        scenario['triggers'] = [tokenizer.transform(
            trig) for trig in scenario['trigger'].split('|')]
        del(scenario['trigger'])
        scenario['responses'] = scenario['response'].split('|')
        del(scenario['response'])
        scenarios.append(scenario)
    return scenarios
示例#15
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument("--train_folder", default=None, type=str, help="QA folder for training. E.g., train")
    parser.add_argument("--dev_folder", default=None, type=str, help="QA folder for dev. E.g., dev")
    parser.add_argument("--test_folder", default=None, type=str, help="QA folder for test. E.g., test")
    parser.add_argument("--vocab_file", default=None, type=str, help="Vocab txt for vocabulary")
    parser.add_argument("--KB_file", default=None, type=str, help="KB json for question answering")
    parser.add_argument("--M2N_file", default=None, type=str, help="mid2name json for question answering")
    parser.add_argument("--QUERY_file", default=None, type=str, help="query json for recording searched queries")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model checkpoints will be written")

    # Other parameters
    parser.add_argument("--load_model", default=None, type=str, help="The pre-trained model to load")
    parser.add_argument("--save_model", default='BaseSave', type=str, help="The name that the models save as")
    parser.add_argument("--config", default='config/base_config.json', help="The config of base model")
    parser.add_argument("--num_train_epochs", default=20, type=int, help="The epoches of training")
    parser.add_argument("--do_train", default=1, type=int, help="Whether to run training")
    parser.add_argument("--do_eval", default=1, type=int, help= "Whether to run eval")
    parser.add_argument("--train_batch_size", default=1, type=int, help="Total batch size for training")
    parser.add_argument("--eval_batch_size", default=1, type=int, help="Total batch size for eval")
    parser.add_argument("--learning_rate", default=5e-6, type=float, help="Total number of training epoches to perform")
    parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for.")
    parser.add_argument("--seed", default=123, type=int, help="random seeed for initialization")
    parser.add_argument("--gpu_id", default=1, type=int, help="id of gpu")
    parser.add_argument("--top_k", default=1, type=int, help="retrieve top k relation path during prediction")
    parser.add_argument("--max_hop_num", default=1, type=int, help="maximum hop number")
    parser.add_argument("--do_policy_gradient", default=1, type=int, help="Whether to train with policy gradient. 1: use policy gradient; 2: use maximum likelihood with beam")
    args = parser.parse_args()

    if torch.cuda.is_available():
        logger.info("cuda {} is available".format(args.gpu_id))
        device = torch.device("cuda", args.gpu_id) #
        n_gpu = 1
    else:
        device = None
        logger.info("cuda is unavailable")

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir, exist_ok=True)
    load_model_file = args.load_model+".bin" if args.load_model else None
    save_model_file = os.path.join(args.output_dir, args.save_model+".bin") if args.save_model else os.path.join(args.output_dir, "base_model.bin")
    save_eval_cp_file = os.path.join(args.output_dir, args.save_model+"_predcp.txt")
    save_eval_file = os.path.join(args.output_dir, args.save_model+".txt")
    save_kb_cache = os.path.join(os.path.dirname(args.KB_file), "kb_cache.json")
    save_m2n_cache = os.path.join(os.path.dirname(args.M2N_file), "m2n_cache.json")
    save_query_cache = os.path.join(os.path.dirname(args.QUERY_file), "query_cache.json")

    tokenizer = Tokenizer(args.vocab_file)
    KB = {} if args.do_eval == 2 else convert_json_to_load(Load_KB_Files(args.KB_file)) if args.KB_file else None
    M2N = {} if args.do_eval == 2 else Load_KB_Files(args.M2N_file)
    QUERY = set() if args.do_eval == 2 else set(Load_KB_Files(args.QUERY_file))

    config = ModelConfig.from_json_file(args.config)
    policy = Policy(config, tokenizer.vocab, device)
    if load_model_file and os.path.exists(load_model_file):
        model_dic = torch.load(load_model_file, map_location='cpu')
        policy.load_state_dict(model_dic, strict=True)
        print("successfully load pre-trained model ...")
    elif config.method in ['Bert']:
        model_dic = torch.load('config/pytorch_model.bin', map_location='cpu')
        model_dic = {re.sub('bert', 'ranker', k): v for k, v in model_dic.items()}
        model_dic['ranker.embeddings.token_type_embeddings.weight'] = torch.cat([model_dic['ranker.embeddings.token_type_embeddings.weight'], model_dic['ranker.embeddings.token_type_embeddings.weight'][1:]], 0)
        if config.method in ['Bert_tmp']: model_dic.update({re.sub('encoder', 'KBencoder', k): v for k, v in model_dic.items() if re.search('encoder', k)})
        policy.load_state_dict(model_dic, strict=False)
        print("successfully load Bert model ...")
    else:
        print("successfully initialize model ...")
    #print(policy.ranker.decoder.weight.data); exit()
    if args.gpu_id:
        policy.to(device)

    global_step, max_eval_reward, t_total = 0, -0.1, 0
    if args.do_eval:
        dev_instances = create_instances(input_file=args.dev_folder,
                                          tokenizer=tokenizer)
        test_instances = create_instances(input_file=args.test_folder,
                                          tokenizer=tokenizer)
        logger.info("***** Loading evaluation *****")
        logger.info("   Num dev examples = %d", len(dev_instances))
        logger.info("   Num test examples = %d", len(test_instances))
        logger.info("   Batch size = %s", args.eval_batch_size)
    if args.do_train:
        train_instances = create_instances(input_file=args.train_folder,
                                           tokenizer=tokenizer)
        logger.info("***** Loading training ******")
        logger.info("    Num examples = %d" , len(train_instances))
        logger.info("    Batch size = %s", args.train_batch_size)
        t_total = len(train_instances)*args.num_train_epochs

    # Prepare optimizer
    # param_optimizer = list(policy.named_parameters())
    # param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
    # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    # optimizer_grouped_parameters = [
    #     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    #     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    #     ]
    # optimizer = BertAdam(optimizer_grouped_parameters,
    #                      lr=args.learning_rate,
    #                      warmup=args.warmup_proportion,
    #                      t_total=t_total)
    param_optimizer = list(policy.parameters())
    optimizer = optim.Adam(param_optimizer, lr=args.learning_rate)

    args.num_train_epochs = 1 if not args.do_train else args.num_train_epochs
    for epoch in trange(int(args.num_train_epochs), desc="Epoch"):

        tr_loss, tr_LM_loss, tr_reward, tr_reward_boundary, hop1_tr_reward, nb_tr_examples, nb_tr_steps, query_num = 0., 0., 0., 0., 0, 0, 0, 0.
        if args.do_train:
            policy.train()
            if args.do_eval == 2: train_instances = train_instances[:1]
            random.shuffle(train_instances)

            for step, batch in enumerate(train_instances[:5000]):
                #print(step)
                done, skip_forward = False, False
                time, _total_losses = 0, 0

                while time < args.max_hop_num:
                    # Retrieve graphs based on the current graph
                    cp, ts, tn, ty_n, su_n, ye_n, an_n, hn, RAs, mcl, qr_n, done = retrieve_KB(batch, KB, QUERY, M2N, tokenizer, config.method, time = time, is_train=True, save_model=args.save_model)
                    query_num += qr_n

                    if len(cp) == 0: skip_forward = True; break # When there is no candidate paths for the question, skip
                    ready_batch = select_field(batch.question, cp, ts, tn, ty_n, su_n, ye_n, an_n, hn, RAs, mcl, is_train=True, method=config.method, save_model=args.save_model)
                    if args.gpu_id: ready_batch = tuple(t.to(device) for t in ready_batch)

                    # Step through environment using chosen action
                    _logits, _losses = policy(ready_batch, None)
                    _total_losses += _losses if _losses else 0
                    logits = _logits.cpu().data.numpy() if args.gpu_id else _logits.data.numpy()
                    adjust_F1s = torch.tensor(batch.current_F1s, dtype=torch.float).view(1, -1)
                    F1s = torch.tensor(batch.F1s, dtype=torch.float).view(1, -1)
                    if args.gpu_id: _adjust_F1s, _F1s = adjust_F1s.to(device), F1s.to(device)
                    if torch.isnan(_logits).any() or (_logits.size()!= _adjust_F1s.size()): skip_forward = True; break # When there is a bug, skip
                    _action, _adjust_loss = select_action(policy, _logits, adjust_F1s = _adjust_F1s, F1s = _F1s,
                                                          is_train=True, is_reinforce=args.do_policy_gradient, epoch=epoch) #True
                    if args.do_policy_gradient ==2: loss= update_policy_immediately(_adjust_loss, optimizer)
                    action = _action.cpu().data.numpy() if args.gpu_id else _action.data.numpy()
                    eval_metric = 'GraphAcc' if (time==0 and tokenizer.dataset in ['CWQ']) else 'AnsAcc' if (tokenizer.dataset in ['FBQ']) else 'F1Text' if (tokenizer.dataset in ['CQ']) else 'F1'
                    reward, _, done, _, _ = generate_F1(logits, action, batch, time = time, is_train=True, eval_metric=eval_metric, M2N=M2N)
                    if time== 0 and tokenizer.dataset in ['CWQ']: hop1_tr_reward += np.mean(reward)
                    update_train_instance(batch, action)

                    # Save reward
                    policy.reward_episode.append(reward)
                    if done: break # When the best path in the previous iteration is same as the best path in current iteration
                    time += 1
                #if np.max(batch.orig_F1s) > reward: print(np.max(batch.orig_F1s)); print(reward); exit()
                # Used to determine when the environment is solved.
                if not skip_forward:
                    if args.do_policy_gradient != 2:
                        lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                        loss = update_policy(_adjust_loss, policy, optimizer, batch, device = device, LM_loss = _total_losses, is_reinforce=args.do_policy_gradient)

                    tr_loss += loss
                    if _total_losses: tr_LM_loss += _total_losses.item()
                    tr_reward_boundary += np.max(batch.orig_F1s)
                    tr_reward += np.mean(reward)
                    nb_tr_examples += 1
                    nb_tr_steps += 1
                    global_step += 1
                policy.reset()
                batch.reset()

                if (step + 1) % 5000 == 0:
                    print('trained %s instances ...' %step)
                    # model_to_save = policy.module if hasattr(policy, 'module') else policy
                    # torch.save(model_to_save.state_dict(), save_model_file)
                    # Save_KB_Files(convert_json_to_save(KB), save_kb_cache)
                    # Save_KB_Files(M2N, save_m2n_cache)
                    # Save_KB_Files(list(QUERY), save_query_cache)

        if args.do_eval:
            policy.eval()
            eval_reward, nb_eval_steps, nb_eval_examples = 0, 0, 0
            if args.do_eval == 2: dev_instances = dev_instances[:1]

            for eval_step, batch in enumerate(dev_instances):
                done, skip_forward, pred_cp = False, False, ''
                time = 0
                #print(eval_step)
                while time < args.max_hop_num:
                    time1 = mytime.time()
                    cp, ts, tn, ty_n, su_n, ye_n, an_n, hn, RAs, mcl, qr_n, done = retrieve_KB(batch, KB, QUERY, M2N, tokenizer, config.method, time = time)
                    query_num += qr_n

                    if len(cp) == 0: skip_forward = True; break
                    ready_batch = select_field(batch.question, cp, ts, tn, ty_n, su_n, ye_n, an_n, hn, RAs, mcl, method=config.method)
                    if args.gpu_id: ready_batch = tuple(t.to(device) for t in ready_batch)

                    # Step through environment using chosen action
                    with torch.no_grad():
                        _logits, _ = policy(ready_batch, None)
                    logits = _logits.cpu().data.numpy() if args.gpu_id else _logits.data.numpy()

                    _action, _ = select_action(policy, _logits, is_train=False, k=args.top_k)
                    action = _action.cpu().data.numpy() if args.gpu_id else _action.data.numpy()
                    eval_metric = 'AnsAcc' if (tokenizer.dataset in ['FBQ']) else 'F1Text' if (tokenizer.dataset in ['CQ']) else 'F1'
                    reward, pred_cp, done, _, _ = generate_F1(logits, action, batch, time = time, is_train = False, eval_metric=eval_metric, M2N=M2N)
                    update_train_instance(batch, action)

                    if done: break
                    time += 1

                if not skip_forward:
                    eval_reward += np.mean(reward)
                    nb_eval_examples += 1
                    nb_eval_steps += 1
                batch.reset()
                #print(logits); exit()
            result = {'training loss': tr_loss/np.max([nb_tr_examples, 1.e-10]),
                      'training reward': tr_reward/np.max([nb_tr_examples, 1.e-10]),
                      'dev reward': eval_reward/np.max([nb_eval_examples, 1.e-10])}
            if tokenizer.dataset in ['CWQ', 'WBQ']: result['train reward boundary'] = tr_reward_boundary/np.max([nb_tr_examples, 1.e-10])
            if tokenizer.dataset in ['CWQ']: result['training hop1 acc'] = hop1_tr_reward/np.max([nb_tr_examples, 1.e-10])
            if 'LM' in config.method: result['training LM loss'] = tr_LM_loss/np.max([nb_tr_examples, 1.e-10])
            eval_reward = eval_reward/np.max([nb_eval_examples, 1.e-10])

            if eval_reward >= max_eval_reward:
                max_eval_reward = eval_reward
                if args.do_eval == 2: test_instances = test_instances[:1]
                eval_reward, nb_eval_steps, nb_eval_examples, eval_pred_cps, eval_pred_top_ans, eval_reward_boundary = 0, 0, 0, [], [], 0

                for eval_step, batch in enumerate(test_instances): #[328:329]
                    done, skip_forward, pred_cp = False, False, ''
                    time, reward, top_pred_ans = 0, [0], defaultdict(int)
                    #print(eval_step)
                    while time < args.max_hop_num:
                        time1 = mytime.time()
                        cp, ts, tn, ty_n, su_n, ye_n, an_n, hn, RAs, mcl, qr_n, done = retrieve_KB(batch, KB, QUERY, M2N, tokenizer, config.method, time = time)
                        query_num += qr_n

                        if len(cp) == 0:
                            skip_forward = True
                            break
                        ready_batch = select_field(batch.question, cp, ts, tn, ty_n, su_n, ye_n, an_n, hn, RAs, mcl, method=config.method)
                        if args.gpu_id: ready_batch = tuple(t.to(device) for t in ready_batch)

                        # Step through environment using chosen action
                        with torch.no_grad():
                            _logits, _ = policy(ready_batch, None)
                            _logits = F.softmax(_logits, 1)
                        logits = _logits.cpu().data.numpy() if args.gpu_id else _logits.data.numpy()
                        adjust_F1s = torch.tensor(batch.current_F1s, dtype=torch.float).view(1, -1)
                        if args.gpu_id: _adjust_F1s = adjust_F1s.to(device)

                        _action, _ = select_action(policy, _logits, is_train=False, k=args.top_k, time = time) # adjust_F1s = _adjust_F1s,  if time < 2 else None
                        action = _action.cpu().data.numpy() if args.gpu_id else _action.data.numpy()
                        eval_metric = 'AnsAcc' if (tokenizer.dataset in ['FBQ']) else 'F1Text' if (tokenizer.dataset in ['CQ']) else 'Hits1' if (tokenizer.dataset in ['CWQ']) else 'F1'
                        reward, pred_cp, done, pred_ans, top_pred_ans = generate_F1(logits, action, batch, time = time, is_train = False, eval_metric=eval_metric, M2N=M2N, top_pred_ans=top_pred_ans)
                        update_train_instance(batch, action)
                        if done: break
                        time += 1
                    #if len(pred_cp.split(' ')) < 2: print(eval_step); exit()
                    eval_pred_cps += [re.sub('\n', '', '%s\t%s\t%s\t%s' %(eval_step+1, pred_cp, reward, '\t'.join(pred_ans)))]
                    eval_pred_top_ans += [top_pred_ans]
                    #print(top_pred_ans)

                    if not skip_forward:
                        #if np.max(batch.orig_F1s) > np.mean(reward): print(batch.orig_F1s); print(reward); print(eval_step); exit()
                        eval_reward += np.mean(reward)
                        eval_reward_boundary += np.max(batch.orig_F1s)
                        nb_eval_examples += 1
                        nb_eval_steps += 1
                    batch.reset()

                result['test reward'] = eval_reward/np.max([nb_eval_examples, 1.e-10])
                result['query times'] = '%s (save model) %s' %(query_num, mask_weight)
                if args.do_eval == 2: print(result); exit()
                if tokenizer.dataset in ['CWQ', 'WBQ', 'CQ']: result['test reward boundary'] = eval_reward_boundary/np.max([nb_eval_examples, 1.e-10])
                g = open(save_eval_cp_file, "w")
                g.write('\n'.join(eval_pred_cps))
                g.close()
                if eval_pred_top_ans:
                    g = open(re.sub('.txt$', '.json', save_eval_cp_file), "w")
                    for top_pred_ans in eval_pred_top_ans:
                        json.dump(top_pred_ans, g)
                        g.write('\n')
                    g.close()

                if args.do_train:
                    '''save the model and some kb cache'''
                    model_to_save = policy.module if hasattr(policy, 'module') else policy
                    torch.save(model_to_save.state_dict(), save_model_file)
                    Save_KB_Files(convert_json_to_save(KB), save_kb_cache)
                    Save_KB_Files(M2N, save_m2n_cache)
                    Save_KB_Files(list(QUERY), save_query_cache)

            with open(save_eval_file, "a") as writer:
                logger.info("***** Eval results (%s)*****" %epoch)
                writer.write("***** Eval results (%s)*****\n" %epoch)
                for key in sorted(result.keys()):
                    logger.info(" %s=%s", key, str(result[key]))
                    writer.write("%s=%s \n" %(key, str(result[key])))
def train():
    tf.logging.set_verbosity(tf.logging.INFO)
    do_train=True
    do_eval=True
    do_test=True
    max_seq_length=50
    batch_size=256
    epochs=200
    warmup_proportion=0.1
    log_steps=500
    model_save_dir='./save'
    train_data_dir='./train_data'
    raw_data_dir='./data'
    tokenizer=Tokenizer()
    create_vocab(raw_data_dir,train_data_dir,tokenizer)
    train_examples = get_train_examples(raw_data_dir,'train')
    #print(len(train_examples))
    train_file = os.path.join(train_data_dir, "train.tf_record")
    file_based_convert_examples_to_features(train_examples, train_data_dir, max_seq_length, tokenizer, train_file)
    config=basic_model_classify.basic_config()
    config.vocab_size=len(tokenizer.vocab)
    config.max_length=max_seq_length
    config.n_tags=len(tokenizer.tags)
    config.batch_size=batch_size
    config.test=False
    num_train_steps=int(len(train_examples)/batch_size*epochs)
    num_warmup_steps=int(num_train_steps*warmup_proportion)
    #_trainining_hooks=_log()
    #_trainining_hooks=None
    model_fn=model_fn_builder(config=config, num_labels=config.n_tags, learning_rate=config.learning_rate,
                     num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps)
    session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    session_config.gpu_options.allow_growth = True
    run_config=tf.estimator.RunConfig(model_dir=model_save_dir,log_step_count_steps=log_steps,session_config=session_config)
    estimater=tf.estimator.Estimator(model_fn=model_fn,model_dir=model_save_dir,params={'batch_size':config.batch_size},config=run_config)
    if do_train:
        tf.logging.info("train examples length:{}".format(len(train_examples)))
        tf.logging.info("train total steps:{}".format(num_train_steps))
        input_fn=file_based_input_fn_builder(train_file,config.max_length,True,True)
        estimater.train(input_fn,steps=num_train_steps)
    if do_eval:
        eval_examples=get_train_examples(raw_data_dir,'test')
        tf.logging.info("eval examples length:{}".format(len(eval_examples)))
        eval_file = os.path.join(train_data_dir, "test.tf_record")
        file_based_convert_examples_to_features(eval_examples, train_data_dir, max_seq_length, tokenizer, eval_file)
        input_fn=file_based_input_fn_builder(eval_file,config.max_length,False,False)
        num_eval_steps=int(len(eval_examples)/batch_size)
        tf.logging.info("eval total steps:{}".format(num_eval_steps))
        result=estimater.evaluate(input_fn,steps=num_eval_steps)
        output_eval_file = os.path.join('./', "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
    if do_test:
        test_examples=get_train_examples(raw_data_dir,'test')
        tf.logging.info("test examples length:{}".format(len(test_examples)))
        test_file = os.path.join(train_data_dir, "test.tf_record")
        file_based_convert_examples_to_features(test_examples, train_data_dir, max_seq_length, tokenizer, test_file)
        input_fn=file_based_input_fn_builder(test_file,config.max_length,False,False)
        num_test_steps=int(len(test_examples)/batch_size)
        result=estimater.predict(input_fn)
        result=[i for i in result]
        true=[i['real_label'] for i in result]
        false=[i['pre_label'] for i in result]
        with open("test_tmp.txt",'w') as f:
            res=classification_report(true,false)
            print(res)
            f.write(res)
示例#17
0
            bleu_cmd += ["-lc"]
        bleu_cmd += [reference_file.name]
        try:
            bleu_out = subprocess.check_output(bleu_cmd,
                                               stdin=read_pred,
                                               stderr=subprocess.STDOUT)
            bleu_out = bleu_out.decode("utf-8")
            bleu_score = re.search(r"BLEU = (.+?),", bleu_out).group(1)
            bleu_score = float(bleu_score)
        except subprocess.CalledProcessError as error:
            if error.output is not None:
                print("multi-bleu.perl script returned non-zero exit code")
                print(error.output)
                bleu_score = np.float32(0.0)

    # Close temp files
    hypothesis_file.close()
    reference_file.close()
    return bleu_score


if __name__ == "__main__":
    from tokenization import Tokenizer
    tokenizer = Tokenizer(
        vocab_file='./src/utils/pretrain-data/gpt2-vocab.json',
        merge_file='./src/utils/pretrain-data/gpt2-merges.txt')
    b = BLEU(tokenizer)
    b.update(['I am his fathers.', 'You are here.'],
             ['I am his father.', 'I am here.'])
    print(b.bleu, type(b.bleu))
def create_tfrecords(params, write_remainder=True, write_every_n_files=1, resume_from_checkpoint=True, display_pbar=True):
    # iterates through files in input_dir, splitting into <args.chunk_size> chunks and saving a tfrecords file every <args.files_per> chunks.
    files, args, process_no = params
    if args.wwm:
        print("WWM Masking ON")
        enc = WWMTokenizer(args.seq_len)
    else:
        print("No WWM Masking")
        enc = Tokenizer()

    # init metadata
    discarded_files = 0
    files_processed = 0
    tfrecord_count = 0
    pbar = tqdm(desc=f"Writing TFRecord Files to {args.output_dir}. Parsed 0 input files. files_written ", disable= not display_pbar)
    checkpoint_path = f"{args.output_dir}/processed_files.txt"

    input_ids_to_prepend = []
    labels_to_prepend = []

    input_ids_list_array = []
    labels_list_array = []

    files_processed_list = []

    for f in files:
        # Read in most updated list of processed files & skip if already processed
        resume_files_processed = read_checkpoint(checkpoint_path, resume_from_checkpoint)
        if f in resume_files_processed:
            continue
        for input_ids_list, labels_list in archive_to_tokens(f, enc, args): # input_ids_list is a whole file chunked in lists of seq_len
            files_processed += 1

            # if the last chunk < chunk size, but > minimum_size, take it and append it to the beginning of the next file
            n_tokens = len(input_ids_list[-1])
            if n_tokens < args.seq_len:
                input_ids_last = input_ids_list.pop(-1)
                labels_last = labels_list.pop(-1)
                if n_tokens >= args.minimum_size:
                    input_ids_to_prepend.extend(input_ids_last)
                    labels_to_prepend.extend(labels_last)
                else:
                    discarded_files += 1

            if len(input_ids_to_prepend) >= args.seq_len:
                # if length of data_to_prepend becomes greater than chunk size, add concatted files to tokenized files
                input_ids_list_array.append(input_ids_to_prepend[:args.seq_len])
                input_ids_to_prepend = input_ids_to_prepend[args.seq_len:]

                labels_list_array.append(labels_to_prepend[:args.seq_len])
                labels_to_prepend = labels_to_prepend[args.seq_len:]

            # add tokenized files > chunk size to main array
            input_ids_list_array.extend(input_ids_list)
            labels_list_array.extend(labels_list)

            if len(labels_list_array) >= args.files_per * write_every_n_files: # write every n files
                _tfrecord_count, input_ids_remainder, labels_remainder = write_files(input_ids_list_array, labels_list_array, files_per=args.files_per, output_dir=args.output_dir, out_name=args.name, start_no = tfrecord_count, process_no=process_no)
                pbar.update(_tfrecord_count - tfrecord_count) # update progress bar
                pbar.set_description(f"Writing TFRecord Files to {args.output_dir}. Parsed {files_processed} input files. files_written ")
                tfrecord_count = _tfrecord_count
                input_ids_list_array = input_ids_remainder if input_ids_remainder is not None else [] # add remaining files to next chunk
                labels_list_array = labels_remainder if labels_remainder is not None else []
                with open(f"{checkpoint_path}", "a") as myfile:
                    for x in files_processed_list:
                        myfile.write(f"{x}, ")
                    files_processed_list = []

        # Save the file names to skip next time if not doing all in one go
        files_processed_list.append(f)

    if len(labels_list_array) >= args.files_per: # also write at end
        _tfrecord_count, input_ids_remainder, labels_remainder = write_files(input_ids_list_array, labels_list_array, files_per=args.files_per, output_dir=args.output_dir, out_name=args.name, start_no=tfrecord_count, process_no=process_no)
        pbar.update(_tfrecord_count - tfrecord_count)
        pbar.set_description(f"Writing TFRecord Files to {args.output_dir}. Parsed {files_processed} input files. files_written ")
        tfrecord_count = _tfrecord_count
        with open(f"{checkpoint_path}", "a") as myfile:
            for x in files_processed_list:
                myfile.write(f"{x}, ")
            files_processed_list = []
    else:
        input_ids_remainder = input_ids_list_array # add remaining to remainder
        labels_remainder = labels_list_array 

    if write_remainder:
        # write out the remaining files even if there's less than files_per
        write_files(input_ids_list_array, labels_list_array, files_per=args.files_per, output_dir=args.output_dir, out_name=args.name, start_no=tfrecord_count, write_remainder=True)

    successful_files = files_processed - discarded_files
    return {"discarded": discarded_files, "processed": files_processed, "successful": successful_files}
示例#19
0
from tokenization import Tokenizer
from flask import Flask, request
import json
import numpy as np
from loadScenarios import load_scenarios
from comparaison import compare_tokens

app = Flask(__name__)

tokenizer = Tokenizer(using_stopwords=False)

scenarios = load_scenarios()

threshold = 0.9


@app.route('/', methods=['POST', 'GET'])
def api():
    args = dict(request.form)
    message = args['content']
    message = tokenizer.transform(message)
    print(message)
    response = "I don't understand ..."
    if len(message) == 0:
        return response
    max_similarity = 0
    for scenario in scenarios:
        similarity = compare_tokens(message, scenario['triggers'])
        print(similarity, scenario['responses'])
        if similarity > max_similarity and similarity > threshold:
            response = scenario['responses'][np.random.randint(
示例#20
0
        if not os.path.exists(os.path.join(args["data_path"], "temp")):
            os.makedirs(os.path.join(args["data_path"], "temp"))
        np.save(train_pre_path.format("data"), train_data, allow_pickle=True)
        np.save(val_pre_path.format("data"), val_data, allow_pickle=True)
        np.save(test_pre_path.format("data"), test_data, allow_pickle=True)
        np.save(train_pre_path.format("target"),
                train_target,
                allow_pickle=True)
        np.save(val_pre_path.format("target"), val_target, allow_pickle=True)
        np.save(test_pre_path.format("target"), test_target, allow_pickle=True)

    if run_tokenization:
        ## do tokenization
        print("Tokenize")
        tokenizer = Tokenizer(args=tokenizer_model,
                              fasttextFile=args["fasttext_file"],
                              doLower=args["doLower"])
        train_data = tokenizer.fit_transform(train_data)
        val_data = tokenizer.transform(val_data)
        test_data = tokenizer.transform(test_data)

        ## save the preprocessed data
        if not os.path.exists(os.path.join(args["data_path"], "temp")):
            os.makedirs(os.path.join(args["data_path"], "temp"))
        if sparse.issparse(train_data):
            sparse.save_npz(train_tok_path.format("data"), train_data)
        else:
            np.save(train_tok_path.format("data"), train_data)
        np.save(train_tok_path.format("target"), train_target)
        if sparse.issparse(val_data):
            sparse.save_npz(val_tok_path.format("data"), val_data)
示例#21
0
# codeing=utf-8

import os
import re
import numpy as np
import random
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from tokenization import Tokenizer

tokenizer = Tokenizer()

input_file = "trec06p/label/index"
data_dir = "trec06p/data"
num_labels = 2
M = 2
alpha = 0.01
seed = 888

random.seed(seed)

cand_nums = [21, 24, 21, 6]
features_num = len(cand_nums)
feature_weights = [3, 0, 5, 0]
train_set_rate = 0.05

from_features = [
    '[UNK]', 'hotmail', 'lingo', 'gmail', 'yahoo', 'aol', '0451', 'iname',
    'singnet', 'www.loveinfashion', 'o-himesama', 'aries.livedoor', 'oh-oku',
    'msn', 'paypal', 'tc.fluke', 'ey', 'specialdevices', 'buta-gori',
    'plan9.bell-labs', 'halcyon'
import os
import json

import responder

from tokenization import Tokenizer


env = os.environ
DEBUG = env['DEBUG'] in ['1', 'True', 'true']
LANG = env.get('LANG')
MECAB_ARGS = env.get('MECAB_ARGS')

api = responder.API(debug=DEBUG)
tokenizer = Tokenizer(lang=LANG, mecab_args=MECAB_ARGS)


@api.route("/")
async def tokenize(req, resp):
    body = await req.text
    texts = json.loads(body)
    docs = [tokenizer.tokenize(text) for text in texts]
    resp.media = dict(data=docs)


if __name__ == "__main__":
    api.run()
示例#23
0
    # Save model
    torch.save(model.state_dict(), '{}_lm{}.pth'.format(config.model_type.lower(), epoch))

if __name__=='__main__':
    config = argparser()
    print(config)

    # Load vocabulary
    import pickle
    with open(config.vocab, 'rb') as reader:
        vocab = pickle.load(reader)

    # Select tokenizer
    if config.tokenizer=='mecab':
        from konlpy.tag import Mecab
        tokenizer = Tokenizer(tokenization_fn=Mecab().morphs,
                              vocab=vocab, max_seq_length=config.max_seq_len)
    
    # Build dataloader
    train_loader = DataLoader(dataset=Corpus(corpus_path=config.train_corpus,
                                             tokenizer=tokenizer,
                                             model_type=config.model_type,
                                             cuda=config.cuda),
                              batch_size=config.batch_size,
                              shuffle=config.shuffle,
                              drop_last=True)
    if config.test_corpus:
        test_loader = DataLoader(dataset=Corpus(corpus_path=config.test_corpus,
                                                tokenizer=tokenizer,
                                                model_type=config.model_type,
                                                cuda=config.cuda),
                                 batch_size=config.batch_size,
示例#24
0
            os.makedirs(os.path.join(args["data_path"], "temp"))
        train_df.to_pickle(train_pre_path)
        val_df.to_pickle(val_pre_path)
        test_df.to_pickle(test_pre_path)
    else:
        train_df = pd.read_pickle(train_pre_path)
        val_df = pd.read_pickle(val_pre_path)
        test_df = pd.read_pickle(test_pre_path)
        ## get data and train columns
        data_column = list(set(train_df.columns) - set(args["targets"]))[0]

    if run_tokenization:
        ## do tokenization
        print("Tokenize")
        tokenizer = Tokenizer(tokenizeStr=tokenizer_model[0],
                              fasttextFile=args["fasttext_file"],
                              doLower=args["doLower"])
        train_df[data_column] = tokenizer.fit_transform(train_df[data_column])
        val_df[data_column] = tokenizer.transform(val_df[data_column])
        test_df[data_column] = tokenizer.transform(test_df[data_column])

        ## save the preprocessed data
        if not os.path.exists(os.path.join(args["data_path"], "temp")):
            os.makedirs(os.path.join(args["data_path"], "temp"))
        train_df.to_pickle(train_tok_path)
        val_df.to_pickle(val_tok_path)
        test_df.to_pickle(test_tok_path)

    else:
        train_df = pd.read_pickle(train_tok_path)
        val_df = pd.read_pickle(val_tok_path)
示例#25
0
    preprocessor = Preprocessor(
        doLower=args["doLower"],
        doLemmatization=args["doLemmatization"],
        removeStopWords=args["removeStopWords"],
        doSpellingCorrection=args["doSpellingCorrection"],
        removeNewLine=args["removeNewLine"],
        removePunctuation=args["removePunctuation"],
        removeHtmlTags=args["removeHtmlTags"],
        minTextLength=args["minTextLength"])
    predict_df["processed"] = preprocessor.fit_transform(
        predict_df["text_german"])
    predict_df = predict_df.dropna(subset=["processed"], axis=0)

    print("Tokenize")
    tokenizer = Tokenizer(tokenizeStr=preperation_technique,
                          ngram=preperation_ngram,
                          fasttextFile=args["fasttext_file"],
                          doLower=args["doLower"])
    predict_df["processed"] = tokenizer.fit_transform(predict_df["processed"])

    ## for testing purposes
    #train_df = train_df.sample(100)
    #val_df = val_df.sample(20)
    #test_df = test_df.sample(20)

    ## apply the model
    labels = [
        "price_pos", "price_neg", "quality_pos", "quality_neg",
        "restaurant_pos", "restaurant_neg", "food_pos", "food_neg",
        "drinks_pos", "drinks_neg", "ambience_pos", "ambience_neg",
        "service_pos", "service_neg"
    ]
示例#26
0
def main():
    parser = argparse.ArgumentParser()

    # model structure
    parser.add_argument('--rnncell', type=str, default='LSTM')
    parser.add_argument('--emsize', type=int, default=200)
    parser.add_argument('--nhid', type=int, default=600)
    parser.add_argument('--outsize', type=int, default=400)
    parser.add_argument('--nlayers', type=int, default=2)
    parser.add_argument('--bidirec', action='store_true')
    parser.add_argument('--autoenc', action='store_true')
    parser.add_argument('--forget-bias', type=float, default=False)
    parser.add_argument('--decoder-bias', action='store_true')

    # data
    parser.add_argument('--corpus', type=str, default='guten')
    parser.add_argument('--min-len', type=int, default=10)
    parser.add_argument('--max-len', type=int, default=80)

    # vocabulary
    parser.add_argument('--vocab', type=str, default=None)
    parser.add_argument('--lower', action='store_true')
    parser.add_argument('--min-cnt', type=int, default=6)

    # training
    parser.add_argument('--dropout', type=float, default=0.1)
    parser.add_argument('--seed', type=int, default=3333)
    parser.add_argument('--batch-size', type=int, default=20)
    parser.add_argument('--eval-batch-size', type=int, default=10)

    # optimizer
    parser.add_argument('--optim', type=str, default='SGD')
    parser.add_argument('--lr', type=float, default=.5)
    parser.add_argument('--clip', type=float, default=5.0)
    parser.add_argument('--decay-after', type=int, default=5)
    parser.add_argument('--decay-rate', type=float, default=0.5)
    parser.add_argument('--decay-period', type=int, default=1)
    parser.add_argument('--epochs', type=int, default=10)

    # save and log
    parser.add_argument('--save-dir', type=str, default='train/noname')
    parser.add_argument('--log-interval', type=int, default=10000)
    parser.add_argument('--eval-interval', type=int, default=10000)
    parser.add_argument('--save-all', action='store_false')
    parser.add_argument('--save-period', type=int, default=1)

    args = parser.parse_args()
    logger.debug("Running {}".format(__file__))

    if not os.path.exists(args.save_dir):
        logger.debug("Creating directory at {}".format(args.save_dir))
        os.makedirs(args.save_dir)

    args_path = os.path.join(args.save_dir, 'args.json')
    with open(args_path, 'w') as f:
        logger.debug("Saving arguments at {}".format(args_path))
        json.dump(vars(args), f, indent=2)

    log_path = os.path.join(args.save_dir, 'log.txt')
    file_handler = logging.FileHandler(log_path, mode='w')
    file_handler.setLevel(logging.INFO)
    logger.addHandler(file_handler)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Set the random seed manually for reproducibility.
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # Use pre built vocabulary if it exists
    if args.vocab and os.path.exists(args.vocab):
        vocab = load_vocab(args.vocab)
        update = False
    else:
        vocab = Vocabulary()
        update = True
    tokenizer = Tokenizer(vocab, args.lower)

    tr_txts = get_txts(args.corpus, 'train')
    va_txts = get_txts(args.corpus, 'valid')

    tr_input = LineInput(tr_txts, tokenizer, update, args.min_len,
                         args.max_len)
    va_input = LineInput(va_txts, tokenizer, update, args.min_len,
                         args.max_len)
    va_batches = va_input.batchify(args.eval_batch_size, False)

    if update:
        vocab.build_from_counter(args.min_cnt)
        logger.debug("Built vocab of size {}".format(len(vocab)))

    # Build the model
    model = WordRNN(len(vocab), len(vocab), args.rnncell, args.emsize,
                    args.outsize, args.nhid, args.nlayers, args.bidirec,
                    args.autoenc, args.decoder_bias, args.forget_bias,
                    args.dropout)
    logger.debug(model)
    model.to(device)

    learnables = list(filter(lambda p: p.requires_grad, model.parameters()))
    optimizer = getattr(optim, args.optim)(learnables, lr=args.lr)

    save_vocab(vocab, os.path.join(args.save_dir, 'vocab.txt'))
    model_path = os.path.join(args.save_dir, 'model.pt')

    # At any point you can hit Ctrl + C to break out of training early.
    try:
        # Loop over epochs.
        best_val_loss = None

        logger.info('-' * 79)
        for epoch in range(1, args.epochs + 1):
            epoch_start_time = time.time()
            tr_batches = tr_input.batchify(args.batch_size, True)
            train(model, tr_batches, learnables, optimizer, device, args)

            val_loss = evaluate(model, va_batches, device)
            logger.info('-' * 79)
            logger.info('| end of epoch {:2d} | time: {:5.2f}s '
                        '| valid loss {:5.2f} | valid ppl {:8.2f} |'.format(
                            epoch, (time.time() - epoch_start_time), val_loss,
                            math.exp(val_loss)))
            logger.info('-' * 79)

            updated_best = not best_val_loss or val_loss < best_val_loss
            if epoch >= args.decay_after > 0:
                if (epoch - args.decay_after) % args.decay_period == 0:
                    for group in optimizer.param_groups:
                        group['lr'] *= args.decay_rate

            if (epoch % args.save_period == 0) and (updated_best
                                                    or args.save_all):
                if args.save_all:
                    model_path = os.path.join(args.save_dir,
                                              'ep{}.pt'.format(epoch))
                torch.save(model.state_dict(), model_path)

                if updated_best:
                    best_val_loss = val_loss

        logger.debug("Completed training and saved to {}".format(
            args.save_dir))
    except KeyboardInterrupt:
        logger.debug('-' * 79)
        logger.debug("Exiting from training early")