def table_tokenizer(table, uncase=True):
    textTable = []
    # maxLen = 0
    for text in tqdm(table, file=sys.stdout):
        text = text_cleaner(text, uncase=uncase)
        textTable.append(text)
    return textTable
def filter_entailment(args):
    with open(args.output_file, 'w') as outfile, open(args.input_file,
                                                      'r') as infile:
        print("Writing to {} from {}".format(args.output_file,
                                             args.input_file))
        line_tqdm = tqdm(infile, dynamic_ncols=True)

        filtered_entailment = {}
        for line in line_tqdm:
            qa_json = json.loads(line)
            num_q = 0
            # Filter entailment score
            if qa_json['score'] > args.min_entail_score:
                filtered_entailment.setdefault(qa_json["id"],
                                               []).append(qa_json)
            #line_tqdm.set_postfix(hits=num_q)

        for qid, qa_list in filtered_entailment.items():
            print('\r q', num_q, end="")
            # Filter number of entailment texts
            if len(qa_list) <= args.max_entail_docs:
                continue
            qa_list = sorted(qa_list, key=lambda qa: qa['score'], reverse=True)
            qa_list = qa_list[:args.max_entail_docs]

            # Write to outfile
            output_dict = create_output_dict(qa_list)
            outfile.write(json.dumps(output_dict) + '\n')
            num_q += 1
        print()
示例#3
0
def main():
    '''
    Runs this script.
    '''
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'output_folder',
        type=str,
        help='Path to the root folder where data will be stored.')
    parser.add_argument('--num',
                        dest='num_cars',
                        type=int,
                        required=True,
                        help='Amount of cars to crawl.')
    parser.add_argument(
        '--from-cheapest',
        action='store_true',
        required=True,
        help=
        'If set, crawls the cars starting from the cheapest one, otherwise from the most expensive one.'
    )
    parser.add_argument('--save-every',
                        type=int,
                        default=100,
                        help='Interval between two security saves.')
    parser.add_argument(
        '--max-pages',
        type=int,
        default=10000,
        help='Sets the maximum number of pages to crawl. Avoids infinity loops.'
    )
    args = parser.parse_args([
        '/home/tom/second_hand_cars_data',
        '--num',
        '1000',
        '--from-cheapest',
        '--save-every',
        '200',
        '--max-pages',
        '1000',
    ])

    img_folder = os.path.join(args.output_folder, 'imgs')
    sort_value = from_most_expensive_code
    if args.from_cheapest:
        sort_value = from_most_expensive_code
    with CrawlerStatus(status_folder=args.output_folder) as status:
        for page_id in tqdm(range(args.max_pages)):
            page_url = base_url.format(page_id, sort_value)
            try:
                crawl_page(page_url, status, img_folder, args.save_every)
            except:
                # Could not complete to crawl this page, go to next one.
                logging.error('Unable to crawl page: {}'.format(page_url))
                continue
            if status.size >= args.num_cars:
                # Collected enough cars: quit crawling.
                break
示例#4
0
def text_cleaner(table):
    textTable = []
    maxLen = 0
    for text in tqdm(table, file=sys.stdout):
        text = get_words(text)
        textTable.append(text)
        if len(text) > maxLen:
            maxLen = len(text)
    return textTable, maxLen
def add_retrieved_text(qa_file, output_file):
    with open(output_file, 'w') as output_handle, open(qa_file, 'r') as qa_handle:
        print("Writing to {} from {}".format(output_file, qa_file))
        line_tqdm = tqdm(qa_handle, dynamic_ncols=True)
        for line in line_tqdm:
            json_line = json.loads(line)
            num_hits = 0
            for output_dict in add_hits_to_qajson(json_line):
                output_handle.write(json.dumps(output_dict) + "\n")
                num_hits += 1
            line_tqdm.set_postfix(hits=num_hits)
示例#6
0
def csv_processing(path, test=False):
    texts_1 = []
    texts_2 = []
    labels = []
    test_ids = []
    with codecs.open(path, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=',')
        header = next(reader)
        if test == False:
            for values in tqdm(reader):
                texts_1.append(text_to_wordlist(values[3]))
                texts_2.append(text_to_wordlist(values[4]))
                labels.append(int(values[5]))
            return texts_1, texts_2, labels
        else:
            for values in tqdm(reader):
                texts_1.append(text_to_wordlist(values[1]))
                texts_2.append(text_to_wordlist(values[2]))
                test_ids.append(values[0])
            return texts_1, texts_2, test_ids
示例#7
0
def add_retrieved_text(qa_file, output_file, query_mode='common'):
    print("Query mode is {}".format(query_mode))
    with open(output_file, 'w') as output_handle, open(qa_file,
                                                       'r') as qa_handle:
        print("Writing to {} from {}".format(output_file, qa_file))
        line_tqdm = tqdm(qa_handle, dynamic_ncols=True)  # read json
        for line in line_tqdm:
            json_line = json.loads(line)

            qa_json = add_hits_to_qajson(json_line, query_mode)
            output_handle.write(json.dumps(qa_json) + "\n")
示例#8
0
 def reformulate_query(self, qa_file, output_file):
     with open(output_file, 'w') as reform_qa, open(qa_file,
                                                    'r') as origin_qa:
         print("Writing to {} from {}".format(output_file, qa_file))
         line_tqdm = tqdm(origin_qa, dynamic_ncols=True)
         for line in line_tqdm:
             json_line = json.loads(line)
             num_reform = 0
             for output_dict in self.reform_query_to_qajson(json_line):
                 reform_qa.write(json.dumps(output_dict) + "\n")
                 num_reform += 1
             line_tqdm.set_postfix(hits=num_reform)
示例#9
0
def load_emb(vocab):
    print("Reading pre-trained embeddings")
    embeddings = np.random.normal(0, 0.01, (len(vocab['w2i']), 300))
    with open("/home/data/glove/glove.840B.300d.txt", "r") as embed_in:
        line_tqdm = tqdm(embed_in, dynamic_ncols=True)
        for idx, line in enumerate(line_tqdm):
            row = line.split()
            if len(row) != 301: continue
            if row[0] in vocab['w2i']:
                embeddings[vocab['w2i'][row[0]], :] = np.asarray(
                    [float(v) for v in row[1:]])
    embeddings[vocab['w2i']['<pad>']] = np.zeros((1, 300))

    return embeddings
示例#10
0
def generate_pkl_nis(pred_dets_hdf5, best_binary_file, out_dir, file_name):
    pred_dets = h5py.File(pred_dets_hdf5, 'r')
    binary_file = h5py.File(best_binary_file, 'r')
    print(pred_dets_hdf5)
    print(best_binary_file)

    assert len(pred_dets.keys()) == 4539

    print(len(binary_file))

    hoi_list = io.load_json_object("data/vcoco/annotations/hoi_list_234.json")
    hoi_dict = {int(hoi["id"]) - 1: hoi for hoi in hoi_list}

    result_list = []
    for global_id in tqdm(pred_dets.keys()):
        image_id = int(global_id.split("_")[1])

        start_end_ids = pred_dets[global_id]['start_end_ids']
        assert len(start_end_ids) == 234

        for hoi_id in range(234):
            start_id, end_id = pred_dets[global_id]['start_end_ids'][int(
                hoi_id)]
            if start_id == end_id:
                continue

            for j in range(start_id, end_id):
                hoi_dets = pred_dets[global_id]['human_obj_boxes_scores'][j]
                inter_score = binary_file[global_id]["binary_score_data"][j]

                final_score = hoi_dets[8] * inter_score * hoi_dets[9]
                person_boxes = hoi_dets[:4].tolist()

                per_image_dict = {}
                per_image_dict["image_id"] = image_id
                per_image_dict["person_box"] = person_boxes

                aciton = hoi_dict[hoi_id]["verb"]
                role = hoi_dict[hoi_id]["role"]

                per_image_dict[aciton + "_" + role] = [
                    hoi_dets[4], hoi_dets[5], hoi_dets[6], hoi_dets[7],
                    final_score
                ]

                result_list.append(per_image_dict)

    io.dump_pickle_object(result_list, os.path.join(out_dir,
                                                    file_name + ".pkl"))
示例#11
0
def w2vEmbdReader(embd_path, reVocab, embd_dim):
    logger.info('  getting pre-trained embedding from file... ')
    logger.info('  embedding length: %i  dim: %i  ' % (len(reVocab), embd_dim))
    embd_matrix = np.zeros((len(reVocab), embd_dim))
    with open(embd_path, 'r', encoding='utf8') as fhd:
        idx = 1  # let 1st padding line all zeros
        for line in tqdm(fhd, total=len(reVocab)):
            elem = line.strip().split(' ')
            assert len(
                elem
            ) == embd_dim + 1, 'Incorrect Embedding Dimension, expect %d but got %d ' % (
                embd_dim, len(elem) - 1)
            w2vec = np.asarray(elem[1:], dtype='float32')
            embd_matrix[idx] = w2vec
            idx += 1
    return embd_matrix
示例#12
0
def test(model,
         test_para,
         test_relation,
         test_label,
         loss_function=None,
         pred_flag=False):
    total_loss = []
    model.eval()
    pred = []

    N = len(test_para)
    test_size = 16
    line_tqdm = tqdm(range(N // test_size + 1), dynamic_ncols=True)
    for i in line_tqdm:
        para_test = test_para[i * test_size:min((i + 1) * test_size, N)]
        relation_test = test_relation[i * test_size:min((i + 1) *
                                                        test_size, N)]
        label_test = test_label[i * test_size:min((i + 1) * test_size, N)]
        score = model(para_test, relation_test)
        target = Variable(torch.LongTensor(label_test))
        if model.use_cuda:
            target = target.cuda()

        if loss_function is not None:
            loss = loss_function(score, target)
            total_loss.extend(loss.data.cpu().numpy().tolist())

        pred.extend(torch.argmax(score, dim=-1).cpu().tolist())

    acc, precision, recall, f1 = getScores(pred, test_label)

    if loss_function is not None:
        print("\t\tLoss: {:0.5f}".format(sum(total_loss) / len(total_loss)))
    print("\t\tAccuracy: {:0.5f}".format(acc))
    print("\t\tPrecision: {:0.5f}".format(precision))
    print("\t\tRecall: {:0.5f}".format(recall))
    print("\t\tF1: {:0.5f}".format(f1))

    if loss_function is not None:
        out = (acc, precision, recall, f1, sum(total_loss) / len(total_loss))
    else:
        out = (acc, precision, recall, f1)

    if pred_flag:
        out = (pred, ) + out

    return out
示例#13
0
    def download(self, url, output_path):
        # Streaming, so we can iterate over the response.
        r = requests.get(url, stream=True)

        # Total size in bytes.
        total_size = int(r.headers.get('content-length', 0))
        block_size = 1024
        wrote = 0
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, 'wb') as f:
            for data in tqdm(r.iter_content(block_size),
                             total=math.ceil(total_size // block_size),
                             unit='KB',
                             unit_scale=True):
                wrote = wrote + len(data)
                f.write(data)
        if total_size != 0 and wrote != total_size:
            raise ConnectionError("ERROR, something went wrong")
示例#14
0
def tokenizeIt(table, clean=False, addHead=None):
    tokenizedTable = []
    maxLen = 0
    for text in tqdm(table, file=sys.stdout):
        if clean:
            # 			text = stripTagsAndUris(text)
            text = word_tokenize(get_words(text))
            if not addHead is None:
                text = [addHead] + text
            tokenizedTable.append(text)
            if len(text) > maxLen:
                maxLen = len(text)
        else:
            text = str(text).split(' ')
            if not addHead is None:
                text = [addHead] + text
            tokenizedTable.append(text)
            if len(text) > maxLen:
                maxLen = len(text)
    return tokenizedTable, maxLen
示例#15
0
def add_retrieved_text(args):
    es_search = EsSearch(es_client="node008",
                         max_hits_retrieved=args.num_retrieve * 2,
                         min_hit_length=5,
                         max_hit_length=100,
                         max_hits_per_choice=args.num_retrieve)

    with open(args.output_file,
              'w') as output_handle, open(args.input_file, 'r') as qa_handle:
        print("Writing to {} from {}".format(args.output_file,
                                             args.input_file))
        line_tqdm = tqdm(qa_handle, dynamic_ncols=True)
        for line in line_tqdm:
            json_line = json.loads(line)
            num_hits = 0
            for output_dict in add_hits_to_qajson(es_search, json_line,
                                                  args.num_retrieve):
                output_handle.write(json.dumps(output_dict) + "\n")
                num_hits += 1
            line_tqdm.set_postfix(hits=num_hits)
示例#16
0
def run_exp(configs, n_jobs=1):
    datasets = get_datasets(configs['data'])
    exp_results = []
    for dataset, data in datasets.items():
        for encoder_config in tqdm(configs['encoder'],
                                   'Process: {}'.format(dataset)):
            transformer_name, transformer = init_transformer(
                encoder_config, data)
            exps = make_exp(data, transformer, configs)

            exp_results.append({
                'dataset':
                dataset,
                'transformer':
                transformer_name,
                'metrics':
                Parallel(n_jobs=n_jobs,
                         prefer='threads')(delayed(run_one_exp)(exp)
                                           for exp in exps)
            })
    return exp_results
示例#17
0
                                   pin_memory=True,
                                   drop_last=False,
                                   listen='*:%d' % (args.port + 3),
                                   timeout=600)
    testloader2 = RemoteDataLoader(augmented_dataset_t,
                                   batch_size=1,
                                   shuffle=False,
                                   pin_memory=True,
                                   drop_last=False,
                                   listen='*:%d' % (args.port + 4),
                                   timeout=600)

    logger.info(f'---- use data cache @ {cache_prefix} ---- ')
    trainloader = CachedDataLoader(trainloader,
                                   tag=f'{cache_prefix}_tr',
                                   cache_max=MAX_CACHE_EPOCH)
    # testloader1 = CachedDataLoader(testloader1, tag=f'{cache_prefix}_ts1', cache_max=CachedDataLoader.NO_SEPARATE_EPOCH)
    # validloader = CachedDataLoader(validloader, tag=f'{cache_prefix}_ts2', cache_max=CachedDataLoader.NO_SEPARATE_EPOCH)
    # testloadera = CachedDataLoader(testloadera, tag=f'{cache_prefix}_tsa', cache_max=CachedDataLoader.NO_SEPARATE_EPOCH)
    # testloader2 = CachedDataLoader(testloader2, tag=f'{cache_prefix}_ts3', cache_max=CachedDataLoader.NO_SEPARATE_EPOCH)

    trainloader = tqdm(trainloader)

    for epoch in range(args.epoch):
        cnt = 0
        start_t = time.time()
        for _ in tqdm(trainloader, desc="%04d" % epoch):
            cnt += 1
            if cnt > 38400:
                break
示例#18
0
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)

    # Return a list of words
    return (text)


if load_train_test_pkl is '':
    texts_1 = []
    texts_2 = []
    labels = []
    with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=',')
        header = next(reader)
        for values in tqdm(reader):
            texts_1.append(text_to_wordlist(values[3]))
            texts_2.append(text_to_wordlist(values[4]))
            labels.append(int(values[5]))
    print('Found %s texts in train.csv' % len(texts_1))

    test_texts_1 = []
    test_texts_2 = []
    test_ids = []
    with codecs.open(TEST_DATA_FILE, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=',')
        header = next(reader)
        for values in tqdm(reader):
            test_texts_1.append(text_to_wordlist(values[1]))
            test_texts_2.append(text_to_wordlist(values[2]))
            test_ids.append(values[0])
示例#19
0
def main(args):

    print("Load Data")
    print(args.train_data, args.dev_data)
    files = {'train': args.train_data, 'dev': args.dev_data}

    model_name = 'gpt2'
    tokenizer = GPT2Tokenizer.from_pretrained(model_name, pad_token='<PAD>')

    # add tokens for precondition generation
    tokenizer.add_tokens([
        '<sep>', '<event>', '</event>', '<pre>', '</pre>', '<eos>', '[BLANK]'
    ])
    encdec = GPT2LMHeadModel.from_pretrained(model_name)
    encdec.resize_token_embeddings(len(tokenizer))

    # dataset load
    dataset = load_data(files, max_len=args.max_sequence_length, eos='<eos>')

    if args.load_model is not None:
        model = torch.load(args.load_model)
    else:
        model = Model(tokenizer, encdec)
        if model.use_cuda:
            model.cuda()

    data_input, gen_seed, target, target_weights = prepare(dataset, tokenizer)

    # Set a path for saving model
    save_model_path = os.path.join(args.save_model_path, args.experiment)
    if not os.path.exists(save_model_path):
        os.makedirs(save_model_path)

    # Optimizer
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5, eps=1e-8)

    n_params = sum([np.prod(p.size()) for p in model.parameters()])
    print("#parameters: {}".format(n_params))

    N = len(data_input['train'])
    print(N // args.batch_size)
    best_dev_loss = 9999
    for epoch in range(1, args.epochs + 1):
        print("Epoch {}:".format(epoch))
        start_time = time.time()
        batch_idxs = np.random.permutation(N // args.batch_size + 1)
        line_tqdm = tqdm(batch_idxs, dynamic_ncols=True)
        total_loss = []
        model.train()

        for batch_idx in line_tqdm:
            enc_input = data_input['train'][batch_idx * args.batch_size:min(
                (batch_idx + 1) * args.batch_size, N)]
            tmp = gen_seed['train'][batch_idx *
                                    args.batch_size:min((batch_idx + 1) *
                                                        args.batch_size, N)]
            event_lens = [len(s) for s in tmp]

            if len(enc_input) == 0:
                continue

            model.zero_grad()

            loss = model(enc_input, copy.deepcopy(enc_input), event_lens)

            total_loss.append(loss.data.cpu().numpy().tolist())
            loss.backward()
            optimizer.step()
            gc.collect()
            torch.cuda.empty_cache()

        end_time = time.time()
        print("Time elapsed: {:.3f}".format(end_time - start_time))
        print("Loss: {}".format(sum(total_loss) / len(total_loss)))

        model.eval()
        with torch.no_grad():
            for set_info in ['train', 'dev']:
                NN = len(data_input[set_info])
                total_loss = []
                for idx in range(NN // args.batch_size):
                    enc_input = data_input[set_info][idx * args.batch_size:min(
                        (idx + 1) * args.batch_size, NN)]
                    tmp = gen_seed[set_info][idx * args.batch_size:min(
                        (idx + 1) * args.batch_size, NN)]
                    event_lens = [len(s) for s in tmp]

                    if len(enc_input) == 0:
                        continue

                    loss = model(enc_input, copy.deepcopy(enc_input),
                                 event_lens)

                    total_loss.append(loss.data.cpu().numpy().tolist())

                loss = sum(total_loss) / len(total_loss)
                print("Test on {} set:".format(set_info))
                print("\tLoss: {}".format(loss))
                if set_info == 'dev':
                    if best_dev_loss > loss:
                        best_dev_loss = loss
                        torch.save(model,
                                   os.path.join(save_model_path, "DevBest.pt"))

            for d, t in zip(gen_seed['dev'][:10], target['dev'][:10]):
                sent = model.generate(d)
                print("Target Event: ", tokenizer.decode(d))
                print("Generated Precondition: ", sent)
                print("Reference: ", tokenizer.decode(t))

    return
示例#20
0
def create_feature_map(img_map: dict, model: FeatureExtractor):
    feature_map = {}
    for img_index, img_path in tqdm(list(img_map.items()), desc="Extracting features.."):
        img = Image.open(img_path)
        feature_map[img_index] = model.extract_features(img)
    return feature_map
        else:
            ts.insert(corrupt_idx, lambda img: PIL.Image.fromarray(corrupt(np.array(img), corrupt_level, None, int(corrupt_type))))

    transform_test = transforms.Compose(ts)

    testset = ImageNet(root='/data/public/rw/datasets/imagenet-pytorch', split='val', transform=transform_test)
    sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for _ in range(1):
        sss = sss.split(list(range(len(testset))), testset.targets)
    train_idx, valid_idx = next(sss)
    testset = Subset(testset, valid_idx)

    testloader = torch.utils.data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=32, pin_memory=True, drop_last=False)

    metric = Accumulator()
    dl_test = tqdm(testloader)
    data_id = 0
    tta_rule_cnt = [0] * tta_num
    for data, label in dl_test:
        data = data.view(-1, data.shape[-3], data.shape[-2], data.shape[-1])
        data = data.cuda()

        with torch.no_grad():
            preds = model_target(data)
            preds = torch.softmax(preds, dim=1)

        preds = preds.view(len(label), -1, preds.shape[-1])

        preds_merged = torch.mean(preds, dim=1)     # simple averaging
        # TODO : weighted average mean?
        # preds_merged = torch.max(preds, dim=1)[0]       # simple maximum peak
示例#22
0
def main():
    parser = argparse.ArgumentParser(
        description='XGB with Handcrafted Features')
    parser.add_argument('--save',
                        type=str,
                        default='XGB_leaky',
                        help='save_file_names')
    args = parser.parse_args()
    timestr = time.strftime("%Y%m%d-%H%M%S-")
    output_dir = '../output/' + time.strftime("%m%d")
    # 	mkdir(output_dir)

    print("Reading train features...")
    df_train = pd.read_csv(train_feature, encoding="ISO-8859-1")
    X_train_ab = df_train.iloc[:, 2:]
    # 	X_train_ab = X_train_ab.drop('euclidean_distance', axis=1)
    # 	X_train_ab = X_train_ab.drop('jaccard_distance', axis=1)

    print("Reading train material...")
    df_train = pd.read_csv(train_file)
    df_train = df_train.fillna(' ')

    print("Reading test material...")
    df_test = pd.read_csv(test_file)
    ques = pd.concat([df_train[['question1', 'question2']], \
        df_test[['question1', 'question2']]], axis=0).reset_index(drop='index')
    q_dict = defaultdict(set)
    for i in tqdm(range(ques.shape[0])):
        q_dict[ques.question1[i]].add(ques.question2[i])
        q_dict[ques.question2[i]].add(ques.question1[i])

    def q1_freq(row):
        return (len(q_dict[row['question1']]))

    def q2_freq(row):
        return (len(q_dict[row['question2']]))

    def q1_q2_intersect(row):
        return (len(
            set(q_dict[row['question1']]).intersection(
                set(q_dict[row['question2']]))))

    df_train['q1_q2_intersect'] = df_train.apply(q1_q2_intersect,
                                                 axis=1,
                                                 raw=True)
    df_train['q1_freq'] = df_train.apply(q1_freq, axis=1, raw=True)
    df_train['q2_freq'] = df_train.apply(q2_freq, axis=1, raw=True)

    df_test['q1_q2_intersect'] = df_test.apply(q1_q2_intersect,
                                               axis=1,
                                               raw=True)
    df_test['q1_freq'] = df_test.apply(q1_freq, axis=1, raw=True)
    df_test['q2_freq'] = df_test.apply(q2_freq, axis=1, raw=True)

    test_leaky = df_test.loc[:, ['q1_q2_intersect', 'q1_freq', 'q2_freq']]
    del df_test

    train_leaky = df_train.loc[:, ['q1_q2_intersect', 'q1_freq', 'q2_freq']]

    # explore
    stops = set(stopwords.words("english"))

    df_train['question1'] = df_train['question1'].map(
        lambda x: str(x).lower().split())
    df_train['question2'] = df_train['question2'].map(
        lambda x: str(x).lower().split())

    train_qs = pd.Series(df_train['question1'].tolist() +
                         df_train['question2'].tolist())

    words = [x for y in train_qs for x in y]
    counts = Counter(words)
    weights = {word: get_weight(count) for word, count in counts.items()}

    print('Building Features')
    X_train = build_features(df_train, stops, weights)
    X_train = pd.concat((X_train, X_train_ab, train_leaky), axis=1)
    y_train = df_train['is_duplicate'].values

    df_train1 = pd.read_csv(train_file)
    X_train1 = pd.concat((df_train1, X_train), axis=1)
    X_train1.to_csv(output_dir + '/' + timestr + 'train_extra_features.csv',
                    index=False)
    del df_train1, X_train1
    del df_train, X_train_ab, train_leaky
    print('Dumped train extra features to file ' + timestr +
          'train_extra_features.csv')

    X_train, X_valid, y_train, y_valid = train_test_split(X_train,
                                                          y_train,
                                                          test_size=0.1,
                                                          random_state=4242)

    #UPDownSampling
    print("Train Sampling...")
    pos_train = X_train[y_train == 1]
    neg_train = X_train[y_train == 0]
    X_train = pd.concat(
        (neg_train, pos_train.iloc[:int(0.8 * len(pos_train))], neg_train))
    y_train = np.array([0] * neg_train.shape[0] + [1] *
                       pos_train.iloc[:int(0.8 * len(pos_train))].shape[0] +
                       [0] * neg_train.shape[0])
    print(np.mean(y_train))
    del pos_train, neg_train

    print("Valid Sampling...")
    pos_valid = X_valid[y_valid == 1]
    neg_valid = X_valid[y_valid == 0]
    X_valid = pd.concat(
        (neg_valid, pos_valid.iloc[:int(0.8 * len(pos_valid))], neg_valid))
    y_valid = np.array([0] * neg_valid.shape[0] + [1] *
                       pos_valid.iloc[:int(0.8 * len(pos_valid))].shape[0] +
                       [0] * neg_valid.shape[0])
    print(np.mean(y_valid))
    del pos_valid, neg_valid

    params = {}
    params['objective'] = 'binary:logistic'
    params['eval_metric'] = 'logloss'
    params['eta'] = 0.02
    params['max_depth'] = 7
    params['subsample'] = 0.6
    params['base_score'] = 0.2
    # params['scale_pos_weight'] = 0.2

    print("DMatrix...")
    d_train = xgb.DMatrix(X_train, label=y_train)
    d_valid = xgb.DMatrix(X_valid, label=y_valid)

    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    print("XGBoost training...")
    bst = xgb.train(params,
                    d_train,
                    2500,
                    watchlist,
                    early_stopping_rounds=50,
                    verbose_eval=50)
    print(log_loss(y_valid, bst.predict(d_valid)))
    bst.save_model(output_dir + '/' + timestr + args.save + '.mdl')

    print('Building Test Features')
    df_test = pd.read_csv(test_feature, encoding="ISO-8859-1")
    x_test_ab = df_test.iloc[:, 2:]
    # 	x_test_ab = x_test_ab.drop('euclidean_distance', axis=1)
    # 	x_test_ab = x_test_ab.drop('jaccard_distance', axis=1)

    df_test = pd.read_csv(test_file)
    df_test = df_test.fillna(' ')

    df_test['question1'] = df_test['question1'].map(
        lambda x: str(x).lower().split())
    df_test['question2'] = df_test['question2'].map(
        lambda x: str(x).lower().split())

    x_test = build_features(df_test, stops, weights)
    x_test = pd.concat((x_test, x_test_ab, test_leaky), axis=1)
    del x_test_ab, test_leaky
    df_test1 = pd.read_csv(test_file)
    x_test1 = pd.concat((df_test1, x_test), axis=1)
    x_test1.to_csv(output_dir + '/' + timestr + 'test_extra_features.csv',
                   index=False)
    del df_test1, x_test1
    print('Dumped test extra features to file ' + timestr +
          'test_extra_features.csv')

    d_test = xgb.DMatrix(x_test)
    p_test = bst.predict(d_test)
    sub = pd.DataFrame()
    sub['test_id'] = df_test['test_id']
    sub['is_duplicate'] = p_test
    sub.to_csv(output_dir + '/' + timestr + args.save + '.csv', index=False)
    print('Dumped inference to file ' + timestr + args.save + '.csv')
    print('Finished.')
    return (x_start, int(x_start + size))


if __name__ == '__main__':
    args = argsProcessor()
    dir = args.dataPath
    if (not os.path.isdir(args.outputFiles)):
        os.mkdir(args.outputFiles)
    import csv

    with open(args.outputFiles + 'gt.csv', 'a') as csvfile:
        spamwriter = csv.writer(csvfile,
                                delimiter=',',
                                quotechar='|',
                                quoting=csv.QUOTE_MINIMAL)
        for folder in tqdm(os.listdir(dir)):
            a = 0
            # print (str(folder))
            if (os.path.isdir(dir + "/" + folder)):
                for file in tqdm(os.listdir(dir + "/" + folder)):
                    images_dir = dir + "/" + folder + "/" + file
                    if (os.path.isdir(images_dir)):

                        list_gt = []
                        tree = ET.parse(images_dir + "/" + file + ".gt")
                        root = tree.getroot()
                        for a in root.iter("frame"):
                            list_gt.append(a)

                        # print (list_gt)
                        for image in os.listdir(images_dir):
示例#24
0
'''
Created on Jun 1, 2017

@author: tonyq
'''
import pandas as pd
from tqdm._tqdm import tqdm
import csv


train = pd.read_csv("../output/0604/20170604-165432-XGB_leaky.csv")
totalen = len(train.is_duplicate)
print('Total size: ', totalen)

fulllist = zip(train.test_id, train.is_duplicate)
length = len(train.is_duplicate)
del train

with open("../output/0604/20170604-165432-XGB_leaky.clean.csv", "w", encoding='utf8') as fwrt:
    writer_sub = csv.writer(fwrt)
    writer_sub.writerow(['test_id','is_duplicate'])
    for (theid, dup) in tqdm(fulllist, total=length):
        writer_sub.writerow([theid, dup])
示例#25
0
def train(args):

    out_dir = os.path.join(args.logdir, args.experiment)

    # setup tensorboard logging
    if args.tensorboard_logging:
        writer = SummaryWriter(out_dir)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    # load data
    data = load_data('../data/peko_all.jsonl')
    del data['test']

    # load transformer tokenizer, model
    model_name = 'xlnet-base-cased'
    tokenizer = XLNetTokenizer.from_pretrained(model_name, pad_token='<PAD>')
    tokenizer.padding_side = "right"
    encoder = XLNetModel.from_pretrained(model_name)

    # apply tokenizer to data and re-align the token indices
    paragraphs = {}
    relations = {}
    labels = {}
    for set_info, raw_data in data.items():
        paragraphs[set_info], relations[set_info], labels[set_info] = prepare(
            raw_data, tokenizer)

    # model instantiation
    embedding_dim = 768
    model = Model(tokenizer, encoder, embedding_dim, 2)
    if model.use_cuda:
        model.cuda()

    # batchify
    batch_data = batchify(paragraphs['train'], relations['train'],
                          labels['train'], args.batch_size)

    weight = torch.FloatTensor([
        sum(labels['train']) / (len(labels['train']) - sum(labels['train'])),
        1.
    ])
    if model.use_cuda:
        weight = weight.cuda()

    loss_function = nn.NLLLoss(weight=weight, reduction='none')

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=1e-8)

    n_params = sum([np.prod(p.size()) for p in model.parameters()])
    print("#parameters: {}".format(n_params))

    dev_best = 0

    # train the model
    N = len(batch_data)
    for epoch in range(1, args.epochs + 1):
        print("Epoch {}:".format(epoch))
        start_time = time.time()
        total_loss = []
        batch_idxs = np.random.permutation(N)
        line_tqdm = tqdm(batch_idxs, dynamic_ncols=True)
        model.train()
        for batch_idx in line_tqdm:

            para, relation, label = batch_data[batch_idx]

            model.zero_grad()

            score = model(para, relation)
            target = torch.LongTensor(label)

            target = Variable(target)
            if model.use_cuda:
                target = target.cuda()

            loss = loss_function(score, target)
            total_loss.extend(loss.data.cpu().numpy().tolist())
            loss.mean().backward()
            optimizer.step()
            gc.collect()
            torch.cuda.empty_cache()

        end_time = time.time()
        print("Time Elapsed: {:.3f}".format(end_time - start_time))
        print("train Loss: {}".format(sum(total_loss) / len(total_loss)))

        if args.tensorboard_logging:
            writer.add_histogram("losses",
                                 np.asarray(total_loss),
                                 epoch,
                                 bins='auto')
            writer.add_scalar("TRAIN/loss",
                              sum(total_loss) / len(total_loss), epoch)
        for set_info in ['train', 'dev']:
            print("Test on {} set".format(set_info))

            with torch.no_grad():
                acc, precision, recall, f1, loss = test(
                    model, paragraphs[set_info], relations[set_info],
                    labels[set_info], loss_function)
            if args.tensorboard_logging:
                writer.add_scalar("{}/Accuracy".format(set_info.upper()), acc,
                                  epoch)
                writer.add_scalar("{}/Precision".format(set_info.upper()),
                                  precision, epoch)
                writer.add_scalar("{}/Recall".format(set_info.upper()), recall,
                                  epoch)
                writer.add_scalar("{}/F1".format(set_info.upper()), f1, epoch)
                if set_info == 'dev':
                    writer.add_scalar("{}/Loss".format(set_info.upper()), loss,
                                      epoch)

            if set_info == 'dev':
                if f1 > dev_best:
                    print("Save Model...\n")
                    torch.save(model,
                               os.path.join(out_dir, 'baseline_best_model.pt'))
                    best_acc = acc
                    best_precision = precision
                    best_recall = recall
                    dev_best = f1

    print("Best Result:")
    print("\tAccuracy: {:0.5f}".format(best_acc))
    print("\tPrecision: {:0.5f}".format(best_precision))
    print("\tRecall: {:0.5f}".format(best_recall))
    print("\tF1: {:0.5f}".format(dev_best))

    return
示例#26
0
def run_bert_predict(input_data, pb_path):
    label_list = get_bert_labels()
    cat_to_id = []
    id_to_cat = []
    # id_to_cat, cat_to_id = read_labels(label_path)
    # _, vocab = read_vocab(vocab_path)
    # contents, y_test_cls = get_encoded_texts_and_labels(input_data, vocab, seq_length, cat_to_id)
    global lines
    lines = []
    batch_size_test = 64

    with open(input_data, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    contents = []
    y_test_cls = []
    y_label_cls = []

    for line in lines:
        contents.append(line.split('\t')[0])
        y_label_cls.append(line.split('\t')[1])

    for item in y_label_cls:
        y_test_cls.append(label_list.index(item.strip()))

    with tf.Graph().as_default():
        graph = tf.GraphDef()
        with open(pb_path, "rb") as f:
            graph.ParseFromString(f.read())
            tf.import_graph_def(graph, name="")
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            input_ids = sess.graph.get_tensor_by_name("input_ids:0")
            input_mask = sess.graph.get_tensor_by_name("input_mask:0")
            seg_ids = sess.graph.get_tensor_by_name("segment_ids:0")
            output_tensor_name = sess.graph.get_tensor_by_name(
                "loss/Softmax:0")

            # for line in test_data:
            #     prob = sess.run(output_tensor_name,
            #                     feed_dict={input_ids: np.reshape([line.input_ids], [1, FLAGS.max_seq_length]),
            #                                input_mask: np.reshape([line.input_mask], [1, FLAGS.max_seq_length]),
            #                                seg_ids: line.seg_ids})
            #     label_id = sess.run(tf.argmax(tf.nn.softmax(prob[0], name='softmax')))
            #     label = label_list[label_id]
            #     print("BERT class_id:{}, label: {}, prob:{}".format(label_id, label, prob[0][label_id]))
            #
            # # return prob[0]

            y_pred_cls = []
            for x_batch, y_batch in tqdm(
                    batch_iter_x_y(contents, y_test_cls, batch_size_test)):
                x_batch = process_unsgetext_for_batch(x_batch)
                feed_dict = {
                    input_ids:
                    np.reshape([i.input_ids for i in x_batch[:]],
                               [batch_size_test, seq_length]),
                    input_mask:
                    np.reshape([i.input_mask for i in x_batch[:]],
                               [batch_size_test, seq_length]),
                    seg_ids:
                    np.reshape([i.seg_ids for i in x_batch[:]],
                               [batch_size_test, seq_length])
                }

                y_pred_cls.extend(
                    np.argmax(
                        sess.run(output_tensor_name, feed_dict=feed_dict), 64))
                print(y_pred_cls)

            print('===writing log report ======')
            log_dir = os.path.join('.', 'bert-logs')
            if not os.path.exists(log_dir):
                os.makedirs(log_dir)
            log_path = os.path.join(log_dir, 'result.log')
            f = open(log_path, 'w', encoding='utf-8')

            with open(input_data, 'r', encoding='utf-8') as f_in:
                testdata = f_in.readlines()

            for i in tqdm(range(len(y_test_cls))):
                is_sucess = 'pass' if (y_pred_cls[i]
                                       == y_test_cls[i]) else 'fail'
                f.write(
                    str(testdata[i].strip()) + '\t' +
                    id_to_cat[y_pred_cls[i]] + '\t' + is_sucess + "\n")
            f.close()

            print('=====testing=====')
            target_idx = set(list(set(y_test_cls)) + list(set(y_pred_cls)))
            # map classification index into class name
            target_names = [cat_to_id.get(x_batch) for x_batch in target_idx]
            print(
                metrics.classification_report(y_test_cls,
                                              y_pred_cls,
                                              target_names=target_names,
                                              digits=4))
示例#27
0
def run_epoch(loader, model, criterion, optimizer, epoch, tag):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    end = time.time()
    if optimizer:
        current_lr = get_learning_rate(optimizer)[0]
    else:
        current_lr = None

    tqdm_disable = bool(os.environ.get('TASK_NAME', ''))  # for KakaoBrain
    loader = tqdm(loader, disable=tqdm_disable)
    loader.set_description('[%s %04d/%04d]' % (tag, epoch, args.epochs))

    for i, (input, target) in enumerate(loader):
        # measure data loading time
        data_time.update(time.time() - end)

        input, target = input.cuda(), target.cuda()

        output = model(input)
        loss = criterion(output, target)

        # measure accuracy and record loss
        losses.update(loss.item(), input.size(0))

        if len(target.size()) == 1:
            err1, err5 = accuracy(output.data, target, topk=(1, 5))
            top1.update(err1.item(), input.size(0))
            top5.update(err5.item(), input.size(0))

        if optimizer:
            # compute gradient and do SGD step
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        else:
            del loss, output

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        loader.set_postfix(lr=current_lr,
                           batch_time=batch_time.avg,
                           data_time=data_time.avg,
                           loss=losses.avg,
                           top1=top1.avg,
                           top5=top5.avg)

    if tqdm_disable:
        print('[%s %03d/%03d] %s' % (tag, epoch, args.epochs,
                                     dict(lr=current_lr,
                                          batch_time=batch_time.avg,
                                          data_time=data_time.avg,
                                          loss=losses.avg,
                                          top1=top1.avg,
                                          top5=top5.avg)))

    return top1.avg, top5.avg, losses.avg
示例#28
0
    def train(self, load_model=False, model_path=None):
        if load_model:
            if model_path is not None:
                self.load_weights(model_path)
        ## Training utterances
        all_input_ids, all_input_len, all_label_ids = convert_examples_to_features(
            self.train_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length)

        print('all input ids size: ', all_input_ids.size())
        num_train_batches = all_input_ids.size(0)
        num_train_steps = int(
            num_train_batches / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)

        logger.info("***** training *****")
        logger.info("  Num examples = %d", len(self.train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        all_input_ids, all_input_len, all_label_ids = all_input_ids.to(DEVICE), all_input_len.to(
            DEVICE), all_label_ids.to(DEVICE)

        train_data = TensorDataset(all_input_ids, all_input_len, all_label_ids)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        all_input_ids_dev, all_input_len_dev, all_label_ids_dev = convert_examples_to_features(
            self.dev_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length)

        logger.info("***** validation *****")
        logger.info("  Num examples = %d", len(self.dev_examples))
        logger.info("  Batch size = %d", args.dev_batch_size)

        all_input_ids_dev, all_input_len_dev, all_label_ids_dev = \
            all_input_ids_dev.to(DEVICE), all_input_len_dev.to(DEVICE), all_label_ids_dev.to(DEVICE)

        dev_data = TensorDataset(all_input_ids_dev, all_input_len_dev, all_label_ids_dev)
        dev_sampler = SequentialSampler(dev_data)
        dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=args.dev_batch_size)

        logger.info("Loaded data!")

        if args.fp16:
            self.sumbt_model.half()
        self.sumbt_model.to(DEVICE)

        # ## Get domain-slot-type embeddings
        # slot_token_ids, slot_len = \
        #     get_label_embedding(self.processor.target_slot, args.max_label_length, self.tokenizer, DEVICE)

        # # for slot_idx, slot_str in zip(slot_token_ids, self.processor.target_slot):
        # #     self.idx2slot[slot_idx] = slot_str

        # ## Get slot-value embeddings
        # label_token_ids, label_len = [], []
        # for slot_idx, labels in zip(slot_token_ids, self.label_list):
        #     # self.idx2value[slot_idx] = {}
        #     token_ids, lens = get_label_embedding(labels, args.max_label_length, self.tokenizer, DEVICE)
        #     label_token_ids.append(token_ids)
        #     label_len.append(lens)
        #     # for label, token_id in zip(labels, token_ids):
        #     #     self.idx2value[slot_idx][token_id] = label

        # logger.info('embeddings prepared')

        # if USE_CUDA and N_GPU > 1:
        #     self.sumbt_model.module.initialize_slot_value_lookup(label_token_ids, slot_token_ids)
        # else:
        #     self.sumbt_model.initialize_slot_value_lookup(label_token_ids, slot_token_ids)

        def get_optimizer_grouped_parameters(model):
            param_optimizer = [(n, p) for n, p in model.named_parameters() if p.requires_grad]
            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01,
                 'lr': args.learning_rate},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0,
                 'lr': args.learning_rate},
            ]
            return optimizer_grouped_parameters

        if not USE_CUDA or N_GPU == 1:
            optimizer_grouped_parameters = get_optimizer_grouped_parameters(self.sumbt_model)
        else:
            optimizer_grouped_parameters = get_optimizer_grouped_parameters(self.sumbt_model.module)

        t_total = num_train_steps

        if args.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if args.fp16_loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.fp16_loss_scale)

        else:
            optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False)
            scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_proportion*t_total, num_training_steps=t_total)
        logger.info(optimizer)

        # Training code
        ###############################################################################

        print(torch.cuda.memory_allocated())

        logger.info("Training...")

        global_step = 0
        last_update = None
        best_loss = None
        model = self.sumbt_model
        if not args.do_not_use_tensorboard:
            summary_writer = None
        else:
            summary_writer = SummaryWriter("./tensorboard_summary/logs_1214/")

        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            # Train
            model.train()
            tr_loss = 0
            nb_tr_examples = 0
            nb_tr_steps = 0

            for step, batch in enumerate(tqdm(train_dataloader)):
                batch = tuple(t.to(DEVICE) for t in batch)
                input_ids, input_len, label_ids = batch
                # print(input_ids.size())

                # Forward
                if N_GPU == 1:
                    loss, loss_slot, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU)
                else:
                    loss, _, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU)

                    # average to multi-gpus
                    loss = loss.mean()
                    acc = acc.mean()
                    acc_slot = acc_slot.mean(0)

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                # Backward
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                # tensrboard logging
                if summary_writer is not None:
                    summary_writer.add_scalar("Epoch", epoch, global_step)
                    summary_writer.add_scalar("Train/Loss", loss, global_step)
                    summary_writer.add_scalar("Train/JointAcc", acc, global_step)
                    if N_GPU == 1:
                        for i, slot in enumerate(self.processor.target_slot):
                            summary_writer.add_scalar("Train/Loss_%s" % slot.replace(' ', '_'), loss_slot[i],
                                                      global_step)
                            summary_writer.add_scalar("Train/Acc_%s" % slot.replace(' ', '_'), acc_slot[i], global_step)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify lealrning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion)
                    if summary_writer is not None:
                        summary_writer.add_scalar("Train/LearningRate", lr_this_step, global_step)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    if scheduler is not None:
                        torch.nn.utils.clip_grad_norm_(optimizer_grouped_parameters, 1.0)
                    optimizer.step()
                    if scheduler is not None:
                        scheduler.step()
                    optimizer.zero_grad()
                    global_step += 1


            # Perform evaluation on validation dataset
            model.eval()
            dev_loss = 0
            dev_acc = 0
            dev_loss_slot, dev_acc_slot = None, None
            nb_dev_examples, nb_dev_steps = 0, 0

            for step, batch in enumerate(tqdm(dev_dataloader, desc="Validation")):
                batch = tuple(t.to(DEVICE) for t in batch)
                input_ids, input_len, label_ids = batch
                if input_ids.dim() == 2:
                    input_ids = input_ids.unsqueeze(0)
                    input_len = input_len.unsqueeze(0)
                    label_ids = label_ids.unsuqeeze(0)

                with torch.no_grad():
                    if N_GPU == 1:
                        loss, loss_slot, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU)
                    else:
                        loss, _, acc, acc_slot, _ = model(input_ids, input_len, label_ids, N_GPU)

                        # average to multi-gpus
                        loss = loss.mean()
                        acc = acc.mean()
                        acc_slot = acc_slot.mean(0)

                num_valid_turn = torch.sum(label_ids[:, :, 0].view(-1) > -1, 0).item()
                dev_loss += loss.item() * num_valid_turn
                dev_acc += acc.item() * num_valid_turn

                if N_GPU == 1:
                    if dev_loss_slot is None:
                        dev_loss_slot = [l * num_valid_turn for l in loss_slot]
                        dev_acc_slot = acc_slot * num_valid_turn
                    else:
                        for i, l in enumerate(loss_slot):
                            dev_loss_slot[i] = dev_loss_slot[i] + l * num_valid_turn
                        dev_acc_slot += acc_slot * num_valid_turn

                nb_dev_examples += num_valid_turn


            dev_loss = dev_loss / nb_dev_examples
            dev_acc = dev_acc / nb_dev_examples

            if N_GPU == 1:
                dev_acc_slot = dev_acc_slot / nb_dev_examples

            # tensorboard logging
            if summary_writer is not None:
                summary_writer.add_scalar("Validate/Loss", dev_loss, global_step)
                summary_writer.add_scalar("Validate/Acc", dev_acc, global_step)
                if N_GPU == 1:
                    for i, slot in enumerate(self.processor.target_slot):
                        summary_writer.add_scalar("Validate/Loss_%s" % slot.replace(' ', '_'),
                                                  dev_loss_slot[i] / nb_dev_examples, global_step)
                        summary_writer.add_scalar("Validate/Acc_%s" % slot.replace(' ', '_'), dev_acc_slot[i],
                                                  global_step)

            dev_loss = round(dev_loss, 6)

            output_model_file = os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "pytorch_model.bin")

            if last_update is None or dev_loss < best_loss:
                last_update = epoch
                best_loss = dev_loss
                best_acc = dev_acc
                if not USE_CUDA or N_GPU == 1:
                    torch.save(model.state_dict(), output_model_file)
                else:
                    torch.save(model.module.state_dict(), output_model_file)

                logger.info(
                    "*** Model Updated: Epoch=%d, Validation Loss=%.6f, Validation Acc=%.6f, global_step=%d ***" % (
                        last_update, best_loss, best_acc, global_step))
            else:
                logger.info(
                    "*** Model NOT Updated: Epoch=%d, Validation Loss=%.6f, Validation Acc=%.6f, global_step=%d  ***" % (
                        epoch, dev_loss, dev_acc, global_step))

            if last_update + args.patience <= epoch:
                break
示例#29
0
'''
Created on Apr 23, 2017

@author: tonyq
'''
from tqdm._tqdm import tqdm
import pickle as pkl

with open('contencVocab.pkl', 'rb') as vocab_file:
    trainSet = pkl.load(vocab_file)

with open('testVocab.pkl', 'rb') as vocab_file:
    testSet = pkl.load(vocab_file)

contentSet = trainSet.union(testSet)

with open('../dsk16g/glove.840B.300d.txt', 'r', encoding='utf8') as fhd:
    with open('../dsk16g/glove.840B.quoraVocab.300d.txt', 'w',
              encoding='utf8') as fwrt:
        for line in tqdm(fhd):
            if line.strip().split(' ')[0] in contentSet:
                fwrt.write(line)
示例#30
0
    def test(self, mode='dev', model_path=os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "pytorch_model.bin")):
        '''Testing funciton of TRADE (to be added)'''
        # Evaluation
        self.load_weights(model_path)

        if mode == 'test':
            eval_examples = self.dev_examples
        elif mode == 'dev':
            eval_examples = self.test_examples

        all_input_ids, all_input_len, all_label_ids = convert_examples_to_features(
            eval_examples, self.label_list, args.max_seq_length, self.tokenizer, args.max_turn_length)
        all_input_ids, all_input_len, all_label_ids = all_input_ids.to(DEVICE), all_input_len.to(
            DEVICE), all_label_ids.to(DEVICE)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.dev_batch_size)

        eval_data = TensorDataset(all_input_ids, all_input_len, all_label_ids)

        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.dev_batch_size)

        model = self.sumbt_model
        eval_loss, eval_accuracy = 0, 0
        eval_loss_slot, eval_acc_slot = None, None
        nb_eval_steps, nb_eval_examples = 0, 0

        accuracies = {'joint7': 0, 'slot7': 0, 'joint5': 0, 'slot5': 0, 'joint_rest': 0, 'slot_rest': 0,
                      'num_turn': 0, 'num_slot7': 0, 'num_slot5': 0, 'num_slot_rest': 0}

        for input_ids, input_len, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
            # if input_ids.dim() == 2:
            #     input_ids = input_ids.unsqueeze(0)
            #     input_len = input_len.unsqueeze(0)
            #     label_ids = label_ids.unsuqeeze(0)

            with torch.no_grad():
                if not USE_CUDA or N_GPU == 1:
                    loss, loss_slot, acc, acc_slot, pred_slot = model(input_ids, input_len, label_ids, 1)
                else:
                    loss, _, acc, acc_slot, pred_slot = model(input_ids, input_len, label_ids, N_GPU)
                    nbatch = label_ids.size(0)
                    nslot = pred_slot.size(3)
                    pred_slot = pred_slot.view(nbatch, -1, nslot)

            accuracies = eval_all_accs(pred_slot, label_ids, accuracies)

            nb_eval_ex = (label_ids[:, :, 0].view(-1) != -1).sum().item()
            nb_eval_examples += nb_eval_ex
            nb_eval_steps += 1

            if not USE_CUDA or N_GPU == 1:
                eval_loss += loss.item() * nb_eval_ex
                eval_accuracy += acc.item() * nb_eval_ex
                if eval_loss_slot is None:
                    eval_loss_slot = [l * nb_eval_ex for l in loss_slot]
                    eval_acc_slot = acc_slot * nb_eval_ex
                else:
                    for i, l in enumerate(loss_slot):
                        eval_loss_slot[i] = eval_loss_slot[i] + l * nb_eval_ex
                    eval_acc_slot += acc_slot * nb_eval_ex
            else:
                eval_loss += sum(loss) * nb_eval_ex
                eval_accuracy += sum(acc) * nb_eval_ex

            # exit(1)

        eval_loss = eval_loss / nb_eval_examples
        eval_accuracy = eval_accuracy / nb_eval_examples
        if not USE_CUDA or N_GPU == 1:
            eval_acc_slot = eval_acc_slot / nb_eval_examples

        loss = None

        if not USE_CUDA or N_GPU == 1:
            result = {
                # 'num': '\t'.join([str(x) for x in model.num_labels]),
                'eval_loss': eval_loss,
                'eval_accuracy': eval_accuracy,
                'loss': loss,
                'eval_loss_slot': '\t'.join([str(val / nb_eval_examples) for val in eval_loss_slot]),
                'eval_acc_slot': '\t'.join([str((val).item()) for val in eval_acc_slot]),
            }
        else:
            result = {'eval_loss': eval_loss,
                      'eval_accuracy': eval_accuracy,
                      'loss': loss
                      }

        out_file_name = 'eval_results'
        # if TARGET_SLOT == 'all':
        #     out_file_name += '_all'
        output_eval_file = os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "%s.txt" % out_file_name)

        if not USE_CUDA or N_GPU == 1:
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

        out_file_name = 'eval_all_accuracies'
        with open(os.path.join(os.path.join(SUMBT_PATH, args.output_dir), "%s.txt" % out_file_name), 'w') as f:
            s = '{:^22s}:{:^22s}:{:^22s}:{:^22s}:{:^22s}:{:^22s}'.format(
                'joint acc (7 domain)', 
                'slot acc (7 domain)', 
                'joint acc (5 domain)', 
                'slot acc (5 domain)', 
                'joint restaurant', 
                'slot acc restaurant')
            f.write(s + '\n')
            print(s)
            s = '{:^22.5f}:{:^22.5f}:{:^22.5f}:{:^22.5f}:{:^22.5f}:{:^22.5f}'.format(
                (accuracies['joint7'] / accuracies['num_turn']).item(),
                (accuracies['slot7'] / accuracies['num_slot7']).item(),
                (accuracies['joint5'] / accuracies['num_turn']).item(),
                (accuracies['slot5'] / accuracies['num_slot5']).item(),
                (accuracies['joint_rest'] / accuracies['num_turn']).item(),
                (accuracies['slot_rest'] / accuracies['num_slot_rest']).item()
            )
            f.write(s + '\n')
            print(s)