예제 #1
0
def Encoding_table(table_data, data_type, id):
    args = parse_opt()
    num_table = len(table_data)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    model.to(device)
    model = DDP(model)
    tables = []
    for i in range(num_table):
        table = table_data[i]
        table_cell_list = []
        for row in range(len(table)):
            for col in range(len(table[row])):
                table_cell_list.append(table[row][col])
        tables.append(table_cell_list)
    table_out = []
    for m in range(num_table):
        table_tokens = tokenizer(tables[m],
                                 padding='max_length',
                                 truncation=True,
                                 max_length=args.table_max_len,
                                 return_tensors='pt')
        table_tokens.to(device)
        out = model(**table_tokens).last_hidden_state
        table_out.append(out)
    torch.save(
        table_out,
        args.data + "/processed_datasets/embedding_data/{}_table_{}.pt".format(
            data_type, id))
예제 #2
0
def Encoding_sentence(sent_data, data_type, id):
    args = parse_opt()
    num_sent = len(sent_data)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    model.to(device)
    model = DDP(model)
    supports = []
    statements = []
    for i in range(num_sent):
        supports.append(sent_data[i][1])
        statements.append(sent_data[i][2])
    support_tokens = tokenizer(supports,
                               padding='max_length',
                               truncation=True,
                               max_length=args.sent_max_len,
                               return_tensors='pt')
    statement_tokens = tokenizer(statements,
                                 padding='max_length',
                                 truncation=True,
                                 max_length=args.sent_max_len,
                                 return_tensors='pt')
    support_tokens.to(device)
    statement_tokens.to(device)
    support_out = model(**support_tokens).last_hidden_state
    statement_out = model(**statement_tokens).last_hidden_state
    torch.save(
        support_out, args.data +
        "/processed_datasets/embedding_data/{}_support_{}.pt".format(
            data_type, id))
    torch.save(
        statement_out, args.data +
        "/processed_datasets/embedding_data/{}_statement_{}.pt".format(
            data_type, id))
예제 #3
0
def Encoding_column(column_data, data_type, id):
    args = parse_opt()
    num_column = len(column_data)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    model.to(device)
    model = DDP(model)
    columns = []
    for i in range(num_column):
        columns.append(column_data[i])
    column_out = []
    for m in range(num_column):
        column_tokens = tokenizer(columns[m],
                                  padding='max_length',
                                  truncation=True,
                                  max_length=args.table_max_len,
                                  return_tensors='pt')
        column_tokens.to(device)
        out = model(**column_tokens).last_hidden_state
        column_out.append(out)
    torch.save(
        column_out, args.data +
        "/processed_datasets/embedding_data/{}_column_{}.pt".format(
            data_type, id))
예제 #4
0
                scores = model(sentences,
                               words_per_sentence)  # (batch_size, n_classes)

            # accuracy
            _, predictions = scores.max(dim=1)  # (n_documents)
            correct_predictions = torch.eq(predictions, labels).sum().item()
            accuracy = correct_predictions / labels.size(0)

            # keep track of metrics
            accs.update(accuracy, labels.size(0))

        # final test accuracy
        print('\n * TEST ACCURACY - %.1f percent\n' % (accs.avg * 100))


if __name__ == '__main__':
    config = parse_opt()

    # load model
    checkpoint_path = os.path.join(config.checkpoint_path,
                                   config.checkpoint_basename + '.pth.tar')
    model, _, _, _, _, _ = load_checkpoint(checkpoint_path, device)
    model = model.to(device)
    model.eval()

    # load test data
    test_loader = load_data(config, 'test')

    test(model, config.model_name, test_loader)
    预测音频情感

    Args:
        config: 配置项
        audio_path (str): 要预测的音频路径
        model: 加载的模型
    """

    # utils.play_audio(audio_path)

    if config.feature_method == 'o':
        # 一个玄学 bug 的暂时性解决方案
        of.get_data(config, audio_path, train=False)
        test_feature = of.load_feature(config, train=False)
    elif config.feature_method == 'l':
        test_feature = lf.get_data(config, audio_path, train=False)

    result = model.predict(test_feature)
    result_prob = model.predict_proba(test_feature)
    print('Recogntion: ', config.class_labels[int(result)])
    print('Probability: ', result_prob)
    utils.radar(result_prob, config.class_labels)


if __name__ == '__main__':
    audio_path = '/Users/zou/Renovamen/Developing/Speech-Emotion-Recognition/datasets/CASIA/angry/201-angry-liuchanhg.wav'

    config = utils.parse_opt()
    model = models.load(config)
    predict(config, audio_path, model)
예제 #6
0
def Encoding(dataset, data_type):
    print(20 * "=" + "pre encoding" + 10 * '=')
    args = parse_opt()
    num_data = dataset.__len__()
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    model.to(device)
    #model = torch.nn.DataParallel(model)
    model = DDP(model)
    supports = []
    statements = []
    tables = []
    columns = []
    labels = []
    for i in tqdm(range(num_data)):
        data_dict = dataset.__getitem__(i)
        supports.append(data_dict['support'])
        statements.append(data_dict['statement'])
        labels.append(data_dict['label'])
        columns.append(data_dict['column'])
        table = data_dict['table']
        table_cell_list = []
        for row in range(len(table)):
            for col in range(len(table[row])):
                table_cell_list.append(table[row][col])
        tables.append(table_cell_list)
    split_data_num = num_data // (args.batch)
    split_list = []
    for n in range(args.batch - 1):
        split_list.append(split_data_num * n)
    split_list.append(num_data)
    for j in tqdm(range(len(split_list) - 1)):
        support_tokens = tokenizer(supports[split_list[j]:split_list[j + 1]],
                                   padding='max_length',
                                   truncation=True,
                                   max_length=args.sent_max_len,
                                   return_tensors='pt')
        statement_tokens = tokenizer(statements[split_list[j]:split_list[j +
                                                                         1]],
                                     padding='max_length',
                                     truncation=True,
                                     max_length=args.sent_max_len,
                                     return_tensors='pt')
        '''
        support_tokens = support_tokens.cuda()
        statement_tokens = statement_tokens.cuda()
        '''
        support_tokens.to(device)
        statement_tokens.to(device)
        support_out = model(**support_tokens).last_hidden_state
        statement_out = model(**statement_tokens).last_hidden_state
        column_out = []
        #print(20*"="+"encoding column"+20*"=")
        for column in tqdm(columns[split_list[j]:split_list[j + 1]]):
            column_tokens = tokenizer(column,
                                      padding='max_length',
                                      truncation=True,
                                      max_length=args.table_max_len,
                                      return_tensors='pt')
            #column_tokens = column_tokens.cuda()
            column_tokens.to(device)
            out = model(**column_tokens).last_hidden_state
            column_out.append(out)
        table_out = []
        #print(20*"="+"encoding table"+20*"=")
        for table in tqdm(tables[split_list[j]:split_list[j + 1]]):
            table_tokens = tokenizer(table,
                                     padding='max_length',
                                     truncation=True,
                                     max_length=args.table_max_len,
                                     return_tensors='pt')
            #table_tokens = table_tokens.cuda()
            table_tokens.to(device)
            out = model(**table_tokens).last_hidden_state
            table_out.append(out)
        data = []
        min_data = []
        mini_num = len(support_out)
        for m in range(mini_num):
            min_data.append(support_out[m])
            min_data.append(statement_out[m])
            min_data.append(table_out[m])
            min_data.append(column_out[m])
            min_data.append(torch.tensor(labels[m]))
            data.append(min_data)
        torch.save(
            data, args.data +
            "/processed_datasets/embedding_data/{}_part0_{}.pt".format(
                data_type, j))
예제 #7
0
    datapath = '/home/DATA/TabFact'
    sent_data, table_ids, table_data = read_dataset(datapath, data_type)
    num_data = len(sent_data)
    labels = []
    for i in range(num_data):
        labels.append(sent_data[i][-1])
    with open(
            datapath +
            '/processed_datasets/embedding_data/{}_label.json'.format(
                data_type), 'w') as f:
        json.dump(labels, f)
    f.close()


if __name__ == "__main__":
    args = parse_opt()
    '''
    write_dataset("train")
    write_dataset("test")
    write_dataset("dev")
    write_dataset("example")
    '''
    #write_embedding_dataset("example")
    '''
    for i in tqdm(range(512)):
        write_embedding_sent_dataset('train', i)
    write_embedding_sent_dataset_end('train')
    '''
    '''
    for j in tqdm(range(512)):
        write_embedding_table_dataset('train', j)