def fit(model, train_dataloader, test_dataloader, optimizer, num_train_steps, num_epocs=5):
    time_string = strftime("%a%d%b%Y-%H%M%S", gmtime())
    writer_name = os.path.join(ROOT, 'log', time_string)
    print(writer_name)
    writer = SummaryWriter(writer_name)
    n_gpu = 1
    global_step = 0
    output_model_file = os.path.join(ROOT, 'save', "finetuned_pytorch_model_3.bin")
    t_total = num_train_steps
    #     model_state_dict = torch.load(output_model_file)
    #     model = BertForSequenceClassification.from_pretrained(args['bert_model'], num_labels = num_labels, state_dict=model_state_dict)
    #     model.to(device)
    model.train()

    for i_ in trange(int(num_epocs), desc="Epoch"):
        # Load a trained model that you have fine-tuned
        print('hehe')
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
            model.train()

            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            loss = model(input_ids, segment_ids, input_mask, label_ids)
            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args['gradient_accumulation_steps'] > 1:
                loss = loss / args['gradient_accumulation_steps']

            if args['fp16']:
                optimizer.backward(loss)
            else:
                loss.backward()

            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % args['gradient_accumulation_steps'] == 0:
                #   scheduler.batch_step()
                # modify learning rate with special warm up BERT uses
                # lr_this_step = args['learning_rate'] * warmup_linear(global_step / t_total, args['warmup_proportion'])
                # for param_group in optimizer.param_groups:
                #     param_group['lr'] = lr_this_step
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

            if (step + 1) % 10 == 0:
                logger.info('Epoch {} Step {} Loss {}'.format(i_, step, tr_loss / nb_tr_steps))
                writer.add_scalar('loss', torch.mean(loss).detach().cpu().numpy(), step)
            if (step + 1) % 100 == 0:
                r = set_eval(model, test_dataloader)
                writer.add_scalar('evalloss', r['eval_loss'], step)
                writer.add_scalar('evalacc', r['eval_accuracy'], step)

        # logger.info('Eval after epoc {}'.format(i_ + 1))
        # Save a trained model
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        torch.save(model_to_save.state_dict(), output_model_file)
Exemplo n.º 2
0
def midis_to_sequences(folder, total_num=None, split_interval=1):
    """Create tensor from midi folder, using music21 and torch
    Args:
        folder: str, full or relative folder path
        total_num: int, if assigned, the first dimension of the output rensor will be
                   `min(num_of_midis, total_num)`
        split_interval: int
    Returns:
        tensor: list of torch.Tensor, [total_num, 3, sequence_length, 3]
                Different sequences will have different sequence_length
    """
    logger.info(
        "Prepare to create tensor from midis, folder {}".format(folder))

    # set up parameters
    filenames = os.listdir(folder)
    sequences = []
    # Enter main loop
    for count, filename in tqdm(enumerate(filenames), total=len(filenames)):
        if total_num is not None and count >= total_num:
            break
        filepath = os.path.join(folder, filename)
        sequences.append(midi_to_sequence(filepath, split_interval))
    logger.info("Creation completed.")
    return sequences
Exemplo n.º 3
0
def sequences_to_midis(sequences,
                       split_interval=1,
                       tune='E minor',
                       folder=None):
    """Create midis from Tensor, most compatitive verson
       NOW PATH IS FIXED TO outputs/
    Args:
        tensor: list of torch.Tensor, [total_num, 3, sequence_length, 3]
        split_interval: scalar, indicates the frequency of notes on reconstruction
        tune: str, 'X xxxxx', 'E minor', etc
        folder: str, folder to save midis
    Returns:
        None
    """
    assert isinstance(sequences, list), "wrong sequences class, got {}".format(
        sequences.__class__.__name__)
    logger.info("{} midis to create".format(len(sequences)))
    # create folder if None
    if folder is None:
        rq = time.strftime('%Y%m%d%H%M', time.localtime(time.time()))
        logger.warning(
            "folder name not assigned, will use current time {}".format(rq))
        folder = 'outputs/' + rq
    if not os.path.exists(folder):
        os.mkdir(folder)
    # Enter main loop
    for num, sequence in tqdm(enumerate(sequences), total=len(sequences)):
        sequence_to_midi(sequence,
                         split_interval,
                         tune,
                         folder,
                         name=str(num) + '.mid')
    logger.info("Midi creation completed")
Exemplo n.º 4
0
def getBatches_test():
    logger.info("getBatches_test() started")
    sequences = midis_to_sequences('/home/hades/Documents/simple_data')
    batches, targets, batch_size = getBatches(sequences)
    assert isinstance(batches, list)
    assert isinstance(batches[0], torch.Tensor)
    assert batches[0].size()[0] == 3
    assert batches[0].size()[2] == batch_size
    assert batches[0].size()[3] == 42
    assert isinstance(targets, list)
    assert isinstance(targets[0], torch.Tensor)
    assert targets[0].size()[0] == 3
    assert targets[0].size()[2] == batch_size
    assert targets[0].size()[3] == 3
    tensors = sequences_to_tensors(sequences)
    batches, targets, batch_size = getBatches(tensors)
    assert isinstance(batches, list)
    assert isinstance(batches[0], torch.Tensor)
    assert batches[0].size()[0] == 3
    assert batches[0].size()[2] == batch_size
    assert batches[0].size()[3] == 42
    assert isinstance(targets, list)
    assert isinstance(targets[0], torch.Tensor)
    assert targets[0].size()[0] == 3
    assert targets[0].size()[2] == batch_size
    assert targets[0].size()[3] == 3
    logger.info("getBatches_test() passed")
Exemplo n.º 5
0
def jdPhone_spider(args, url, beginPage, endPage):
    if args.mode == 'update':
        global phone_num
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.5,zh;q=0.3',
            'Referer': 'https://www.jd.com/',
            'DNT': '1',
            #'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'TE': 'Trailers',
        }
        with open(args.update_file, 'rb') as f:
            update_list = pickle.load(f)
        for ID in update_list:
            new_url = 'https://item.jd.com/' + str(ID) + '.html'
            row = one_phone(args, new_url, headers)
            logger.info('succeed in crawling %s feature' % len(row))
            mysql_tool.auto_save_data(row, args.table_name)
            phone_num += 1
    else:
        for page in range(beginPage, endPage + 1):
            pn = page * 2 - 1
            page_num = page
            logger.info("crawlling No," + str(page) + "page")
            fullurl = url + "&page=" + str(pn)
            time.sleep(2)
            load_page(args, fullurl)
Exemplo n.º 6
0
def midis_to_sequences_test():
    logger.info("midis_to_sequences_test() started")
    sequences = midis_to_sequences('/home/hades/Documents/simple_data')
    assert isinstance(sequences, list)
    assert isinstance(sequences[0], torch.Tensor)
    assert sequences[0].size()[0] == 3
    assert sequences[0].size()[2] == 3
    logger.info("midis_to_sequences_test() passed")
Exemplo n.º 7
0
def save_test():
    logger.info("save_test() started")
    sequences = midis_to_sequences('/home/hades/Documents/simple_data')
    path = save(sequences, 'sequences.test')
    sequences = load(path)
    assert isinstance(sequences, list)
    assert isinstance(sequences[0], torch.Tensor)
    assert sequences[0].size()[0] == 3
    assert sequences[0].size()[2] == 3
    logger.info("save_test() passed")
 def get_train_examples(self, data_dir, size=-1):
     filename = 'train.csv'
     logger.info("LOOKING AT {}".format(os.path.join(data_dir, filename)))
     if size == -1:
         data_df = pd.read_csv(os.path.join(data_dir, filename),
                               engine=None)
         return self._create_examples(data_df, "train")
     else:
         data_df = pd.read_csv(os.path.join(data_dir, filename))
         return self._create_examples(data_df.sample(size), "train")
Exemplo n.º 9
0
def load(path):
    """Useful wrapping of pickle method
    Args:
        path: full or relative path of the file to be loaded
    Returns:
        obj: any
    """
    with open(path, 'rb+') as f:
        obj = torch.load(f)
    logger.info("a {} object loaded from {}".format(obj.__class__.__name__,
                                                    path))
    return obj
Exemplo n.º 10
0
def save(obj, filename):
    """Useful wrapping of pickle method
    Args:
        obj: Any
        filename: str, obj will be saved to `current_path/saves/time/filename/`
    Returns:
        path: str, full path of the file saved
    """
    rq = time.strftime('%Y%m%d%H%M', time.localtime(time.time()))
    path = 'saves/' + rq + '/' + filename
    with open(path, 'wb+') as f:
        torch.save(obj, f)
    logger.info("a {} object saved to {}".format(obj.__class__.__name__, path))
    return path
def set_eval(model, eval_dataloader):
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args["no_cuda"] else "cpu")
    count = 0
    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)

        with torch.no_grad():
            tmp_eval_loss = model(input_ids, segment_ids, input_mask,
                                  label_ids)
            logits = model(input_ids, segment_ids, input_mask)

        logits = logits.detach().cpu().numpy()
        label_ids = label_ids.to('cpu').numpy()
        tmp_eval_accuracy = accuracy(logits, label_ids)

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy

        nb_eval_examples += input_ids.size(0)
        nb_eval_steps += 1

        count += 1
        if count >= 4:
            break

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_examples

    #     loss = tr_loss/nb_tr_steps if tr_loss else None
    result = {
        'eval_loss': eval_loss,
        'eval_accuracy': eval_accuracy,
        'global_step': 0
    }
    #               'loss': loss}

    output_eval_file = os.path.join(args['output_dir'], "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results *****")
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
    return result
Exemplo n.º 12
0
def load_page(args, url):
    global mysql_tool
    global phone_num
    global tmp_dict
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.5,zh;q=0.3',
        'Referer': 'https://www.jd.com/',
        'DNT': '1',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'TE': 'Trailers',
    }

    params = (
        ('keyword', '手机'),
        ('enc', 'utf-8'),
        ('wq', '手机'),
        ('pvid', '70b2126fcf3246ce9f32710d41799ede'),
    )

    response = requests.get(url, headers=headers, params=params)
    html = response.content
    content = etree.HTML(html)
    content_list = content.xpath(
        '//div[@class="gl-i-wrap"]/div[@class="p-img"]/a/@href')

    for i in range(1, 31):
        try:
            result = re.split(r":", content_list[i - 1])[1]
            content_list[i - 1] = result
        except Exception as e:
            continue
    for j in content_list:
        new_url = "http:" + j
        logger.info('trying to crawling No. %s phone info...' % phone_num)
        phone_num += 1
        row = one_phone(args, new_url, headers)
        print(len(row))
        if args.mode == 'debug':
            tmp_dict[UNIKEY] = row
        else:
            if phone_num == 1:
                mysql_tool.auto_create_table(row, 'jd_phone_raw')
            mysql_tool.auto_save_data(row, 'jd_phone_raw')
            phone_num += 1
def convert_single(single_text,
                   max_seq_length=None,
                   tokenizer=None) -> InputFeatures:
    if tokenizer is None:
        tokenizer = BertTokenizer.from_pretrained(
            'bert-base-uncased', do_lower_case=args['do_lower_case'])
    if max_seq_length is None:
        max_seq_length = args['max_seq_length']

    tokens_a = tokenizer.tokenize(single_text)

    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[:(max_seq_length - 2)]

    tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
    segment_ids = [0] * len(tokens)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding = [0] * (max_seq_length - len(input_ids))
    input_ids += padding
    input_mask += padding
    segment_ids += padding

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    logger.info("*** Example ***")
    logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
    logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
    logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
    logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))

    return InputFeatures(input_ids=input_ids,
                         input_mask=input_mask,
                         segment_ids=segment_ids,
                         label_ids=0)
def train():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=args['do_lower_case'])
    train_examples = None
    num_train_steps = None

    processors = {
        "news_cat_label": LabelTextProcessor
    }
    processor = processors[args['task_name'].lower()](args['data_dir'])
    if args['do_train']:
        train_examples = processor.get_train_examples(args['full_data_dir'], size=args['train_size'])
        num_train_steps = int(
            len(train_examples) / args['train_batch_size'] / args['gradient_accumulation_steps'] * args[
                'num_train_epochs'])
    eval_examples = processor.get_dev_examples(args['data_dir'], size=args['val_size'])

    train_features = convert_examples_to_features(train_examples, label_list, args['max_seq_length'], tokenizer)

    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_ids for f in train_features], dtype=torch.long)
    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args['train_batch_size'])

    eval_features = convert_examples_to_features(
        eval_examples, label_list, args['max_seq_length'], tokenizer)
    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_examples))
    logger.info("  Batch size = %d", args['eval_batch_size'])
    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_ids for f in eval_features], dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    # Run prediction for full data
    eval_sampler = RandomSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args['eval_batch_size'])

    model = BertForSequenceClassification.from_pretrained(args['bert_model'], num_labels=label_list.__len__())
    model = model.to(device)
    _, optimizer = opt.get_opt(model, num_train_steps)

    fit(model, train_dataloader, eval_dataloader, optimizer, num_train_steps, args['num_train_epochs'])
    def train(self, episodes=-1):

        # Hacky...
        if episodes < 0:
            episodes = self.max_episodes

        episode = 0
        all_rewards = []

        try:
            # Set this to "while True" for genuine convergence
            for e in range(episodes):

                # Start episode
                episode_reward = 0
                self.episode_count = e
                t = 0
                state = self.env.reset()
                state = np.reshape(state, [1, self.n_features])

                while True:
                    # self.env.render()

                    # Select action
                    action = self._select_action(state)

                    # Execute transition
                    next_state, reward, done, info = self.env.step(action)
                    episode_reward += reward
                    next_state = np.reshape(next_state, [1, self.n_features])

                    # Store experience tuple in memory
                    self.memory.append(
                        (state, action, reward, next_state, done))
                    state = next_state

                    # Replay using mini batch
                    self._update_Q()

                    # Copy learned Q function into target network
                    if t % self.net_replacement_freq == 0:
                        self.Q_ = clone_model(self.Q)
                        self.Q_.set_weights(self.Q.get_weights())

                    t += 1
                    if done:
                        break

                all_rewards.append(episode_reward)
                sma = np.mean(all_rewards[-SMA_WINDOW:])
                logger.info('{},{},{},{}'.format(episode, episode_reward,
                                                 self.epsilon, sma))
                episode += 1

                # Uncomment for episodic epsilon decay
                if not self.epsilon_special:
                    if self.epsilon > self.epsilon_min:
                        self.epsilon *= self.epsilon_decay_rate

                # Special case: stepwise epsilon decay
                else:
                    if episode < 150:
                        self.epsilon = 1.0
                    elif episode < 250:
                        self.epsilon = 0.5
                    else:
                        self.epsilon = 0.0

                # Convergence
                if sma >= 200:
                    self.solved = True
                    break

        except KeyboardInterrupt:
            logger.info('KeyboardInterrupt: halting training')
        finally:
            plot(all_rewards)
            self._save_model()
            return all_rewards
Exemplo n.º 16
0
    args = parser.parse_args()
    init_logger(args.log_file)
    beginPage = int(args.begin)
    endPage = int(args.end)
    url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8"
    global mysql_tool
    mysql_tool = mysql_tool(args,
                            'localhost',
                            'root',
                            '20192019_yhf',
                            3306,
                            'spiders',
                            logger=logger)
    global phone_num
    phone_num = 1
    global tmp_dict
    tmp_dict = {}
    global page_num
    page_num = args.begin
    if args.mode in ['crawl', 'sparse_table']:
        while page_num <= args.end:
            try:
                jdPhone_spider(args, url, beginPage, endPage)
            except:
                logger.info('trying connect again!')
    else:
        jdPhone_spider(args, url, beginPage, endPage)
    if args.mode == 'debug':
        with open('result/debug.pk', 'wb') as f:
            pickle.dump(tmp_dict, f)
Exemplo n.º 17
0
    def train(self, env):

        # Track rewards
        all_rewards = []

        try:
            while True:
                state = env.reset()
                state = [
                    round(s, PRECISION)
                    for s in state[:CONTINUOUS_OBSERVATIONS]
                ]
                action = self._query_initial(
                    state, env.discrete_obs_space
                )  # set the state and get first action
                episode_return = 0
                steps = 0
                total_Q_update = 0

                while True:
                    new_state, reward, done, details = env.step(action)
                    new_state = [
                        round(s, PRECISION)
                        for s in new_state[:CONTINUOUS_OBSERVATIONS]
                    ]
                    # env.render()
                    episode_return += reward

                    # if steps % 10 == 0:
                    #     print([x for x in new_state])
                    #     print("step {} total_reward {:+0.2f}".format(steps, episode_return))
                    steps += 1

                    if done:
                        break

                    action, delta_Q = self._query(state, action, new_state,
                                                  reward,
                                                  env.discrete_obs_space)
                    total_Q_update += delta_Q

                all_rewards.append(episode_return)

                sma = np.mean(all_rewards[-SMA_WINDOW:])

                if self.episodes % 10 == 0:
                    if self.episodes >= SMA_WINDOW:
                        logger.info(
                            'Episode {} | Reward = {} | SMA = {}'.format(
                                self.episodes, episode_return, sma))
                    else:
                        logger.info('Episode {} | Reward = {}'.format(
                            self.episodes, episode_return))

                # Convergence
                if self.episodes > SMA_WINDOW and sma >= SOLUTION_THRESHOLD:
                    break

                self.episodes += 1
        except KeyboardInterrupt:
            logger.warn('KeyboardInterrupt - halting training')

        plot(all_rewards,
             title='Rewards per episode',
             xlab='Episode',
             ylab='Reward')
        logger.info('{}% of actions were random'.format(
            round(100. * self.random_actions / self.total_actions, 2)))
Exemplo n.º 18
0
def sequences_to_midis_test():
    logger.info("sequences_to_midis_test() started")
    sequences = midis_to_sequences('/home/hades/Documents/simple_data')
    sequences_to_midis(sequences)
    logger.info("sequences_to_midis_test() passed")
def convert_examples_to_features(examples, label_list, max_seq_length,
                                 tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        labels_ids = label_map[example.labels]
        if ex_index < 0:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x)
                                                    for x in input_ids]))
            logger.info("input_mask: %s" %
                        " ".join([str(x) for x in input_mask]))
            logger.info("segment_ids: %s" %
                        " ".join([str(x) for x in segment_ids]))
            logger.info("label: %s (id = %s)" % (example.labels, labels_ids))

        features.append(
            InputFeatures(input_ids=input_ids,
                          input_mask=input_mask,
                          segment_ids=segment_ids,
                          label_ids=labels_ids))
    return features