def get_bert_pair_single_features(FLAGS,
								tokenizer, 
								query, 
								candidate, 
								max_seq_length):

	tokens_a = tokenizer.tokenize(full2half(query))
	tokens_b = tokenizer.tokenize(full2half(candidate))

	tf_data_utils._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)

	def get_input(input_tokens_a, input_tokens_b):
		tokens = []
		segment_ids = []
		tokens.append("[CLS]")
		segment_ids.append(0)

		for token in input_tokens_a:
			tokens.append(token)
			segment_ids.append(0)
		tokens.append("[SEP]")
		segment_ids.append(0)

		for token in input_tokens_b:
			tokens.append(token)
			segment_ids.append(1)
		tokens.append("[SEP]")
		segment_ids.append(1)

		input_ids = tokenizer.convert_tokens_to_ids(tokens)
		input_mask = [1] * len(input_ids)

		# Zero-pad up to the sequence length.
		while len(input_ids) < max_seq_length:
			input_ids.append(0)
			input_mask.append(0)
			segment_ids.append(0)

		return [tokens, input_ids, 
				input_mask, segment_ids]

	[tokens_a_,
	input_ids_a, 
	input_mask_a, 
	segment_ids_a] = get_input(tokens_a, tokens_b)

	[tokens_b_,
	input_ids_b, 
	input_mask_b, 
	segment_ids_b] = get_input(tokens_b, tokens_a)

	feature_dict = {"input_ids_a":input_ids_a,
			"input_mask_a":input_mask_a,
			"segment_ids_a":segment_ids_a,
			"input_ids_b":input_ids_b,
			"input_mask_b":input_mask_b,
			"segment_ids_b":segment_ids_b,
			"label_ids":[0]}

	return feature_dict
def get_training_isntance(document, max_seq_length):
    if not document:
        return []

    # index_range = list(range(num_of_documents))
    # index_range.remove(document_index)

    # random_document_lst = random.sample(index_range, len(index_range))

    # Account for [CLS], [SEP], [SEP]
    max_num_tokens = max_seq_length - 3
    instances = []

    # We *usually* want to fill up the entire sequence since we are padding
    # to `max_seq_length` anyways, so short sequences are generally wasted
    # computation. However, we *sometimes*
    # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
    # sequences to minimize the mismatch between pre-training and fine-tuning.
    # The `target_seq_length` is just a rough target however, whereas
    # `max_seq_length` is a hard limit.
    target_seq_length = max_num_tokens
    tokens_a_lst = []

    instances = []

    tokens_a = document['title']
    tokens_b = document['comment']
    label = document['label']

    tf_data_utils._truncate_seq_pair(tokens_a, tokens_b, max_num_tokens)

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        if token == '[UNK]':
            continue
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)
    for token in tokens_b:
        if token == '[UNK]':
            continue
        tokens.append(token)
        segment_ids.append(1)
    tokens.append("[SEP]")
    segment_ids.append(1)

    if label == "0":
        is_random_next = 0
    else:
        is_random_next = 1

    instance = TrainingInstance(tokens=tokens,
                                segment_ids=segment_ids,
                                label_ids=is_random_next)
    instances = [instance]
    return instances
示例#3
0
def convert_classifier_examples_with_rule_to_features(examples, label_dict,
                                                      max_seq_length,
                                                      tokenizer, rule_detector,
                                                      output_file):

    feature_writer = ClassifierRuleFeatureWriter(output_file,
                                                 is_training=False)

    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)
        if ex_index % 10000 == 0:
            tf.logging.info("Writing example %d of %d" %
                            (ex_index, len(examples)))

        tokens_b = None
        if example.text_b:
            try:
                tokens_b = tokenizer.tokenize(example.text_b)
            except:
                print("==token b error==", example.text_b, ex_index)
                break

        if tokens_b:
            tf_data_utils._truncate_seq_pair(tokens_a, tokens_b,
                                             max_seq_length - 3)

        else:
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[0:(max_seq_length - 2)]

        rule_id_lst = rule_detector.infer(tokens_a)

        tokens = []
        segment_ids = []
        rule_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        rule_ids.append(0)
        for index, token in enumerate(tokens_a):
            tokens.append(token)
            segment_ids.append(0)
            rule_ids.append(rule_id_lst[index])
        tokens.append("[SEP]")
        segment_ids.append(0)
        rule_ids.append(0)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
            rule_ids.append(0)

        try:
            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length
            assert len(rule_ids) == max_seq_length
        except:
            print(len(input_ids), max_seq_length, ex_index, "length error")
            break

        if len(example.label) == 1:
            label_id = label_dict[example.label[0]]
        else:
            label_id = [0] * len(label_dict)
            for item in example.label:
                label_id[label_dict[item]] = 1
        if ex_index < 5:
            print(tokens)
            tf.logging.info("*** Example ***")
            tf.logging.info("guid: %s" % (example.guid))
            tf.logging.info(
                "tokens: %s" %
                " ".join([tokenization.printable_text(x) for x in tokens]))
            tf.logging.info("input_ids: %s" %
                            " ".join([str(x) for x in input_ids]))
            tf.logging.info("input_mask: %s" %
                            " ".join([str(x) for x in input_mask]))
            tf.logging.info("segment_ids: %s" %
                            " ".join([str(x) for x in segment_ids]))
            tf.logging.info("rule_ids: %s" %
                            " ".join([str(x) for x in rule_ids]))
            tf.logging.info("label: {} (id = {})".format(
                example.label, label_id))

        feature = extra_mask_feature_classifier.InputFeatures(
            guid=example.guid,
            input_ids=input_ids,
            input_mask=input_mask,
            segment_ids=segment_ids,
            rule_ids=rule_ids,
            label_ids=label_id)
        feature_writer.process_feature(feature)
    feature_writer.close()
示例#4
0
def convert_pair_order_classifier_examples_to_features(examples, label_dict,
                                                       max_seq_length,
                                                       tokenizer, output_file):

    feature_writer = PairClassifierFeatureWriter(output_file,
                                                 is_training=False)

    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)
        if ex_index % 10000 == 0:
            tf.logging.info("Writing example %d of %d" %
                            (ex_index, len(examples)))

        tokens_b = tokenizer.tokenize(example.text_b)

        tf_data_utils._truncate_seq_pair(tokens_a, tokens_b,
                                         max_seq_length - 3)

        def get_input(input_tokens_a, input_tokens_b):
            tokens = []
            segment_ids = []
            tokens.append("[CLS]")
            segment_ids.append(0)

            for token in input_tokens_a:
                tokens.append(token)
                segment_ids.append(0)
            tokens.append("[SEP]")
            segment_ids.append(0)

            for token in input_tokens_b:
                tokens.append(token)
                segment_ids.append(1)
            tokens.append("[SEP]")
            segment_ids.append(1)

            input_ids = tokenizer.convert_tokens_to_ids(tokens)
            input_mask = [1] * len(input_ids)

            # Zero-pad up to the sequence length.
            while len(input_ids) < max_seq_length:
                input_ids.append(0)
                input_mask.append(0)
                segment_ids.append(0)

            return [tokens, input_ids, input_mask, segment_ids]

        [tokens_a_, input_ids_a, input_mask_a,
         segment_ids_a] = get_input(tokens_a, tokens_b)

        [tokens_b_, input_ids_b, input_mask_b,
         segment_ids_b] = get_input(tokens_b, tokens_a)

        try:
            assert len(input_ids_a) == max_seq_length
            assert len(input_mask_a) == max_seq_length
            assert len(segment_ids_a) == max_seq_length

            assert len(input_ids_b) == max_seq_length
            assert len(input_mask_b) == max_seq_length
            assert len(segment_ids_b) == max_seq_length

        except:
            print(len(input_ids_a), input_ids_a, max_seq_length, ex_index,
                  "length error")
            break

        if len(example.label) == 1:
            label_id = label_dict[example.label[0]]
        else:
            label_id = [0] * len(label_dict)
            for item in example.label:
                label_id[label_dict[item]] = 1
        if ex_index < 5:
            tf.logging.info("*** Example ***")
            tf.logging.info("guid: %s" % (example.guid))
            tf.logging.info(
                "tokens_a: %s" %
                " ".join([tokenization.printable_text(x) for x in tokens_a_]))
            tf.logging.info("input_ids_a: %s" %
                            " ".join([str(x) for x in input_ids_a]))
            tf.logging.info("input_mask_a: %s" %
                            " ".join([str(x) for x in input_mask_a]))
            tf.logging.info("segment_ids_a: %s" %
                            " ".join([str(x) for x in segment_ids_a]))

            tf.logging.info(
                "tokens_b: %s" %
                " ".join([tokenization.printable_text(x) for x in tokens_b_]))
            tf.logging.info("input_ids_b: %s" %
                            " ".join([str(x) for x in input_ids_b]))
            tf.logging.info("input_mask_b: %s" %
                            " ".join([str(x) for x in input_mask_b]))
            tf.logging.info("segment_ids_b: %s" %
                            " ".join([str(x) for x in segment_ids_b]))

            tf.logging.info("label: {} (id = {})".format(
                example.label, label_id))

        feature = pair_data_feature_classifier.InputFeatures(
            guid=example.guid,
            input_ids_a=input_ids_a,
            input_mask_a=input_mask_a,
            segment_ids_a=segment_ids_a,
            input_ids_b=input_ids_b,
            input_mask_b=input_mask_b,
            segment_ids_b=segment_ids_b,
            label_ids=label_id)
        feature_writer.process_feature(feature)
    feature_writer.close()
示例#5
0
def convert_multichoice_examples_to_features(examples, label_dict,
                                             max_seq_length, tokenizer,
                                             output_file):

    feature_writer = MultiChoiceFeatureWriter(output_file, is_training=False)
    for (ex_index, example) in enumerate(examples):
        question_text = tokenizer.tokenize(example.question_text)
        context_text = tokenizer.tokenize(example.doc_tokens)

        if ex_index % 10000 == 0:
            tf.logging.info("Writing example %d of %d" %
                            (ex_index, len(examples)))

        question_context = question_text + context_text
        choice_token_ids = []
        choice_segment_ids = []
        choice_mask = []
        choice_tokens = []
        for answer in example.answer_choice:
            answer_text = tokenizer.tokenize(answer)
            tf_data_utils._truncate_seq_pair(question_context, answer_text,
                                             max_seq_length - 3)

            tokens = []
            segment_ids = []
            tokens.append("[CLS]")
            segment_ids.append(0)

            for token in question_context:
                tokens.append(token)
                segment_ids.append(0)
            tokens.append("[SEP]")
            segment_ids.append(0)

            for token in answer_text:
                tokens.append(token)
                segment_ids.append(1)
            tokens.append("[SEP]")
            segment_ids.append(1)

            input_ids = tokenizer.convert_tokens_to_ids(tokens)
            input_mask = [1] * len(input_ids)

            # Zero-pad up to the sequence length.
            while len(input_ids) < max_seq_length:
                input_ids.append(0)
                input_mask.append(0)
                segment_ids.append(0)

            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length

            choice_token_ids.extend(input_ids)
            choice_segment_ids.extend(segment_ids)
            choice_mask.extend(input_mask)
            choice_tokens.extend(tokens)

        assert len(choice_token_ids) == max_seq_length * len(
            example.answer_choice)

        if ex_index < 5:
            tf.logging.info("*** Example ***")
            tf.logging.info("tokens: {}".format(choice_token_ids))
            tf.logging.info("choice: {} answer {}".format(
                example.choice, example.answer_choice))

            # tf.logging.info("*** Example ***")
            # tf.logging.info("qas_id: %s" % (example.qas_id))
            # tf.logging.info("tokens: %s" % " ".join(
            # 		[tokenization.printable_text(x) for x in tokens]))
            # tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            # tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            # tf.logging.info(
            # 		"segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            # tf.logging.info("choice: {} answer {}".format(example.choice, example.answer_choice))

        feature = data_feature_mrc.InputFeatures(
            unique_id=example.qas_id,
            input_ids=choice_token_ids,
            input_mask=choice_mask,
            segment_ids=choice_segment_ids,
            choice=example.choice)
        feature_writer.process_feature(feature)
    feature_writer.close()
示例#6
0
def create_cls_problem_generator(task_type,
									examples,
									label_dict,
									multi_task_config,
									tokenizer,
									mode):
	max_seq_length = multi_task_config[task_type]["max_length"]
	lm_augumentation = multi_task_config[task_type]["lm_augumentation"]
	for (ex_index, example) in enumerate(examples):
		tokens_a = tokenizer.tokenize(example.text_a)
		if ex_index % 10000 == 0:
			tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))

		tokens_b = None
		if example.text_b:
			try:
				tokens_b = tokenizer.tokenize(example.text_b)
			except:
				print("==token b error==", example.text_b, ex_index)
				break

		if tokens_b:
			tf_data_utils._truncate_seq_pair(tokens_a, tokens_b, max_seq_length-3)
		else:
			if len(tokens_a) > max_seq_length - 2:
				tokens_a = tokens_a[0:(max_seq_length - 2)]

		tokens = []
		segment_ids = []
		tokens.append("[CLS]")
		segment_ids.append(0)

		for token in tokens_a:
			tokens.append(token)
			segment_ids.append(0)
		tokens.append("[SEP]")
		segment_ids.append(0)

		if tokens_b:
			for token in tokens_b:
				tokens.append(token)
				segment_ids.append(1)
			tokens.append("[SEP]")
			segment_ids.append(1)

		if lm_augumentation and mode == 'train':
			rng = random.Random()
			(mask_lm_tokens, masked_lm_positions,
				masked_lm_labels) = create_masked_lm_predictions(
					tokens,
					multi_task_config[task_type]["masked_lm_prob"],
					multi_task_config[task_type]["max_predictions_per_seq"],
					list(tokenizer.vocab.keys()), rng)

			_, mask_lm_tokens, _ = create_mask_and_padding(
				mask_lm_tokens, copy(segment_ids), max_seq_length)
			masked_lm_weights, masked_lm_labels, masked_lm_positions = create_mask_and_padding(
				masked_lm_labels, masked_lm_positions, 
				multi_task_config[task_type]["max_predictions_per_seq"])
			mask_lm_input_ids = tokenizer.convert_tokens_to_ids(
				mask_lm_tokens)
			masked_lm_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels)

			assert len(mask_lm_tokens) == max_seq_length

		input_mask, tokens, segment_ids = create_mask_and_padding(
			tokens, segment_ids, max_seq_length)

		input_ids = tokenizer.convert_tokens_to_ids(tokens)
		if len(example.label) == 1:
			label_id = label_dict[example.label[0]]
		else:
			label_id = [0] * len(label_dict)
			for item in example.label:
				label_id[label_dict[item]] = 1

		assert len(input_ids) == max_seq_length
		assert len(input_mask) == max_seq_length
		assert len(segment_ids) == max_seq_length

		if ex_index < 5:
			tf.logging.debug("*** Example ***")
			tf.logging.debug("tokens: %s" % " ".join(
				[tokenization.printable_text(x) for x in tokens]))
			tf.logging.debug("input_ids: %s" %
							 " ".join([str(x) for x in input_ids]))
			tf.logging.debug("input_mask: %s" %
							 " ".join([str(x) for x in input_mask]))
			tf.logging.debug("segment_ids: %s" %
							 " ".join([str(x) for x in segment_ids]))
			tf.logging.debug("%s_label_ids: %s" %
							 (task_type, str(label_id)))
			tf.logging.debug("%s_label: %s" %
							 (task_type, str(example.label)))
			if lm_augumentation and mode == 'train':
				tf.logging.debug("mask lm tokens: %s" % " ".join(
					[tokenization.printable_text(x) for x in mask_lm_tokens]))
				tf.logging.debug("mask lm input_ids: %s" %
								 " ".join([str(x) for x in mask_lm_input_ids]))
				tf.logging.debug("mask lm label ids: %s" %
								 " ".join([str(x) for x in masked_lm_ids]))
				tf.logging.debug("mask lm position: %s" %
								 " ".join([str(x) for x in masked_lm_positions]))
			
		if not lm_augumentation:
			return_dict = {
				'input_ids': input_ids,
				'input_mask': input_mask,
				'segment_ids': segment_ids,
				'%s_label_ids' % task_type: label_id
			}

		else:
			if mode == 'train':
				return_dict = {
					'input_ids': mask_lm_input_ids,
					'input_mask': input_mask,
					'segment_ids': segment_ids,
					'%s_label_ids' % task_type: label_id,
					"masked_lm_positions": masked_lm_positions,
					"masked_lm_ids": masked_lm_ids,
					"masked_lm_weights": masked_lm_weights,
				}
			else:
				return_dict = {
                    'input_ids': input_ids,
                    'input_mask': input_mask,
                    'segment_ids': segment_ids,
                    '%s_label_ids' % task_type: label_id,
                    "masked_lm_positions": [0]*multi_task_config[task_type]["max_predictions_per_seq"],
                    "masked_lm_ids": [0]*multi_task_config[task_type]["max_predictions_per_seq"],
                    "masked_lm_weights": [0]*multi_task_config[task_type]["max_predictions_per_seq"],
                }
		
		yield return_dict
def create_instances_from_document(all_documents, document_index, vocab_words,
                                   max_seq_length, short_seq_prob,
                                   masked_lm_prob, max_predictions_per_seq,
                                   rng, num_of_documents):
    """Creates `TrainingInstance`s for a single document."""
    document = get_document(all_documents, es_api, document_index)
    if not document:
        return []

    # index_range = list(range(num_of_documents))
    # index_range.remove(document_index)

    # random_document_lst = random.sample(index_range, len(index_range))

    # Account for [CLS], [SEP], [SEP]
    max_num_tokens = max_seq_length - 3
    instances = []

    # We *usually* want to fill up the entire sequence since we are padding
    # to `max_seq_length` anyways, so short sequences are generally wasted
    # computation. However, we *sometimes*
    # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
    # sequences to minimize the mismatch between pre-training and fine-tuning.
    # The `target_seq_length` is just a rough target however, whereas
    # `max_seq_length` is a hard limit.
    target_seq_length = max_num_tokens
    tokens_a_lst = []

    instances = []

    tokens_a = document['title']
    tokens_b = document['comment']
    label = document['label']

    tf_data_utils._truncate_seq_pair(tokens_a, tokens_b, max_num_tokens)

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        if token == '[UNK]':
            continue
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)
    for token in tokens_b:
        if token == '[UNK]':
            continue
        tokens.append(token)
        segment_ids.append(1)
    tokens.append("[SEP]")
    segment_ids.append(1)

    if label == "0":
        is_random_next = False
    else:
        is_random_next = True

    (output_tokens, masked_lm_positions,
     masked_lm_labels) = create_masked_lm_predictions(tokens, masked_lm_prob,
                                                      max_predictions_per_seq,
                                                      vocab_words, rng)
    instance = TrainingInstance(original_tokens=tokens,
                                tokens=output_tokens,
                                segment_ids=segment_ids,
                                is_random_next=is_random_next,
                                masked_lm_positions=masked_lm_positions,
                                masked_lm_labels=masked_lm_labels)
    instances = [instance]
    return instances