示例#1
0
 def get_sentence_examples(self, questions):
     examples = []
     for index, data in enumerate(questions):
         guid = 'test-%d' % index
         text_a = tokenization.convert_to_unicode(str(data[0]))
         text_b = tokenization.convert_to_unicode(str(data[1]))
         label = str(0)
         examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
     return examples
示例#2
0
def make_examples_fn(samples, set_type):
    ## samples should be list of tuples (text,label)
    examples = []
    for (i, sample) in enumerate(samples):
        guid = "%s-%s" % (set_type, i)
        text = tokenization.convert_to_unicode(sample[0])
        if set_type == "test":
            label = "0"
        else:
            label = tokenization.convert_to_unicode(sample[1])
        examples.append(InputExample(guid=guid, text_a=text, label=label))
    return examples
示例#3
0
def create_examples(lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
        # Only the test set has a header
        if set_type == "test" and i == 0:
            continue
        guid = "%s-%s" % (set_type, i)
        if set_type == "test":
            text_a = tokenization.convert_to_unicode(line[1])
            label = "0"
        else:
            text_a = tokenization.convert_to_unicode(line[3])
            label = tokenization.convert_to_unicode(line[1])
        examples.append(
            InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    return examples
示例#4
0
 def _create_examples(self, lines, set_type, set_id):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         guid = "%s-%s-%s" % (set_type, set_id, i)
         if set_type == "test":
             text_a = tokenization.convert_to_unicode(line[0])
             label = "-1"
         else:
             text_a = tokenization.convert_to_unicode(line[0])
             label = tokenization.convert_to_unicode(line[1])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=None,
                          label=label))
     return examples
示例#5
0
    def create_examples(self, lines, set_type, file_base=True):
        """Creates examples for the training and dev sets. each line is label+\t+text_a+\t+text_b """
        examples = []
        for (i, line) in tqdm(enumerate(lines)):

            if file_base:
                if i == 0:
                    continue

            guid = "%s-%s" % (set_type, i)
            text = tokenization.convert_to_unicode(line[1])
            if set_type == "test" or set_type == "pred":
                label = "0"
            else:
                label = tokenization.convert_to_unicode(line[0])
            examples.append(InputExample(guid=guid, text_a=text, label=label))
        return examples
示例#6
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "%s-%s" % (set_type, i)
         text_a = tokenization.convert_to_unicode(line[3])
         text_b = tokenization.convert_to_unicode(line[4])
         if set_type == "test":
             label = "0"
         else:
             label = tokenization.convert_to_unicode(line[0])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
def class_predict_service():

    global graph
    with graph.as_default():
        result = {}
        result['code'] = 0
        try:
            sentence = request.args['text']
            result['text'] = sentence
            start = datetime.now()
            sentence = tokenizer.tokenize(sentence)
            sentence = " ".join(sentence)
            print('your input is:{}'.format(sentence))
            example = InputExample(guid=None, text_a=sentence, text_b=None)

            feature = convert_single_example(0, example, labels,
                                             config.max_seq_length, tokenizer)

            input_ids, input_mask, segment_ids, label_ids = convert(feature)

            print(input_ids)
            feed_dict = {input_ids_p: input_ids, input_mask_p: input_mask}
            # run session get current feed_dict result
            pred_probabilities_result = sess.run([probabilities], feed_dict)[0]
            #print(pred_probabilities_result[0])
            #print(pred_probabilities_result)
            label_ids = np.where(pred_probabilities_result > 0.5, 1, 0)
            pred_label_result = mlb.inverse_transform(label_ids)[0]

            print(label_ids)
            #todo: 组合策略
            result['data'] = pred_label_result
            result["data2"] = convert_id2label(labels,
                                               pred_probabilities_result)
            print('time used: {} sec'.format(
                (datetime.now() - start).total_seconds()))
            return json.dumps(result, ensure_ascii=False)
        except:
            traceback.print_exc()
            result['code'] = -1
            result['data'] = 'error'
            return json.dumps(result, ensure_ascii=False)
示例#8
0
 def get_dev_examples(self, data_dir):
     """See base class."""
     lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "dev-%d" % (i)
         language = tokenization.convert_to_unicode(line[0])
         if language != tokenization.convert_to_unicode(self.language):
             continue
         text_a = tokenization.convert_to_unicode(line[6])
         text_b = tokenization.convert_to_unicode(line[7])
         label = tokenization.convert_to_unicode(line[1])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
示例#9
0
 def get_train_examples(self, data_dir):
     """See base class."""
     lines = self._read_tsv(
         os.path.join(data_dir, "multinli",
                      "multinli.train.%s.tsv" % self.language))
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "train-%d" % (i)
         text_a = tokenization.convert_to_unicode(line[0])
         text_b = tokenization.convert_to_unicode(line[1])
         label = tokenization.convert_to_unicode(line[2])
         if label == tokenization.convert_to_unicode("contradictory"):
             label = tokenization.convert_to_unicode("contradiction")
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
示例#10
0
def main():
    max_seq_len = 64
    label_list = ['0', '1']
    sentences = ("您好.麻烦您截图全屏辛苦您了.", "麻烦您截图大一点辛苦您了.最好可以全屏.")
    guid = 'test-%d' % 1
    text_a = tokenization.convert_to_unicode(str(sentences[0]))
    text_b = tokenization.convert_to_unicode(str(sentences[1]))
    label = str(0)
    predict_examples = InputExample(guid=guid,
                                    text_a=text_a,
                                    text_b=text_b,
                                    label=label)
    tokenizer = tokenization.FullTokenizer(
        vocab_file='./albert_config/vocab.txt', do_lower_case=True)
    features = convert_single_example(predict_examples, label_list,
                                      max_seq_len, tokenizer)
    export_dir = './export/1576720765'
    graph = tf.Graph()
    with graph.as_default():
        with tf.Session() as sess:
            tf.saved_model.loader.load(sess, [tag_constants.SERVING],
                                       export_dir)
            tensor_input_ids = graph.get_tensor_by_name('input_ids_1:0')
            tensor_input_mask = graph.get_tensor_by_name('input_mask_1:0')
            tensor_label_ids = graph.get_tensor_by_name('label_ids_1:0')
            tensor_segment_ids = graph.get_tensor_by_name('segment_ids_1:0')
            tensor_outputs = graph.get_tensor_by_name('loss/Softmax:0')
            result = sess.run(
                tensor_outputs,
                feed_dict={
                    tensor_input_ids:
                    np.array(features.input_ids).reshape(-1, max_seq_len),
                    tensor_input_mask:
                    np.array(features.input_mask).reshape(-1, max_seq_len),
                    tensor_label_ids:
                    np.array([features.label_id]),
                    tensor_segment_ids:
                    np.array(features.segment_ids).reshape(-1, max_seq_len),
                })
            print(*(result[0]), sep='\t')
示例#11
0

datasets = get_datas(args.test_file)
output_dict = {}
model.to(device)
print('Predicting by Bert....')
predicts = []
d = defaultdict(str)
for dataset in tqdm(datasets):
    if not len(dataset):
        # output_dict[pid] = ''
        continue
    examples = []
    for i, data in enumerate(dataset):
        if i < 64:
            examples.append(InputExample(i, data[1], data[2], '0'))
    eval_features = convert_examples_to_features(examples, label_list,
                                                 args.max_seq_length,
                                                 tokenizer)
    all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                 dtype=torch.long)
    #  print(all_input_ids.size())
    input_ids = all_input_ids.to(device)
    input_mask = all_input_mask.to(device)
    segment_ids = all_segment_ids.to(device)
示例#12
0
label_ids = [
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
]
label_list = ['0', '1', '2']
max_seq_length = 70
vocab_file = "../data_model/chinese_L-12_H-768_A-12/vocab.txt"
do_lower_case = True
fix_label = "2"

text = "linux改xp花了一下午时间。散热不好,cpu温度就没下过50,玩游戏能上70,比较吓人。触摸板关不掉,打字经常碰到。"
guid = 'train-%d' % index  # 参数guid是用来区分每个example的
text_a = tokenization.convert_to_unicode(text)  # 要分类的文本
# label = str(line[2])  # 文本对应的情感类别
example = InputExample(guid=guid, text_a=text_a, text_b=None, label=fix_label)

tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                       do_lower_case=do_lower_case)
feature = convert_single_example(ex_index, example, label_list, max_seq_length,
                                 tokenizer)
# input_dict["input_ids"] = [[101, 8403, 3121, 8766, 5709, 749, 671, 678, 1286, 3198, 7313, 511, 3141, 4178, 679, 1962, 8024, 8476, 3946, 2428, 2218, 3766, 678, 6814, 8145, 8024, 4381, 3952, 2767, 5543, 677, 8203, 8024, 3683, 6772, 1405, 782, 511, 6239, 3043, 3352, 1068, 679, 2957, 8024, 2802, 2099, 5307, 2382, 4821, 1168, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
# input_dict["input_mask"] = [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
# input_dict["segment_ids"] = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
# input_dict["label_ids"] = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
input_dict["input_ids"] = [feature.input_ids]
input_dict["input_mask"] = [feature.input_mask]
input_dict["segment_ids"] = [feature.segment_ids]
input_dict["label_ids"] = [label_ids]
prob = predict_fn(input_dict)
print(prob)