示例#1
0
def get_entityType_pinyin(entity_type):
    entity_info_dict = {}
    entity_file = os.path.join(entity_folder, "%s.txt" % entity_type)
    with open(entity_file, "r") as fr:
        lines = fr.readlines()
    priority = 3
    if entity_type in ["song"]:
        priority -= 0.5
    print(curLine(), "get %d %s from %s, priority=%f" % (len(lines), entity_type, entity_file, priority))
    for line in lines:
        raw_entity = line.strip()
        add_pinyin(raw_entity, entity_info_dict, priority, entity_type)

    ### TODO  从标注语料中挖掘得到
    entity_file = os.path.join(entity_files_folder, "%s.json" % entity_type)
    with open(entity_file, "r") as fr:
        current_entity_dict = json.load(fr)
    print(curLine(), "get %d %s from %s, priority=%f" % (len(current_entity_dict), entity_type, entity_file, priority))
    for entity_before, entity_after_times in current_entity_dict.items():
        entity_after = entity_after_times[0]
        priority = 4
        if entity_type in ["song"]:
            priority -= 0.5
        add_pinyin(entity_after, entity_info_dict, priority, entity_type)
    return entity_info_dict
示例#2
0
def pinyin_similar_word_noduoyin(entity_info_dict, word):
    if word in entity_info_dict:  # 存在实体,无需纠错
        return 1.0, word
    best_similar_word = None
    top_similar_score = 0
    try:
        all_combination = ["".join(lazy_pinyin(word))
                           ]  # get_pinyin_combination(entity=word) #
        for current_combination in all_combination:  # 当前的各种发音
            if len(current_combination) == 0:
                print(curLine(), "word:", word)
                continue
            similar_word = None
            current_distance = 10000
            for entity, (com, pri) in entity_info_dict.items():
                char_ratio = 0.0
                d = distance(com, current_combination) * (
                    1.0 - char_ratio) + distance(entity, word) * char_ratio
                if d < current_distance:
                    current_distance = d
                    similar_word = entity
                # if d<=2.5:
                #     print(curLine(),com, current_combination, distance(com, current_combination), distance(entity, word) )
                #     print(curLine(), word, entity, similar_word, "current_distance=", current_distance)

            current_similar_score = 1.0 - float(current_distance) / len(
                current_combination)
            # print(curLine(), "current_combination:%s, %f" % (current_combination, current_similar_score), similar_word, current_distance)
            if current_similar_score > top_similar_score:
                # print(curLine(), current_similar_score, top_similar_score, best_similar_word, similar_word)
                best_similar_word = similar_word
                top_similar_score = current_similar_score
    except Exception as error:
        print(curLine(), "error:", error)
    return top_similar_score, best_similar_word
示例#3
0
def http_post(sources_batch):
    parameter = {'text_list': sources_batch}
    headers = {'Content-type': 'application/json'}
    status = -1
    output = None
    try:
        r = requests.post(url,
                          data=json.dumps(parameter),
                          headers=headers,
                          timeout=10.5)
        if r.status_code == 200:
            result = r.json()
            # print(curLine(),result)
            status = result['status']
            version = result['version']
            if status == 0:
                data = result["data"]
                output = data['output']
            else:
                print(
                    curLine(), "version:%s, status=%d, message:%s" %
                    (version, status, result['message']))
        else:
            print("%sraise wrong,status_code: " % (curLine()), r.status_code)
    except Exception as e:
        print(curLine(), Exception, ' : ', e)
        input(curLine())
    return status, output
示例#4
0
def split(corpus_list, save_folder, trainRate=0.8):
    corpusNum = len(corpus_list)
    shuffle_indices = list(np.random.permutation(range(corpusNum)))
    indexTrain = int(trainRate * corpusNum)
    # indexDev= int((trainRate + devRate) * corpusNum)
    corpusTrain = []
    for i in shuffle_indices[:indexTrain]:
        corpusTrain.append(corpus_list[i])
    save_file = os.path.join(save_folder, "train.txt")
    with open(save_file, "w") as fw:
        fw.writelines(corpusTrain)
    print(curLine(), "have save %d to %s" % (len(corpusTrain), save_file))

    corpusDev = []
    for i in shuffle_indices[indexTrain:]:  # TODO all corpus
        corpusDev.append(corpus_list[i])
    save_file = os.path.join(save_folder, "tune.txt")
    with open(save_file, "w") as fw:
        fw.writelines(corpusDev)
    print(curLine(), "have save %d to %s" % (len(corpusDev), save_file))

    save_file = os.path.join(save_folder, "test.txt")
    with open(save_file, "w") as fw:
        fw.writelines(corpusDev)
    print(curLine(), "have save %d to %s" % (len(corpusDev), save_file))
示例#5
0
def get_entityType_pinyin(entity_type):
    entity_info_dict = {}
    entity_file = os.path.join(entity_folder, "%s.txt" % entity_type)
    with open(entity_file, "r") as fr:
        lines = fr.readlines()
    pri = 3
    if entity_type in ["song"]:
        pri -= 0.5
    print(
        curLine(), "get %d %s from %s, pri=%f" %
        (len(lines), entity_type, entity_file, pri))
    for line in lines:
        entity = line.strip()
        for k, v in number_map.items():
            entity.replace(k, v)
        # for combination in all_combination:
        if entity not in entity_info_dict:  # 新的实体
            combination = "".join(lazy_pinyin(
                entity))  # default:默认行为,不处理,原木原样返回  , errors="ignore"
            if len(combination) < 2:
                print(curLine(), "warning:", entity, "combination:",
                      combination)
            entity_info_dict[entity] = (combination, pri)
        else:
            combination, old_pri = entity_info_dict[entity]
            if pri > old_pri:
                entity_info_dict[entity] = (combination, pri)
    return entity_info_dict
示例#6
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')
    flags.mark_flag_as_required('input_file')
    flags.mark_flag_as_required('input_format')
    flags.mark_flag_as_required('output_file')
    flags.mark_flag_as_required('label_map_file')
    flags.mark_flag_as_required('vocab_file')
    flags.mark_flag_as_required('saved_model')

    label_map = utils.read_label_map(FLAGS.label_map_file)
    converter = tagging_converter.TaggingConverter(
        tagging_converter.get_phrase_vocabulary_from_label_map(label_map),
        FLAGS.enable_swap_tag)
    builder = bert_example.BertExampleBuilder(label_map, FLAGS.vocab_file,
                                              FLAGS.max_seq_length,
                                              FLAGS.do_lower_case, converter)
    predictor = predict_utils.LaserTaggerPredictor(
        tf.contrib.predictor.from_saved_model(FLAGS.saved_model), builder,
        label_map)
    print(colored("%s input file:%s" % (curLine(), FLAGS.input_file), "red"))
    sources_list = []
    target_list = []
    with tf.gfile.GFile(FLAGS.input_file) as f:
        for line in f:
            sources, target, lcs_rate = line.rstrip('\n').split('\t')
            sources_list.append([sources])
            target_list.append(target)
    number = len(sources_list)  # 总样本数
    predict_batch_size = min(64, number)
    batch_num = math.ceil(float(number) / predict_batch_size)

    start_time = time.time()
    num_predicted = 0
    with tf.gfile.Open(FLAGS.output_file, 'w') as writer:
        writer.write(f'source\tprediction\ttarget\n')
        for batch_id in range(batch_num):
            sources_batch = sources_list[batch_id *
                                         predict_batch_size:(batch_id + 1) *
                                         predict_batch_size]
            prediction_batch = predictor.predict_batch(
                sources_batch=sources_batch)
            assert len(prediction_batch) == len(sources_batch)
            num_predicted += len(prediction_batch)
            for id, [prediction,
                     sources] in enumerate(zip(prediction_batch,
                                               sources_batch)):
                target = target_list[batch_id * predict_batch_size + id]
                writer.write(f'{"".join(sources)}\t{prediction}\t{target}\n')
            if batch_id % 20 == 0:
                cost_time = (time.time() - start_time) / 60.0
                print(
                    "%s batch_id=%d/%d, predict %d/%d examples, cost %.2fmin."
                    % (curLine(), batch_id + 1, batch_num, num_predicted,
                       number, cost_time))
    cost_time = (time.time() - start_time) / 60.0
    logging.info(
        f'{curLine()} {num_predicted} predictions saved to:{FLAGS.output_file}, cost {cost_time} min, ave {cost_time / num_predicted} min.'
    )
示例#7
0
def main(corpus_folder, save_folder):
    fileList = os.listdir(corpus_folder)
    corpus_list_total = []
    for raw_file_name in fileList:
        corpus_list = process(corpus_folder, raw_file_name)
        print(curLine(), raw_file_name, len(corpus_list))
        corpus_list_total.extend(corpus_list)
    save_file = os.path.join(save_folder, "baoan_airport_from_xlsx.txt")
    with open(save_file, "w") as fw:
        fw.writelines(corpus_list_total)
    print(curLine(),
          "have save %d to %s" % (len(corpus_list_total), save_file))
def get_slot_info(query, domain):
    useEntityTypeList = domain2entity_map[domain]
    entityTypeMap = get_all_entity(query, useEntityTypeList=useEntityTypeList)
    entity_list_all = []  # 汇总所有实体
    for entity_type, entity_list in entityTypeMap.items():
        for entity in entity_list:
            entity_before = entity['before']
            ignore_flag = False
            if entity_type != "song" and len(
                    entity_before) < 2 and entity_before not in ["家", "妈"]:
                ignore_flag = True
            if entity_type == "song" and len(entity_before) < 2 and \
                    entity_before not in {"鱼", "云", "逃", "退", "陶", "美", "图", "默"}:
                ignore_flag = True
            if entity_before in {
                    "什么歌", "一首", "小花", "叮当", "傻逼", "给你听", "现在", "当我"
            }:
                ignore_flag = True
            if ignore_flag:
                if entity_before not in "好点没走伤":
                    print(
                        curLine(),
                        "ignore entity_type:%s, entity:%s, query:%s" %
                        (entity_type, entity_before, query))
            else:
                entity_list_all.append((entity_type, entity_before,
                                        entity['after'], entity['priority']))
    entity_list_all = sorted(entity_list_all,
                             key=lambda item: len(item[1]) * 100 + item[3],
                             reverse=True)  # new_entity_map 中key是实体,value是实体类型
    slot_info = query
    exist_entityType_set = set()
    replace_mask = [0] * len(query)
    for entity_type, entity_before, entity_after, priority in entity_list_all:
        if entity_before not in query:
            continue
        if entity_type in exist_entityType_set:
            continue  # 已经有这个类型了,忽略 # TODO
        start_location = slot_info.find(entity_before)
        if start_location > -1:
            exist_entityType_set.add(entity_type)
            if entity_after == entity_before:
                entity_info_str = "<%s>%s</%s>" % (entity_type, entity_after,
                                                   entity_type)
            else:
                entity_info_str = "<%s>%s||%s</%s>" % (
                    entity_type, entity_before, entity_after, entity_type)
            slot_info = slot_info.replace(entity_before, entity_info_str)
            query = query.replace(entity_before, "")
        else:
            print(curLine(), replace_mask, slot_info, "entity_type:",
                  entity_type, entity_before)
    return slot_info
示例#9
0
def process(corpus_folder, raw_file_name, save_folder):
    corpus_list = []
    for name in raw_file_name:
        raw_file = os.path.join(corpus_folder, name)
        with open(raw_file, "r") as fr:
            lines = fr.readlines()

        for i, line in enumerate(lines):
            source, target, label = line.strip().split("\t")
            if label == "0" or source == target:
                continue
            if label != "1":
                input(curLine() + line.strip())
            length = float(len(source) + len(target))

            source_length = len(source)
            if source_length > 8 and source_length < 38 and (
                    i + 1) % 2 > 0:  # 对50%的长句构造交换操作
                rand = random.uniform(0.4, 0.9)
                source_pre = source
                swag_location = int(source_length * rand)
                source = "%s%s" % (source[swag_location:],
                                   source[:swag_location])
                lcs1 = _compute_lcs(source, target)
                lcs_rate = len(lcs1) / length
                if (lcs_rate < 0.4):  # 差异大,换回来
                    source = source_pre
                else:
                    print(
                        curLine(), "source_pre:%s, source:%s, lcs_rate=%f" %
                        (source_pre, source, lcs_rate))

            lcs1 = _compute_lcs(source, target)
            lcs_rate = len(lcs1) / length
            if (lcs_rate < 0.2):
                continue  # 变动过大,忽略

            # if (lcs_rate<0.4):
            #   continue # 变动过大,忽略
            # if len(source)*1.15 < len(target):
            #   new_t = source
            #   source = target
            #   target = new_t
            #   print(curLine(), source, target, ",lcs1:",lcs1 , ",lcs_rate=", lcs_rate)
            corpus = "%s\t%s\t%f\n" % (source, target, lcs_rate)
            corpus_list.append(corpus)
        print(curLine(), len(corpus_list), "from %s" % raw_file)
    save_file = os.path.join(save_folder, "lcqmc.txt")
    with open(save_file, "w") as fw:
        fw.writelines(corpus_list)
    print(curLine(), "have save %d to %s" % (len(corpus_list), save_file))
示例#10
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')
    flags.mark_flag_as_required('input_file')
    flags.mark_flag_as_required('input_format')
    flags.mark_flag_as_required('output_tfrecord_train')
    flags.mark_flag_as_required('output_tfrecord_dev')
    flags.mark_flag_as_required('vocab_file')
    builder = bert_example.BertExampleBuilder({}, FLAGS.vocab_file,
                                              FLAGS.max_seq_length,
                                              FLAGS.do_lower_case)

    num_converted = 0
    num_ignored = 0
    with tf.python_io.TFRecordWriter(
            FLAGS.output_tfrecord_train) as writer_train:
        for input_file in [FLAGS.input_file]:
            print(curLine(), "input_file:", input_file)
            for i, (sources, target) in enumerate(
                    utils.yield_sources_and_targets(input_file,
                                                    FLAGS.input_format)):
                logging.log_every_n(
                    logging.INFO,
                    f'{i} examples processed, {num_converted} converted to tf.Example.',
                    10000)
                if len(sources[-1]) > FLAGS.max_seq_length:  # TODO 忽略问题太长的样本
                    num_ignored += 1
                    print(
                        curLine(),
                        "ignore num_ignored=%d, question length=%d" %
                        (num_ignored, len(sources[-1])))
                    continue
                example1, _ = builder.build_bert_example(sources, target)
                example = example1.to_tf_example().SerializeToString()
                writer_train.write(example)
                num_converted += 1
    logging.info(
        f'Done. {num_converted} examples converted to tf.Example, num_ignored {num_ignored} examples.'
    )
    for output_file in [
            FLAGS.output_tfrecord_train, FLAGS.output_tfrecord_dev
    ]:
        count_fname = _write_example_count(num_converted,
                                           output_file=output_file)
        logging.info(f'Wrote:\n{output_file}\n{count_fname}')
    with open(FLAGS.label_map_file, "w") as f:
        json.dump(builder._label_map, f, ensure_ascii=False, indent=4)
    print(curLine(),
          "save %d to %s" % (len(builder._label_map), FLAGS.label_map_file))
def main():
    argv = sys.argv
    host_name = argv[2]
    model_id = argv[3]
    print(curLine(), "argv:", argv)
    arg_groups = params.parse(argv[1], host_name, mode="test")
    args, config = arg_groups[0]
    args.output_dir = "/home/%s/Mywork/model/qa_model_dir/on_test/block1-layer1-hidden100-acc=85.31" % (
        host_name)  # TODO
    args.output_dir = "/home/%s/Mywork/model/qa_model_dir/part_chatcorpus_model/block1-layer1-hidden100-normal-acc80.57" % host_name

    args.data_dir = os.path.join(
        "/home/%s/Mywork/corpus/Chinese_QA" % host_name, args.data_dir)
    checkpoint_dir = os.path.join(args.output_dir, model_id)

    if len(argv) == 5:
        args.eval_file = argv[4]
    demoer = Demoer(args, checkpoint_dir)
    sample = {
        'text1': "请问谁有狂三这张高清的电影资源?",
        'text2': '这张高清图,谁有狂三这张高清的请问谁有狂三这张高清的电影资源?'
    }
    predictions, probabilities, inference_time = demoer.serve(dev=[sample])

    test(args, config, demoer)  # 批量测试
示例#12
0
    def build_model(self, sess):
        states = {}
        interface = Interface(self.args, self.log)
        self.log(
            f'#classes: {self.args.num_classes}; #vocab: {self.args.num_vocab}'
        )
        if self.args.seed:
            random.seed(self.args.seed)
            np.random.seed(self.args.seed)
            tf.set_random_seed(self.args.seed)

        model = Model(self.args, sess)
        sess.run(tf.global_variables_initializer())
        embeddings = interface.load_embeddings()
        model.set_embeddings(sess, embeddings)

        self.log(f'trainable params: {model.num_parameters():,d}')
        self.log(
            f'trainable params (exclude embeddings): {model.num_parameters(exclude_embed=True):,d}'
        )
        validate_params(self.args)
        file = os.path.join(self.args.summary_dir, 'args.json5')
        print(curLine(), "save to %s" % file)
        with open(file, 'w') as f:
            args = {
                k: v
                for k, v in vars(self.args).items() if not k.startswith('_')
            }
            json5.dump(args, f, indent=2)
        self.log(pformat(vars(self.args), indent=2, width=120))
        return model, interface, states
示例#13
0
def process(corpus_folder, raw_file_name, save_folder):
  raw_file = os.path.join(corpus_folder, raw_file_name)
  with open(raw_file, "r") as fr:
    lines = fr.readlines()
  corpus_list = []
  for line in lines:
    sent_list = line.strip().split("&&")
    sent_num = len(sent_list)
    for i in range(1, sent_num, 2):
      source= sent_list[i-1]
      target = sent_list[i]
      length = float(len(source) + len(target))
      lcs1 = _compute_lcs(source, target)
      lcs_rate= len(lcs1)/length
      if (lcs_rate<0.3):
        continue # 变动过大,忽略
      if len(source)*1.15 < len(target):
        new_t = source
        source = target
        target = new_t
      corpus = "%s\t%s\t%f\n" % (source, target, lcs_rate)
      corpus_list.append(corpus)
  save_file = os.path.join(save_folder, "baoan_airport.txt")
  with open(save_file, "w") as fw:
    fw.writelines(corpus_list)
  print(curLine(), "have save %d to %s" % (len(corpus_list), save_file))
示例#14
0
def my_pinyin(char):
    shengmu = pinyin(char, style=Style.INITIALS, strict=True)[0][0]
    yunmu = pinyin(char, style=Style.FINALS, strict=True)[0][0]
    total_pinyin = lazy_pinyin(char, errors='default')[0]
    if shengmu + yunmu != total_pinyin:
        print(curLine(), "char:", char,
              ",shengmu:%s, yunmu:%s" % (shengmu, yunmu), total_pinyin)
    return shengmu, yunmu, total_pinyin
示例#15
0
def test():
    """
    此函数为测试函数,将sh运行在服务器端后,用该程序在另一网络测试
    This function is a test function.
    Run this function for test in a network while ServerDemo.py is running on a server in a different network
    """
    sources_list = []
    target_list = []
    output_file = "/home/cloudminds/Mywork/corpus/rephrase_corpus/pred.tsv"
    input_file = "/home/cloudminds/Mywork/corpus/rephrase_corpus/test.txt"
    with tf.io.gfile.GFile(input_file) as f:
        for line in f:
            sources, target, lcs_rate = line.rstrip('\n').split('\t')
            sources_list.append(sources)  # [sources])
            target_list.append(target)
    number = len(target_list)  # 总样本数
    predict_batch_size = min(64, number)  # TODO
    batch_num = math.ceil(float(number) / predict_batch_size)
    num_predicted = 0
    with open(output_file, 'w') as writer:
        writer.write(f'source\tprediction\ttarget\n')
        start_time = time.time()
        for batch_id in range(batch_num):
            sources_batch = sources_list[batch_id *
                                         predict_batch_size:(batch_id + 1) *
                                         predict_batch_size]
            # prediction_batch = predictor.predict_batch(sources_batch=sources_batch)
            status, prediction_batch = http_post(sources_batch)
            assert len(prediction_batch) == len(sources_batch)
            num_predicted += len(prediction_batch)
            for id, [prediction,
                     sources] in enumerate(zip(prediction_batch,
                                               sources_batch)):
                target = target_list[batch_id * predict_batch_size + id]
                writer.write(f'{"".join(sources)}\t{prediction}\t{target}\n')
            if batch_id % 20 == 0:
                cost_time = (time.time() - start_time) / 60.0
                print(
                    "%s batch_id=%d/%d, predict %d/%d examples, cost %.2fmin."
                    % (curLine(), batch_id + 1, batch_num, num_predicted,
                       number, cost_time))
    cost_time = (time.time() - start_time) / 60.0
    print(
        curLine(), "%d predictions saved to %s, cost %f min, ave %f min." %
        (num_predicted, output_file, cost_time, cost_time / num_predicted))
def test(args, config, demoer):
    dev = loader.load_data(args.data_dir, args.eval_file)
    targets = []
    for sample in dev:
        targets.append(int(sample['target']))
    predictions, probabilities, inference_time = demoer.serve(dev=dev,
                                                              batch_size=384)

    if "train" in args.eval_file:  # 将模型的置信度保存到文件
        with open(os.path.join(args.data_dir, "%s.txt" % args.eval_file),
                  "r") as fr:
            lines = fr.readlines()
        assert len(lines) == len(
            probabilities
        ), 'number of lines is %d, number of probabilities is %d' % (
            len(lines), len(probabilities))
        save_file = os.path.join(args.data_dir,
                                 "%s_score.txt" % args.eval_file)
        with open(save_file, "w") as writer:
            for line, prediction, prob in zip(lines, predictions,
                                              probabilities):
                writer.write("%s\t%f\n" % (line.strip(), prob[1]))
        print(curLine(),
              "save %d results to %s" % (len(probabilities), save_file))

    outputs = {
        'target': targets,
        'prob': probabilities,
        'pred': predictions,
        'args': args,
    }
    # total_loss = sum(losses[:-1]) / (len(losses) - 1) if len(losses) > 1 else sum(losses)
    states = {'inference_time': inference_time / len(targets)}
    for metric in args.watch_metrics:
        if metric not in states:  # multiple metrics could be computed by the same function
            states.update(metrics[metric](outputs))
    print(curLine(), "stats:", states)
    with open('%s/log.jsonl' % args.output_dir, 'a') as f:
        f.write(
            json5.dumps({
                'data': os.path.basename(args.data_dir),
                'params': config,
                'state': states
            }))
        f.write('\n')
示例#17
0
def main():
    argv = sys.argv
    print(curLine(), "argv:", argv)
    host_name = sys.argv[2]
    if len(argv) == 3:
        arg_groups = params.parse(sys.argv[1], host_name, mode="train")
        test_score_sum = 0.0
        max_test_score = 0.0
        experiment_times = 0
        eval_score_list = []
        best_experiment_times = None
        for args, config in arg_groups:
            if not os.path.exists(args.summary_dir):
                os.makedirs(args.summary_dir)
            args.pretrained_embeddings = os.path.join(
                "/home/%s/Word2Vector/Chinese" % host_name,
                args.pretrained_embeddings)
            # print(curLine(), "args.data_dir:%s, args.output_dir:%s" % (args.data_dir, args.output_dir))
            trainer = Trainer(args)
            states, best_eval_score = trainer.train(experiment_times)
            eval_score_list.append(best_eval_score)
            test_score_sum += best_eval_score
            if max_test_score < best_eval_score:
                max_test_score = best_eval_score
                best_experiment_times = experiment_times
            experiment_times += 1
            print(
                curLine(),
                "experiment_times=%d/%d, best_experiment_times=%d, ave_test_score=%f, max_test_score=%f"
                % (experiment_times, len(arg_groups), best_experiment_times,
                   test_score_sum / experiment_times, max_test_score))
            with open('%s/log.jsonl' % args.output_dir, 'a') as f:
                f.write(
                    json5.dumps({
                        'data': os.path.basename(args.data_dir),
                        'params': config,
                        'state': states,
                    }))
                f.write('\n')
            print(curLine(), "eval_score_list:", eval_score_list,
                  eval_score_list.index(max_test_score), "\n")
    else:
        print(curLine(),
              'Usage: "python train.py configs/xxx.json5 host_name"')
示例#18
0
def get_slot(param):
    slot = []
    if "<" not in param:
        return slot
    if ">" not in param:
        print(curLine(), "param:", param)
        return slot
    if "</" not in param:
        return slot
    start_segment = re.findall("<[\w_]*>", param)
    end_segment = re.findall("</[\w_]*>", param)
    if len(start_segment) != len(end_segment):
        print(curLine(), "start_segment:", start_segment)
        print(curLine(), "end_segment:", end_segment)
    search_location = 0
    for s,e in zip(start_segment, end_segment):
        entityType = s[1:-1]
        assert "</%s>" % entityType == e
        start_index = param[search_location:].index(s) + len(s)
        end_index = param[search_location:].index(e)
        entity_info = param[search_location:][start_index:end_index]
        search_location += end_index + len(e)
        before,after = entity_info, entity_info
        if "||" in entity_info:
            before, after = entity_info.split("||")
        if before in before2after:
            after = before2after[before]
        if before not in all_entity_dict[entityType]:
            all_entity_dict[entityType][before] = [after, 1]
        else:
            if after != all_entity_dict[entityType][before][0]:
                print(curLine(), entityType, before, after, all_entity_dict[entityType][before])
            assert after == all_entity_dict[entityType][before][0]
            all_entity_dict[entityType][before][1] += 1
        if before != after:
            before = after
            if before not in all_entity_dict[entityType]:
                all_entity_dict[entityType][before] = [after, 1]
            else:
                assert after == all_entity_dict[entityType][before][0]
                all_entity_dict[entityType][before][1] += 1
示例#19
0
def read_data(path, lowercase):
    """Reads data from prediction TSV file.

  The prediction file should contain 3 or more columns:
  1: sources (concatenated)
  2: prediction
  3-n: targets (1 or more)

  Args:
    path: Path to the prediction file.
    lowercase: Whether to lowercase the data (to compute case insensitive
      scores).

  Returns:
    Tuple (list of sources, list of predictions, list of target lists)
  """
    sources = []
    predDomain_list = []
    predIntent_list = []
    domain_list = []
    right_intent_num = 0
    right_slot_num = 0
    exact_num = 0
    with tf.gfile.GFile(path) as f:
        for line in f:
            if "sessionId" in line and "pred" in line:
                continue
            sessionId, query, predDomain, predIntent, predSlot, domain, intent, Slot = line.rstrip(
                '\n').split('\t')
            # if lowercase:
            #   source = normal_transformer(source.lower())
            #   pred = normal_transformer(pred.lower())
            #   targets = [normal_transformer(t.lower()) for t in targets]
            # sources.append(source)
            if predIntent == intent:
                right_intent_num += 1
                if predSlot == Slot:
                    exact_num += 1
            # else:
            #   print(curLine(), predIntent, "intent:", intent)
            if predSlot == Slot:
                right_slot_num += 1
            # else:
            #   print(curLine(), predSlot, "Slot:", Slot, "predDomain:%s, domain:%s" % (predDomain, domain))
            if predDomain != domain:
                print(curLine(),
                      "predDomain:%s, domain:%s" % (predDomain, domain),
                      predSlot, "Slot:", Slot)
            predDomain_list.append(predDomain)
            predIntent_list.append(predIntent)
            domain_list.append(domain)
    return predDomain_list, predIntent_list, domain_list, right_intent_num, right_slot_num, exact_num
示例#20
0
def get_slot_info(query, domain):
    useEntityTypeList = domain2entity_map[domain]
    entityTypeMap = get_all_entity(query, useEntityTypeList=useEntityTypeList)
    if "phone_num" in useEntityTypeList:
        token_numbers = re_phoneNum.findall(query)
        for number in token_numbers:
            entityTypeMap["phone_num"].append({'before':number, 'after':number, 'priority': 2})
    # print(curLine(), "entityTypeMap", entityTypeMap)
    # for entity_type, entity_info_list in entityTypeMap.items():
    #     for entity_info in entity_info_list:
    #         entity_before = entity_info['before']
    #         priority = entity_info['priority']
    #         if len(entity_before) < 2 and entity_before not in ["家","妈"]:
    #             continue
    #         entity_map[entity_before] = (entity_type, entity_info['after'], priority) # TODO song的优先级应该低一点
    #         # if entity_before not in entity_map or (priority>entity_map[entity_before][2]):
    #         #     entity_map[entity_before] = (entity_type, entity_info['after'], priority)
    # print(curLine(), len(entity_map), "entity_map", entity_map)
    # if "phone_num" in useEntityTypeList:
    #     token_numbers = re_phoneNum.findall(query)
    #     for number in token_numbers:
    #         entity_map[number] = ("phone_num", number, 2)
    entity_list_all = [] #汇总所有实体
    for entity_type, entity_list in entityTypeMap.items():
        for entity in entity_list:
            entity_before = entity['before']
            if len(entity_before) < 2 and entity_before not in ["家","妈"]:
                continue
            entity_list_all.append((entity_type, entity_before, entity['after'], entity['priority']))
    entity_list_all = sorted(entity_list_all, key=lambda item: len(item[1])*100+item[3],
                             reverse=True)  # new_entity_map 中key是实体,value是实体类型
    slot_info = query
    exist_entityType_set = set()
    replace_mask = [0] * len(query)
    for entity_type, entity_before, entity_after, priority in entity_list_all:
        if entity_before not in query:
            continue
        if entity_type in exist_entityType_set:
            continue  # 已经有这个类型了,忽略 # TODO
        start_location = slot_info.find(entity_before)
        if start_location > -1: #  exist
            exist_entityType_set.add(entity_type)
            if entity_after == entity_before:
                entity_info_str = "<%s>%s</%s>" % (entity_type, entity_after, entity_type)
            else:
                entity_info_str = "<%s>%s||%s</%s>" % (entity_type, entity_before, entity_after, entity_type)
            slot_info = slot_info.replace(entity_before, entity_info_str)
            query = query.replace(entity_before, "")
        else:
            print(curLine(), replace_mask, slot_info, "entity_type:", entity_type, entity_before)
    return slot_info
示例#21
0
def bleu(hyps, refs_list):
    """
    calculate bleu1, bleu2, bleu3
    """
    bleu_1 = []
    bleu_2 = []

    for hyp, refs in zip(hyps, refs_list):
        if len(hyp) <= 1:
            # print("ignore hyp:%s, refs:" % hyp, refs)
            bleu_1.append(0.0)
            bleu_2.append(0.0)
            continue

        score = bleu_score.sentence_bleu(
            refs,
            hyp,
            smoothing_function=None,  # bleu_score.SmoothingFunction().method7,
            weights=[1, 0, 0, 0])
        # input(curLine())
        if score > 1.0:
            print(curLine(), refs, hyp)
            print(curLine(), "score=", score)
            input(curLine())
        bleu_1.append(score)

        score = bleu_score.sentence_bleu(
            refs,
            hyp,
            smoothing_function=None,  # bleu_score.SmoothingFunction().method7,
            weights=[0.5, 0.5, 0, 0])
        bleu_2.append(score)
    bleu_1 = np.average(bleu_1)
    bleu_2 = np.average(bleu_2)
    bleu_average_score = (bleu_1 + bleu_2) * 0.5
    print("bleu_1=%f, bleu_2=%f, bleu_average_score=%f" %
          (bleu_1, bleu_2, bleu_average_score))
    return bleu_average_score
示例#22
0
def process(source_file, train_file, dev_file):
    dev_lines = []
    train_num = 0
    intent_distribution = defaultdict(dict)
    with open(source_file, "r") as f, open(train_file, "w") as f_train:
        reader = csv.reader(f)
        train_write = csv.writer(f_train, dialect='excel')
        for row_id, line in enumerate(reader):
            if row_id==0:
                continue
            (sessionId, raw_query, domain_intent, param) = line
            get_slot(param)

            if domain_intent == other_tag:
                domain = other_tag
                intent = other_tag
            else:
                domain, intent = domain_intent.split(".")
            if intent in intent_distribution[domain]:
                intent_distribution[domain][intent] += 1
            else:
                intent_distribution[domain][intent] = 0
            if row_id == 0:
                continue
            sessionId = int(sessionId)
            if sessionId % 10>0:
                train_write.writerow(line)
                train_num += 1
            else:
                dev_lines.append(line)
    with open(dev_file, "w") as f_dev:
        write = csv.writer(f_dev, dialect='excel')
        for line in dev_lines:
            write.writerow(line)
    print(curLine(), "dev=%d, train=%d" % (len(dev_lines), train_num))
    for domain, intent_num in intent_distribution.items():
        print(curLine(), domain, intent_num)
示例#23
0
def _added_token_counts(data_iterator, try_swapping, max_input_examples=10000):
    """Computes how many times different phrases have to be added.
    计算需要添加多少个不同的短语
    Args:
      data_iterator: Iterator to yield source lists and targets. See function
        yield_sources_and_targets in utils.py for the available iterators. The
        strings in the source list will be concatenated, possibly after swapping
        their order if swapping is enabled.
      try_swapping: Whether to try if swapping sources results in less added text.
      max_input_examples: Maximum number of examples to be read from the iterator.

    Returns:
      Tuple (collections.Counter for phrases, added phrases for each example).
    """
    phrase_counter = collections.Counter()
    num_examples = 0
    all_added_phrases = []
    max_seq_length = 0
    for sources, target in data_iterator:
        # sources 可能是多句话,后面用空格拼接起来
        if num_examples >= max_input_examples:
            break
        # source_merge = ' '.join(sources)
        source_merge = sources

        #print("phrase_vocabulary_optimization.py source_merge",source_merge)
        if len(source_merge) > max_seq_length:
            print(
                curLine(),
                "max_seq_length=%d, len(source_merge)=%d,source_merge:%s" %
                (max_seq_length, len(source_merge), source_merge))
            max_seq_length = len(source_merge)
        logging.log_every_n(logging.INFO,
                            f'{num_examples} examples processed.', 10000)
        added_phrases = _get_added_phrases(source_merge, target)
        #print("added_phrases",added_phrases)
        if try_swapping and len(sources) == 2:
            added_phrases_swap = _get_added_phrases(' '.join(sources[::-1]),
                                                    target)
            # If we can align more and have to add less after swapping, we assume that
            # the sources would be swapped during conversion.
            if len(''.join(added_phrases_swap)) < len(''.join(added_phrases)):
                added_phrases = added_phrases_swap
        for phrase in added_phrases:
            phrase_counter[phrase] += 1
        all_added_phrases.append(added_phrases)
        num_examples += 1
    logging.info(f'{num_examples} examples processed.\n')
    return phrase_counter, all_added_phrases, max_seq_length
    def process_sample(self, text1):
        text = normal_transformer(text1)
        if len(text) < 1:
            print(curLine(), "text:%s, text1:%s" % (text, text1))
        if self.args.language.lower() == "chinese":
            processed_text = [
                self.vocab.index(w) for w in list(text)[:self.args.max_len]
            ]
        else:
            processed_text = [
                self.vocab.index(w) for w in text.split()[:self.args.max_len]
            ],
        processed_len = len(processed_text)

        return processed_text, processed_len
示例#25
0
 def __init__(self, args, checkpoint_dir):
     self.args = args
     self.log = Logger(self.args)
     tf.reset_default_graph()
     with tf.Graph().as_default():
         config = tf.ConfigProto()
         config.gpu_options.allow_growth = True
         config.allow_soft_placement = True
         self.sess = tf.Session(config=config)
         with self.sess.as_default():
             self.model, self.interface, self.states = self.build_model(
                 self.sess)
             ckpt = tf.train.get_checkpoint_state(
                 checkpoint_dir)  # self.model_path)
             if ckpt is None:
                 print(curLine(),
                       "there is no model in %s" % checkpoint_dir)
             else:
                 file_name = ckpt.model_checkpoint_path.split("/")[-1]
                 model_checkpoint_file = os.path.join(
                     checkpoint_dir, file_name)
                 print(curLine(), "restore from %s" % model_checkpoint_file)
                 # saver = tf.train.import_meta_graph("{}.meta".format(ckpt.model_checkpoint_path)) # 用这个saver比self.model.saver推理更慢
                 self.model.saver.restore(self.sess, model_checkpoint_file)
示例#26
0
def predict_and_write(predictor, sources_batch, previous_line_list,context_list, writer, num_predicted, start_time, batch_num):
    prediction_batch = predictor.predict_batch(sources_batch=sources_batch)
    assert len(prediction_batch) == len(sources_batch)
    for id, [prediction, sources] in enumerate(zip(prediction_batch, sources_batch)):
        output = ""
        if len(prediction) > 1 and prediction != sources:  #  TODO  ignore keep totelly and short prediction
            output= "%s%s" % (context_list[id], prediction)  # 需要和context拼接么
            # print(curLine(), "prediction:", prediction, "sources:", sources, ",output:", output, prediction != sources)
        writer.write("%s\t%s\n" % (previous_line_list[id], output))
    batch_num = batch_num + 1
    num_predicted += len(prediction_batch)
    if batch_num % 200 == 0:
        cost_time = (time.time() - start_time) / 3600.0
        print("%s batch_id=%d, predict %d examples, cost %.3fh." %
              (curLine(), batch_num, num_predicted, cost_time))
    return num_predicted, batch_num
示例#27
0
def pinyin_similar_word(entity_info_dict, word):
    similar_word = None
    if word in entity_info_dict: # 存在实体,无需纠错
        return 0, word
    all_combination = get_pinyin_combination(entity=word)
    top_similar_score = 0
    for current_combination in all_combination: # 当前的各种发音
        current_distance = 10000

        for entity_after,(com_list, pri) in entity_info_dict.items():
            for com in com_list:
                d = distance(com, current_combination)
                if d < current_distance:
                    current_distance = d
                    similar_word = entity_after
        current_similar_score = 1.0 - float(current_distance) / len(current_combination)
        print(curLine(), "current_combination:%s, %f" % (current_combination, current_similar_score), similar_word, current_distance)
示例#28
0
def process(corpus_folder, raw_file_name):
    raw_file = os.path.join(corpus_folder, raw_file_name)
    # 打开文件,获取excel文件的workbook(工作簿)对象
    workbook = xlrd.open_workbook(raw_file)  # 文件路径

    # 通过sheet索引获得sheet对象
    worksheet = workbook.sheet_by_index(0)
    nrows = worksheet.nrows  # 获取该表总行数
    ncols = worksheet.ncols  # 获取该表总列数
    print(
        curLine(), "raw_file_name:%s, worksheet:%s nrows=%d, ncols=%d" %
        (raw_file_name, worksheet.name, nrows, ncols))
    assert ncols == 3
    assert nrows > 0
    col_data = worksheet.col_values(0)  # 获取第一列的内容
    corpus_list = []
    for line in col_data:
        sent_list = line.strip().split("&&")
        sent_num = len(sent_list)
        for i in range(1, sent_num, 2):
            source = sent_list[i - 1]
            target = sent_list[i]
            # source_length = len(source)
            # if source_length > 8 and (i+1)%4>0: # 对50%的长句随机删除
            #   rand = random.uniform(0.1, 0.9)
            #   source_pre = source
            #   swag_location = int(source_length*rand)
            #   source = "%s%s" % (source[:swag_location], source[swag_location+1:])
            #   print(curLine(), "source_pre:%s, source:%s" % (source_pre, source))

            length = float(len(source) + len(target))
            lcs1 = _compute_lcs(source, target)
            lcs_rate = len(lcs1) / length
            if (lcs_rate < 0.2):
                continue  # 变动过大,忽略

            # if (lcs_rate<0.3):
            #   continue # 变动过大,忽略
            # if len(source)*1.15 < len(target):
            #   new_t = source
            #   source = target
            #   target = new_t
            corpus = "%s\t%s\t%f\n" % (source, target, lcs_rate)
            corpus_list.append(corpus)
    return corpus_list
示例#29
0
    def _realize_sequence(self, tokens, tags):
        """Realizes output text corresponding to a single source text.

        Args:
          tokens: Tokens of the source text.
          tags: Tags indicating the edit operations.

        Returns:
          The realized text.
        """
        output_tokens = []
        for index, (token, tag) in enumerate(zip(tokens, tags)):
            loc = "0"
            if self.location is not None:
                loc = self.location[index]
            if tag.added_phrase and (
                    loc == "0" or index == 0 or
                (index > 0 and self.location[index - 1] == "0")):  # TODO
                if not tag.added_phrase.startswith("##", 0, 2):
                    output_tokens.append(tag.added_phrase)
                else:  # word piece
                    if len(output_tokens) > 0:
                        output_tokens[-1] += tag.added_phrase[2:]
                    else:
                        output_tokens.append(tag.added_phrase[2:])
            if tag.tag_type in (
                    TagType.KEEP, TagType.SWAP
            ) or loc == "1":  # TODO 根据需要修改代码,location为"1"的位置不能被删除, 但目前是可以插入的
                token = token.upper()  # TODO 因为当前语料中有不少都是大写的,所以把预测结果都转为大写
                if token.startswith("##", 0, 2):
                    output_tokens.append(token[2:])
                elif "UNK" in token:  # 处理UNK的情况
                    previoud_id = self.token_index_map[index]  # unk对应word开始的位置
                    next_previoud_id = previoud_id + 1  # unk对应word结束的位置
                    if index + 1 in self.token_index_map:
                        next_previoud_id = self.token_index_map[index + 1]
                    token = self.sources[0][previoud_id:
                                            next_previoud_id]  # TODO
                    print(
                        curLine(), "self.passage[%d,%d]=%s" %
                        (previoud_id, next_previoud_id, token))
                    output_tokens.append(token)
                else:  # word piece
                    output_tokens.append(token)
        return self.sep.join(output_tokens)
示例#30
0
def rules(raw_query, predict_domain, target_domain_name):
    predict_intent = predict_domain  # OTHERS
    slot_info = raw_query
    if predict_domain == "navigation":
        predict_intent = 'navigation'
        if "打开" in raw_query:
            predict_intent = "open"
        elif "开始" in raw_query:
            predict_intent = "start_navigation"
        for word in predict_utils.cancel_keywords:
            if word in raw_query:
                predict_intent = 'cancel_navigation'
                break
        # slot_info = raw_query
        # if predict_intent == 'navigation': TODO
        slot_info = exacter_acmation.get_slot_info(raw_query, domain=predict_domain)
        # if predict_intent != 'navigation': # TODO
        #     print(curLine(), "slot_info:", slot_info)
    elif predict_domain == 'music':
        predict_intent = 'play'
        for word in predict_utils.cancel_keywords:
            if word in raw_query:
                predict_intent = 'pause'
                break
        for word in ["下一", "换一首", "换一曲", "切歌", "其他歌"]:
            if word in raw_query:
                predict_intent = 'next'
                break
        slot_info = exacter_acmation.get_slot_info(raw_query, domain=predict_domain)
        if predict_intent not in ['play','pause'] and slot_info != raw_query: # 根据槽位修改意图  换一首<singer>高安</singer>的<song>红尘情歌</song>
            print(curLine(), predict_intent, slot_info)
            predict_intent = 'play'
        # if predict_intent != 'play': # 换一首<singer>高安</singer>的<song>红尘情歌</song>
        #     print(curLine(), predict_intent, slot_info)
    elif predict_domain == 'phone_call':
        predict_intent = 'make_a_phone_call'
        for word in predict_utils.cancel_keywords:
            if word in raw_query:
                predict_intent = 'cancel'
                break
        slot_info = exacter_acmation.get_slot_info(raw_query, domain=predict_domain)
    return predict_intent, slot_info