Пример #1
0
def generate_ner(args) -> None:
    """
    总共分成2个步骤:
        Step1 : 用模型进行实体识别
        Step2 : 对每篇文章按照中文句点进行分割
    Args:
        args:
            --file_root : root path of data
    """

    file_names = scan_files(args.file_root)  # type:List[str]
    for file in file_names:
        data = load_file(args.file_root, file, "txt")

        # Part1 : 计算得到当前文章的实体识别结果
        prepare_data = prepare(data)  # type:np.ndarray
        result = predict(prepare_data)  # type:np.ndarray
        _, ner_result = decode_result(result=result,
                                      sent_pre=prepare_data,
                                      sent=data)

        pickle.dump(ner_result, open(args.file_root + file + "_ner.pkl", 'wb'))

        # Part2 : 将当前文章按照(句号/问号/感叹号)作为划分,并记录到dict中
        start, end = 0, 0
        sentence_split_result = []
        stop_tokens = ["。", "!", "?"]
        for idx, c in enumerate(data):
            if c in stop_tokens:
                end = idx
                sentence_split_result.append((start, end))
                start = end + 1

        pickle.dump(sentence_split_result,
                    open(args.file_root + file + "_sentence_split.pkl", 'wb'))
Пример #2
0
 def prepare_data(self):
     with open(self._w2v_file_path, 'w', encoding='utf-8') as f:
         file_names = scan_files(self.root)
         for name in file_names:
             data = load_file(self.root, name, "txt")
             data = " ".join(data)
             f.write(data)
             f.write("\n")
     return
Пример #3
0
def put_neg_cells(background, label, size):
    # collect all negative cells for specific label
    neg_cells = []
    sub_paths = neg_cells_path_map[label]
    for sub_path in sub_paths:
        # neg_cells += scan_files(os.path.join(neg_cells_paths[sub_path[0]], sub_path[1]), postfix=".bmp")
        if sub_path[1] == "all":
            sub_neg_cells = scan_files(neg_cells_paths[sub_path[0]], postfix=".bmp")
        else:
            sub_neg_cells = scan_files(os.path.join(neg_cells_paths[sub_path[0]], sub_path[1]), postfix=".bmp")
        neg_cells += random.sample(sub_neg_cells, min(sub_path[2], len(sub_neg_cells)))

    # get number and names of negative cells
    neg_cells_cnt = random.randint(neg_cells_num[0], neg_cells_num[1])
    neg_cells_for_patch = random.sample(neg_cells, min(neg_cells_cnt, len(neg_cells)))
    # print("total", len(neg_cells), "choose", len(neg_cells_for_patch))
    
    # get possible cell positions in background
    neg_cells_possible = []
    dets = []
    for neg_cell in neg_cells_for_patch:
        # tokens = re.findall(r"\d+", neg_cell)  # should follow ..._w123_h234.bmp format
        # neg_w, neg_h = math.ceil(int(tokens[-2])/2), math.ceil(int(tokens[-1])/2)
        neg_img = cv2.imread(neg_cell)
        neg_h, neg_w, _ = neg_img.shape
        neg_x = random.randint(0, size-neg_w)
        neg_y = random.randint(0, size-neg_h)
        neg_cells_possible.append([neg_cell, (neg_x, neg_y, neg_w, neg_h)])
        dets.append([neg_x, neg_y, neg_x+neg_w, neg_y+neg_h, 1])

    # remove overlapping cells
    keep = py_cpu_nms(np.array(dets), thresh=0.1)

    neg_cells_ready = [neg_cells_possible[i] for i in keep]

    # put cells on background
    for neg_cell in neg_cells_ready:
        neg_img = cv2.imread(neg_cell[0])
        # neg_img = cv2.pyrDown(neg_img)
        neg_x, neg_y, neg_w, neg_h = neg_cell[1]
        background[neg_y:neg_y+neg_h, neg_x:neg_x+neg_w, :] = neg_img

    return background
Пример #4
0
def scan_param_files(path):
    if not path.endswith('/'):
        path.append('/')
    model_files = scan_files(path,
                             r'dict_model(.*)\.py',
                             raise_not_found_error=False)
    optimizer_files = scan_files(path,
                                 r'dict_optimizer(.*)\.py',
                                 raise_not_found_error=False)
    trainer_files = scan_files(path,
                               r'dict_trainer(.*)\.py',
                               raise_not_found_error=False)
    data_loader_files = scan_files(path,
                                   r'dict_data_loader(.*)\.py',
                                   raise_not_found_error=False)
    config_files = scan_files(path,
                              r'config(.*)\.py',
                              raise_not_found_error=False)
    '''
    if raise_not_found_error: # raise error if did not find any param dict
        if len(model_files)==0:
            raise Exception('No available model param dict in %s'%str(path))
        if len(optimizer_files)==0:
            raise Exception('No available optimizer param dict in %s'%str(path))
        if len(trainer_files)==0:
            raise Exception('No available trainer param dict in %s'%str(path))
        if len(data_loader_files)==0:
            raise Exception('No available data_loader param dict in %s'%str(path)) 
    '''
    return {
        'model_files': model_files,
        'optimizer_files': optimizer_files,
        'trainer_files': trainer_files,
        'data_loader_files': data_loader_files,
        'config_files': config_files
    }
    '''
Пример #5
0
def put_cells(pos_cells_path, save_path, postfix=".bmp"):
    os.makedirs(save_path, exist_ok=True)

    files = scan_files(pos_cells_path, postfix=postfix)
    print("# files:", len(files))

    executor = ProcessPoolExecutor(max_workers=cpu_count())
    tasks = []

    batch_size = 100
    for i in range(0, len(files), batch_size):
        batch = files[i : i+batch_size]
        tasks.append(executor.submit(batch_put_cell, batch, save_path))
    
    job_count = len(tasks)
    for future in as_completed(tasks):
        # result = future.result()  # get the returning result from calling fuction
        job_count -= 1
        print("One Job Done, Remaining Job Count: %s" % (job_count))
Пример #6
0
def get_background(cell_name, useback, size):
    if useback == "white":
        background = np.ones((size, size, 3))
    elif useback == "black":
        background = np.zeros((size, size, 3))
    elif useback == "positive":  # use negative images from positive wsis
        neg_files = []
        for sub_dir in os.listdir(neg_background_path):
            if os.path.basename(cell_name).startswith(sub_dir):
                neg_files = scan_files(os.path.join(neg_background_path, sub_dir), postfix=".bmp")
                break
        if len(neg_files) >= 1:
            neg_randf = random.sample(neg_files, 1)[0]
            background = cv2.imread(neg_randf)
        else:
            background = np.zeros((size, size, 3))
    else:
        background = np.zeros((size, size, 3))
    return background
Пример #7
0
# cut tif file to 608x608 images. For each tif, it will generate a folder to put 608 images #######################
print(colorama.Fore.GREEN + "[INFO] cut 608 images from tif file" +
      colorama.Fore.WHITE)
os.makedirs(output_tif_608s, exist_ok=True)
tif_name_ext = get_unrunned_tif(input_tif_files, save_path)
if len(tif_name_ext) == 0:
    print(colorama.Fore.RED + "[INFO] data processing finished" +
          colorama.Fore.WHITE)
    sys.exit()
asap_to_image(os.path.join(input_tif_files, tif_name_ext), output_tif_608s)

# get the list of 608 image full pathnames ########################################################################
tif_name = os.path.splitext(tif_name_ext)[0]
image_path = os.path.join(output_tif_608s, tif_name)
images = scan_files(image_path)

# generate txt file for current tif ###############################################################################
print(colorama.Fore.GREEN + "[INFO] generate txt for " + tif_name +
      colorama.Fore.WHITE)
gen_txt_for_dir(images, output_tif_608s, tif_name)

# run darknet test ################################################################################################
darknet_path = darknet_dir
segment(darknet_path, image_path)
os.remove(image_path + ".txt")

# evaluate predictions and convert coordinates to xmls ############################################################
print(colorama.Fore.GREEN +
      "[INFO] evaluate predictions and write coordinates into xmls" +
      colorama.Fore.WHITE)
Пример #8
0
    def generate_tuples(self, data_dir: str):
        """
        用于从源数据中,用多线程的方式生成tuples
        Args:
            data_dir: 数据存储的路径,其中包括:
                      eg. 源文章名称    __ data/round2/0.txt
                          NER结果名称   __ data/round2/0_ner.pkl
                          文章分句结果   __ data/round2/0_sentence_split.pkl
        """

        # Step1 : load word2idx and emb_matrix
        self.config.load_word2idx_embmatrix()

        # Step2 : 生成候选关系对
        instances = list()
        file_names = scan_files(data_dir)

        for file in file_names:
            passage = load_file(data_dir, file, "txt")  # type:str
            sent_split = pickle.load(
                open(data_dir + file + "_sentence_split.pkl",
                     "rb"))  # type:List[tuple]
            ner_result = pickle.load(open(data_dir + file + "_ner.pkl",
                                          "rb"))  # type:List[tuple]

            sent_split.sort(key=lambda x: x[0])

            # Step2.1 : 找出属于e1与e2的实体
            e1_entities, e2_entities = list(), list()
            for e in ner_result:
                # e是个4元组,例如:('Disease', 1, 10, '糖尿病下肢动脉病变')
                if e[0] == self.config.e1_type:
                    e1_entities.append(e)
                elif e[0] == self.config.e2_type:
                    e2_entities.append(e)
            e1_entities.sort(key=lambda x: x[1])
            e2_entities.sort(key=lambda x: x[1])

            # Step2.2 : 对每一个e1去找到候选的e2,并确定三元组<BEF,BET,AFT,sequence_tag>
            for e1 in e1_entities:
                e1_start, e1_end = e1[1], e1[2]
                cur_sentence_idx = -1
                for idx, s in enumerate(sent_split):
                    if s[0] <= e1_start and s[1] >= e1_end:
                        cur_sentence_idx = idx
                        break
                # 根据当前实体的位置确定了寻找e2的上下界:即 上一句 + 当前句 + 下一句
                search_e2_start = sent_split[
                    cur_sentence_idx - 1 if cur_sentence_idx > 1 else 0][0]
                search_e2_end = sent_split[cur_sentence_idx + 1 if cur_sentence_idx < len(sent_split) - 1 \
                    else len(sent_split) - 1][1]

                for i in range(len(e2_entities)):
                    e2 = e2_entities[i]
                    e2_start = e2[1]
                    e2_end = e2[2]
                    if e2_end < search_e2_start:
                        continue
                    elif e2_start > search_e2_end:
                        break
                    elif e2_start >= search_e2_start and e2_end <= search_e2_end:
                        if e1_end == e2_start:
                            # 情况(1):e1在e2前,且紧挨着
                            before = passage[search_e2_start:e1_start]
                            between = ""
                            after = passage[e2_end:search_e2_end]
                            t = Tuple(e1[3],
                                      e2[3],
                                      sequence_tag=True,
                                      before=before,
                                      between=between,
                                      after=after,
                                      config=self.config)
                            instances.append(t)
                        elif e2_end == e1_start:
                            # 情况(2):e1在e2后,且紧挨着
                            before = passage[search_e2_start:e2_start]
                            between = ""
                            after = passage[e1_end:search_e2_end]
                            t = Tuple(e1[3],
                                      e2[3],
                                      sequence_tag=False,
                                      before=before,
                                      between=between,
                                      after=after,
                                      config=self.config)
                            instances.append(t)
                        elif e1_end < e2_start:
                            # 情况(3):e1在e2前,不挨着
                            before = passage[search_e2_start:e1_start]
                            between = passage[e1_end:e2_start]
                            after = passage[e2_end:search_e2_end]
                            t = Tuple(e1[3],
                                      e2[3],
                                      sequence_tag=True,
                                      before=before,
                                      between=between,
                                      after=after,
                                      config=self.config)
                            instances.append(t)
                        elif e2_end < e1_start:
                            # 情况(4):e1在e2后,不挨着
                            before = passage[search_e2_start:e2_start]
                            between = passage[e2_end:e1_start]
                            after = passage[e1_end:search_e2_end]
                            t = Tuple(e1[3],
                                      e2[3],
                                      sequence_tag=False,
                                      before=before,
                                      between=between,
                                      after=after,
                                      config=self.config)
                            instances.append(t)

        # Stpe3 : 持久化
        pickle.dump(
            instances,
            open("./saved_model_files/RE_candidate_instances.pkl", "wb"))