Python Segmentor 예제들, utils.segmentor.Segmentor Python 예제들

예제 #1

0

파일 보기

파일: search.py 프로젝트: zsyf102900/TextCluster

 def __init__(self, args=_get_parser()):
     p_bucket_path = os.path.join(args.infile, 'p_bucket.pickle')
     with open(p_bucket_path, 'rb') as infile:
         self.p_bucket = pickle.load(infile)
     self.seg = Segmentor(args)
     self.path = args.infile
     self.sim_th = args.sim_th
     self.stop_words = get_stop_words(args.stop_words)
     self.args = args

예제 #2

0

파일 보기

파일: search.py 프로젝트: zsyf102900/TextCluster

class Searcher(object):
    def __init__(self, args=_get_parser()):
        p_bucket_path = os.path.join(args.infile, 'p_bucket.pickle')
        with open(p_bucket_path, 'rb') as infile:
            self.p_bucket = pickle.load(infile)
        self.seg = Segmentor(args)
        self.path = args.infile
        self.sim_th = args.sim_th
        self.stop_words = get_stop_words(args.stop_words)
        self.args = args

    def search(self, sentence):
        if not sentence or type(sentence) != str:
            return None
        res = list()
        c_bucket = list()
        seg_sen = list(self.seg.cut(sentence))
        seg_sen = list(filter(lambda x: x not in self.stop_words, seg_sen))
        for w in seg_sen:
            if w in self.p_bucket:
                c_bucket += self.p_bucket[w]
        c_bucket = list(set(c_bucket))
        cmp, score = list(), list()
        for bucket in c_bucket:
            bucket_path = os.path.join(self.path, bucket)
            check_file(bucket_path)
            infile = open(bucket_path, 'r', encoding="utf-8")
            for inline in infile:
                inline = inline.rstrip()
                line = inline.split(':::')[0]
                seg_list = list(self.seg.cut(line))
                seg_list = list(
                    filter(lambda x: x not in self.stop_words, seg_list))
                sc = jaccard(seg_sen, seg_list)
                if sc < self.args.threshold:
                    continue
                cmp.append(inline)
                score.append(sc)
            infile.close()

        zipped = zip(cmp, score)
        zipped = sorted(zipped, key=lambda x: x[1], reverse=True)
        right = None if self.args.top_k <= 0 else self.args.top_k
        for (cp, sc) in zipped[:right]:
            res.append(cp)
        return res

예제 #3

0

파일 보기

파일: cluster_plus.py 프로젝트: august0228/TextCluster

    parser.add_argument('--lang',
                        type=str,
                        choices=['cn', 'en'],
                        default='cn',
                        help='Segmentor language setting.')
    args = parser.parse_args()
    return args


def lstg(num, lst):
    for i in range(0, len(lst), num):
        yield lst[i:i + num]


args = _get_parser()
seg = Segmentor(args)

today = time.strftime("%Y%m%d", time.localtime(time.time()))

# 停用词缓存
stop_words_cache = {}
jieba_cache = {}
# load stop words
stop_words = get_stop_words(args.stop_words) if os.path.exists(
    args.stop_words) else list()


def fenci(i):
    result = {}
    for zzz in i:
        inline = zzz.rstrip()

예제 #4

0

파일 보기

    def run(self, questions):
        args = self._get_parser()

        # preliminary work
        ensure_dir(args.output)

        if args.name_len_update:
            line_cnt = line_counter(args.infile)
            args.name_len = len(str(line_cnt)) + 1

        clean_dir(args.output, args.name_len)
        # end preliminary work

        p_bucket = defaultdict(list)
        save_idx = 0
        id_name = '{0:0' + str(args.name_len) + 'd}'
        # load stop words
        stop_words = get_stop_words(args.stop_words) if os.path.exists(
            args.stop_words) else list()
        # load tokenizer
        seg = Segmentor(args)

        print('Splitting sentence into different clusters ...')
        infile = questions
        for inline in tqdm(infile):
            inline = inline.rstrip()
            line = inline.split(':::')[0]
            is_match = False
            seg_list = list(seg.cut(line))
            if stop_words:
                seg_list = list(filter(lambda x: x not in stop_words,
                                       seg_list))
            for wd in seg_list:
                if is_match:
                    break
                w_bucket = p_bucket[wd]
                for bucket in w_bucket:
                    bucket_path = os.path.join(args.output, bucket)
                    check_file(bucket_path)
                    selected = sample_file(bucket_path, args.sample_number)
                    selected = list(map(lambda x: x.split(':::')[0], selected))
                    selected = list(map(lambda x: list(seg.cut(x)), selected))
                    # remove stop words
                    if stop_words:
                        filt_selected = list()
                        for sen in selected:
                            sen = list(
                                filter(lambda x: x not in stop_words, sen))
                            filt_selected.append(sen)
                        selected = filt_selected
                    # calculate similarity with each bucket
                    if all(
                            jaccard(seg_list, cmp_list) > args.threshold
                            for cmp_list in selected):
                        is_match = True
                        with open(bucket_path, 'a',
                                  encoding='utf-8') as outfile:
                            outfile.write(line + '\n')
                        for w in seg_list:
                            if bucket not in p_bucket[w]:
                                p_bucket[w].append(bucket)
                        break
            if not is_match:
                bucket_name = ('tmp' + id_name).format(save_idx)
                bucket_path = os.path.join(args.output, bucket_name)
                with open(bucket_path, 'a', encoding='utf-8') as outfile:
                    outfile.write(line + '\n')
                for w in seg_list:
                    p_bucket[w].append(bucket_name)
                save_idx += 1

        # sort and rename file
        file_list = os.listdir(args.output)
        file_list = list(filter(lambda x: x.startswith('tmp'), file_list))
        cnt = dict()
        for file in file_list:
            file_path = os.path.join(args.output, file)
            cnt[file] = line_counter(file_path)

        sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True)
        name_map = dict()
        for idx, (file_name, times) in enumerate(sorted_cnt):
            origin_path = os.path.join(args.output, file_name)
            new_name = id_name.format(idx)
            new_path = os.path.join(args.output, new_name)
            os.rename(origin_path, new_path)
            name_map[file_name] = new_name

        for k, v in p_bucket.items():
            p_bucket[k] = list(map(lambda x: name_map[x], v))

        #合并文件
        output_file = os.path.join(args.output, 'all_cluster.txt')
        try:
            if os.path.isfile(output_file):
                os.unlink(output_file)
        except Exception as e:
            print(e)
        file_list = os.listdir(args.output)
        fw = open(output_file, 'w+')
        for file in file_list:
            with open(os.path.join(args.output, file)) as f:
                for line in f.readlines():
                    fw.write(str(int(file)) + ',' + line)
        fw.close()
        df = pd.read_csv(output_file, names=['id', 'text'])
        df.columns = ['cluster_id', 'ques']
        print('All is well')
        # json.dumps(dict(ques=ques))
        df_dict = df.set_index('cluster_id').T.to_dict('records')[0]

        #dataframe 的数据格式转换
        #df 0 aa
        #   0 aaa                   => aa  [aaa]
        #   1 bb                       bb  []
        #df_dict = {0: aa, 1: bb}
        print(df_dict)
        result_dict = {}
        for cluster_id, ques in df_dict.items():
            li = df[df['cluster_id'] == cluster_id].ques.values.tolist()
            # if(ques in li): li.remove(ques)
            result_dict[ques] = li

        my_list = [result_dict]
        my_df = pd.DataFrame(my_list).T
        my_df = my_df.reset_index()
        my_df.columns = ['ques', 'info']
        print(my_df)
        return my_df.to_json(orient="records", force_ascii=False)

예제 #5

0

파일 보기

def find_non_stationary_clusters(args):
    if args['use_gpu']:
        print("Using CUDA" if torch.cuda.is_available() else "Using CPU")
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    else:
        device = "cpu"

    network_folder_name = args['folder'].split('/')[-1]
    tmp = re.search(r"cn(\d+)-", network_folder_name)
    n_clusters = int(tmp.group(1))

    save_folder = os.path.join(args['dest_root'], network_folder_name)
    if os.path.exists(
            os.path.join(save_folder, 'cluster_histogram_for_corr.npy')):
        print('{} already exists. skipping'.format(
            os.path.join(save_folder, 'cluster_histogram_for_corr.npy')))
        return

    check_mkdir(save_folder)

    with open(os.path.join(args['folder'], 'bestval.txt')) as f:
        best_val_dict_str = f.read()
        bestval = eval(best_val_dict_str.rstrip())

    # Network and weight loading
    model_config = model_configs.PspnetCityscapesConfig()
    net = model_config.init_network(
        n_classes=n_clusters,
        for_clustering=False,
        output_features=False,
        use_original_base=args['use_original_base']).to(device)
    net.load_state_dict(
        torch.load(os.path.join(args['folder'],
                                bestval['snapshot'] + '.pth')))  # load weights
    net.eval()

    # copy network file to save location
    copyfile(os.path.join(args['folder'], bestval['snapshot'] + '.pth'),
             os.path.join(save_folder, 'weights.pth'))

    if args['only_copy_weights']:
        print('Only copying weights')
        return

    # Data loading setup
    if args['corr_set'] == 'rc':
        corr_set_config = data_configs.RobotcarConfig()
    elif args['corr_set'] == 'cmu':
        corr_set_config = data_configs.CmuConfig()
    elif args['corr_set'] == 'both':
        corr_set_config1 = data_configs.CmuConfig()
        corr_set_config2 = data_configs.RobotcarConfig()

    sliding_crop_im = joint_transforms.SlidingCropImageOnly(
        713, args['stride_rate'])

    input_transform = model_config.input_transform
    pre_validation_transform = model_config.pre_validation_transform

    if args['corr_set'] == 'both':
        corr_set_val1 = correspondences.Correspondences(
            corr_set_config1.correspondence_path,
            corr_set_config1.correspondence_im_path,
            input_size=(713, 713),
            input_transform=None,
            joint_transform=None,
            listfile=corr_set_config1.correspondence_val_list_file)
        corr_set_val2 = correspondences.Correspondences(
            corr_set_config2.correspondence_path,
            corr_set_config2.correspondence_im_path,
            input_size=(713, 713),
            input_transform=None,
            joint_transform=None,
            listfile=corr_set_config2.correspondence_val_list_file)
        corr_set_val = merged.Merged([corr_set_val1, corr_set_val2])
    else:
        corr_set_val = correspondences.Correspondences(
            corr_set_config.correspondence_path,
            corr_set_config.correspondence_im_path,
            input_size=(713, 713),
            input_transform=None,
            joint_transform=None,
            listfile=corr_set_config.correspondence_val_list_file)

    # Segmentor
    segmentor = Segmentor(net, n_clusters, n_slices_per_pass=4)

    # save args
    open(os.path.join(save_folder,
                      str(datetime.datetime.now()) + '.txt'),
         'w').write(str(args) + '\n\n')

    cluster_histogram_for_correspondences = np.zeros((n_clusters, ),
                                                     dtype=np.int64)
    cluster_histogram_non_correspondences = np.zeros((n_clusters, ),
                                                     dtype=np.int64)

    for i in range(0, len(corr_set_val), args['step']):
        img1, img2, pts1, pts2, _ = corr_set_val[i]
        seg1 = segmentor.run_and_save(
            img1,
            None,
            pre_sliding_crop_transform=pre_validation_transform,
            input_transform=input_transform,
            sliding_crop=sliding_crop_im,
            use_gpu=args['use_gpu'])

        seg1 = np.array(seg1)
        corr_loc_mask = np.zeros(seg1.shape, dtype=np.bool)

        valid_inds = (pts1[0, :] >= 0) & (pts1[0, :] < seg1.shape[1]) & (
            pts1[1, :] >= 0) & (pts1[1, :] < seg1.shape[0])

        pts1 = pts1[:, valid_inds]
        for j in range(pts1.shape[1]):
            pt = pts1[:, j]
            corr_loc_mask[pt[1], pt[0]] = True

        cluster_ids_corr = seg1[corr_loc_mask]
        hist_tmp, _ = np.histogram(cluster_ids_corr, np.arange(n_clusters + 1))
        cluster_histogram_for_correspondences += hist_tmp

        cluster_ids_no_corr = seg1[~corr_loc_mask]
        hist_tmp, _ = np.histogram(cluster_ids_no_corr,
                                   np.arange(n_clusters + 1))
        cluster_histogram_non_correspondences += hist_tmp

        if ((i + 1) % 100) < args['step']:
            print('{}/{}'.format(i + 1, len(corr_set_val)))

    np.save(os.path.join(save_folder, 'cluster_histogram_for_corr.npy'),
            cluster_histogram_for_correspondences)
    np.save(os.path.join(save_folder, 'cluster_histogram_non_corr.npy'),
            cluster_histogram_non_correspondences)
    frac = cluster_histogram_for_correspondences / \
        (cluster_histogram_for_correspondences +
         cluster_histogram_non_correspondences)
    stationary_inds = np.argwhere(frac > 0.01)
    np.save(os.path.join(save_folder, 'stationary_inds.npy'), stationary_inds)
    print('{} stationary clusters out of {}'.format(
        len(stationary_inds), len(cluster_histogram_for_correspondences)))

예제 #6

0

파일 보기

파일: segment_images_in_folder.py 프로젝트: xwu4lab/cross-season-segmentation

def segment_images_in_folder(network_file, img_folder, save_folder, args):

    # get current available device
    if args['use_gpu']:
        print("Using CUDA" if torch.cuda.is_available() else "Using CPU")
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    else:
        device = "cpu"

    # Network and weight loading
    model_config = model_configs.PspnetCityscapesConfig()
    if 'n_classes' in args:
        print('Initializing model with %d classes' % args['n_classes'])
        net = model_config.init_network(
            n_classes=args['n_classes'],
            for_clustering=False,
            output_features=False,
            use_original_base=args['use_original_base']).to(device)
    else:
        net = model_config.init_network().to(device)

    print('load model ' + network_file)
    state_dict = torch.load(network_file,
                            map_location=lambda storage, loc: storage)
    # needed since we slightly changed the structure of the network in pspnet
    state_dict = rename_keys_to_match(state_dict)
    net.load_state_dict(state_dict)
    net.eval()

    # data loading
    input_transform = model_config.input_transform
    pre_validation_transform = model_config.pre_validation_transform
    # make sure crop size and stride same as during training
    sliding_crop = joint_transforms.SlidingCropImageOnly(
        713, args['sliding_transform_step'])

    check_mkdir(save_folder)
    t0 = time.time()

    # get all file names
    filenames_ims = list()
    filenames_segs = list()
    print('Scanning %s for images to segment.' % img_folder)
    for root, subdirs, files in os.walk(img_folder):
        filenames = [f for f in files if f.endswith(args['img_ext'])]
        if len(filenames) > 0:
            print('Found %d images in %s' % (len(filenames), root))
            seg_path = root.replace(img_folder, save_folder)
            check_mkdir(seg_path)
            filenames_ims += [os.path.join(root, f) for f in filenames]
            filenames_segs += [
                os.path.join(seg_path, f.replace(args['img_ext'], '.png'))
                for f in filenames
            ]

    # Create segmentor
    if net.n_classes == 19:  # This could be the 19 cityscapes classes
        segmentor = Segmentor(net,
                              net.n_classes,
                              colorize_fcn=cityscapes.colorize_mask,
                              n_slices_per_pass=args['n_slices_per_pass'])
    else:
        segmentor = Segmentor(net,
                              net.n_classes,
                              colorize_fcn=None,
                              n_slices_per_pass=args['n_slices_per_pass'])

    count = 1
    for im_file, save_path in zip(filenames_ims, filenames_segs):
        tnow = time.time()
        print("[%d/%d (%.1fs/%.1fs)] %s" %
              (count, len(filenames_ims), tnow - t0,
               (tnow - t0) / count * len(filenames_ims), im_file))
        segmentor.run_and_save(
            im_file,
            save_path,
            pre_sliding_crop_transform=pre_validation_transform,
            sliding_crop=sliding_crop,
            input_transform=input_transform,
            skip_if_seg_exists=True,
            use_gpu=args['use_gpu'])
        count += 1

    tend = time.time()
    print('Time: %f' % (tend - t0))

예제 #7

0

파일 보기

파일: cluster.py 프로젝트: sjyttkl/TextCluster

def main():
    args = _get_parser()

    # preliminary work
    check_file(args.infile)
    ensure_dir(args.output)

    if args.name_len_update:
        line_cnt = line_counter(args.infile)
        args.name_len = len(str(line_cnt)) + 1

    clean_dir(args.output, args.name_len)
    # end preliminary work

    p_bucket = defaultdict(list)
    save_idx = 0
    id_name = '{0:0' + str(args.name_len) + 'd}'
    # load stop words
    stop_words = get_stop_words(args.stop_words) if os.path.exists(
        args.stop_words) else list()
    # load tokenizer
    seg = Segmentor(args)

    print('Splitting sentence into different clusters ...')
    infile = open(args.infile, 'r', encoding="utf-8")
    for line in tqdm(infile):
        line = line.rstrip()
        is_match = False
        seg_list = list(seg.cut(line))
        if stop_words:
            seg_list = list(filter(lambda x: x not in stop_words, seg_list))
        for wd in seg_list:
            if is_match:
                break
            w_bucket = p_bucket[wd]
            for bucket in w_bucket:
                bucket_path = os.path.join(args.output, bucket)
                check_file(bucket_path)
                selected = sample_file(bucket_path, args.sample_number)
                selected = list(map(lambda x: list(seg.cut(x)), selected))
                # remove stop words
                if stop_words:
                    filt_selected = list()
                    for sen in selected:
                        sen = list(filter(lambda x: x not in stop_words, sen))
                        filt_selected.append(sen)
                    selected = filt_selected
                # calculate similarity with each bucket
                if all(
                        jaccard(seg_list, cmp_list) > args.threshold
                        for cmp_list in selected):
                    is_match = True
                    with open(bucket_path, 'a', encoding='utf-8') as outfile:
                        outfile.write(line + '\n')
                    for w in seg_list:
                        if bucket not in p_bucket[w]:
                            p_bucket[w].append(bucket)
                    break
        if not is_match:
            bucket_name = ('tmp' + id_name).format(save_idx)
            bucket_path = os.path.join(args.output, bucket_name)
            with open(bucket_path, 'a', encoding='utf-8') as outfile:
                outfile.write(line + '\n')
            for w in seg_list:
                p_bucket[w].append(bucket_name)
            save_idx += 1

    infile.close()

    # sort and rename file
    file_list = os.listdir(args.output)
    file_list = list(filter(lambda x: x.startswith('tmp'), file_list))
    cnt = dict()
    for file in file_list:
        file_path = os.path.join(args.output, file)
        cnt[file] = line_counter(file_path)

    sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True)
    for idx, (file_name, times) in enumerate(sorted_cnt):
        origin_path = os.path.join(args.output, file_name)
        new_path = os.path.join(args.output, id_name.format(idx))
        os.rename(origin_path, new_path)

    print('All is well')

예제 #8

0

파일 보기

    def run(self, net, optimizer, args, curr_iter, save_dir, f_handle, writer=None):
        # the following code is written assuming that batch size is 1
        net.eval()
        segmentor = Segmentor(net, self.n_classes, colorize_fcn=None, n_slices_per_pass=10)

        confmat = np.zeros((self.n_classes, self.n_classes))
        for vi, data in enumerate(self.data_loader):
            img_slices, gt, slices_info = data
            gt.squeeze_(0)
            prediction_tmp = segmentor.run_on_slices(img_slices.squeeze_(0), slices_info.squeeze_(0))

            if prediction_tmp.shape != gt.size():
                prediction_tmp = Image.fromarray(prediction_tmp.astype(np.uint8)).convert('P')
                prediction_tmp = F.resize(prediction_tmp, gt.size(), interpolation=Image.NEAREST)

            acc, acc_cls, mean_iu, fwavacc, confmat, _ = evaluate_incremental(
                confmat, np.asarray(prediction_tmp), gt.numpy(), self.n_classes)

            str2write = 'validating: %d / %d' % (vi + 1, len(self.data_loader))
            print(str2write)
            # f_handle.write(str2write + "\n")

        # Store confusion matrix
        confmatdir = os.path.join(save_dir, 'confmat')
        os.makedirs(confmatdir, exist_ok=True)
        with open(os.path.join(confmatdir, self.extra_name_str + str(curr_iter) + '_confmat.pkl'), 'wb') as confmat_file:
            pickle.dump(confmat, confmat_file)

        if self.save_snapshot:
            snapshot_name = 'iter_%d_acc_%.5f_acc-cls_%.5f_mean-iu_%.5f_fwavacc_%.5f_lr_%.10f' % (
                curr_iter, acc, acc_cls, mean_iu, fwavacc, optimizer.param_groups[1]['lr'])
            torch.save(net.state_dict(), os.path.join(
                save_dir, snapshot_name + '.pth'))
            torch.save(optimizer.state_dict(), os.path.join(
                save_dir, 'opt_' + snapshot_name + '.pth'))

            if args['best_record']['mean_iu'] < mean_iu:
                args['best_record']['iter'] = curr_iter
                args['best_record']['acc'] = acc
                args['best_record']['acc_cls'] = acc_cls
                args['best_record']['mean_iu'] = mean_iu
                args['best_record']['fwavacc'] = fwavacc
                args['best_record']['snapshot'] = snapshot_name
                open(os.path.join(save_dir, 'bestval.txt'), 'w').write(
                    str(args['best_record']) + '\n\n')

            str2write = '%s best record: [acc %.5f], [acc_cls %.5f], [mean_iu %.5f], [fwavacc %.5f]' % (self.extra_name_str,
                                                                                                        args['best_record']['acc'], args['best_record']['acc_cls'], args['best_record']['mean_iu'], args['best_record']['fwavacc'])

            print(str2write)
            f_handle.write(str2write + "\n")

        str2write = '%s [iter %d], [acc %.5f], [acc_cls %.5f], [mean_iu %.5f], [fwavacc %.5f]' % (self.extra_name_str,
                                                                                                  curr_iter, acc, acc_cls, mean_iu, fwavacc)
        print(str2write)
        f_handle.write(str2write + "\n")

        if writer is not None:
            writer.add_scalar(self.extra_name_str + ': acc', acc, curr_iter)
            writer.add_scalar(self.extra_name_str +
                              ': acc_cls', acc_cls, curr_iter)
            writer.add_scalar(self.extra_name_str +
                              ': mean_iu', mean_iu, curr_iter)
            writer.add_scalar(self.extra_name_str +
                              ': fwavacc', fwavacc, curr_iter)

        net.train()
        if 'freeze_bn' not in args or args['freeze_bn']:
            freeze_bn(net)

        return mean_iu