def __init__(self, args=_get_parser()): p_bucket_path = os.path.join(args.infile, 'p_bucket.pickle') with open(p_bucket_path, 'rb') as infile: self.p_bucket = pickle.load(infile) self.seg = Segmentor(args) self.path = args.infile self.sim_th = args.sim_th self.stop_words = get_stop_words(args.stop_words) self.args = args
class Searcher(object): def __init__(self, args=_get_parser()): p_bucket_path = os.path.join(args.infile, 'p_bucket.pickle') with open(p_bucket_path, 'rb') as infile: self.p_bucket = pickle.load(infile) self.seg = Segmentor(args) self.path = args.infile self.sim_th = args.sim_th self.stop_words = get_stop_words(args.stop_words) self.args = args def search(self, sentence): if not sentence or type(sentence) != str: return None res = list() c_bucket = list() seg_sen = list(self.seg.cut(sentence)) seg_sen = list(filter(lambda x: x not in self.stop_words, seg_sen)) for w in seg_sen: if w in self.p_bucket: c_bucket += self.p_bucket[w] c_bucket = list(set(c_bucket)) cmp, score = list(), list() for bucket in c_bucket: bucket_path = os.path.join(self.path, bucket) check_file(bucket_path) infile = open(bucket_path, 'r', encoding="utf-8") for inline in infile: inline = inline.rstrip() line = inline.split(':::')[0] seg_list = list(self.seg.cut(line)) seg_list = list( filter(lambda x: x not in self.stop_words, seg_list)) sc = jaccard(seg_sen, seg_list) if sc < self.args.threshold: continue cmp.append(inline) score.append(sc) infile.close() zipped = zip(cmp, score) zipped = sorted(zipped, key=lambda x: x[1], reverse=True) right = None if self.args.top_k <= 0 else self.args.top_k for (cp, sc) in zipped[:right]: res.append(cp) return res
parser.add_argument('--lang', type=str, choices=['cn', 'en'], default='cn', help='Segmentor language setting.') args = parser.parse_args() return args def lstg(num, lst): for i in range(0, len(lst), num): yield lst[i:i + num] args = _get_parser() seg = Segmentor(args) today = time.strftime("%Y%m%d", time.localtime(time.time())) # 停用词缓存 stop_words_cache = {} jieba_cache = {} # load stop words stop_words = get_stop_words(args.stop_words) if os.path.exists( args.stop_words) else list() def fenci(i): result = {} for zzz in i: inline = zzz.rstrip()
def run(self, questions): args = self._get_parser() # preliminary work ensure_dir(args.output) if args.name_len_update: line_cnt = line_counter(args.infile) args.name_len = len(str(line_cnt)) + 1 clean_dir(args.output, args.name_len) # end preliminary work p_bucket = defaultdict(list) save_idx = 0 id_name = '{0:0' + str(args.name_len) + 'd}' # load stop words stop_words = get_stop_words(args.stop_words) if os.path.exists( args.stop_words) else list() # load tokenizer seg = Segmentor(args) print('Splitting sentence into different clusters ...') infile = questions for inline in tqdm(infile): inline = inline.rstrip() line = inline.split(':::')[0] is_match = False seg_list = list(seg.cut(line)) if stop_words: seg_list = list(filter(lambda x: x not in stop_words, seg_list)) for wd in seg_list: if is_match: break w_bucket = p_bucket[wd] for bucket in w_bucket: bucket_path = os.path.join(args.output, bucket) check_file(bucket_path) selected = sample_file(bucket_path, args.sample_number) selected = list(map(lambda x: x.split(':::')[0], selected)) selected = list(map(lambda x: list(seg.cut(x)), selected)) # remove stop words if stop_words: filt_selected = list() for sen in selected: sen = list( filter(lambda x: x not in stop_words, sen)) filt_selected.append(sen) selected = filt_selected # calculate similarity with each bucket if all( jaccard(seg_list, cmp_list) > args.threshold for cmp_list in selected): is_match = True with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: if bucket not in p_bucket[w]: p_bucket[w].append(bucket) break if not is_match: bucket_name = ('tmp' + id_name).format(save_idx) bucket_path = os.path.join(args.output, bucket_name) with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: p_bucket[w].append(bucket_name) save_idx += 1 # sort and rename file file_list = os.listdir(args.output) file_list = list(filter(lambda x: x.startswith('tmp'), file_list)) cnt = dict() for file in file_list: file_path = os.path.join(args.output, file) cnt[file] = line_counter(file_path) sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True) name_map = dict() for idx, (file_name, times) in enumerate(sorted_cnt): origin_path = os.path.join(args.output, file_name) new_name = id_name.format(idx) new_path = os.path.join(args.output, new_name) os.rename(origin_path, new_path) name_map[file_name] = new_name for k, v in p_bucket.items(): p_bucket[k] = list(map(lambda x: name_map[x], v)) #合并文件 output_file = os.path.join(args.output, 'all_cluster.txt') try: if os.path.isfile(output_file): os.unlink(output_file) except Exception as e: print(e) file_list = os.listdir(args.output) fw = open(output_file, 'w+') for file in file_list: with open(os.path.join(args.output, file)) as f: for line in f.readlines(): fw.write(str(int(file)) + ',' + line) fw.close() df = pd.read_csv(output_file, names=['id', 'text']) df.columns = ['cluster_id', 'ques'] print('All is well') # json.dumps(dict(ques=ques)) df_dict = df.set_index('cluster_id').T.to_dict('records')[0] #dataframe 的数据格式转换 #df 0 aa # 0 aaa => aa [aaa] # 1 bb bb [] #df_dict = {0: aa, 1: bb} print(df_dict) result_dict = {} for cluster_id, ques in df_dict.items(): li = df[df['cluster_id'] == cluster_id].ques.values.tolist() # if(ques in li): li.remove(ques) result_dict[ques] = li my_list = [result_dict] my_df = pd.DataFrame(my_list).T my_df = my_df.reset_index() my_df.columns = ['ques', 'info'] print(my_df) return my_df.to_json(orient="records", force_ascii=False)
def find_non_stationary_clusters(args): if args['use_gpu']: print("Using CUDA" if torch.cuda.is_available() else "Using CPU") device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") else: device = "cpu" network_folder_name = args['folder'].split('/')[-1] tmp = re.search(r"cn(\d+)-", network_folder_name) n_clusters = int(tmp.group(1)) save_folder = os.path.join(args['dest_root'], network_folder_name) if os.path.exists( os.path.join(save_folder, 'cluster_histogram_for_corr.npy')): print('{} already exists. skipping'.format( os.path.join(save_folder, 'cluster_histogram_for_corr.npy'))) return check_mkdir(save_folder) with open(os.path.join(args['folder'], 'bestval.txt')) as f: best_val_dict_str = f.read() bestval = eval(best_val_dict_str.rstrip()) # Network and weight loading model_config = model_configs.PspnetCityscapesConfig() net = model_config.init_network( n_classes=n_clusters, for_clustering=False, output_features=False, use_original_base=args['use_original_base']).to(device) net.load_state_dict( torch.load(os.path.join(args['folder'], bestval['snapshot'] + '.pth'))) # load weights net.eval() # copy network file to save location copyfile(os.path.join(args['folder'], bestval['snapshot'] + '.pth'), os.path.join(save_folder, 'weights.pth')) if args['only_copy_weights']: print('Only copying weights') return # Data loading setup if args['corr_set'] == 'rc': corr_set_config = data_configs.RobotcarConfig() elif args['corr_set'] == 'cmu': corr_set_config = data_configs.CmuConfig() elif args['corr_set'] == 'both': corr_set_config1 = data_configs.CmuConfig() corr_set_config2 = data_configs.RobotcarConfig() sliding_crop_im = joint_transforms.SlidingCropImageOnly( 713, args['stride_rate']) input_transform = model_config.input_transform pre_validation_transform = model_config.pre_validation_transform if args['corr_set'] == 'both': corr_set_val1 = correspondences.Correspondences( corr_set_config1.correspondence_path, corr_set_config1.correspondence_im_path, input_size=(713, 713), input_transform=None, joint_transform=None, listfile=corr_set_config1.correspondence_val_list_file) corr_set_val2 = correspondences.Correspondences( corr_set_config2.correspondence_path, corr_set_config2.correspondence_im_path, input_size=(713, 713), input_transform=None, joint_transform=None, listfile=corr_set_config2.correspondence_val_list_file) corr_set_val = merged.Merged([corr_set_val1, corr_set_val2]) else: corr_set_val = correspondences.Correspondences( corr_set_config.correspondence_path, corr_set_config.correspondence_im_path, input_size=(713, 713), input_transform=None, joint_transform=None, listfile=corr_set_config.correspondence_val_list_file) # Segmentor segmentor = Segmentor(net, n_clusters, n_slices_per_pass=4) # save args open(os.path.join(save_folder, str(datetime.datetime.now()) + '.txt'), 'w').write(str(args) + '\n\n') cluster_histogram_for_correspondences = np.zeros((n_clusters, ), dtype=np.int64) cluster_histogram_non_correspondences = np.zeros((n_clusters, ), dtype=np.int64) for i in range(0, len(corr_set_val), args['step']): img1, img2, pts1, pts2, _ = corr_set_val[i] seg1 = segmentor.run_and_save( img1, None, pre_sliding_crop_transform=pre_validation_transform, input_transform=input_transform, sliding_crop=sliding_crop_im, use_gpu=args['use_gpu']) seg1 = np.array(seg1) corr_loc_mask = np.zeros(seg1.shape, dtype=np.bool) valid_inds = (pts1[0, :] >= 0) & (pts1[0, :] < seg1.shape[1]) & ( pts1[1, :] >= 0) & (pts1[1, :] < seg1.shape[0]) pts1 = pts1[:, valid_inds] for j in range(pts1.shape[1]): pt = pts1[:, j] corr_loc_mask[pt[1], pt[0]] = True cluster_ids_corr = seg1[corr_loc_mask] hist_tmp, _ = np.histogram(cluster_ids_corr, np.arange(n_clusters + 1)) cluster_histogram_for_correspondences += hist_tmp cluster_ids_no_corr = seg1[~corr_loc_mask] hist_tmp, _ = np.histogram(cluster_ids_no_corr, np.arange(n_clusters + 1)) cluster_histogram_non_correspondences += hist_tmp if ((i + 1) % 100) < args['step']: print('{}/{}'.format(i + 1, len(corr_set_val))) np.save(os.path.join(save_folder, 'cluster_histogram_for_corr.npy'), cluster_histogram_for_correspondences) np.save(os.path.join(save_folder, 'cluster_histogram_non_corr.npy'), cluster_histogram_non_correspondences) frac = cluster_histogram_for_correspondences / \ (cluster_histogram_for_correspondences + cluster_histogram_non_correspondences) stationary_inds = np.argwhere(frac > 0.01) np.save(os.path.join(save_folder, 'stationary_inds.npy'), stationary_inds) print('{} stationary clusters out of {}'.format( len(stationary_inds), len(cluster_histogram_for_correspondences)))
def segment_images_in_folder(network_file, img_folder, save_folder, args): # get current available device if args['use_gpu']: print("Using CUDA" if torch.cuda.is_available() else "Using CPU") device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") else: device = "cpu" # Network and weight loading model_config = model_configs.PspnetCityscapesConfig() if 'n_classes' in args: print('Initializing model with %d classes' % args['n_classes']) net = model_config.init_network( n_classes=args['n_classes'], for_clustering=False, output_features=False, use_original_base=args['use_original_base']).to(device) else: net = model_config.init_network().to(device) print('load model ' + network_file) state_dict = torch.load(network_file, map_location=lambda storage, loc: storage) # needed since we slightly changed the structure of the network in pspnet state_dict = rename_keys_to_match(state_dict) net.load_state_dict(state_dict) net.eval() # data loading input_transform = model_config.input_transform pre_validation_transform = model_config.pre_validation_transform # make sure crop size and stride same as during training sliding_crop = joint_transforms.SlidingCropImageOnly( 713, args['sliding_transform_step']) check_mkdir(save_folder) t0 = time.time() # get all file names filenames_ims = list() filenames_segs = list() print('Scanning %s for images to segment.' % img_folder) for root, subdirs, files in os.walk(img_folder): filenames = [f for f in files if f.endswith(args['img_ext'])] if len(filenames) > 0: print('Found %d images in %s' % (len(filenames), root)) seg_path = root.replace(img_folder, save_folder) check_mkdir(seg_path) filenames_ims += [os.path.join(root, f) for f in filenames] filenames_segs += [ os.path.join(seg_path, f.replace(args['img_ext'], '.png')) for f in filenames ] # Create segmentor if net.n_classes == 19: # This could be the 19 cityscapes classes segmentor = Segmentor(net, net.n_classes, colorize_fcn=cityscapes.colorize_mask, n_slices_per_pass=args['n_slices_per_pass']) else: segmentor = Segmentor(net, net.n_classes, colorize_fcn=None, n_slices_per_pass=args['n_slices_per_pass']) count = 1 for im_file, save_path in zip(filenames_ims, filenames_segs): tnow = time.time() print("[%d/%d (%.1fs/%.1fs)] %s" % (count, len(filenames_ims), tnow - t0, (tnow - t0) / count * len(filenames_ims), im_file)) segmentor.run_and_save( im_file, save_path, pre_sliding_crop_transform=pre_validation_transform, sliding_crop=sliding_crop, input_transform=input_transform, skip_if_seg_exists=True, use_gpu=args['use_gpu']) count += 1 tend = time.time() print('Time: %f' % (tend - t0))
def main(): args = _get_parser() # preliminary work check_file(args.infile) ensure_dir(args.output) if args.name_len_update: line_cnt = line_counter(args.infile) args.name_len = len(str(line_cnt)) + 1 clean_dir(args.output, args.name_len) # end preliminary work p_bucket = defaultdict(list) save_idx = 0 id_name = '{0:0' + str(args.name_len) + 'd}' # load stop words stop_words = get_stop_words(args.stop_words) if os.path.exists( args.stop_words) else list() # load tokenizer seg = Segmentor(args) print('Splitting sentence into different clusters ...') infile = open(args.infile, 'r', encoding="utf-8") for line in tqdm(infile): line = line.rstrip() is_match = False seg_list = list(seg.cut(line)) if stop_words: seg_list = list(filter(lambda x: x not in stop_words, seg_list)) for wd in seg_list: if is_match: break w_bucket = p_bucket[wd] for bucket in w_bucket: bucket_path = os.path.join(args.output, bucket) check_file(bucket_path) selected = sample_file(bucket_path, args.sample_number) selected = list(map(lambda x: list(seg.cut(x)), selected)) # remove stop words if stop_words: filt_selected = list() for sen in selected: sen = list(filter(lambda x: x not in stop_words, sen)) filt_selected.append(sen) selected = filt_selected # calculate similarity with each bucket if all( jaccard(seg_list, cmp_list) > args.threshold for cmp_list in selected): is_match = True with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: if bucket not in p_bucket[w]: p_bucket[w].append(bucket) break if not is_match: bucket_name = ('tmp' + id_name).format(save_idx) bucket_path = os.path.join(args.output, bucket_name) with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: p_bucket[w].append(bucket_name) save_idx += 1 infile.close() # sort and rename file file_list = os.listdir(args.output) file_list = list(filter(lambda x: x.startswith('tmp'), file_list)) cnt = dict() for file in file_list: file_path = os.path.join(args.output, file) cnt[file] = line_counter(file_path) sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True) for idx, (file_name, times) in enumerate(sorted_cnt): origin_path = os.path.join(args.output, file_name) new_path = os.path.join(args.output, id_name.format(idx)) os.rename(origin_path, new_path) print('All is well')
def run(self, net, optimizer, args, curr_iter, save_dir, f_handle, writer=None): # the following code is written assuming that batch size is 1 net.eval() segmentor = Segmentor(net, self.n_classes, colorize_fcn=None, n_slices_per_pass=10) confmat = np.zeros((self.n_classes, self.n_classes)) for vi, data in enumerate(self.data_loader): img_slices, gt, slices_info = data gt.squeeze_(0) prediction_tmp = segmentor.run_on_slices(img_slices.squeeze_(0), slices_info.squeeze_(0)) if prediction_tmp.shape != gt.size(): prediction_tmp = Image.fromarray(prediction_tmp.astype(np.uint8)).convert('P') prediction_tmp = F.resize(prediction_tmp, gt.size(), interpolation=Image.NEAREST) acc, acc_cls, mean_iu, fwavacc, confmat, _ = evaluate_incremental( confmat, np.asarray(prediction_tmp), gt.numpy(), self.n_classes) str2write = 'validating: %d / %d' % (vi + 1, len(self.data_loader)) print(str2write) # f_handle.write(str2write + "\n") # Store confusion matrix confmatdir = os.path.join(save_dir, 'confmat') os.makedirs(confmatdir, exist_ok=True) with open(os.path.join(confmatdir, self.extra_name_str + str(curr_iter) + '_confmat.pkl'), 'wb') as confmat_file: pickle.dump(confmat, confmat_file) if self.save_snapshot: snapshot_name = 'iter_%d_acc_%.5f_acc-cls_%.5f_mean-iu_%.5f_fwavacc_%.5f_lr_%.10f' % ( curr_iter, acc, acc_cls, mean_iu, fwavacc, optimizer.param_groups[1]['lr']) torch.save(net.state_dict(), os.path.join( save_dir, snapshot_name + '.pth')) torch.save(optimizer.state_dict(), os.path.join( save_dir, 'opt_' + snapshot_name + '.pth')) if args['best_record']['mean_iu'] < mean_iu: args['best_record']['iter'] = curr_iter args['best_record']['acc'] = acc args['best_record']['acc_cls'] = acc_cls args['best_record']['mean_iu'] = mean_iu args['best_record']['fwavacc'] = fwavacc args['best_record']['snapshot'] = snapshot_name open(os.path.join(save_dir, 'bestval.txt'), 'w').write( str(args['best_record']) + '\n\n') str2write = '%s best record: [acc %.5f], [acc_cls %.5f], [mean_iu %.5f], [fwavacc %.5f]' % (self.extra_name_str, args['best_record']['acc'], args['best_record']['acc_cls'], args['best_record']['mean_iu'], args['best_record']['fwavacc']) print(str2write) f_handle.write(str2write + "\n") str2write = '%s [iter %d], [acc %.5f], [acc_cls %.5f], [mean_iu %.5f], [fwavacc %.5f]' % (self.extra_name_str, curr_iter, acc, acc_cls, mean_iu, fwavacc) print(str2write) f_handle.write(str2write + "\n") if writer is not None: writer.add_scalar(self.extra_name_str + ': acc', acc, curr_iter) writer.add_scalar(self.extra_name_str + ': acc_cls', acc_cls, curr_iter) writer.add_scalar(self.extra_name_str + ': mean_iu', mean_iu, curr_iter) writer.add_scalar(self.extra_name_str + ': fwavacc', fwavacc, curr_iter) net.train() if 'freeze_bn' not in args or args['freeze_bn']: freeze_bn(net) return mean_iu