def main(args): #-----------------------------------------------------# # 2D/3D Convolutional Autoencoder # #-----------------------------------------------------# if args.program == 'CAE': cae = CAE(input_dir=args.data_dir, patch_size=ast.literal_eval(args.patch_size), batch_size=args.batch_size, test_size=args.test_size, prepare_batches=args.prepare_batches) cae.prepare_data(args.sampler_type, args.max_patches, args.resample, ast.literal_eval(args.patch_overlap), args.min_lab_vox, args.label_prob, args.load_data) if args.model_dir is None: cae.train(args.epochs) cae.predict(args.model_dir) #-----------------------------------------------------# # Patient classification # #-----------------------------------------------------# """ if args.program=='AutSeg': asg = AutomaticSegmentation( model_name=args.model_name, patch_size=args.patch_size, patch_overlap=args.patch_overlap, input_dir=args.data_dir, model_dir=args.model_dir ) asg.run() asg.run_postprocessing() """ if args.program == 'CLUS': clustering = Clustering(num_iters=args.iterations, num_clusters=args.num_clusters, input_dir=args.data_dir) clustering.run() if args.program == 'FeEx': fe = FeatureExtraction(model_name=args.model_name, patch_size=ast.literal_eval(args.patch_size), patch_overlap=ast.literal_eval( args.patch_overlap), num_clusters=args.num_clusters, cluster_selection=args.cluster_selection, resample=args.resample, encoded_layer_num=args.encoded_layer_num, model_dir=args.model_dir, input_dir=args.data_dir) fe.run(batch_size=20) if args.program == 'SVM': svm = SvmClassifier(feature_dir=args.feature_dir, ffr_dir=args.ffr_dir, ffr_filename=args.ffr_filename, input_dir=args.data_dir, ffr_cut_off=args.ffr_cut_off, test_size=args.test_size) svm.train() svm.predict()
def generate(self, keys, url): json_work("other_files/work_file.json", "w", []) # обнуляем work print(f'Ключей получено: {len(keys)}') if len(keys) > 0: self.generate_pretmp( keys ) # генерация претемплейтов по ключам c уникальным stemming print(f'Ключей после удаления дублей: {len(self.work_file)}') time.sleep(2) if len(self.work_file) > 0: with ThreadPoolExecutor(5) as executor: for _ in executor.map(self.template_generated, self.work_file): pass work = json_work("other_files/work_file.json", "r") if len(work) > 0: gen_data = sorted(work, key=lambda x: x["frequency"]["basic"], reverse=True) json_work("other_files/work_file.json", "w", gen_data) gen_data += json_work("other_files/main.json", "r") gen_data = sorted(gen_data, key=lambda x: x["frequency"]["basic"], reverse=True) json_work("other_files/main.json", "w", gen_data) print(f"url {url} обработан") clustering = Clustering( json_work("other_files/work_file.json", "r"), url) clustering.run() else: print("Перехожу к следующему url") return
def run_pipeline(data, y, dataset_id, evaluate=False, verbose=False, optimize_method=False, **kwargs): """ It performs the basic logic pipeline for getting the data, creates blocking and clustering and stores the matching pairs. :param data: pd.DataFrame, the input dataset :param y: pd.DataFrame, the dataset with the actual pairs :param dataset_id: int, the id of the input dataset :param evaluate: boolean, if evaluation should be run :param verbose: boolean, if logging :return: pd.DataFrame, with the predicted matching pairs """ cluster_n = kwargs.get('cluster_num', 0) distance_threshold = kwargs.get('distance_threshold', 0) method = kwargs.get('clustering_method', 'agglomerative') encoding = kwargs.get('encoding', 'use') # blocking if 'title' in data.columns: # Instantiate correct blocker based on dataset blocker = X2Blocker() # identify X3 if optimize_method and ("source" in data.instance_id[0]): method = 'cosine' elif 'name' in data.columns: blocker = X4Blocker() data["title"] = data["name"] if optimize_method: method = 'agglomerative' cluster_n = 0 else: raise ValueError("Please add a valid dataset id") blocker.fit(data=data) # blocks is of type [[instance_id]] # list of lists of instance_ids belonging to same group # transform returns modified data frame for further use blocks, data = blocker.transform() # apply clustering for each block to get matching pairs clusters = list() print("Method: ", method) print("Encoding: ", encoding, "\n") cls = Clustering(method=method, cluster_n=cluster_n, distance_threshold=distance_threshold, encoding=encoding) for block in blocks: if len(block) > 1: # filter data based on the instance_id's presented in the block block_df = data[data['instance_id'].isin(block)] clusters_l = cls.run(block_df) for c in clusters_l: clusters.append(c) # create pairs from clusters pairs_pred_df = create_pairs(clusters) dataset_scores = dict() if evaluate: # run performance evaluation dataset_scores = get_scores(actual=y, pred=pairs_pred_df) if verbose: print('Precision: {:.3f}'.format( dataset_scores['precision_score'])) print('Recall: {:.3f}'.format(dataset_scores['recall_score'])) print('F1 score: {:.3f}'.format(dataset_scores['f1_score'])) dataset_scores['cluster_n'] = cluster_n dataset_scores['method'] = method dataset_scores['dataset'] = dataset_id dataset_scores['encoding'] = encoding dataset_scores['threshold'] = distance_threshold return pairs_pred_df, dataset_scores