def factorize(data, seed, post, norm, gt, rank, max_iter=200): """ The factorization of NMF, data = W*H. The input gt (groundtruth) is only for monitoring performance of each iteration. Note: since calculating the cost function is too slow, we can only use the number of iteration as the stopping critera for efficiency issue. Return: W (m*k) and H (k*n) matrix. """ V = data W, H = initialize(V, rank, seed=seed, norm=norm) iter = 0 while iter <= max_iter: targets = dl.get_targets(W.T, post) """ #Add a function of counting #items in each cluster clusters = np.unique(targets) count_arr = [0 for i in range(0,len(clusters))] for c in targets: count_arr[c]+=1 print sorted(count_arr) """ if gt != None: A = metrics.accuracy(gt, targets) F1 = metrics.f_measure(gt, targets) #print "Iter = %d, Acc = %f, F1 = %f" %(iter,A,F1) W, H = euclidean_update(V, W, H, norm) W, H = _adjustment(W, H) iter += 1 return W, H
def calc_performance(self, segmentations, ground_truths, verbose=False): num_truthes = len(ground_truths) f_per_seg = [] c_per_seg = [] for i, seg in enumerate(segmentations): k = self.k_means[i] f = 0 c = 0 for j in range(num_truthes): fm = metrics.f_measure(seg, ground_truths[j]) cond_entropy = v_measure_score(seg.flat, ground_truths[j].flat) f += fm c += cond_entropy if verbose: print( "k:{} VS gorund-truth:{} => f={:.4f}, c={:.4f}".format( k, j + 1, fm, cond_entropy)) f /= num_truthes c /= num_truthes f_per_seg.append(f) c_per_seg.append(c) print( "k={}: Avg. f-measue={:.4f} , Avg. conditional entropy={:.4f}\n" .format(k, f, c)) return f_per_seg, c_per_seg
def evaluation_scores(groundtruth, labels_pred): """ Eval scores of the predicted results. :param: groundtruth (type list): the groundtruth (GT) of cluster assignment. Each element denotes an item's GT cluster_id. :param: labels_pred (type list): the predicted cluster assignments. Each element denotes an item's predicted cluster_id. """ NMI = metrics.normalized_mutual_info_score(groundtruth,labels_pred) A = metrics.accuracy(groundtruth,labels_pred) F1 = metrics.f_measure(groundtruth,labels_pred) P = metrics.purity(groundtruth,labels_pred) RI = metrics.random_index(groundtruth,labels_pred) ARI = metrics.adjusted_rand_score(groundtruth,labels_pred) map_pairs = metrics.get_map_pairs(groundtruth,labels_pred) return NMI, A, F1, P, RI, ARI, map_pairs
def conmf_factorize(method, datas, weights, regu_weights, seed, post, norm, max_iter, rank, gt=None): """ Factorization process of CoNMF. :param max_iter (type: int). Maximum iterations of executing CoNMF update rules. :param rank (type: int). Number of latent factors in NMF factorization. For clustering application, it is typicall set as the number of clusters. Other parameters are with same meaning of conmf(). """ if method not in ["pair-wise", "cluster-wise"]: print "Error! Method not in [pair-wise, cluster-wise]!" return None Ws, Hs = conmf_initialize(datas, rank, seed, weights, norm) targets, As, F1s = [], [], [] iter_num = 0 while iter_num <= max_iter: targets = [dl.get_targets(W.T, post) for W in Ws] As = [ "{0:.4f}".format(metrics.accuracy(gt, target)) for target in targets ] F1s = [ "{0:.4f}".format(metrics.f_measure(gt, target)) for target in targets ] if iter_num == 0: print "\t\t CoNMF Inits \t Acc = %s;\t F1 = %s " % (str(As), str(F1s)) #print "\t\t Iter = %d: \t Acc = %s;\t F1 = %s " %(iter_num, str(As), str(F1s)) Ws, Hs = conmf_update(datas, Ws, Hs, weights, regu_weights, norm, method) #cost = conmf_cost(Vs,Ws,Hs, weights, mutual_weights, norm, method) if iter_num == max_iter: print "\t\t CoNMF Ends \t Acc = %s;\t F1 = %s " % (str(As), str(F1s)) iter_num += 1 return Ws, Hs
def results_to_metrics(results, methods, ref_motifs): _, _, ref_labels = motif.unpack_motif(ref_motifs) metric_dict = dict.fromkeys(methods) for m in methods: obs_motifs = results[m] _, _, obs_labels = motif.unpack_motif(obs_motifs) this_edit = metrics.edit_distance(obs_labels, ref_labels) this_recall = metrics.recall(obs_motifs, ref_motifs) this_precis = metrics.precision(obs_motifs, ref_motifs) this_f = metrics.f_measure(obs_motifs, ref_motifs) this_bm = metrics.boundary_distance(obs_motifs, ref_motifs) metric_dict[m] = [this_edit, this_recall, this_precis, this_f, this_bm] return metric_dict
def main(): parser = argparse.ArgumentParser() parser.add_argument("--train_file_path", default="data/train.csv", help="training file path") parser.add_argument("--test_file_path", default="data/test.csv", help="testing file path") parser.add_argument("--approach", default="SVD", help="Baseline | SVD | SlopeOne | NMF | CoClustering") parser.add_argument("--output_ranking_file", default="ranking", help="output ranking for test") bsl_options = {'method': 'sgd', 'n_epochs': 20, 'reg_u': 100, 'reg_i': 50} options = { "Baseline": BaselineOnly(bsl_options, verbose=True), "SVD": SVD(verbose=True, n_factors=20, n_epochs=3), "SlopeOne": SlopeOne(), "NMF": NMF(), "CoClustering": CoClustering() } args = parser.parse_args() reader = Reader(line_format='user item rating timestamp', sep='\t') algo = options[args.approach] train_data = Dataset.load_from_file(args.train_file_path, reader=reader) test_data = Dataset.load_from_file(args.test_file_path, reader=reader) train_set = train_data.build_full_trainset() test_set = test_data.build_full_trainset().build_testset() print("training....") algo.fit(train_set) print("testing...") predictions = algo.test(test_set) accuracy.mae(predictions, verbose=True) accuracy.rmse(predictions, verbose=True) ### Extra Credit output_ranking(predictions, args.output_ranking_file + "_" + args.approach + ".out") precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=2.5) print("Precision:", sum(prec for prec in precisions.values()) / len(precisions)) print("Recall:", sum(rec for rec in recalls.values()) / len(recalls)) print("F-measure:", f_measure(precisions, recalls)) print("conversion_rate:", get_conversion_rate(predictions, k=10)) print("ndcg:", get_ndcg(predictions, k_highest_scores=10))
def compute_ovr_metric(annotations: Dict[int, List[int]], n_obs: int, metric: Metric) -> Dict[int, float]: """Compute the one-vs-rest annotator score for each annotator in the provided dictionary of annotations on the given metric. Parameters ---------- annotations : Dict[int, List[int]] Mapping from annotator ID to a list of change point indices. n_obs : int Length of the time series in question. This is needed for the covering metric. metric : Metric The metric to use for the evaluation. Returns ------- ovr_scores : Dict[int, float] Mapping from the annotator ID to the one-vs-rest annotator score on the chosen metric. """ ovr = {} for j in annotations: others = [k for k in annotations if not k == j] Ak = {u: annotations[u] for u in others} X = annotations[j] if metric == Metric.f1: ovr[j] = f_measure(Ak, X) elif metric == Metric.cover: ovr[j] = covering(Ak, X, n_obs) else: raise ValueError(f"Unknown metric: {metric}") return ovr
def main(): args = parse_args() dataset = load_json(args.dataset_file) annotations = load_annotations(args.annotation_file, dataset["name"]) out = { "dataset": dataset["name"], "dataset_nobs": dataset["n_obs"], "dataset_ndim": dataset["n_dim"], "annotations": annotations, "results": {}, } data_results = next( (d for d in os.listdir(args.result_dir) if d == dataset["name"]), None) if data_results is None: print( "Couldn't find the result directory for dataset %s" % dataset["name"], file=sys.stderr, ) raise SystemExit(1) dataset_dir = os.path.join(args.result_dir, data_results) for method in sorted(os.listdir(dataset_dir)): method_dir = os.path.join(dataset_dir, method) for result_file in sorted(os.listdir(method_dir)): # print("Processing result file: %s" % result_file) fname = os.path.join(method_dir, result_file) result = load_json(fname) if not method in out["results"]: out["results"][method] = [] if result["status"].lower() == "success": locations = clean_cps(result["result"]["cplocations"], dataset) f1, precision, recall = f_measure(annotations, locations, return_PR=True) n_obs = dataset["n_obs"] cover = covering(annotations, locations, n_obs) scores = { "f1": f1, "precision": precision, "recall": recall, "cover": cover, } else: locations = None scores = None out["results"][method].append({ "parameters": result["parameters"], "task_file": result_file, "cplocations": locations, "scores": scores, "status": result["status"], }) if args.output_file: with open(args.output_file, "w") as fp: json.dump(out, fp, indent="\t") else: print(json.dumps(out, indent="\t"))
def run(): while True: trial = pull_pending() if trial is None: break params = eval(trial['Parameters']) logging.info(trial) dataset = load(trial['Dataset']) fold = int(trial['Fold']) - 1 (X_train, y_train), (X_test, y_test) = dataset[fold][0], dataset[fold][1] n_minority = Counter(y_train).most_common()[1][1] n_majority = Counter(y_train).most_common()[0][1] imblearn_ratios = [ ((n_majority - n_minority) * ratio + n_minority) / n_majority for ratio in [0.5, 0.75, 1.0] ] clf = { 'NB': NB(), 'KNN': KNN(), 'SVM': SVM(gamma='scale'), 'CART': CART() }[params['classifier']] if (trial['Algorithm'] is None) or (trial['Algorithm'] == 'None'): algorithm = None else: algorithms = { 'AKNN': ResamplingCV(AKNN, clf, n_neighbors=[1, 3, 5, 7]), 'Bord': ResamplingCV(SMOTE, clf, kind=['borderline1'], k_neighbors=[1, 3, 5, 7, 9], m_neighbors=[5, 10, 15], sampling_strategy=imblearn_ratios), 'CC': ResamplingCV(CC, clf, sampling_strategy=imblearn_ratios), 'CNN': ResamplingCV(CNN, clf, n_neighbors=[1, 3, 5, 7]), 'ENN': ResamplingCV(ENN, clf, n_neighbors=[1, 3, 5, 7]), 'IHT': ResamplingCV(IHT, clf, sampling_strategy=imblearn_ratios, cv=[2]), 'NCL': ResamplingCV(NCL, clf, n_neighbors=[1, 3, 5, 7]), 'NM': ResamplingCV(NM, clf, n_neighbors=[1, 3, 5, 7]), 'OSS': ResamplingCV(OSS, clf, n_neighbors=[1, 3, 5, 7]), 'RBO': ResamplingCV(RBO, clf, gamma=[0.01, 0.1, 1.0, 10.0], ratio=[0.5, 0.75, 1.0]), 'RBU': ResamplingCV(RBU, clf, gamma=params.get('gamma'), ratio=params.get('ratio')), 'RENN': ResamplingCV(RENN, clf, n_neighbors=[1, 3, 5, 7]), 'ROS': ResamplingCV(ROS, clf, sampling_strategy=imblearn_ratios), 'RUS': ResamplingCV(RUS, clf, sampling_strategy=imblearn_ratios), 'SMOTE': ResamplingCV(SMOTE, clf, k_neighbors=[1, 3, 5, 7, 9], sampling_strategy=imblearn_ratios), 'SMOTE+ENN': ResamplingCV( SMOTEENN, clf, smote=[SMOTE(k_neighbors=k) for k in [1, 3, 5, 7, 9]], sampling_strategy=imblearn_ratios), 'SMOTE+TL': ResamplingCV( SMOTETomek, clf, smote=[SMOTE(k_neighbors=k) for k in [1, 3, 5, 7, 9]], sampling_strategy=imblearn_ratios), 'TL': TL(), } algorithm = algorithms.get(trial['Algorithm']) if algorithm is None: raise NotImplementedError if algorithm is not None: X_train, y_train = algorithm.fit_sample(X_train, y_train) clf = clf.fit(X_train, y_train) predictions = clf.predict(X_test) scores = { 'Precision': metrics.precision(y_test, predictions), 'Recall': metrics.recall(y_test, predictions), 'F-measure': metrics.f_measure(y_test, predictions), 'AUC': metrics.auc(y_test, predictions), 'G-mean': metrics.g_mean(y_test, predictions) } submit_result(trial, scores)