Exemplo n.º 1
0
def factorize(data, seed, post, norm, gt, rank, max_iter=200):
    """
    The factorization of NMF, data = W*H. 
    The input gt (groundtruth) is only for monitoring performance of each iteration.
    
    Note: since calculating the cost function is too slow, we can only use the number of iteration as the stopping critera for efficiency issue. 
    Return: W (m*k) and H (k*n) matrix.
    """
    V = data
    W, H = initialize(V, rank, seed=seed, norm=norm)
    iter = 0
    while iter <= max_iter:
        targets = dl.get_targets(W.T, post)
        """
        #Add a function of counting #items in each cluster
        clusters = np.unique(targets)
        count_arr = [0 for i in range(0,len(clusters))]
        for c in targets:
            count_arr[c]+=1
        print sorted(count_arr)
        """
        if gt != None:
            A = metrics.accuracy(gt, targets)
            F1 = metrics.f_measure(gt, targets)
            #print "Iter = %d, Acc = %f, F1 = %f" %(iter,A,F1)

        W, H = euclidean_update(V, W, H, norm)
        W, H = _adjustment(W, H)
        iter += 1

    return W, H
 def calc_performance(self, segmentations, ground_truths, verbose=False):
     num_truthes = len(ground_truths)
     f_per_seg = []
     c_per_seg = []
     for i, seg in enumerate(segmentations):
         k = self.k_means[i]
         f = 0
         c = 0
         for j in range(num_truthes):
             fm = metrics.f_measure(seg, ground_truths[j])
             cond_entropy = v_measure_score(seg.flat, ground_truths[j].flat)
             f += fm
             c += cond_entropy
             if verbose:
                 print(
                     "k:{} VS gorund-truth:{} => f={:.4f}, c={:.4f}".format(
                         k, j + 1, fm, cond_entropy))
         f /= num_truthes
         c /= num_truthes
         f_per_seg.append(f)
         c_per_seg.append(c)
         print(
             "k={}: Avg. f-measue={:.4f} , Avg. conditional entropy={:.4f}\n"
             .format(k, f, c))
     return f_per_seg, c_per_seg
Exemplo n.º 3
0
def evaluation_scores(groundtruth, labels_pred):
    """
    Eval scores of the predicted results.
     
    :param: groundtruth (type list): the groundtruth (GT) of cluster assignment. Each element denotes an item's GT cluster_id. 
    :param: labels_pred (type list): the predicted cluster assignments. Each element denotes an item's predicted cluster_id.
    """
    NMI = metrics.normalized_mutual_info_score(groundtruth,labels_pred)
    A = metrics.accuracy(groundtruth,labels_pred)
    F1 = metrics.f_measure(groundtruth,labels_pred)
    P = metrics.purity(groundtruth,labels_pred)
    RI = metrics.random_index(groundtruth,labels_pred)
    ARI = metrics.adjusted_rand_score(groundtruth,labels_pred)
    map_pairs = metrics.get_map_pairs(groundtruth,labels_pred)
    return NMI, A, F1, P, RI, ARI, map_pairs
Exemplo n.º 4
0
def conmf_factorize(method,
                    datas,
                    weights,
                    regu_weights,
                    seed,
                    post,
                    norm,
                    max_iter,
                    rank,
                    gt=None):
    """
    Factorization process of CoNMF.
    
    :param max_iter (type: int). Maximum iterations of executing CoNMF update rules.
    :param rank (type: int). Number of latent factors in NMF factorization. For clustering application, it is typicall set as the number of clusters.
    
    Other parameters are with same meaning of conmf().
    """
    if method not in ["pair-wise", "cluster-wise"]:
        print "Error! Method not in [pair-wise, cluster-wise]!"
        return None

    Ws, Hs = conmf_initialize(datas, rank, seed, weights, norm)

    targets, As, F1s = [], [], []
    iter_num = 0
    while iter_num <= max_iter:
        targets = [dl.get_targets(W.T, post) for W in Ws]
        As = [
            "{0:.4f}".format(metrics.accuracy(gt, target))
            for target in targets
        ]
        F1s = [
            "{0:.4f}".format(metrics.f_measure(gt, target))
            for target in targets
        ]
        if iter_num == 0:
            print "\t\t CoNMF Inits \t Acc = %s;\t F1 = %s " % (str(As),
                                                                str(F1s))
        #print "\t\t Iter = %d: \t Acc = %s;\t F1 = %s " %(iter_num, str(As), str(F1s))
        Ws, Hs = conmf_update(datas, Ws, Hs, weights, regu_weights, norm,
                              method)
        #cost = conmf_cost(Vs,Ws,Hs, weights, mutual_weights, norm, method)
        if iter_num == max_iter:
            print "\t\t CoNMF Ends \t Acc = %s;\t F1 = %s " % (str(As),
                                                               str(F1s))
        iter_num += 1
    return Ws, Hs
Exemplo n.º 5
0
def results_to_metrics(results, methods, ref_motifs):
    _, _, ref_labels = motif.unpack_motif(ref_motifs)
    metric_dict = dict.fromkeys(methods)

    for m in methods:
        obs_motifs = results[m]
        _, _, obs_labels = motif.unpack_motif(obs_motifs)

        this_edit = metrics.edit_distance(obs_labels, ref_labels)
        this_recall = metrics.recall(obs_motifs, ref_motifs)
        this_precis = metrics.precision(obs_motifs, ref_motifs)
        this_f = metrics.f_measure(obs_motifs, ref_motifs)
        this_bm = metrics.boundary_distance(obs_motifs, ref_motifs)
        metric_dict[m] = [this_edit, this_recall, this_precis, this_f, this_bm]

    return metric_dict
Exemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--train_file_path",
                        default="data/train.csv",
                        help="training file path")
    parser.add_argument("--test_file_path",
                        default="data/test.csv",
                        help="testing file path")
    parser.add_argument("--approach",
                        default="SVD",
                        help="Baseline | SVD | SlopeOne | NMF | CoClustering")
    parser.add_argument("--output_ranking_file",
                        default="ranking",
                        help="output ranking for test")
    bsl_options = {'method': 'sgd', 'n_epochs': 20, 'reg_u': 100, 'reg_i': 50}
    options = {
        "Baseline": BaselineOnly(bsl_options, verbose=True),
        "SVD": SVD(verbose=True, n_factors=20, n_epochs=3),
        "SlopeOne": SlopeOne(),
        "NMF": NMF(),
        "CoClustering": CoClustering()
    }
    args = parser.parse_args()
    reader = Reader(line_format='user item rating timestamp', sep='\t')
    algo = options[args.approach]
    train_data = Dataset.load_from_file(args.train_file_path, reader=reader)
    test_data = Dataset.load_from_file(args.test_file_path, reader=reader)
    train_set = train_data.build_full_trainset()
    test_set = test_data.build_full_trainset().build_testset()
    print("training....")
    algo.fit(train_set)
    print("testing...")
    predictions = algo.test(test_set)
    accuracy.mae(predictions, verbose=True)
    accuracy.rmse(predictions, verbose=True)
    ### Extra Credit
    output_ranking(predictions,
                   args.output_ranking_file + "_" + args.approach + ".out")
    precisions, recalls = precision_recall_at_k(predictions,
                                                k=10,
                                                threshold=2.5)
    print("Precision:",
          sum(prec for prec in precisions.values()) / len(precisions))
    print("Recall:", sum(rec for rec in recalls.values()) / len(recalls))
    print("F-measure:", f_measure(precisions, recalls))
    print("conversion_rate:", get_conversion_rate(predictions, k=10))
    print("ndcg:", get_ndcg(predictions, k_highest_scores=10))
Exemplo n.º 7
0
def compute_ovr_metric(annotations: Dict[int, List[int]], n_obs: int,
                       metric: Metric) -> Dict[int, float]:
    """Compute the one-vs-rest annotator score for each annotator in the
    provided dictionary of annotations on the given metric.

    Parameters
    ----------
    annotations : Dict[int, List[int]]
        Mapping from annotator ID to a list of change point indices.

    n_obs : int
        Length of the time series in question. This is needed for the covering
        metric.

    metric : Metric
        The metric to use for the evaluation.

    Returns
    -------
    ovr_scores : Dict[int, float]
        Mapping from the annotator ID to the one-vs-rest annotator score on the
        chosen metric.

    """
    ovr = {}
    for j in annotations:
        others = [k for k in annotations if not k == j]
        Ak = {u: annotations[u] for u in others}
        X = annotations[j]
        if metric == Metric.f1:
            ovr[j] = f_measure(Ak, X)
        elif metric == Metric.cover:
            ovr[j] = covering(Ak, X, n_obs)
        else:
            raise ValueError(f"Unknown metric: {metric}")
    return ovr
Exemplo n.º 8
0
def main():
    args = parse_args()

    dataset = load_json(args.dataset_file)
    annotations = load_annotations(args.annotation_file, dataset["name"])

    out = {
        "dataset": dataset["name"],
        "dataset_nobs": dataset["n_obs"],
        "dataset_ndim": dataset["n_dim"],
        "annotations": annotations,
        "results": {},
    }

    data_results = next(
        (d for d in os.listdir(args.result_dir) if d == dataset["name"]), None)
    if data_results is None:
        print(
            "Couldn't find the result directory for dataset %s" %
            dataset["name"],
            file=sys.stderr,
        )
        raise SystemExit(1)

    dataset_dir = os.path.join(args.result_dir, data_results)

    for method in sorted(os.listdir(dataset_dir)):
        method_dir = os.path.join(dataset_dir, method)
        for result_file in sorted(os.listdir(method_dir)):
            # print("Processing result file: %s" % result_file)
            fname = os.path.join(method_dir, result_file)
            result = load_json(fname)
            if not method in out["results"]:
                out["results"][method] = []

            if result["status"].lower() == "success":
                locations = clean_cps(result["result"]["cplocations"], dataset)

                f1, precision, recall = f_measure(annotations,
                                                  locations,
                                                  return_PR=True)
                n_obs = dataset["n_obs"]
                cover = covering(annotations, locations, n_obs)
                scores = {
                    "f1": f1,
                    "precision": precision,
                    "recall": recall,
                    "cover": cover,
                }
            else:
                locations = None
                scores = None

            out["results"][method].append({
                "parameters": result["parameters"],
                "task_file": result_file,
                "cplocations": locations,
                "scores": scores,
                "status": result["status"],
            })

    if args.output_file:
        with open(args.output_file, "w") as fp:
            json.dump(out, fp, indent="\t")
    else:
        print(json.dumps(out, indent="\t"))
Exemplo n.º 9
0
def run():
    while True:
        trial = pull_pending()

        if trial is None:
            break

        params = eval(trial['Parameters'])

        logging.info(trial)

        dataset = load(trial['Dataset'])
        fold = int(trial['Fold']) - 1

        (X_train, y_train), (X_test,
                             y_test) = dataset[fold][0], dataset[fold][1]

        n_minority = Counter(y_train).most_common()[1][1]
        n_majority = Counter(y_train).most_common()[0][1]

        imblearn_ratios = [
            ((n_majority - n_minority) * ratio + n_minority) / n_majority
            for ratio in [0.5, 0.75, 1.0]
        ]

        clf = {
            'NB': NB(),
            'KNN': KNN(),
            'SVM': SVM(gamma='scale'),
            'CART': CART()
        }[params['classifier']]

        if (trial['Algorithm'] is None) or (trial['Algorithm'] == 'None'):
            algorithm = None
        else:
            algorithms = {
                'AKNN':
                ResamplingCV(AKNN, clf, n_neighbors=[1, 3, 5, 7]),
                'Bord':
                ResamplingCV(SMOTE,
                             clf,
                             kind=['borderline1'],
                             k_neighbors=[1, 3, 5, 7, 9],
                             m_neighbors=[5, 10, 15],
                             sampling_strategy=imblearn_ratios),
                'CC':
                ResamplingCV(CC, clf, sampling_strategy=imblearn_ratios),
                'CNN':
                ResamplingCV(CNN, clf, n_neighbors=[1, 3, 5, 7]),
                'ENN':
                ResamplingCV(ENN, clf, n_neighbors=[1, 3, 5, 7]),
                'IHT':
                ResamplingCV(IHT,
                             clf,
                             sampling_strategy=imblearn_ratios,
                             cv=[2]),
                'NCL':
                ResamplingCV(NCL, clf, n_neighbors=[1, 3, 5, 7]),
                'NM':
                ResamplingCV(NM, clf, n_neighbors=[1, 3, 5, 7]),
                'OSS':
                ResamplingCV(OSS, clf, n_neighbors=[1, 3, 5, 7]),
                'RBO':
                ResamplingCV(RBO,
                             clf,
                             gamma=[0.01, 0.1, 1.0, 10.0],
                             ratio=[0.5, 0.75, 1.0]),
                'RBU':
                ResamplingCV(RBU,
                             clf,
                             gamma=params.get('gamma'),
                             ratio=params.get('ratio')),
                'RENN':
                ResamplingCV(RENN, clf, n_neighbors=[1, 3, 5, 7]),
                'ROS':
                ResamplingCV(ROS, clf, sampling_strategy=imblearn_ratios),
                'RUS':
                ResamplingCV(RUS, clf, sampling_strategy=imblearn_ratios),
                'SMOTE':
                ResamplingCV(SMOTE,
                             clf,
                             k_neighbors=[1, 3, 5, 7, 9],
                             sampling_strategy=imblearn_ratios),
                'SMOTE+ENN':
                ResamplingCV(
                    SMOTEENN,
                    clf,
                    smote=[SMOTE(k_neighbors=k) for k in [1, 3, 5, 7, 9]],
                    sampling_strategy=imblearn_ratios),
                'SMOTE+TL':
                ResamplingCV(
                    SMOTETomek,
                    clf,
                    smote=[SMOTE(k_neighbors=k) for k in [1, 3, 5, 7, 9]],
                    sampling_strategy=imblearn_ratios),
                'TL':
                TL(),
            }

            algorithm = algorithms.get(trial['Algorithm'])

            if algorithm is None:
                raise NotImplementedError

        if algorithm is not None:
            X_train, y_train = algorithm.fit_sample(X_train, y_train)

        clf = clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)

        scores = {
            'Precision': metrics.precision(y_test, predictions),
            'Recall': metrics.recall(y_test, predictions),
            'F-measure': metrics.f_measure(y_test, predictions),
            'AUC': metrics.auc(y_test, predictions),
            'G-mean': metrics.g_mean(y_test, predictions)
        }

        submit_result(trial, scores)