示例#1
0
文件: xtree.py 项目: rahlk/WarnPlan
def xtree(train_df, test_df):
    """XTREE"""

    if isinstance(train_df, list):
        train_df = list2dataframe(
            train_df)  # create a pandas dataframe of training data.dat
    if isinstance(test_df, list):
        test_df = list2dataframe(
            test_df)  # create a pandas dataframe of testing data.dat
    if isinstance(test_df, basestring):
        test_df = list2dataframe(
            [test_df])  # create a pandas dataframe of testing data.dat

    # train_df = SMOTE(train_df, atleast=1000, atmost=1001)

    tree = pyC45.dtree(train_df)  # Create a decision tree

    patch = Patches(train=None,
                    test=None,
                    trainDF=train_df,
                    testDF=test_df,
                    tree=tree)

    modified = patch.main()

    return modified
示例#2
0
def transfer_lessons4():
    data = DefectData.get_all_projects()["Apache"]
    for proj, paths in data.iteritems():
        if not proj in paths.bellw:
            res = {proj[:6]: []}
            "If training data.dat doesn't exist, create it."
            pred, pred2, distr, distr2 = [], [], [], []

            if not "train_bellw" in locals():
                train_bellw = list2dataframe(data[paths.bellw].data)

            train_local = list2dataframe(paths.data[:-1])
            test = list2dataframe(paths.data[-1])

            patched_local = xtree.execute(train_local, test)
            patched_bellw = xtree.execute(train_bellw, test)

            pred, distr = rforest(train_bellw,
                                  patched_local)  # How good are the patches
            pred2, distr2 = rforest(train_bellw,
                                    patched_bellw)  # How good are the predcitions
            pred3, distr3 = rforest(train_bellw,
                                    test)  # How good are the predcitions

            res[proj[:6]].extend(pred_stats(before=test[test.columns[-1]],
                                            after=pred3,
                                            distr=distr3))

            res[proj[:6]].append(impact(test, pred))
            res[proj[:6]].append(impact(test, pred2))
            yield res
示例#3
0
def transfer_lessons3():
    data = DefectData.get_all_projects()["Apache"]
    for proj, paths in data.iteritems():
        if not proj in paths.bellw:
            res = {proj[:6]: []}
            "If training data.dat doesn't exist, create it."
            pred, pred2, distr, distr2 = [], [], [], []

            if not "train" in locals():
                train = list2dataframe(data[paths.bellw].data)

            test, validation = train_test_split(list2dataframe(paths.data),
                                                test_size=0.8)
            # test = list2dataframe(paths.data.dat[-1])
            # validation = list2dataframe(paths.data.dat[:-1])
            patched = xtree.execute(train, test)
            a, b = rforest(train, patched)  # How good are the patches
            aa, bb = rforest(train, test)  # How good are the predcitions
            pred.append(a)
            pred2.append(aa)
            distr.append(b)
            distr2.append(bb)
            res[proj[:6]].extend(pred_stats(before=test[test.columns[-1]],
                                            after=pred2,
                                            distr=distr2))

            res[proj[:6]].extend(impact(test, pred))
            yield res
示例#4
0
def planning():
    data = get_all_projects(features="processed")
    results = dict()
    for proj, paths in data.iteritems():
        results.update({proj: []})
        for train, test, validation in TrainTestValidate.split(paths.data):

            "Convert to pandas type dataframe"
            train = list2dataframe(train)
            test = list2dataframe(test)
            validation = list2dataframe(validation)

            "Recommend changes with XTREE"
            new = xtree(train[train.columns[1:]], test)
            """
            Have the changes been implemented?" 
            """

            "Create a smaller dframe of all closed issues in validation set"
            closed_in_validation = validation[validation['category'].isin([0])]

            "Group the smaller dframe and the patched dframe by their file names"
            modules = list(set(closed_in_validation["Name"].tolist()))

            heeded = []
            for module_name in modules:
                count = []
                module_name_new = new[new["Name"].isin([module_name])]
                module_name_act = train[train["Name"].isin([module_name])]
                module_name_val = closed_in_validation[
                    closed_in_validation["Name"].isin([module_name])]
                for col_name in module_name_val.columns[1:-1]:
                    aa = module_name_new[col_name]
                    bb = module_name_val[col_name]
                    try:
                        ranges = sorted(eval(aa.values.tolist()[0]))
                        count.append(
                            any([
                                abs(ranges[0]) <= bbb <= abs(ranges[1])
                                for bbb in bb.tolist()
                            ]))
                    except TypeError:
                        count.append(
                            any([bbb == aa.values[0] for bbb in bb.tolist()]))
                    except IndexError:
                        pass
                if len(count) > 0:
                    heeded.append(sum(count) / len(count))
        results[proj] = heeded
        percentiles = np.percentile(results[proj], [25, 50, 75])
        print("{}\t{:0.2f}\t{:0.2f}\t{:0.2f}".format(proj[:5], percentiles[0],
                                                     percentiles[1],
                                                     percentiles[2]))
        "Find the deltas between patched and smaller validation dframe"
示例#5
0
def changes(data=None):
    if data is None:
        data = DefectData.get_all_projects()["Apache"]

    for proj, paths in data.iteritems():
        "Make sure we don't test on the bellwether dataset"

        if not proj in paths.bellw:
            res = {
                proj[:6]: {
                    "xtree_local": [],
                    "xtree_bellw": [],
                    "alves": [],
                    "olive": [],
                    "shatw": []
                }
            }

            bellw = list2dataframe(data[paths.bellw].data)
            test = list2dataframe(paths.data)
            test_local = list2dataframe(paths.data[-1])
            train_local = list2dataframe(paths.data[:-1])

            for train_bellw, validation in CrossValidation.split(bellw,
                                                                 ways=2):
                orig = DataFrame([
                    test.iloc[n].values.tolist()
                    for n in xrange(test.shape[0]) if test.iloc[n][-1] > 0
                ],
                                 columns=test.columns)

                patched_alves, changes_alves = alves(train_bellw, test_local)
                patched_shatw, changes_shatw = shatnawi(
                    train_bellw, test_local)
                patched_olive, changes_olive = oliveira(
                    train_bellw, test_local)
                patched_xtree, changes_xtree = xtree(train_bellw, test_local)
                patched_xtree_local, changes_xtree_local = xtree(
                    train_local, test_local)

                # How good are the patches from local lessons?
                res[proj[:6]]["alves"].append(
                    deltas_count(test.columns, changes_alves))
                res[proj[:6]]["olive"].append(
                    deltas_count(test.columns, changes_olive))
                res[proj[:6]]["shatw"].append(
                    deltas_count(test.columns, changes_shatw))
                res[proj[:6]]["xtree_bellw"].append(
                    deltas_count(test.columns, changes_xtree))
                res[proj[:6]]["xtree_local"].append(
                    deltas_count(test.columns, changes_xtree_local))

            yield res
示例#6
0
def xgboost_grid_tuned(train, target):
    try:
        source = list2dataframe(train)
    except IOError:
        source = train

    source = SMOTE(source)

    # Tune with grid search

    param_grid = {
        "n_estimators": [80],  #, 40, 20],
        "learning_rate": [0.1],
        # "max_depth": [4, 6],
        # "min_samples_leaf": [3, 5, 9, 17],
        # "max_features": [1.0, 0.3, 0.1]
    }

    clf = GradientBoostingClassifier()
    source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = False
    features = source.columns[:-1]
    klass = list(source[source.columns[-1]])
    clf = GridSearchCV(clf, param_grid).fit(source[features], klass)
    preds = clf.predict(target[target.columns[:-1]])
    distr = clf.predict_proba(target[target.columns[:-1]])[:, 1]

    return preds, distr
示例#7
0
def rforest_grid_tuned(train, target):
    clf = RandomForestClassifier(n_estimators=800,
                                 max_depth=6,
                                 min_samples_leaf=6,
                                 max_features=0.33)
    try:
        source = list2dataframe(train)
    except IOError:
        source = train

    source = SMOTE(source)

    # use a full grid over all parameters
    param_grid = {
        "max_depth": [3, None],
        "max_features": [1, 3, 10],
        "min_samples_split": [1, 3, 10],
        "min_samples_leaf": [1, 3, 10],
        "bootstrap": [True, False],
        "criterion": ["gini", "entropy"]
    }

    source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = False
    features = source.columns[:-1]
    klass = list(source[source.columns[-1]])
    clf = GridSearchCV(clf, param_grid).fit(source[features], klass)
    preds = clf.predict(target[target.columns[:-1]])
    distr = clf.predict_proba(target[target.columns[:-1]])[:, 1]

    return preds, distr
示例#8
0
def transfer_lessons(data=None):
    if data is None:
        data = DefectData.get_all_projects()["Apache"]

    for proj, paths in data.iteritems():
        "Make sure we don't test on the bellwether dataset"

        if not proj in paths.bellw:
            res = {proj[:6]: {
                "pd": [],
                "pf": [],
                "local": [],
                "bellw": []}
            }

            bellw = list2dataframe(data[paths.bellw].data)

            for train_bellw, validation in CrossValidation.split(bellw,
                                                                 ways=5):
                train_local = list2dataframe(paths.data[:-1])
                test = list2dataframe(paths.data[-1])

                patched_local = xtree(train_local, test)
                patched_bellw = xtree(train_bellw, list2dataframe(paths.data))

                # How good are the patches from local lessons?
                pred_local, distr_local = xgboost(validation, patched_local)

                # How good are the patches from the bellwether lessons?
                pred_bellw, distr_bellw = xgboost(validation, patched_bellw)

                # How good are the predictions
                pred_qual, distr_qual = xgboost(validation, test)

                pred = pred_stats(before=test[test.columns[-1]],
                                  after=pred_qual,
                                  distr=distr_qual)

                res[proj[:6]]["pd"].append(pred[0])
                res[proj[:6]]["pf"].append(pred[1])

                res[proj[:6]]["local"].append(impact(test, pred_local))
                res[proj[:6]]["bellw"].append(impact(test, pred_bellw))

            yield res
示例#9
0
def test_oracles(data=None):
    if data is None:
        data = DefectData.get_all_projects()["Apache"]

    for proj, paths in data.iteritems():
        "Make sure we don't test on the bellwether dataset"

        if not proj in paths.bellw:
            res = {
                proj[:6]: {
                    "pd_rf": [],
                    "pf_rf": [],
                    "pd_xg": [],
                    "pf_xg": []
                }
            }

            pred, pred2, distr, distr2 = [], [], [], []

            validate = list2dataframe(paths.data[:-1])
            test = list2dataframe(paths.data[-1])

            set_trace()

            # How good are the predictions

            pred1, distr1 = rforest(validate, test)
            pred2, distr2 = xgboost(validate, test)

            pred_rf = pred_stats(before=test[test.columns[-1]],
                                 after=pred1,
                                 distr=distr1)
            pred_xg = pred_stats(before=test[test.columns[-1]],
                                 after=pred2,
                                 distr=distr2)

            res[proj[:6]]["pd_rf"].append(pred_rf[0])
            res[proj[:6]]["pf_rf"].append(pred_rf[1])

            res[proj[:6]]["pd_xg"].append(pred_xg[0])
            res[proj[:6]]["pf_xg"].append(pred_xg[1])

            yield res
示例#10
0
def transfer_lessons2(n_folds=1):
    data = DefectData.get_all_projects()["Apache"]
    print("Name\tPd\tPf\tImprovement")
    for proj, paths in data.iteritems():
        if not proj in paths.bellw:
            print(proj[:4], end="\t")
            "If training data.dat doesn't exist, create it."

            train, validation = train_test_split(list2dataframe(paths.data),
                                                 test_size=0.8)
            test = paths.data[-1]
            validation = paths.data[:-1]
            patched = xtree.execute(train, test)
            test = list2dataframe(test)
            pred, distr = rforest(validation,
                                  patched)  # How good are the patches
            pred2, distr2 = rforest(validation,
                                    test)  # How good are the predcitions

            pred_stats(before=test[test.columns[-1]],
                       after=pred2,
                       distr=distr2)

            impact(test, pred)
示例#11
0
def rforest(train, target):
    clf = RandomForestClassifier(n_estimators=100, random_state=1)
    try:
        source = list2dataframe(train)
    except IOError:
        source = train

    source = SMOTE(source)

    source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = False
    features = source.columns[:-1]
    klass = list(source[source.columns[-1]])
    clf.fit(source[features], klass)
    preds = clf.predict(target[target.columns[:-1]])
    distr = clf.predict_proba(target[target.columns[:-1]])[:, 1]

    return preds, distr
示例#12
0
def xgboost(train, target):
    try:
        source = list2dataframe(train)
    except IOError:
        source = train

    # source = SMOTE(source)

    clf = GradientBoostingClassifier(n_estimators=80,
                                     max_depth=6,
                                     min_samples_leaf=6,
                                     learning_rate=0.085,
                                     subsample=True,
                                     max_features=0.33)

    source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = False
    features = source.columns[:-1]
    klass = list(source[source.columns[-1]])
    clf.fit(source[features], klass)
    preds = clf.predict(target[target.columns[:-1]])
    distr = clf.predict_proba(target[target.columns[:-1]])[:, 1]
    preds = [1 if val > 0.77 else 0 for val in distr]
    return preds, distr
示例#13
0
def run_experiment():
    data = DefectData.get_all_projects()["Apache"]
    metrics = list2dataframe(data["ant"].data[-1]).columns
    for res in changes(data):
        for key, value in res.iteritems():
            print(key)
            for n, (attr, xtree_local, xtree_bellw, Olive, Alves, Shatw) in \
                    enumerate(
                        zip(metrics,
                            np.median(value["xtree_local"], axis=0),
                            np.median(value["xtree_bellw"], axis=0),
                            np.median(value["olive"], axis=0),
                            np.median(value["alves"], axis=0),
                            np.median(value["shatw"], axis=0))):
                print(n,
                      attr[1:],
                      int(xtree_local),
                      int(xtree_bellw),
                      int(Olive),
                      int(Alves),
                      int(Shatw),
                      sep="\t")

    set_trace()
示例#14
0
def oliveira(train, test):
    """
    Implements shatnavi's threshold based planner.
    :param train: 
    :param test: 
    :param rftrain: 
    :param tunings: 
    :param verbose: 
    :return: 
    """
    "Helper Functions"

    def compliance_rate(k, train_columns):
        return \
            len([t for t in train_columns if t <= k]) / len(train_columns)

    def penalty_1(p, k, Min, compliance):

        comply = Min - compliance
        if comply >= 0:
            return (Min - comply) / Min
        else:
            return 0

    def penalty_2(k, Med):
        if k > Med:
            return (k - Med) / Med
        else:
            return 0

    "Compute Thresholds"

    if isinstance(test, list):
        test = list2dataframe(test)

    if isinstance(test, basestring):
        test = list2dataframe([test])

    if isinstance(train, list):
        train = list2dataframe(train)

    lo, hi = train.min(), train.max()
    quantile_array = get_percentiles(train)
    changes = []

    pk_best = dict()

    for metric in train.columns:
        min_comply = 10e32
        vals = np.empty([10, 100])
        for p_id, p in enumerate(np.arange(0, 100, 10)):
            for k_id, k in enumerate(np.linspace(lo[metric], hi[metric], 100)):
                med = quantile_array[90][metric]
                compliance = compliance_rate(k, train[metric])
                penalty1 = penalty_1(p, k, compliance=compliance, Min=0.9)
                penalty2 = penalty_2(k, med)
                comply_rate_penalty = penalty1 + penalty2
                vals[p_id, k_id] = comply_rate_penalty

                if comply_rate_penalty <= min_comply:
                    min_comply = comply_rate_penalty
                    try:
                        pk_best[metric] = (p, k)
                    except KeyError:
                        pk_best.update({metric: (p, k)})
    """
    Apply Plans Sequentially
    """

    modified = []
    for n in xrange(test.shape[0]):
        C = Changes()
        if test.iloc[n][-1] > 0 or test.iloc[n][-1] == True:
            new_row = apply3(test.iloc[n].values.tolist(), test.columns,
                             pk_best)
            for name, new, old in zip(test.columns, new_row,
                                      test.iloc[n].values.tolist()):
                C.save(name, new=new, old=old)

            changes.append(C.log)
            modified.append(new_row)

        # Disable the next two line if you're measuring the number of changes.
        else:
            if rand() > 0.7:
                modified.append(test.iloc[n].tolist())

    return pd.DataFrame(modified, columns=test.columns), changes
示例#15
0
文件: InfoGain.py 项目: rahlk/fss17
    featureTot = 0
    information_gain = []
    for i in range(0, len(nz[0])):
        if (i != 0 and nz[0][i] != pre):
            for notappear in range(pre + 1, nz[0][i]):
                information_gain.append(0)
            ig = _calIg()
            information_gain.append(ig)
            pre = nz[0][i]
            classCnt = {}
            featureTot = 0
        featureTot = featureTot + 1
        yclass = y[nz[1][i]]
        if yclass not in classCnt:
            classCnt[yclass] = 1
        else:
            classCnt[yclass] = classCnt[yclass] + 1
    ig = _calIg()
    information_gain.append(ig)

    return np.asarray(information_gain)


if __name__ == "__main__":
    data = DefectData.get_all_projects()["Apache"]
    test_data = list2dataframe(data["ant"].data)
    indep_var = test_data[test_data.columns[:-1]]
    depen_var = test_data[test_data.columns[-1]]
    information_gain(indep_var.values, depen_var.values)
    set_trace()
示例#16
0
                    deltas_magnitude(orig, patched_alves))
                res[proj[:6]]["olive"].append(
                    deltas_magnitude(orig, patched_olive))
                res[proj[:6]]["shatw"].append(
                    deltas_magnitude(orig, patched_shatw))
                res[proj[:6]]["xtree_bellw"].append(
                    deltas_magnitude(orig, patched_xtree))
                res[proj[:6]]["xtree_local"].append(
                    deltas_magnitude(orig, patched_xtree_local))

            yield res


if __name__ == "__main__":
    data = DefectData.get_all_projects()["Apache"]
    metrics = list2dataframe(data["ant"].data[-1]).columns
    for res in changes(data):
        for key, value in res.iteritems():
            set_trace()
            print(key)
            for n, (attr, xtree_local, xtree_bellw, Olive, Alves, Shatw) in \
                    enumerate(
                        zip(metrics,
                            np.median(value["xtree_local"], axis=0),
                            np.median(value["xtree_bellw"], axis=0),
                            np.median(value["olive"], axis=0),
                            np.median(value["alves"], axis=0),
                            np.median(value["shatw"], axis=0))):
                print(n,
                      attr[1:],
                      xtree_local,
示例#17
0
def transfer_lessons(data=None):
    if data is None:
        data = DefectData.get_all_projects()["Apache"]

    for proj, paths in data.iteritems():
        "Make sure we don't test on the bellwether dataset"

        if proj in ["ant", "ivy", "poi", "jedit"]:
            res = {
                proj[:6]: {
                    "xtree_local": [],
                    "xtree_bellw": [],
                    "alves": [],
                    "olive": [],
                    "shatw": []
                }
            }

            bellw = list2dataframe(data[paths.bellw].data)
            test = list2dataframe(paths.data)
            test_local = list2dataframe(paths.data[-1])
            train_local = list2dataframe(paths.data[:-1])

            for train_bellw, validation in CrossValidation.split(bellw,
                                                                 ways=5):
                patched_alves = alves(train_bellw, test_local)
                patched_shatw = shatnawi(train_bellw, test_local)
                patched_olive = oliveira(train_bellw, test_local)
                patched_xtree = xtree(train_bellw, test_local)
                patched_xtree_local = xtree(train_local, test_local)

                # How good are the patches from Alves?
                pred_alves, distr_alves = xgboost(validation, patched_alves)

                # How good are the patches from Shatnawi?
                pred_shatw, distr_shatw = xgboost(validation, patched_shatw)

                # How good are the patches from Oliveira?
                pred_olive, distr_olive = xgboost(validation, patched_olive)

                # How good are the patches from the bellwether XTREE?
                pred_xtree, distr_xtree = xgboost(validation, patched_xtree)

                # How good are the patches from the local XTREE?
                pred_xtree_local, distr_xtree_local = xgboost(
                    validation, patched_xtree_local)

                res[proj[:6]]["alves"].append(impact(test, pred_alves))
                res[proj[:6]]["shatw"].append(impact(test, pred_shatw))
                res[proj[:6]]["olive"].append(impact(test, pred_olive))
                res[proj[:6]]["xtree_bellw"].append(impact(test, pred_xtree))
                res[proj[:6]]["xtree_local"].append(
                    impact(test, pred_xtree_local))

                # Not yet...
                # # How good are the patches from the bellwether lessons?
                # pred_fontana, distr_fontana = xgboost(validation, patched_xtree)
                #
                # # res[proj[:6]]["fontana"].append(pred[1])

            yield res
示例#18
0
def shatnawi(train, test):
    """
    Implements shatnavi's threshold based planner.
    :param train: 
    :param test: 
    :param rftrain: 
    :param tunings: 
    :param verbose: 
    :return: 
    """
    "Compute Thresholds"

    if isinstance(test, list):
        test = list2dataframe(test)

    if isinstance(test, basestring):
        test = list2dataframe([test])

    if isinstance(train, list):
        train = list2dataframe(train)

    changed = []
    metrics = [str[1:] for str in train[train.columns[:-1]]]
    ubr = LogisticRegression()  # Init LogisticRegressor
    X = train[train.columns[:-1]]  # Independent Features (CK-Metrics)
    y = train[train.columns[-1]]  # Dependent Feature (Bugs)

    ubr.fit(X, y.values.tolist())  # Fit Logit curve
    inter = ubr.intercept_[0]  # Intercepts
    coef = ubr.coef_[0]  # Slopes
    pVal = f_classif(X, y)[1]  # P-Values
    changes = len(metrics) * [-1]

    "Find Thresholds using VARL"
    for Coeff, P_Val, idx in zip(coef, pVal,
                                 range(len(metrics))):  # xrange(len(metrics)):
        thresh = VARL(Coeff, inter, p0=0.065)  # VARL p0=0.05 (95% CI)
        if P_Val < 0.05:
            changes[idx] = thresh

    # set_trace()
    """
    Apply Plans Sequentially
    """

    modified = []
    for n in xrange(test.shape[0]):
        C = Changes()
        if test.iloc[n][-1] > 0 or test.iloc[n][-1] == True:
            new_row = apply2(changes, test.iloc[n].values.tolist())
            for name, new, old in zip(test.columns, new_row,
                                      test.iloc[n].values.tolist()):
                C.save(name, new=new, old=old)

            changed.append(C.log)
            modified.append(new_row)

        # Disable the next two line if you're measuring the number of changes.
        else:
            if rand() > 0.7:
                modified.append(test.iloc[n].tolist())

    return pd.DataFrame(modified, columns=test.columns), changed
示例#19
0
def alves(train, test):
    if isinstance(test, list):
        test = list2dataframe(test)

    if isinstance(test, basestring):
        test = list2dataframe([test])

    if isinstance(train, list):
        train = list2dataframe(train)

    metrics = [met[1:] for met in train[train.columns[:-1]]]
    X = train[train.columns[:-1]]  # Independent Features (CK-Metrics)
    changes = []
    """
    As weight we will consider 
    the source lines of code (LOC) of the entity.
    """

    tot_loc = train.sum()["$loc"]
    X = _ent_weight(X, scale=tot_loc)
    """
    Divide the entity weight by the sum of all weights of the same system.
    """
    denom = pd.DataFrame(X).sum().values
    norm_sum = pd.DataFrame(pd.DataFrame(X).values / denom, columns=metrics)
    """
    Find Thresholds
    """
    y = train[train.columns[-1]]  # Dependent Feature (Bugs)
    pVal = f_classif(X, y)[1]  # P-Values
    cutoff = []
    cumsum = lambda vals: [sum(vals[:i]) for i, __ in enumerate(vals)]

    def point(array):
        for idx, val in enumerate(array):
            if val > 0.95: return idx

    for idx in xrange(len(train.columns[:-1])):
        # Setup Cumulative Dist. Func.
        name = metrics[idx]
        loc = train["$loc"].values
        vals = norm_sum[name].values
        sorted_ids = np.argsort(vals)
        cumulative = [sum(vals[:i]) for i, __ in enumerate(sorted(vals))]
        cutpoint = point(cumulative)
        cutoff.append(vals[sorted_ids[cutpoint]] * tot_loc /
                      loc[sorted_ids[cutpoint]] * denom[idx])
    """ 
    Apply Plans Sequentially
    """

    modified = []
    for n in xrange(test.shape[0]):
        C = Changes()
        if test.iloc[n][-1] > 0 or test.iloc[n][-1] is True:
            new_row = apply2(cutoff, test.iloc[n].values.tolist())
            for name, new, old in zip(test.columns, new_row,
                                      test.iloc[n].values.tolist()):
                C.save(name, new=new, old=old)

            changes.append(C.log)
            modified.append(new_row)
            # set_trace()
        # Disable the next two line if you're measuring the number of changes.
        else:
            if rand() > 0.7:
                modified.append(test.iloc[n].tolist())

    return pd.DataFrame(modified, columns=test.columns), changes