示例#1
0
def loss(d, t, y_true, y_pred):
    metr = ClassificationMetrics(y_true, y_pred)
    metr.add_metrics(['d2h', 'pd', 'pf'])
    d2h, pd, pf = metr.get_metrics()
    file = open(f'./hyperopt-log/{d}-{t}.txt', 'a')
    print(f'd2h = {d2h}\tpd = {pd}\tpf = {pf}', file=file)
    return 2. - d2h
示例#2
0
def main():
    file_dic = {"ivy":     ["ivy-1.4.csv", "ivy-2.0.csv"],
                "lucene":  ["lucene-2.0.csv", "lucene-2.2.csv"],
                "lucene2": ["lucene-2.2.csv", "lucene-2.4.csv"],
                "poi":     ["poi-1.5.csv", "poi-2.5.csv"],
                "poi2": ["poi-2.5.csv", "poi-3.0.csv"],
                "synapse": ["synapse-1.0.csv", "synapse-1.1.csv"],
                "synapse2": ["synapse-1.1.csv", "synapse-1.2.csv"],
                "camel": ["camel-1.2.csv", "camel-1.4.csv"],
                "camel2": ["camel-1.4.csv", "camel-1.6.csv"],
                "xerces": ["xerces-1.2.csv", "xerces-1.3.csv"],
                "jedit": ["jedit-3.2.csv", "jedit-4.0.csv"],
                "jedit2": ["jedit-4.0.csv", "jedit-4.1.csv"],
                "log4j": ["log4j-1.0.csv", "log4j-1.1.csv"],
                "xalan": ["xalan-2.4.csv", "xalan-2.5.csv"]
                }

    for dataset in file_dic:
        sys.stdout = open(f'./hyperopt-log/{dat}.txt', 'w')
        print(f'Running {dat}')
        print('=' * 20)
        data = DataLoader.from_files(
            base_path='./issue_close_time/', files=file_dic[dataset])

        try:
            a = time.time()
            estim = HyperoptEstimator(classifier=any_classifier('clf'),
                                      preprocessing=any_preprocessing(
                'pre'),
                algo=tpe.suggest,
                max_evals=30,
                loss_fn=loss,
                trial_timeout=30)

            estim.fit(data.x_train, data.y_train)
            preds = estim.predict(data.x_test)
            metr = ClassificationMetrics(data.y_test, preds)
            metr.add_metrics(['d2h', 'pd', 'pf'])
            print(metr.get_metrics())
            print(estim.best_model())
            b = time.time()

            print('Completed in', b-a, 'seconds.')
            except:
                continue
示例#3
0
def main():

    directories = [
        "1 day", "7 days", "14 days", "30 days", "90 days", "180 days",
        "365 days"
    ]
    datasets = [
        "camel", "cloudstack", "cocoon", "hadoop", "deeplearning", "hive",
        "node", "ofbiz", "qpid"
    ]

    for dat in datasets:
        for time_ in directories:
            sys.stdout = open(f'./hyperopt-log/{dat}-{time_}.txt', 'w')
            print(f'Running {dat}-{time_}')
            print('=' * 30)
            data = DataLoader.from_file(
                "/Users/ryedida/PycharmProjects/raise-package/issue_close_time/"
                + time_ + "/" + dat + ".csv",
                target="timeOpen",
                col_start=0)

            try:
                a = time.time()
                estim = HyperoptEstimator(
                    classifier=any_classifier('clf'),
                    preprocessing=any_preprocessing('pre'),
                    algo=tpe.suggest,
                    max_evals=30,
                    loss_fn=partial(loss, dat, time_),
                    trial_timeout=30)

                estim.fit(data.x_train, data.y_train)
                preds = estim.predict(data.x_test)
                metr = ClassificationMetrics(data.y_test, preds)
                metr.add_metrics(['d2h', 'pd', 'pf'])
                print(metr.get_metrics())
                print(estim.best_model())
                b = time.time()

                print('Completed in', b - a, 'seconds.')
            except ValueError:
                continue
            except:
                continue
示例#4
0
def main():
    for dataset in glob.glob('../../../Dodge/data/UCI/*.csv'):
        df = pd.read_csv(dataset)
        target = df.columns[-1]
        sys.stdout = open(f'./hyperopt-log/{dataset.split("/")[-1]}.txt', 'w')
        try:
            print(f'Running {dataset}')
            print('=' * 20)
            data = DataLoader.from_file(dataset,
                                        target=target,
                                        col_start=0,
                                        col_stop=-1)

            a = time.time()
            estim = HyperoptEstimator(classifier=any_classifier('clf'),
                                      preprocessing=any_preprocessing('pre'),
                                      algo=tpe.suggest,
                                      max_evals=30,
                                      loss_fn=loss,
                                      trial_timeout=30)

            estim.fit(data.x_train, data.y_train)
            preds = estim.predict(data.x_test)
            metr = ClassificationMetrics(data.y_test, preds)
            metr.add_metrics(['d2h', 'pd', 'pf'])
            print('perf:', metr.get_metrics()[0])
            print(metr.get_metrics())
            print(estim.best_model())
            b = time.time()

            print('Completed in', b - a, 'seconds.')
        except:
            raise
            continue
示例#5
0
def main():
    for dataset in [
            'DataClass.csv', 'FeatureEnvy.csv', 'GodClass.csv',
            'LongMethod.csv'
    ]:
        sys.stdout = open(f'./hyperopt-log/{dataset}.txt', 'w')
        try:
            print(f'Running {dataset}')
            print('=' * 20)
            data = DataLoader.from_file(f'../../../Dodge/data/smell/{dataset}',
                                        target='SMELLS',
                                        col_start=0,
                                        col_stop=-1)

            a = time.time()
            estim = HyperoptEstimator(classifier=any_classifier('clf'),
                                      preprocessing=any_preprocessing('pre'),
                                      algo=tpe.suggest,
                                      max_evals=30,
                                      loss_fn=loss,
                                      trial_timeout=30)

            estim.fit(data.x_train, data.y_train)
            preds = estim.predict(data.x_test)
            metr = ClassificationMetrics(data.y_test, preds)
            metr.add_metrics(['d2h', 'pd', 'pf'])
            print('perf:', metr.get_metrics()[0])
            print(metr.get_metrics())
            print(estim.best_model())
            b = time.time()

            print('Completed in', b - a, 'seconds.')
        except:
            continue
示例#6
0
def main():
    for dataset in ['pitsA', 'pitsB', 'pitsC', 'pitsD', 'pitsE', 'pitsF']:
        sys.stdout = open(f'./hyperopt-log/{dataset}.txt', 'w')
        for i in range(10):
            try:
                print(f'Running {dataset}')
                print('=' * 20)
                data = TextDataLoader.from_file(
                    f'../../../Dodge/data/textmining/{dataset}.txt')

                a = time.time()
                estim = HyperoptEstimator(
                    classifier=any_classifier('clf'),
                    preprocessing=any_text_preprocessing('pre'),
                    algo=tpe.suggest,
                    max_evals=30,
                    loss_fn=loss,
                    trial_timeout=30)

                estim.fit(data.x_train, data.y_train)
                preds = estim.predict(data.x_test)
                metr = ClassificationMetrics(data.y_test, preds)
                metr.add_metrics(['d2h', 'pd', 'pf'])
                print('perf:', metr.get_metrics()[0])
                print(metr.get_metrics())
                print(estim.best_model())
                b = time.time()

                print('Completed in', b - a, 'seconds.')
            except:
                continue
示例#7
0
def loss(y_true, y_pred):
    metr = ClassificationMetrics(y_true, y_pred)
    metr.add_metrics(['d2h', 'pd', 'pf'])
    d2h, pd, pf = metr.get_metrics()
    return 2. - d2h
示例#8
0
def run(data: Data, name: str, config: dict):
    '''
    Runs one experiment, given a Data instance.

    :param {Data} data - The dataset to run on, NOT preprocessed.
    :param {str} name - The name of the experiment.
    :param {dict} config - The config to use. Must be one in the format used in `process_configs`.
    '''
    if config.get('ultrasample', False):
        # Apply WFO
        transform = Transform('wfo')
        transform.apply(data)

        # Reverse labels
        data.y_train = 1. - data.y_train
        data.y_test = 1. - data.y_test

        # Autoencode the inputs
        loss = 1e4
        while loss > 1e3:
            ae = Autoencoder(n_layers=2, n_units=[10, 7], n_out=5)
            ae.set_data(*data)
            ae.fit()

            loss = ae.model.history.history['loss'][-1]

        data.x_train = ae.encode(np.array(data.x_train))
        data.x_test = ae.encode(np.array(data.x_test))

    if config.get('dodge', False):
        # Tune the hyper-params
        dodge_config = {
            'n_runs': 10,
            'data': [data],
            'metrics': ['f1', 'd2h', 'pd', 'pf', 'prec'],
            'learners': [],
            'log_path': './ghost-log/',
            'transforms': ['standardize', 'normalize', 'minmax'] * 30,
            'random': True,
            'name': name
        }

        for _ in range(30):
            wfo = config.get('wfo', True)
            smote = config.get('smote', True)
            weighted = config.get('weighted_loss', True)

            dodge_config['learners'].append(
                FeedforwardDL(weighted=weighted,
                              wfo=wfo,
                              smote=smote,
                              random={
                                  'n_units': (2, 6),
                                  'n_layers': (2, 5)
                              },
                              n_epochs=50))

        dodge = DODGE(dodge_config)
        dodge.optimize()
        return

    # Otherwise, it's one of the untuned approaches.
    elif config.get('wfo', False):
        learner = FeedforwardDL(weighted=True,
                                wfo=True,
                                smote=True,
                                n_epochs=50)
        learner.set_data(*data)
        learner.fit()

    elif config.get('weighted_loss', False):
        learner = FeedforwardDL(weighted=True,
                                wfo=False,
                                smote=False,
                                n_epochs=50)
        learner.set_data(*data)
        learner.fit()

    else:
        learner = FeedforwardDL(weighted=False,
                                wfo=False,
                                smote=False,
                                n_epochs=50,
                                random={
                                    'n_layers': (1, 5),
                                    'n_units': (5, 20)
                                })
        learner.set_data(*data)
        learner.fit()

    # Get the results.
    preds = learner.predict(data.x_test)
    m = ClassificationMetrics(data.y_test, preds)
    m.add_metrics(['f1', 'd2h', 'pd', 'pf', 'prec'])
    results = m.get_metrics()
    return results