def loss(d, t, y_true, y_pred): metr = ClassificationMetrics(y_true, y_pred) metr.add_metrics(['d2h', 'pd', 'pf']) d2h, pd, pf = metr.get_metrics() file = open(f'./hyperopt-log/{d}-{t}.txt', 'a') print(f'd2h = {d2h}\tpd = {pd}\tpf = {pf}', file=file) return 2. - d2h
def main(): file_dic = {"ivy": ["ivy-1.4.csv", "ivy-2.0.csv"], "lucene": ["lucene-2.0.csv", "lucene-2.2.csv"], "lucene2": ["lucene-2.2.csv", "lucene-2.4.csv"], "poi": ["poi-1.5.csv", "poi-2.5.csv"], "poi2": ["poi-2.5.csv", "poi-3.0.csv"], "synapse": ["synapse-1.0.csv", "synapse-1.1.csv"], "synapse2": ["synapse-1.1.csv", "synapse-1.2.csv"], "camel": ["camel-1.2.csv", "camel-1.4.csv"], "camel2": ["camel-1.4.csv", "camel-1.6.csv"], "xerces": ["xerces-1.2.csv", "xerces-1.3.csv"], "jedit": ["jedit-3.2.csv", "jedit-4.0.csv"], "jedit2": ["jedit-4.0.csv", "jedit-4.1.csv"], "log4j": ["log4j-1.0.csv", "log4j-1.1.csv"], "xalan": ["xalan-2.4.csv", "xalan-2.5.csv"] } for dataset in file_dic: sys.stdout = open(f'./hyperopt-log/{dat}.txt', 'w') print(f'Running {dat}') print('=' * 20) data = DataLoader.from_files( base_path='./issue_close_time/', files=file_dic[dataset]) try: a = time.time() estim = HyperoptEstimator(classifier=any_classifier('clf'), preprocessing=any_preprocessing( 'pre'), algo=tpe.suggest, max_evals=30, loss_fn=loss, trial_timeout=30) estim.fit(data.x_train, data.y_train) preds = estim.predict(data.x_test) metr = ClassificationMetrics(data.y_test, preds) metr.add_metrics(['d2h', 'pd', 'pf']) print(metr.get_metrics()) print(estim.best_model()) b = time.time() print('Completed in', b-a, 'seconds.') except: continue
def main(): directories = [ "1 day", "7 days", "14 days", "30 days", "90 days", "180 days", "365 days" ] datasets = [ "camel", "cloudstack", "cocoon", "hadoop", "deeplearning", "hive", "node", "ofbiz", "qpid" ] for dat in datasets: for time_ in directories: sys.stdout = open(f'./hyperopt-log/{dat}-{time_}.txt', 'w') print(f'Running {dat}-{time_}') print('=' * 30) data = DataLoader.from_file( "/Users/ryedida/PycharmProjects/raise-package/issue_close_time/" + time_ + "/" + dat + ".csv", target="timeOpen", col_start=0) try: a = time.time() estim = HyperoptEstimator( classifier=any_classifier('clf'), preprocessing=any_preprocessing('pre'), algo=tpe.suggest, max_evals=30, loss_fn=partial(loss, dat, time_), trial_timeout=30) estim.fit(data.x_train, data.y_train) preds = estim.predict(data.x_test) metr = ClassificationMetrics(data.y_test, preds) metr.add_metrics(['d2h', 'pd', 'pf']) print(metr.get_metrics()) print(estim.best_model()) b = time.time() print('Completed in', b - a, 'seconds.') except ValueError: continue except: continue
def main(): for dataset in glob.glob('../../../Dodge/data/UCI/*.csv'): df = pd.read_csv(dataset) target = df.columns[-1] sys.stdout = open(f'./hyperopt-log/{dataset.split("/")[-1]}.txt', 'w') try: print(f'Running {dataset}') print('=' * 20) data = DataLoader.from_file(dataset, target=target, col_start=0, col_stop=-1) a = time.time() estim = HyperoptEstimator(classifier=any_classifier('clf'), preprocessing=any_preprocessing('pre'), algo=tpe.suggest, max_evals=30, loss_fn=loss, trial_timeout=30) estim.fit(data.x_train, data.y_train) preds = estim.predict(data.x_test) metr = ClassificationMetrics(data.y_test, preds) metr.add_metrics(['d2h', 'pd', 'pf']) print('perf:', metr.get_metrics()[0]) print(metr.get_metrics()) print(estim.best_model()) b = time.time() print('Completed in', b - a, 'seconds.') except: raise continue
def main(): for dataset in [ 'DataClass.csv', 'FeatureEnvy.csv', 'GodClass.csv', 'LongMethod.csv' ]: sys.stdout = open(f'./hyperopt-log/{dataset}.txt', 'w') try: print(f'Running {dataset}') print('=' * 20) data = DataLoader.from_file(f'../../../Dodge/data/smell/{dataset}', target='SMELLS', col_start=0, col_stop=-1) a = time.time() estim = HyperoptEstimator(classifier=any_classifier('clf'), preprocessing=any_preprocessing('pre'), algo=tpe.suggest, max_evals=30, loss_fn=loss, trial_timeout=30) estim.fit(data.x_train, data.y_train) preds = estim.predict(data.x_test) metr = ClassificationMetrics(data.y_test, preds) metr.add_metrics(['d2h', 'pd', 'pf']) print('perf:', metr.get_metrics()[0]) print(metr.get_metrics()) print(estim.best_model()) b = time.time() print('Completed in', b - a, 'seconds.') except: continue
def main(): for dataset in ['pitsA', 'pitsB', 'pitsC', 'pitsD', 'pitsE', 'pitsF']: sys.stdout = open(f'./hyperopt-log/{dataset}.txt', 'w') for i in range(10): try: print(f'Running {dataset}') print('=' * 20) data = TextDataLoader.from_file( f'../../../Dodge/data/textmining/{dataset}.txt') a = time.time() estim = HyperoptEstimator( classifier=any_classifier('clf'), preprocessing=any_text_preprocessing('pre'), algo=tpe.suggest, max_evals=30, loss_fn=loss, trial_timeout=30) estim.fit(data.x_train, data.y_train) preds = estim.predict(data.x_test) metr = ClassificationMetrics(data.y_test, preds) metr.add_metrics(['d2h', 'pd', 'pf']) print('perf:', metr.get_metrics()[0]) print(metr.get_metrics()) print(estim.best_model()) b = time.time() print('Completed in', b - a, 'seconds.') except: continue
def loss(y_true, y_pred): metr = ClassificationMetrics(y_true, y_pred) metr.add_metrics(['d2h', 'pd', 'pf']) d2h, pd, pf = metr.get_metrics() return 2. - d2h
def run(data: Data, name: str, config: dict): ''' Runs one experiment, given a Data instance. :param {Data} data - The dataset to run on, NOT preprocessed. :param {str} name - The name of the experiment. :param {dict} config - The config to use. Must be one in the format used in `process_configs`. ''' if config.get('ultrasample', False): # Apply WFO transform = Transform('wfo') transform.apply(data) # Reverse labels data.y_train = 1. - data.y_train data.y_test = 1. - data.y_test # Autoencode the inputs loss = 1e4 while loss > 1e3: ae = Autoencoder(n_layers=2, n_units=[10, 7], n_out=5) ae.set_data(*data) ae.fit() loss = ae.model.history.history['loss'][-1] data.x_train = ae.encode(np.array(data.x_train)) data.x_test = ae.encode(np.array(data.x_test)) if config.get('dodge', False): # Tune the hyper-params dodge_config = { 'n_runs': 10, 'data': [data], 'metrics': ['f1', 'd2h', 'pd', 'pf', 'prec'], 'learners': [], 'log_path': './ghost-log/', 'transforms': ['standardize', 'normalize', 'minmax'] * 30, 'random': True, 'name': name } for _ in range(30): wfo = config.get('wfo', True) smote = config.get('smote', True) weighted = config.get('weighted_loss', True) dodge_config['learners'].append( FeedforwardDL(weighted=weighted, wfo=wfo, smote=smote, random={ 'n_units': (2, 6), 'n_layers': (2, 5) }, n_epochs=50)) dodge = DODGE(dodge_config) dodge.optimize() return # Otherwise, it's one of the untuned approaches. elif config.get('wfo', False): learner = FeedforwardDL(weighted=True, wfo=True, smote=True, n_epochs=50) learner.set_data(*data) learner.fit() elif config.get('weighted_loss', False): learner = FeedforwardDL(weighted=True, wfo=False, smote=False, n_epochs=50) learner.set_data(*data) learner.fit() else: learner = FeedforwardDL(weighted=False, wfo=False, smote=False, n_epochs=50, random={ 'n_layers': (1, 5), 'n_units': (5, 20) }) learner.set_data(*data) learner.fit() # Get the results. preds = learner.predict(data.x_test) m = ClassificationMetrics(data.y_test, preds) m.add_metrics(['f1', 'd2h', 'pd', 'pf', 'prec']) results = m.get_metrics() return results