def dnn_model_kfold(k=10): """ Run kfold cross validation in parallel Keyword Arguments: k {integer} -- the number of folds (default: {10}) """ samples = csv2dict(DATASET.features) # These collections are speed up the process while calculating top-k accuracy sample_dict, bug_reports, br2files_dict = helper_collections(samples) np.random.shuffle(samples) # K-fold Cross Validation in parallel acc_dicts = Parallel(n_jobs=-2)( # Uses all cores but one delayed(train_dnn)( i, k, samples, start, step, sample_dict, bug_reports, br2files_dict ) for i, (start, step) in enumerate(kfold_split_indexes(k, len(samples))) ) # Calculating the average accuracy from all folds avg_acc_dict = {} for key in acc_dicts[0].keys(): avg_acc_dict[key] = round(sum([d[key] for d in acc_dicts]) / len(acc_dicts), 3) return avg_acc_dict
def rsvm_model(): samples = csv2dict(DATASET.features) # rvsm_list = [float(sample["rVSM_similarity"]) for sample in samples] # These collections are speed up the process while calculating top-k accuracy sample_dict, bug_reports, br2files_dict = helper_collections(samples, True) acc_dict = topk_accuarcy(bug_reports, sample_dict, br2files_dict) return acc_dict
from util import csv2dict, tsv2dict import numpy as np from sklearn.neural_network import MLPRegressor from sklearn.model_selection import train_test_split, KFold samples = csv2dict() rvsm_list = [float(sample['rVSM_similarity']) for sample in samples] sample_dict = {} for sample in samples: sample_dict[sample["report_id"]] = [] for sample in samples: temp_dict = {} temp_dict[sample["file"]] = [float(sample['rVSM_similarity'])] sample_dict[sample["report_id"]].append(temp_dict) bug_reports = tsv2dict() bug_reports_files_dict = {} for bug_report in bug_reports: bug_reports_files_dict[bug_report["id"]] = bug_report["files"] topk_counters = [0] * 20 negative_total = 0 for bug_report in bug_reports: dnn_input = [] corresponding_files = [] bug_id = bug_report["id"]
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( "context", help= "Path to a CSV file with the data context for the template. Each row in the CSV will result in one sent email. The template is filled with associated data from a given row. Must have a column named 'email' for To: address." ) parser.add_argument( "template", help="Path to a Jinja2 template file that can be filled by the context" ) parser.add_argument( "subject", help= "Quoted string with email's subject line, subject string is NOT filled as a template" ) parser.add_argument("-q", "--quiet", help="Supress printing of all filled templates.", action="store_true") parser.add_argument( "-d", "--dryrun", help="Don't actually send emails, just print them to screen.", action="store_true") args = parser.parse_args() context = util.csv2dict(args.context) template = util.readtemplate(args.template) subject = args.subject main()
from util import csv2dict, tsv2dict import numpy as np from sklearn.neural_network import MLPRegressor from sklearn.model_selection import train_test_split, KFold samples_ = csv2dict() samples = [] # oversampling for i, sample in enumerate(samples_): samples.append(sample) if i % 51 in [0]: for _ in range(9): samples.append(sample) np.random.shuffle(samples) x_ = np.zeros((len(samples), 5)) y_ = np.zeros((len(samples), 1)) for i, sample in enumerate(samples): x_[i][0] = float(sample['rVSM_similarity']) x_[i][1] = float(sample['collab_filter']) x_[i][2] = float(sample['classname_similarity']) x_[i][3] = float(sample['bug_recency']) x_[i][4] = float(sample['bug_frequency']) y_[i] = float(sample['match']) data_train, data_test, labels_train, labels_test = train_test_split( x_, y_, test_size=0.20, random_state=42)