def organize_bin_versions(project: Project): general_log = logging.getLogger(__name__) success_log = logging.getLogger("success") failure_log = logging.getLogger("failure") failure_verbose_log = logging.getLogger("failure_verbose") general_log.info("organizing bin versions for {0}".format(project.github())) try: data = Config().config['CACHING']['RepositoryData'] selected = Config().config['DATA_EXTRACTION']['SelectedVersionsBin'] path = os.path.join(data, selected, project.github() + ".csv") in_path = Config.get_work_dir_path(path) versions = "versions" Path(versions).mkdir(parents=True, exist_ok=True) dest_path = os.path.join(versions, project.github() + ".csv") if os.path.exists(in_path): df = pd.read_csv(in_path) df.head(8).drop(columns=["start", "step", "stop"]).to_csv(dest_path, index=False) success_log.info("organize | bin | Succeeded to organize {0}".format(project.github())) return path except Exception as e: failure_log.error("organize | bin | Failed to organize {0}".format(project.github())) failure_verbose_log.exception("organize | bin | Failed to organize {0}".format(project.github()))
def test_project_update(self): a = Project('Project A', 'Work on A') project_id = self._project_manager.add_project(a) b = Project('Project B', 'Work on B') self._project_manager.update_project(project_id, b) updated_project = self._project_manager.find_project_by_id(project_id) self.assertEqual(updated_project.name, b.name) self.assertEqual(updated_project.description, b.description)
def project(context, l, c, namespace, name): activeGitlab = context.obj['active_gitlab'] project = Project(projects = activeGitlab.projects, namespaces=activeGitlab.namespaces) if l: projects = project.listProjectsPaths() for proj in projects: print(proj) if c: project.createProject(namespace=namespace, name=name)
def test_listProjectsPaths(self, list): print('starting now to auth') auth = Auth(username=logindetails.user, token=logindetails.token) activeGitlab = auth.authorizeUser() # create a Project object called project project = Project(projects=activeGitlab.projects, namespaces=activeGitlab.namespaces) # call the function to listProject paths - The API list function is mocked to return the Mockproject objects projpaths = project.listProjectsPaths() print(projpaths)
def test_project_remove(self): a = Project('Project A', 'Work on A') a_id = self._project_manager.add_project(a) b = Project('Project B', 'Work on B') b_id = self._project_manager.add_project(b) self.assertEqual(self._project_manager.count_projects(), 2) self._project_manager.remove_project(a_id) project = self._project_manager.find_project_by_id(b_id) self.assertEqual(project.name, 'Project B') self.assertEqual(self._project_manager.count_projects(), 1)
def test_find_project_by_id(self): a = Project('Project A', 'Work on A') b = Project('Project B', 'Work on B') a_id = self._project_manager.add_project(a) b_id = self._project_manager.add_project(b) project = self._project_manager.find_project_by_id(a_id) self.assertEqual(project.name, 'Project A') self.assertEqual(project.description, 'Work on A') project = self._project_manager.find_project_by_id(b_id) self.assertEqual(project.name, 'Project B') self.assertEqual(project.description, 'Work on B')
def test_find_projects_by_name(self): a = Project('Initialization', 'Work on A') b = Project('Progress', 'Work on B') c = Project('Final steps', 'Work on C') self._project_manager.add_project(a) self._project_manager.add_project(b) self._project_manager.add_project(c) result = self._project_manager.find_projects_by_name('in') self.assertEqual(len(result), 2) result = self._project_manager.find_projects_by_name('in progress') self.assertEqual(len(result), 0) result = self._project_manager.find_projects_by_name('PROGRESS') self.assertEqual(len(result), 1)
def __init__(self, extractor_name, project: Project, version, repo=None): self.extractor_name = extractor_name self.project = project self.project_name = project.github() self.version = version self.config = Config().config self.runner = self._get_runner(self.config, extractor_name) if repo is None: repo = Repo(project.jira(), project.github(), project.path(), version) self.local_path = os.path.realpath(repo.local_path) self.file_analyser = JavaParserFileAnalyser(self.local_path, self.project_name, self.version) self.data: Data = None
def main(): print("Loading data. This may take some time.") records = [Record(project) for project in Project.available_projects()] show_rates_table(records) method_category_score(records) method_score_distributions(records) plt.show()
def download(self): url = self.url() print 'gaudi url: "%s"'%url print 'gaudi tag: "%s"'%self.tag() # Gaudi has a peculiar repository if url[:3] == 'git': return self._download_git_monolithic() if url[:3] == 'svn' and 'cern.ch' in url: self._download_cern_svn() return Project.download(self)
def __init__(self, server): super(WorkbookManager, self).__init__(server) self.sample_project_id = None sample_project_entry = Project.get_by_name( self.server.environment.envid, 'Tableau Samples') if sample_project_entry: self.sample_project_id = sample_project_entry.id
def loadProjectFile(self, filePath): try: projectSettings = Project.load(filePath) except Exception as err: logger.error(err) self.ui.errorMsg(f'Error loading project {filePath}.') self.updateRecentProjectsActions() else: self.loadProject(projectSettings)
def main(): print('Loading data. This may take a while...') projects = [(p.name, Record(p.descartes), Record(p.gregor)) for p in Project.available_projects()] print('Execution time:') time_table(projects) plot_times(projects) print('Number of mutants created:') mutants_table(projects) plot_mutants(projects) plt.show()
def add_project(self): user_name = raw_input("User name: ") for account in Account.objects: if account.username == user_name: user_account = account self.title = raw_input("Project title: ") self.contact = raw_input("Project contact: ") self.results = raw_input("Results: ") self.nsf_Aggreement = raw_input("NSF Aggreement (Yes or No): ") self.slide_collection_aggreement = raw_input( "Slide Collection Aggreement (Yes or No): ") self.other = raw_input("Other: ") project = Project(project_title=self.title, lead=user_account, manager=user_account, contact=self.contact, results=self.results, nsf_Aggreement=self.nsf_Aggreement, slide_collection_aggreement=self.slide_collection_aggreement, other=self.other) project.save() print project.project_title
def test_properties(self): members = [1, 2, 3] documents = [4, 5, 6] project = Project('project_1', 'The first project', members=members, documents=documents) self.assertEqual(project.name, 'project_1') self.assertEqual(project.description, 'The first project') self.assertEqual(project.members, [1, 2, 3]) self.assertEqual(project.documents, [4, 5, 6])
def do_create(self, arg): """\nCreate a new project from scratch. create name=<string> [target=<string>] [version=<string>] [author=<string>] name = new project name. Default = New Project. target = project target FPGA/CPLD. Default = APF9328. version = project version. Default = 1.0. author = component category. Default = User Component. """ args = CREATION_ARGS.parse(arg) if args: proj = Project() proj.name = args.name proj.version = args.version proj.category = args.category proj.target = args.target self.write("New project created.\n") settings.active_project = proj else: self.stdout.write("*** Arguments extraction error, creation canceled.\n")
def get_data(): for project in Project.available_projects(): record = Record(project) non_accessible_methods = set( method_id(m) for m in project.methods if 'ACCESSIBLE' not in m['classifications']) all_methods = set(method_id(m) for m in project.methods) def ratio(a_set): return len(a_set.intersection(non_accessible_methods)) / len(a_set) yield (project.name, ratio(record.pseudo_tested), ratio(record.methods_under_analysis), ratio(all_methods))
def __init__(self): super(MainWindow, self).__init__() self.ui = Ui_MainWindow() self.ui.setupUi(self) self.connectActions() self.videoAnnoWidget = VideoAnnoWidget(self) self.setCentralWidget(self.videoAnnoWidget) self.project: Project = Project() self.appSettings: AppSettings = AppSettings() self.loadAppSettings()
def add_project(self): user_name = raw_input("User name: ") for account in Account.objects: if account.username == user_name: user_account = account self.title = raw_input("Project title: ") self.contact = raw_input("Project contact: ") self.results = raw_input("Results: ") self.nsf_Aggreement = raw_input("NSF Aggreement (Yes or No): ") self.slide_collection_aggreement = raw_input( "Slide Collection Aggreement (Yes or No): ") self.other = raw_input("Other: ") project = Project( project_title=self.title, lead=user_account, manager=user_account, contact=self.contact, results=self.results, nsf_Aggreement=self.nsf_Aggreement, slide_collection_aggreement=self.slide_collection_aggreement, other=self.other) project.save() print project.project_title
def main(): projects = list(Project.available_projects()) #Compute the scores print('Computing the scores. This may take a while...') scores = [get_both_scores(p) for p in projects] descartes_scores = [c[0].score for c in scores] gregor_scores = [c[1].score for c in scores] #Show the table render_table([p.name for p in projects], scores) #Show the correlation correlation = spearmanr(descartes_scores, gregor_scores) print(f'The Spearman correlation coefficient is {correlation.correlation} with a p-value of {correlation.pvalue}') #Show plot show_plot(descartes_scores, gregor_scores) bland_altman_plot(descartes_scores, gregor_scores) plt.show()
def read_file(self, month, year, file): wb_r = xlrd.open_workbook(file) sheet1 = wb_r.sheet_by_name(month + year) nrow = sheet1.nrows ncol = sheet1.ncols self.project_dates_col = sheet1.col_values(0) for i in range(1, ncol): project_obj = Project(sheet1.cell_value(0, i)) for j in range(1, nrow): date_row = sheet1.cell_value(j, 0) words = sheet1.cell_value(j, i) project_obj.list_of_occurrences.append([date_row, words]) self.main_list.append(project_obj)
def update_file(self, day, month, new_project, new_words): today = day + " " + month temp_project = Project(new_project) if today not in self.project_dates_col: self.project_dates_col.append(today) index_proj = 9999 project_exists = False project_date_exists = False index_date = 9999 # checks if project project exists and get its index in the main list for index, project in enumerate(self.main_list): if project.get_name() == new_project: index_proj = index project_exists = True print(index_proj) if project_exists == True: for date_proj, data_pair in enumerate(self.main_list[index_proj].list_of_occurrences): print(data_pair[0]) if data_pair[0] == today: index_date = date_proj project_date_exists = True print(index_date) if project_date_exists == True: self.main_list[index_proj].list_of_occurrences[index_date][1] = new_words else: self.main_list[index_proj].list_of_occurrences.append([today, new_words]) if project_exists == False: temp_project.list_of_occurrences.append([today, new_words]) print(temp_project.list_of_occurrences) self.main_list.append(temp_project)
def __init__(self): Project.__init__(self,"gaudi") return
class user_project(): accounts = Account.objects() projects = Project.objects() title = "" category = "" #keywords = "" contact = "" #members = "" #alumni = "" #nsf_grant_number = "" #nsf_grant_url = "" results = "" nsf_Aggreement = "" #Yes/No slide_collection_aggreement = "" #Yes/No other = "" def add_project(self): user_name = raw_input("User name: ") for account in Account.objects: if account.username == user_name: user_account = account self.title = raw_input("Project title: ") self.contact = raw_input("Project contact: ") self.results = raw_input("Results: ") self.nsf_Aggreement = raw_input("NSF Aggreement (Yes or No): ") self.slide_collection_aggreement = raw_input( "Slide Collection Aggreement (Yes or No): ") self.other = raw_input("Other: ") project = Project( project_title=self.title, lead=user_account, manager=user_account, contact=self.contact, results=self.results, nsf_Aggreement=self.nsf_Aggreement, slide_collection_aggreement=self.slide_collection_aggreement, other=self.other) project.save() print project.project_title #account.project = self.title refernce and list field def generate_random(self): pass def generate_project(self): pass def list_project(self): project = Project.objects() for project in Project.objects(): print print project.project_title, ":", project print def del_project(self): user_name = raw_input("User name: ") for project in Project.objects: if project.lead.username == user_name: project.delete()
def test_creation(self): project = Project('project_1', 'The first project')
def set_project(self, github_name, github_user, jira_name, jira_url): self.project = Project(github_name, github_user, '', [jira_name], [], jira_url, '') self.set_extractor()
def test_creation_with_documents(self): documents = [1, 2, 3] project = Project('project_1', 'The first project', documents=documents)
def test_creation_with_members(self): members = [1, 2, 3] project = Project('project_1', 'The first project', members=members)
def list_project(self): project = Project.objects() for project in Project.objects(): print print project.project_title, ":", project print
def __init__(self): Project.__init__(self,"lcgcmt") return
def set_project(self, github, jira): self.project = Project(github.lower(), jira.upper()) self.set_extractor()
class Main(): def __init__(self): self.project = None self.extractor = None self.save_data_names() self.jira_url = None self.github_user_name = None def list_projects(self): print("\n".join( list( map(lambda e: "{0}: {1}".format(e.name, e.value.description()), ProjectName)))) def extract(self): self.extractor.extract(True) def set_project(self, github, jira): self.project = Project(github.lower(), jira.upper()) self.set_extractor() def set_project_enum(self, name): self.project = ProjectName[name].value self.set_extractor() def set_extractor(self): self.extractor = DataExtractor(self.project, self.jira_url, self.github_user_name) def extract_metrics(self): classes_data = Config.get_work_dir_path( os.path.join(Config().config['CACHING']['RepositoryData'], Config().config['VERSION_METRICS']['ClassesData'], self.project.github())) Path(classes_data).mkdir(parents=True, exist_ok=True) method_data = Config.get_work_dir_path( os.path.join(Config().config['CACHING']['RepositoryData'], Config().config['VERSION_METRICS']['MethodData'], self.project.github())) Path(method_data).mkdir(parents=True, exist_ok=True) classes_datasets = [] methods_datasets = [] for version in self.extractor.get_selected_versions()[:-1]: self.extractor.checkout_version(version) classes_df, methods_df = self.extract_features_to_version( classes_data, method_data, version) classes_datasets.append(classes_df) methods_datasets.append(methods_df) classes_instance = self.extract_classes_datasets(classes_datasets) classes_instance.predict() methods_instance = self.extract_methods_datasets(methods_datasets) methods_instance.predict() def aggrate_methods_df(self, df): ids = df['Method_ids'].iteritems() files_id, classes_id = tee(ids, 2) files = pd.Series(list(map(lambda x: x[1].split('@')[0], files_id))).values classes = pd.Series( list( map(lambda x: x[1].split('@')[1].split('.')[:-1][-1], classes_id))).values df.insert(0, 'File', files) df.insert(0, 'Class', classes) groupby = ['File', 'Class'] columns_filter = [ 'File', 'Class', 'BuggedMethods', 'Method', 'Method_ids' ] columns = list( filter(lambda x: x not in columns_filter, df.columns.values.tolist())) data = list() for key, group in df.groupby(groupby): key_data = {} key_data.update(dict(zip(groupby, key))) for feature in columns: pt = pd.DataFrame(group[feature].describe()).T cols = [ "{0}_{1}".format(feature, c) for c in pt.columns.values.tolist() ] pt.columns = cols key_data.update(list(pt.iterrows())[0][1].to_dict()) data.append(key_data) return pd.DataFrame(data) def fillna(self, df): for col in df: dt = df[col].dtype if dt == int or dt == float: df[col].fillna(0, inplace=True) else: df[col].fillna(False, inplace=True) return df def extract_features_to_version(self, classes_data, method_data, version): extractors = Extractor.get_all_extractors(self.project, version) for extractor in extractors: extractor.extract() db = DataBuilder(self.project, version) list(map(lambda d: db.append(d), DataNameEnum)) classes_df, methods_df = db.build() intermediate_dir = Config.get_work_dir_path( os.path.join(Config().config['CACHING']['RepositoryData'], Config().config['VERSION_METRICS']['Intermediate'], self.project.github())) classes_intermediate_dir = os.path.join(intermediate_dir, "classes") methods_intermediate_dir = os.path.join(intermediate_dir, "methods") Path(classes_intermediate_dir).mkdir(parents=True, exist_ok=True) Path(methods_intermediate_dir).mkdir(parents=True, exist_ok=True) classes_df.to_csv(os.path.join(classes_intermediate_dir, version + ".csv"), index=False, sep=';') methods_df.to_csv(os.path.join(methods_intermediate_dir, version + ".csv"), index=False, sep=';') methods_df = self.fillna(methods_df) aggregated_methods_df = self.aggrate_methods_df(methods_df) classes_df.dropna(inplace=True) classes_df.to_csv(os.path.join(intermediate_dir, "classes_df.csv"), index=False, sep=';') aggregated_methods_df.to_csv(os.path.join(intermediate_dir, "aggregated_methods_df.csv"), index=False, sep=';') if 'Class' in classes_df.columns and 'Class' in aggregated_methods_df.columns: classes_df = classes_df.merge(aggregated_methods_df, on=['File', 'Class'], how='outer') else: classes_df = classes_df.merge(aggregated_methods_df, on=['File'], how='outer') classes_df.to_csv(os.path.join(intermediate_dir, "classes_df_afterMerge.csv"), index=False, sep=';') classes_df = self.fillna(classes_df) classes_df.to_csv(os.path.join(classes_data, version + ".csv"), index=False, sep=';') methods_df = methods_df.drop('File', axis=1, errors='ignore') methods_df = methods_df.drop('Class', axis=1, errors='ignore') methods_df = methods_df.drop('Method', axis=1, errors='ignore') methods_df.to_csv(os.path.join(method_data, version + ".csv"), index=False, sep=';') return classes_df, methods_df def extract_classes_datasets(self, classes_datasets): dataset_dir = Config.get_work_dir_path( os.path.join(Config().config['CACHING']['RepositoryData'], Config().config['VERSION_METRICS']['Dataset'], self.project.github())) classes_dataset_dir = os.path.join(dataset_dir, "classes") Path(classes_dataset_dir).mkdir(parents=True, exist_ok=True) classes_training = pd.concat(classes_datasets[:-1], ignore_index=True).drop( ["File", "Class", "Method_ids"], axis=1, errors='ignore') classes_testing = classes_datasets[-1].drop("Method_ids", axis=1, errors='ignore') file_names = classes_testing.pop("File").values.tolist() classes_names = classes_testing.pop("Class").values.tolist() classes_testing_names = list( map("@".join, zip(file_names, classes_names))) return ClassificationInstance(classes_training, classes_testing, classes_testing_names, classes_dataset_dir) def extract_methods_datasets(self, methods_datasets): dataset_dir = Config.get_work_dir_path( os.path.join(Config().config['CACHING']['RepositoryData'], Config().config['VERSION_METRICS']['Dataset'], self.project.github())) methods_dataset_dir = os.path.join(dataset_dir, "methods") Path(methods_dataset_dir).mkdir(parents=True, exist_ok=True) methods_training = pd.concat(methods_datasets[:-1], ignore_index=True).drop("Method_ids", axis=1, errors='ignore') methods_testing = methods_datasets[-1] methods_testing_names = methods_testing.pop( "Method_ids").values.tolist() return ClassificationInstance(methods_training, methods_testing, methods_testing_names, methods_dataset_dir, label="BuggedMethods") def choose_versions(self, version_num=5, algorithm="bin", version_type=VersionType.Untyped, strict=True): self.extractor.choose_versions(version_num=version_num, algorithm=algorithm, strict=strict, version_type=version_type) def set_version_selection(self, version_num=5, algorithm="bin", version_type=VersionType.Untyped, strict=True, selected_config=0): self.extractor.choose_versions(version_num=version_num, algorithm=algorithm, strict=strict, version_type=version_type, selected_config=selected_config) self.extractor.selected_config = selected_config assert self.extractor.get_selected_versions() def save_data_names(self): j = list() out_path = Config.get_work_dir_path( os.path.join(Config().config['CACHING']['RepositoryData'], "dataname.json")) for d in DataNameEnum: j.append(d.value.as_description_dict()) with open(out_path, "w") as f: json.dump(j, f) def main(self): parser = argparse.ArgumentParser(description='Execute project data') parser.add_argument('-p', '--projects', dest='projects', action='store_const', const=True, default=False, help='list all aleready defined projects') parser.add_argument('-c', '--choose', dest='choose', action='store', help='choose a project to extract') parser.add_argument( '-g', '--github_repo_name', dest='github', action='store', help= 'the github repository name to the project to extract (lowercase)') parser.add_argument( '-j', '--jira_name', dest='jira', action='store', help='the jira name to the project to extract (uppercase)') parser.add_argument( '-עu', '--github_user_name', dest='github_user_name', action='store', help='the github user name to the project to extract (lowercase)', default="apache") parser.add_argument('-jl', '--jira_url', dest='jira_url', action='store', help='the link to jira', default="http://issues.apache.org/jira") parser.add_argument( '-l', '--list_select_verions', dest='list_selected', action='store', help='the algorithm to select the versions : [bin]', default='bin') parser.add_argument('-s', '--select_verions', dest='select', action='store', help='the configuration to choose', default=-1, type=int) parser.add_argument('-n', '--num_verions', dest='num_versions', action='store', help='the number of versions to select', default=5, type=int) parser.add_argument('-t', '--versions_type', dest='versions_type', action='store', help='the versions type to select', default="Untyped") parser.add_argument('-f', '--free_choose', dest='free_choose', action='store_true', help='the versions type to select') args = parser.parse_args() self.github_user_name = args.github_user_name self.jira_url = args.jira_url if args.projects: self.list_projects() if args.choose: self.set_project_enum(args.choose) if args.github and args.jira: self.set_project(args.github, args.jira) if args.list_selected: self.choose_versions(version_num=args.num_versions, algorithm=args.list_selected, version_type=VersionType[args.versions_type], strict=args.free_choose) if args.select != -1: self.set_version_selection( version_num=args.num_versions, algorithm='bin', version_type=VersionType[args.versions_type], strict=args.free_choose, selected_config=args.select) self.extract() self.extract_metrics()
def stats(): projects = Project("SteveKipp") projects.get_repo_activity() return render_template("stats.html", projects = projects)
x = entries[i] kinds = [my_dict['kind'] for my_dict in x[4]] actions = [my_dict['action'] for my_dict in x[4]] projects = [] for i in range(len(x[3])): curr_path = x[3][i] size_to_add = 0 if kinds[i] == 'file': size_to_add = parse_list(list_file, curr_path.replace('/mjschau2/', '')) svn_link = str( 'https://subversion.ews.illinois.edu/svn/sp17-cs242' + curr_path + '/?p=' + x[0]['revision']) temp_proj = proj.Project(curr_path, size_to_add, actions[i], kinds[i], text=svn_link, file_id=curr_id) result = files.insert_one(temp_proj.__dict__) #print(result) curr_id += 1 projects.append(temp_proj.__dict__) temp_obj = le.log_entry(int(x[0]['revision']), x[1], x[2], x[5], projects) entry_objs.append(temp_obj.__dict__) project_data = entry_objs #now put up on mongodb database result = logs.insert_many(project_data)
def build(repositories, theme, all_languages, output_dir): termlangs = get_termlangs(repositories, all_languages) print_info("Copying assets") copydir(html_assets, output_dir) css_dir = os.path.join(output_dir, "css") makedirs(css_dir) make_css(css_assets, theme, css_dir) languages = {} project_count = {} for language_code, terms in termlangs.items(): if language_code not in all_languages: all_languages[language_code] = Language( code=language_code, name=language_code, legal={}, translations={} ) language = all_languages[language_code] print_info("Language "+language.name) out_terms = [] count = 0 lang_dir = os.path.join(output_dir, language.code) for term in terms: term_dir = os.path.join(lang_dir, "%s.%d"%(term.id, term.number)) makedirs(term_dir) print_info("Building Term:\t\t" + str(term.title)) projects = [] for p in term.projects: built_project = Project.build_from_resource(p, \ term, \ term_dir, \ language, \ theme) if built_project: count += 1 projects.append(built_project) else: continue extras = [] for r in term.extras: extras.append(build_extra(term, r, language, theme, term_dir)) term = Term( id=term.id, manifest=term.manifest, number=term.number, language=term.language, title=term.title, description=term.description, projects=projects, extras=extras, ) out_terms.append(make_term_index(term, language, theme, term_dir)) print_info("Term built!") print_info("Building " + language.name +" index") languages[language_code] = \ make_lang_index(language, out_terms, theme, lang_dir) project_count[language_code] = count print_info("Building " + theme.name + " index: " + output_dir) sorted_languages = [] for lang in sorted(project_count.keys(), \ key=lambda x: project_count[x], reverse=True): sorted_languages.append((all_languages[lang], languages[lang])) make_index(sorted_languages, \ all_languages[theme.language], \ theme, \ output_dir) print_info("Complete")
def newProject(self): self.saveProject() se = Project() self.loadProject(se)
class Main(): def __init__(self): self.project = None self.extractor = None self.save_data_names() self.jira_url = None self.github_user_name = None def list_projects(self): print("\n".join(list(map(lambda e: "{0}: {1}".format(e.name, e.value.description()), ProjectName)))) def extract(self): self.extractor.extract(True) def set_project(self, github, jira): self.project = Project(github.lower(), jira.upper()) self.set_extractor() def set_project_enum(self, name): self.project = ProjectName[name].value self.set_extractor() def set_extractor(self): self.extractor = DataExtractor(self.project, self.jira_url, self.github_user_name) def extract_metrics(self, rest_versions, rest_only, data_types): classes_datasets = [] aggregated_classes_datasets = [] methods_datasets = [] if not rest_only: for version in self.extractor.get_selected_versions()[:-1]: classes_df, methods_df, aggregated_classes_df = self.extract_features_to_version(version, True, data_types) classes_datasets.append(classes_df) methods_datasets.append(methods_df) aggregated_classes_datasets.append(aggregated_classes_df) for version in rest_versions: try: self.extract_features_to_version(version, False, data_types) except: pass if rest_only: return self.extract_classes_datasets(aggregated_classes_datasets[:-1], aggregated_classes_datasets[-1]).predict() # self.extract_classes_datasets(classes_datasets[:-1], classes_datasets[-1], "classes_no_aggregate").predict() self.extract_methods_datasets(methods_datasets[:-1], methods_datasets[-1]).predict() def create_all_but_one_dataset(self, data_types): alls = {} ones = {} detailed = {} for d in DataNameEnum: if d.value.data_type.value in data_types: detailed.setdefault(d.value.data_type.value, set()).add(d.value.name) for d in detailed: ones[d] = detailed[d] all_but_d = list(detailed.keys()) all_but_d.remove(d) alls[d] = reduce(set.__or__, list(map(detailed.get, all_but_d)), set()) for sub_dir, label in [("methods", "BuggedMethods"), ("classes", "Bugged")]: scores = [] training_df = pd.read_csv(os.path.join(self.get_dataset_path(sub_dir), "training.csv"), sep=';') testing_df = pd.read_csv(os.path.join(self.get_dataset_path(sub_dir), "testing.csv"), sep=';') dataset_cols = set(training_df.columns.to_list()).intersection(set(testing_df.columns.to_list())) names = pd.read_csv(os.path.join(self.get_dataset_path(sub_dir), "prediction.csv"), sep=';')['name'].to_list() for dir_name, columns in (('one', ones), ('all', alls)): for d in columns: cols = set(filter(lambda dc: any(map(lambda c: c in dc, columns[d])), dataset_cols)) if len(cols) == 0: continue cols.add(label) cols = list(cols) train = training_df[cols] test = testing_df[cols] ci = ClassificationInstance(train, test, names, self.get_dataset_path(os.path.join(dir_name, sub_dir, d)), label=label) try: ci.predict() ci_scores = dict(ci.scores) ci_scores.update({"type": dir_name, "data_type": d}) scores.append(ci_scores) except Exception as e: print(e) pd.DataFrame(scores).to_csv(self.get_dataset_path(sub_dir + "_metrics.csv", False), index=False, sep=';') def get_data_dirs(self): classes_data = Config.get_work_dir_path(os.path.join(Config().config['CACHING']['RepositoryData'], Config().config['VERSION_METRICS']['ClassesData'], self.project.github())) method_data = Config.get_work_dir_path( os.path.join(Config().config['CACHING']['RepositoryData'], Config().config['VERSION_METRICS']['MethodData'], self.project.github())) intermediate_dir = Config.get_work_dir_path( os.path.join(Config().config['CACHING']['RepositoryData'], Config().config['VERSION_METRICS']['Intermediate'], self.project.github())) classes_intermediate_dir = os.path.join(intermediate_dir, "classes") methods_intermediate_dir = os.path.join(intermediate_dir, "methods") Path(classes_intermediate_dir).mkdir(parents=True, exist_ok=True) Path(methods_intermediate_dir).mkdir(parents=True, exist_ok=True) Path(classes_data).mkdir(parents=True, exist_ok=True) Path(method_data).mkdir(parents=True, exist_ok=True) return classes_data, method_data, classes_intermediate_dir, methods_intermediate_dir, intermediate_dir def aggrate_methods_df(self, df): def clean(s): if "@" in s: return s[1].split('@')[1].split('.')[:-1][-1] return s[1].split('.')[:-1][-1] ids = df['Method_ids'].iteritems() files_id, classes_id = tee(ids, 2) files = pd.Series(list(map(lambda x: x[1].split('@')[0], files_id))).values classes = pd.Series(list(map(clean, classes_id))).values df.insert(0, 'File', files) df.insert(0, 'Class', classes) groupby = ['File', 'Class'] columns_filter = ['File', 'Class', 'BuggedMethods', 'Method', 'Method_ids'] columns = list( filter(lambda x: x not in columns_filter, df.columns.values.tolist())) data = list() for key, group in df.groupby(groupby): key_data = {} key_data.update(dict(zip(groupby, key))) for feature in columns: pt = pd.DataFrame(group[feature].describe(include = 'all')).T cols = ["{0}_{1}".format(feature, c) for c in pt.columns.values.tolist()] pt.columns = cols key_data.update(list(pt.iterrows())[0][1].to_dict()) data.append(key_data) return pd.DataFrame(data) def fillna(self, df, default=False): if 'Bugged' in df: df = df[df['Bugged'].notna()] if 'BuggedMethods' in df : df = df[df['BuggedMethods'].notna()] for col in df: dt = df[col].dtype if dt == int or dt == float: df[col].fillna(0, inplace=True) else: df[col].fillna(default, inplace=True) return df def extract_features_to_version(self, version, extract_bugs, data_types): self.extractor.checkout_version(version) db, extractors_to_run = self.get_extractors(data_types, extract_bugs, version) for extractor in extractors_to_run: start = time.time() extractor.extract() print(time.time() - start, extractor.__class__.__name__) classes_df, methods_df = db.build() aggregated_methods_df = self.aggrate_methods_df(methods_df) methods_df = self.fillna(methods_df) aggregated_classes_df = self.merge_aggregated_methods_to_class(aggregated_methods_df, classes_df) classes_df = self.fillna(classes_df) methods_df = methods_df.drop('File', axis=1, errors='ignore') methods_df = methods_df.drop('Class', axis=1, errors='ignore') methods_df = methods_df.drop('Method', axis=1, errors='ignore') self.save_dfs(classes_df, methods_df, aggregated_classes_df, aggregated_methods_df, version) return classes_df, methods_df, aggregated_classes_df def merge_aggregated_methods_to_class(self, aggregated_methods_df, classes_df): aggregated_classes_df = classes_df.copy(deep=True) if 'Class' in aggregated_classes_df.columns and 'Class' in aggregated_methods_df.columns: aggregated_classes_df = aggregated_classes_df.merge(aggregated_methods_df, on=['File', 'Class'], how='outer') else: aggregated_classes_df = aggregated_classes_df.merge(aggregated_methods_df, on=['File'], how='outer') return self.fillna(aggregated_classes_df) def save_dfs(self, classes_df, methods_df, aggregated_classes_df, aggregated_methods_df, version): classes_data, method_data, classes_intermediate_dir, methods_intermediate_dir, intermediate_dir = self.get_data_dirs() classes_df.to_csv(os.path.join(classes_intermediate_dir, version + ".csv"), index=False, sep=';') aggregated_classes_df.to_csv(os.path.join(classes_intermediate_dir, version + "_aggregated_classes.csv"), index=False, sep=';') methods_df.to_csv(os.path.join(methods_intermediate_dir, version + ".csv"), index=False, sep=';') aggregated_methods_df.to_csv(os.path.join(intermediate_dir, version + "_aggregated_methods_df.csv"), index=False, sep=';') classes_df.to_csv(os.path.join(classes_data, version + ".csv"), index=False, sep=';') aggregated_classes_df.to_csv(os.path.join(classes_data, version + "_aggregated_classes_.csv"), index=False, sep=';') methods_df.to_csv(os.path.join(method_data, version + ".csv"), index=False, sep=';') def get_extractors(self, data_types, extract_bugs, version): db = DataBuilder(self.project, version) if extract_bugs: data_types.add("bugged") data_types.add("bugged_methods") extractors_to_run = set() for extractor in Extractor.get_all_extractors(self.project, version): if not extract_bugs and "bugged" in extractor.__class__.__name__.lower(): continue extractor_data_types = [] for dt in extractor.data_types: if dt.value in data_types: extractor_data_types.append(dt) extractors_to_run.add(extractor) db.add_data_types(extractor_data_types) return db, extractors_to_run def extract_classes_datasets(self, training_datasets, testing_dataset, sub_dir="classes"): training = pd.concat(training_datasets, ignore_index=True).drop(["File", "Class", "Method_ids"], axis=1, errors='ignore') training = self.fillna(training) testing = testing_dataset.drop(["Method_ids", "Class"], axis=1, errors='ignore') testing = self.fillna(testing, default='') file_names = testing.pop("File").values.tolist() # classes_names = testing.pop("Class").values.tolist() # classes_testing_names = list(map("@".join, zip(file_names, ['' if x in (False, True) else x for x in classes_names]))) return ClassificationInstance(training, testing, file_names, self.get_dataset_path(sub_dir)) def get_dataset_path(self, name, is_dir=True): dataset_dir = Config.get_work_dir_path( os.path.join(Config().config['CACHING']['RepositoryData'], Config().config['VERSION_METRICS']['Dataset'], self.project.github())) path = os.path.join(dataset_dir, name) if is_dir: Path(path).mkdir(parents=True, exist_ok=True) return path def extract_methods_datasets(self, training_datasets, testing_dataset): training = pd.concat(training_datasets, ignore_index=True).drop("Method_ids", axis=1, errors='ignore') training = self.fillna(training) testing = testing_dataset testing = self.fillna(testing) methods_testing_names = testing.pop("Method_ids").values.tolist() return ClassificationInstance(training, testing, methods_testing_names, self.get_dataset_path("methods"), label="BuggedMethods") def choose_versions(self, version_num=5, algorithm="bin", version_type=VersionType.Untyped, strict=True): self.extractor.init_jira_commits() self.extractor.choose_versions(version_num=version_num, algorithm=algorithm, strict=strict, version_type=version_type) def set_version_selection(self, version_num=5, algorithm="bin", version_type=VersionType.Untyped, strict=True, selected_config=0): self.extractor.set_selected_config(selected_config) self.extractor.choose_versions(version_num=version_num, algorithm=algorithm, strict=strict, version_type=version_type) assert self.extractor.get_selected_versions() def save_data_names(self): j = list() out_path = Config.get_work_dir_path( os.path.join(Config().config['CACHING']['RepositoryData'], "dataname.json")) for d in DataNameEnum: j.append(d.value.as_description_dict()) with open(out_path, "w") as f: json.dump(j, f) def main(self): parser = argparse.ArgumentParser(description='Execute project data') parser.add_argument('-p', '--projects', dest='projects', action='store_const', const=True, default=False, help='list all aleready defined projects') parser.add_argument('-c', '--choose', dest='choose', action='store', help='choose a project to extract') parser.add_argument('-g', '--github_repo_name', dest='github', action='store', help='the github repository name to the project to extract (lowercase)') parser.add_argument('-j', '--jira_name', dest='jira', action='store', help='the jira name to the project to extract (uppercase)') parser.add_argument('-u', '--github_user_name', dest='github_user_name', action='store', help='the github user name to the project to extract (lowercase)', default="apache") parser.add_argument('-jl', '--jira_url', dest='jira_url', action='store', help='the link to jira', default="http://issues.apache.org/jira") parser.add_argument('-l', '--list_select_verions', dest='list_selected', action='store', help='the algorithm to select the versions : [bin]', default='bin') parser.add_argument('-d', '--data_types_to_extract', dest='data_types', action='store', help='Json file of the data types to extract as features. Choose a sublist of ' '[checkstyle, designite_design, designite_implementation, ' 'designite_type_organic, designite_method_organic, designite_type_metrics,' 'designite_method_metrics, source_monitor_files, source_monitor, ck, mood, halstead,' 'jasome_files, jasome_methods, process_files, issues_files]. You can use the files under externals\configurations', default=r"externals\configurations\default.json") parser.add_argument('-s', '--select_verions', dest='select', action='store', help='the configuration to choose', default=0, type=int) parser.add_argument('-n', '--num_verions', dest='num_versions', action='store', help='the number of versions to select', default=5, type=int) parser.add_argument('-t', '--versions_type', dest='versions_type', action='store', help='the versions type to select', default="Untyped") parser.add_argument('-f', '--free_choose', dest='free_choose', action='store_true', help='the versions type to select') parser.add_argument('-r', '--only_rest', dest='only_rest', action='store_true', help='extract only rest versions') parser.add_argument('rest', nargs=argparse.REMAINDER) args = parser.parse_args() self.github_user_name = args.github_user_name self.jira_url = args.jira_url if args.projects: self.list_projects() if args.choose: self.set_project_enum(args.choose) if args.github and args.jira: self.set_project(args.github, args.jira) if args.list_selected: self.choose_versions(version_num=args.num_versions, algorithm=args.list_selected, version_type=VersionType[args.versions_type], strict=args.free_choose) # if args.select != -1: self.set_version_selection(version_num=args.num_versions, algorithm='bin', version_type=VersionType[args.versions_type], strict=args.free_choose, selected_config=args.select) self.extract() data_types = None if os.path.exists(args.data_types): with open(args.data_types) as f: data_types = set(json.loads(f.read())) self.extract_metrics(args.rest, args.only_rest, data_types) self.create_all_but_one_dataset(data_types)