def collect_data(self): No = 0 TotalNum = len(self.RepoList) for repo in self.RepoList: id = str(repo['id']) if (self.is_processed(id)): No += 1 continue print("[Task%d-%d/%d]repo -> %s : %s" % (self.Task, No + 1, TotalNum, repo['id'], repo['url'])) self.process(id, repo['created_at'], repo['url']) System.set_tag(id) No += 1
def is_processed(self, id): return System.access_tag(str(id))
def get_stats_path(self, RepoId, Index): StatsDir = System.setdir_cmmt_stats(str(RepoId)) StatsFile = StatsDir + "/" + str(Index) + ".csv" return StatsFile
def is_exist(self, file): return System.is_exist(file)
def get_content_path(self, RepoId, Index): ContentDir = System.setdir_cmmt_content(str(RepoId)) ContentFile = ContentDir + "/" + str(Index) + ".csv" return ContentFile
def get_commit_path(self, RepoId): CommitDir = System.setdir_cmmt(str(RepoId)) CommitFile = CommitDir + "/" + str(RepoId) + '.csv' return CommitFile
def is_prenbr_ready(self): return System.is_exist(Collect_Nbr.prenbr_stats)
def get_cmmtinfo(self, NbrStats): repo_id = NbrStats.repo_id cmmt_file = System.cmmt_file(repo_id) if (System.is_exist(cmmt_file) == False): return #developers & commit_num cdf = pd.read_csv(cmmt_file) commits_num = 0 if (cdf.shape[0] < self.max_cmmt_num): commits_num = cdf.shape[0] else: commits_num = self.max_cmmt_num developers = {} max_date = "1999-01-01T13:44:12Z" min_date = "2020-12-31T13:44:12Z" for index, row in cdf.iterrows(): developers[row['author']] = 1 date = row['date'] if (date > max_date): max_date = date if (date < min_date): min_date = date developer_num = len(developers) max_time = datetime.strptime(max_date, '%Y-%m-%dT%H:%M:%SZ') min_time = datetime.strptime(min_date, '%Y-%m-%dT%H:%M:%SZ') age = (max_time - min_time).days #security bug num cmmt_stat_file = System.cmmt_stat_file(repo_id) + ".csv" if (System.is_exist(cmmt_stat_file) == False): return cdf = pd.read_csv(cmmt_stat_file) se_num = cdf.shape[0] NbrStats.update(age, commits_num, developer_num, se_num) #se categories se_rem_num = 0 se_iibc_num = 0 se_pd_num = 0 se_other = 0 for index, row in cdf.iterrows(): keywords = ast.literal_eval(row['fuzzy']).keys() for key in keywords: category = self.get_secategory(key) if (category != 3): break if (category == 0): se_rem_num += 1 elif (category == 1): se_iibc_num += 1 elif (category == 2): se_pd_num += 1 else: se_other += 1 NbrStats.update_secategory(se_rem_num, se_iibc_num, se_pd_num, se_other) self.pre_nbr_stats[repo_id] = NbrStats
def CollectRepository(): print(">>>>>>>>>>>> CollectRepo fom github...") # Retrieves repo data from Github by page CR = CollectRepo(System.get_repopath(), "username", "token") CR.collect_repositories()