示例#1
0
    def collect_data(self):
        No = 0
        TotalNum = len(self.RepoList)
        for repo in self.RepoList:
            id = str(repo['id'])
            if (self.is_processed(id)):
                No += 1
                continue

            print("[Task%d-%d/%d]repo -> %s : %s" %
                  (self.Task, No + 1, TotalNum, repo['id'], repo['url']))
            self.process(id, repo['created_at'], repo['url'])

            System.set_tag(id)
            No += 1
示例#2
0
 def is_processed(self, id):
     return System.access_tag(str(id))
示例#3
0
 def get_stats_path(self, RepoId, Index):
     StatsDir = System.setdir_cmmt_stats(str(RepoId))
     StatsFile = StatsDir + "/" + str(Index) + ".csv"
     return StatsFile
示例#4
0
 def is_exist(self, file):
     return System.is_exist(file)
示例#5
0
 def get_content_path(self, RepoId, Index):
     ContentDir = System.setdir_cmmt_content(str(RepoId))
     ContentFile = ContentDir + "/" + str(Index) + ".csv"
     return ContentFile
示例#6
0
 def get_commit_path(self, RepoId):
     CommitDir = System.setdir_cmmt(str(RepoId))
     CommitFile = CommitDir + "/" + str(RepoId) + '.csv'
     return CommitFile
示例#7
0
 def is_prenbr_ready(self):
     return System.is_exist(Collect_Nbr.prenbr_stats)
示例#8
0
    def get_cmmtinfo(self, NbrStats):
        repo_id = NbrStats.repo_id

        cmmt_file = System.cmmt_file(repo_id)
        if (System.is_exist(cmmt_file) == False):
            return

        #developers & commit_num
        cdf = pd.read_csv(cmmt_file)

        commits_num = 0
        if (cdf.shape[0] < self.max_cmmt_num):
            commits_num = cdf.shape[0]
        else:
            commits_num = self.max_cmmt_num

        developers = {}
        max_date = "1999-01-01T13:44:12Z"
        min_date = "2020-12-31T13:44:12Z"
        for index, row in cdf.iterrows():
            developers[row['author']] = 1
            date = row['date']
            if (date > max_date):
                max_date = date
            if (date < min_date):
                min_date = date
        developer_num = len(developers)

        max_time = datetime.strptime(max_date, '%Y-%m-%dT%H:%M:%SZ')
        min_time = datetime.strptime(min_date, '%Y-%m-%dT%H:%M:%SZ')
        age = (max_time - min_time).days

        #security bug num
        cmmt_stat_file = System.cmmt_stat_file(repo_id) + ".csv"
        if (System.is_exist(cmmt_stat_file) == False):
            return
        cdf = pd.read_csv(cmmt_stat_file)
        se_num = cdf.shape[0]
        NbrStats.update(age, commits_num, developer_num, se_num)

        #se categories
        se_rem_num = 0
        se_iibc_num = 0
        se_pd_num = 0
        se_other = 0
        for index, row in cdf.iterrows():
            keywords = ast.literal_eval(row['fuzzy']).keys()
            for key in keywords:
                category = self.get_secategory(key)
                if (category != 3):
                    break
            if (category == 0):
                se_rem_num += 1
            elif (category == 1):
                se_iibc_num += 1
            elif (category == 2):
                se_pd_num += 1
            else:
                se_other += 1
        NbrStats.update_secategory(se_rem_num, se_iibc_num, se_pd_num,
                                   se_other)

        self.pre_nbr_stats[repo_id] = NbrStats
示例#9
0
def CollectRepository():
    print(">>>>>>>>>>>> CollectRepo fom github...")
    # Retrieves repo data from Github by page
    CR = CollectRepo(System.get_repopath(), "username", "token")
    CR.collect_repositories()