예제 #1
0
def getTokensForTokenization(datasetParam):
    completeLabels = []
    completeCorpus = [
    ]  ## a list of lists with tokens from defected and non defected files
    '''
   token holders for manual/topic modeling
   '''
    defective_tokens, non_defective_tokens = [], []
    with open(datasetParam, 'rU') as f:
        reader_ = csv.reader(f)
        next(reader_, None)
        for row in reader_:
            defectStatus = int(
                row[-1]
            )  ### need to convert to int , otherwise gives error for sklearn.are_under_roc
            fileToRead = row[1]
            fileContentAsStr = utility.giveCommentFreeContentFromFile(
                fileToRead)
            #print "!"*75
            #print fileContentAsStr
            filtered_str_from_one_file = tokenization_preprocessor.processTokensOfOneFile(
                fileContentAsStr)
            #print len(filtered_str_from_one_file)
            #print "="*75
            completeCorpus.append(filtered_str_from_one_file)
            ### after getting the text , getthe labels
            completeLabels.append(defectStatus)

    return completeCorpus, completeLabels
예제 #2
0
def getCommitLevelTokensForTokenization(categ_file_param, the_flag):
    completeLabels = []
    completeCorpus = [
    ]  ## a list of lists with tokens from defected and non defected files
    with open(categ_file_param, 'rU') as f:
        reader_ = csv.reader(f)
        next(reader_, None)
        for row in reader_:
            id_ = row[0]
            repo_path = row[1]
            catgeg_ = row[3]
            if catgeg_ == 'N':
                defectStatus = 0
            else:
                defectStatus = 1
            if repo_path[-1] != '/':
                repo_path = repo_path + '/'
            fileToRead = repo_path + 'diffs/' + str(id_) + '.txt'
            commitContentAsStr = utility.giveCommentFreeContentFromFile(
                fileToRead, the_flag)
            #print commitContentAsStr
            #print "!"*75
            filtered_str_from_one_commit = tokenization_preprocessor.processTokensOfOneFile(
                commitContentAsStr)
            #print filtered_str_from_one_commit
            #print "="*75
            completeCorpus.append(filtered_str_from_one_commit)
            ### after getting the text , getthe labels
            completeLabels.append(defectStatus)
    #print len(completeCorpus), len(completeLabels)
    return completeCorpus, completeLabels
예제 #3
0
def getICSTFilesForExa(datasetParam, kw_param):
    completeLabels = []
    completeCorpus = [
    ]  ## a list of lists with tokens from defected and non defected files
    with open(datasetParam, 'rU') as f:
        reader_ = csv.reader(f)
        next(reader_, None)
        for row in reader_:
            # defectStatus = int(row[20])  ### need to convert to int , otherwise gives error for sklearn.are_under_roc
            defectStatus = int(
                row[-1]
            )  ### need to convert to int , otherwise gives error for sklearn.are_under_roc
            fileToRead = row[1]
            fileContentAsStr = utility.giveCommentFreeContentFromFile(
                fileToRead)
            #print "!"*75
            #print fileContentAsStr
            filtered_str_from_one_file = tokenization_preprocessor.processTokensOfOneFile(
                fileContentAsStr)
            #print len(filtered_str_from_one_file)
            #print "="*75
            if ((kw_param in filtered_str_from_one_file)
                    and (defectStatus == 1)):
                print '-' * 25
                print fileToRead
                print '-' * 25
                print filtered_str_from_one_file
                print '-' * 25