def getTokensForTokenization(datasetParam): completeLabels = [] completeCorpus = [ ] ## a list of lists with tokens from defected and non defected files ''' token holders for manual/topic modeling ''' defective_tokens, non_defective_tokens = [], [] with open(datasetParam, 'rU') as f: reader_ = csv.reader(f) next(reader_, None) for row in reader_: defectStatus = int( row[-1] ) ### need to convert to int , otherwise gives error for sklearn.are_under_roc fileToRead = row[1] fileContentAsStr = utility.giveCommentFreeContentFromFile( fileToRead) #print "!"*75 #print fileContentAsStr filtered_str_from_one_file = tokenization_preprocessor.processTokensOfOneFile( fileContentAsStr) #print len(filtered_str_from_one_file) #print "="*75 completeCorpus.append(filtered_str_from_one_file) ### after getting the text , getthe labels completeLabels.append(defectStatus) return completeCorpus, completeLabels
def getCommitLevelTokensForTokenization(categ_file_param, the_flag): completeLabels = [] completeCorpus = [ ] ## a list of lists with tokens from defected and non defected files with open(categ_file_param, 'rU') as f: reader_ = csv.reader(f) next(reader_, None) for row in reader_: id_ = row[0] repo_path = row[1] catgeg_ = row[3] if catgeg_ == 'N': defectStatus = 0 else: defectStatus = 1 if repo_path[-1] != '/': repo_path = repo_path + '/' fileToRead = repo_path + 'diffs/' + str(id_) + '.txt' commitContentAsStr = utility.giveCommentFreeContentFromFile( fileToRead, the_flag) #print commitContentAsStr #print "!"*75 filtered_str_from_one_commit = tokenization_preprocessor.processTokensOfOneFile( commitContentAsStr) #print filtered_str_from_one_commit #print "="*75 completeCorpus.append(filtered_str_from_one_commit) ### after getting the text , getthe labels completeLabels.append(defectStatus) #print len(completeCorpus), len(completeLabels) return completeCorpus, completeLabels
def getICSTFilesForExa(datasetParam, kw_param): completeLabels = [] completeCorpus = [ ] ## a list of lists with tokens from defected and non defected files with open(datasetParam, 'rU') as f: reader_ = csv.reader(f) next(reader_, None) for row in reader_: # defectStatus = int(row[20]) ### need to convert to int , otherwise gives error for sklearn.are_under_roc defectStatus = int( row[-1] ) ### need to convert to int , otherwise gives error for sklearn.are_under_roc fileToRead = row[1] fileContentAsStr = utility.giveCommentFreeContentFromFile( fileToRead) #print "!"*75 #print fileContentAsStr filtered_str_from_one_file = tokenization_preprocessor.processTokensOfOneFile( fileContentAsStr) #print len(filtered_str_from_one_file) #print "="*75 if ((kw_param in filtered_str_from_one_file) and (defectStatus == 1)): print '-' * 25 print fileToRead print '-' * 25 print filtered_str_from_one_file print '-' * 25