Python processTokensOfOneFile 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: tokenization_preprocessor

메소드/함수: processTokensOfOneFile

hotexamples.com에서의 예제들: 4

Python processTokensOfOneFile - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 tokenization_preprocessor.processTokensOfOneFile에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def getCommitLevelTokensForTokenization(categ_file_param, the_flag):
    completeLabels = []
    completeCorpus = [
    ]  ## a list of lists with tokens from defected and non defected files
    with open(categ_file_param, 'rU') as f:
        reader_ = csv.reader(f)
        next(reader_, None)
        for row in reader_:
            id_ = row[0]
            repo_path = row[1]
            catgeg_ = row[3]
            if catgeg_ == 'N':
                defectStatus = 0
            else:
                defectStatus = 1
            if repo_path[-1] != '/':
                repo_path = repo_path + '/'
            fileToRead = repo_path + 'diffs/' + str(id_) + '.txt'
            commitContentAsStr = utility.giveCommentFreeContentFromFile(
                fileToRead, the_flag)
            #print commitContentAsStr
            #print "!"*75
            filtered_str_from_one_commit = tokenization_preprocessor.processTokensOfOneFile(
                commitContentAsStr)
            #print filtered_str_from_one_commit
            #print "="*75
            completeCorpus.append(filtered_str_from_one_commit)
            ### after getting the text , getthe labels
            completeLabels.append(defectStatus)
    #print len(completeCorpus), len(completeLabels)
    return completeCorpus, completeLabels

예제 #2

파일 보기

def getICSTFilesForExa(datasetParam, kw_param):
    completeLabels = []
    completeCorpus = [
    ]  ## a list of lists with tokens from defected and non defected files
    with open(datasetParam, 'rU') as f:
        reader_ = csv.reader(f)
        next(reader_, None)
        for row in reader_:
            # defectStatus = int(row[20])  ### need to convert to int , otherwise gives error for sklearn.are_under_roc
            defectStatus = int(
                row[-1]
            )  ### need to convert to int , otherwise gives error for sklearn.are_under_roc
            fileToRead = row[1]
            fileContentAsStr = utility.giveCommentFreeContentFromFile(
                fileToRead)
            #print "!"*75
            #print fileContentAsStr
            filtered_str_from_one_file = tokenization_preprocessor.processTokensOfOneFile(
                fileContentAsStr)
            #print len(filtered_str_from_one_file)
            #print "="*75
            if ((kw_param in filtered_str_from_one_file)
                    and (defectStatus == 1)):
                print '-' * 25
                print fileToRead
                print '-' * 25
                print filtered_str_from_one_file
                print '-' * 25

예제 #3

파일 보기

def getTokensForTokenization(datasetParam):
    completeLabels = []
    completeCorpus = [
    ]  ## a list of lists with tokens from defected and non defected files
    '''
   token holders for manual/topic modeling
   '''
    defective_tokens, non_defective_tokens = [], []
    with open(datasetParam, 'rU') as f:
        reader_ = csv.reader(f)
        next(reader_, None)
        for row in reader_:
            defectStatus = int(
                row[-1]
            )  ### need to convert to int , otherwise gives error for sklearn.are_under_roc
            fileToRead = row[1]
            fileContentAsStr = utility.giveCommentFreeContentFromFile(
                fileToRead)
            #print "!"*75
            #print fileContentAsStr
            filtered_str_from_one_file = tokenization_preprocessor.processTokensOfOneFile(
                fileContentAsStr)
            #print len(filtered_str_from_one_file)
            #print "="*75
            completeCorpus.append(filtered_str_from_one_file)
            ### after getting the text , getthe labels
            completeLabels.append(defectStatus)

    return completeCorpus, completeLabels

예제 #4

파일 보기

파일: main_compare_corpus.py 프로젝트: akondrahman/IaC-Defect-Semantics

def getTokensForTokenization(datasetParam):
    defectedCorpus = []  ## a list of lists with tokens from defected files
    nonDefectedCorpus = [
    ]  ## a list of lists with tokens from non defected files
    completeCorpus = []  ## a list of lists with tokens from all files
    with open(datasetParam, 'rU') as f:
        reader_ = csv.reader(f)
        next(reader_, None)
        for row in reader_:
            defectStatus = int(
                row[20]
            )  ### need to convert to int , otherwise gives error for sklearn.are_under_roc
            fileToRead = row[1]
            fileContentAsStr = utility.giveCommentFreeFileContent(fileToRead)
            #print "!"*75
            #print fileContentAsStr
            filtered_str_from_one_file = tokenization_preprocessor.processTokensOfOneFile(
                fileContentAsStr)
            #print len(filtered_str_from_one_file)
            #print "="*75
            completeCorpus.append(filtered_str_from_one_file)
            if (defectStatus == 1):
                defectedCorpus.append(filtered_str_from_one_file)
            else:
                nonDefectedCorpus.append(filtered_str_from_one_file)

    return defectedCorpus, nonDefectedCorpus, completeCorpus