def main(argv=None): logging.basicConfig(filename='out.log', filemode='w', level=logging.ERROR) parser = argparse.ArgumentParser( description='pre-process java files for mallet') parser.add_argument('--configfile', dest='configfile', help='configuration filename', required=True) args = parser.parse_args() logging.debug('args.configfile: ' + args.configfile) config.configFilename = args.configfile config.get_config_from_filename() #fsock = open('out.log','w') #sys.stdout = fsock '''mvtosingle.moveJavaFilesToSingleDir() class_sep.extractInnerClasses()''' '''if config.langType == config.Langs.java: print 'Running java class separator...' java_class_sep.runClassSeparator()''' print 'Performing camel case splitting...' camelcase_sep.splitWordsByCamelCase() print 'Stemming...' stemmer.runStemmerOnDir() print 'Exiting'
break for c in line: if c.isalpha(): word += c.lower() else: if word: output += p.stem(word, 0,len(word)-1) word = '' output += c.lower() logging.debug( output,) outfile.write(output) infile.close() outfile.close() if __name__ == '__main__': try: opts, args = getopt.getopt(sys.argv[1:], "ho:v", ["help", "output="]) except getopt.GetoptError, err: # print help information and exit: print str(err) # will print something like "option -a not recognized" sys.exit(2) config.get_config_from_filename() fsock = open(config.workspaceLoc + '/PySourceSeparator/out.log','w') sys.stdout = fsock runStemmerOnDir()
break for c in line: if c.isalpha(): word += c.lower() else: if word: output += p.stem(word, 0, len(word) - 1) word = '' output += c.lower() logging.debug(output, ) outfile.write(output) infile.close() outfile.close() if __name__ == '__main__': try: opts, args = getopt.getopt(sys.argv[1:], "ho:v", ["help", "output="]) except getopt.GetoptError, err: # print help information and exit: print str(err) # will print something like "option -a not recognized" sys.exit(2) config.get_config_from_filename() fsock = open(config.workspaceLoc + '/PySourceSeparator/out.log', 'w') sys.stdout = fsock runStemmerOnDir()
def splitWordsByCamelCase(): innerClassSep = True config.get_config_from_filename() prefixOutput = config.subjectsDir #prefixInput = "/home/joshua/Documents/joshuaga-jpl-macbookpro/Documents/workspace" #prefixInput = "/home/joshua/Documents/joshuaga-jpl-macbookpro/Documents/workspace" prefixInput = config.subjectsDir; outputDir = '' if innerClassSep: outputDir = prefixOutput + os.sep + config.projectDirName + os.sep + 'cc_ics' else: outputDir = prefixOutput + os.sep + config.projectDirName + os.sep + 'camelcase_separated' logging.debug("outputDir: " + outputDir) startDir = prefixOutput + os.sep + config.projectDirName + os.sep + 'separated' logging.debug("startDir: " + startDir) javaKeywordFile = open('res' + os.sep + 'javakeywords','r') javaKeywords = [] for line in javaKeywordFile: javaKeywords.append(line.strip()) logging.debug("javaKeywords: " + str(javaKeywords)) cppKeywordFile = open('res'+os.sep+'cppkeywords','r') cppKeywords = [] for line in cppKeywordFile: cppKeywords.append(line.strip()) logging.debug("cppKeywords: " + str(cppKeywords)) cKeywordsFile = open('res'+os.sep+'ckeywords','r') cKeywords = [] for line in cKeywordsFile: cKeywords.append(line.strip()) logging.debug("cKeywords: " + str(cKeywords)) #outputDir = "/Users/joshuaga/Documents/Software Engineering Research/Subjects/freecs/camelcase_separated" #startDir = "/Users/joshuaga/Documents/Software Engineering Research/Subjects/freecs/separated" dirlisting = os.listdir(startDir) logging.debug(str(dirlisting)) expr = "" if config.langType == config.Langs.cpp: expr = ".cpp" elif config.langType == config.Langs.java: expr = ".java" elif config.langType == config.Langs.cfunc: expr = ".func" elif config.langType == config.Langs.cfile: expr = "(.h)|(.c)|(.S)|(.tbl)|(.p)|(.cpp)|(.cc)" for root, dirs, files in os.walk(startDir): printWalkData(root, dirs, files) for file in files: currdir, ext = os.path.splitext(file) if re.search(expr,ext): logging.debug("top-level class name and location: ") logging.debug(root + os.sep + file) #if (not (file == "LlamaChat.java") ): # continue fd = open(root + os.sep + file,'r') if not os.path.exists(outputDir): os.makedirs(outputDir) relativePath = root.replace(startDir,"") + os.sep absDirContainingFile = outputDir + os.sep + relativePath outFullFilename = absDirContainingFile + os.sep + file if not os.path.exists(absDirContainingFile): os.makedirs(absDirContainingFile) outf = open(outFullFilename, 'w') for lineNo, line in enumerate(fd): pattern = re.compile('[\W_]+') alphanumericOnlyLine = pattern.sub(' ',line) splitLineList = alphanumericOnlyLine.split(' ') removeList = [] logging.debug('splitLineList: ' + str(splitLineList)) for strSplit in splitLineList: logging.debug('str in splitLineList: ' + strSplit) strippedStr = strSplit.strip() if config.langType == config.Langs.java: for keyword in javaKeywords: #print 'keyword: ', keyword #print 'strippedStr: ', strippedStr if keyword.lower() == strippedStr.lower(): logging.debug('Adding ' + strSplit + ' to removeList') removeList.append(strSplit) if config.langType == config.Langs.cpp: for keyword in cppKeywords: #print 'keyword: ', keyword #print 'strippedStr: ', strippedStr if keyword.lower() == strippedStr.lower(): logging.debug('Adding ' + strSplit + ' to removeList') if config.langType == config.Langs.cfunc or config.langType == config.Langs.cfile: for keyword in cKeywords: #print 'keyword: ', keyword #print 'strippedStr: ', strippedStr if keyword.lower() == strippedStr.lower(): logging.debug('Adding ' + strSplit + ' to removeList') removeList.append(strSplit) logging.debug('removeList: ' + str(removeList)) for strRemoved in removeList: splitLineList = filter(lambda item : item.lower() != strRemoved.lower(),splitLineList) logging.debug('line with keywords removed: ') logging.debug(str(splitLineList)) splitLine = '' for i in range(len(splitLineList)): strSplit = splitLineList[i] splitLine = splitLine + strSplit if i != (len(splitLineList) - 1 ): splitLine = splitLine + ' ' splitLine = splitLine upperCount = 0 lineToWrite = '' LOWER, UPPER, NOTALPHA, UNINIT = range(4) lastCharType = UNINIT for index, c in enumerate(splitLine): if not re.search(r"[a-zA-Z]",c): # if not a letter lineToWrite = lineToWrite + c lastCharType = NOTALPHA elif c.islower(): # current character is lowercase if (upperCount > 2 and lastCharType == UPPER): lineToWrite = lineToWrite[:len(lineToWrite)-1] + " " + lineToWrite[len(lineToWrite)-1:] + c else: lineToWrite = lineToWrite + c lastCharType = LOWER upperCount = 0 elif c.isupper(): # current character is uppercase if (lastCharType == LOWER): lineToWrite = lineToWrite + " " lastCharType = UPPER upperCount = upperCount + 1 lineToWrite = lineToWrite + c else: errorStr = 'Unexpected character ' + c + ' in ' + splitLine + '\n' print errorStr logging.error(errorStr) sys.exit(1) outf.write(lineToWrite) fd.close()
def splitWordsByCamelCase(): innerClassSep = True config.get_config_from_filename() prefixOutput = config.subjectsDir #prefixInput = "/home/joshua/Documents/joshuaga-jpl-macbookpro/Documents/workspace" #prefixInput = "/home/joshua/Documents/joshuaga-jpl-macbookpro/Documents/workspace" prefixInput = config.subjectsDir outputDir = '' if innerClassSep: outputDir = prefixOutput + os.sep + config.projectDirName + os.sep + 'cc_ics' else: outputDir = prefixOutput + os.sep + config.projectDirName + os.sep + 'camelcase_separated' logging.debug("outputDir: " + outputDir) startDir = prefixOutput + os.sep + config.projectDirName + os.sep + 'separated' logging.debug("startDir: " + startDir) javaKeywordFile = open('res' + os.sep + 'javakeywords', 'r') javaKeywords = [] for line in javaKeywordFile: javaKeywords.append(line.strip()) logging.debug("javaKeywords: " + str(javaKeywords)) cppKeywordFile = open('res' + os.sep + 'cppkeywords', 'r') cppKeywords = [] for line in cppKeywordFile: cppKeywords.append(line.strip()) logging.debug("cppKeywords: " + str(cppKeywords)) cKeywordsFile = open('res' + os.sep + 'ckeywords', 'r') cKeywords = [] for line in cKeywordsFile: cKeywords.append(line.strip()) logging.debug("cKeywords: " + str(cKeywords)) #outputDir = "/Users/joshuaga/Documents/Software Engineering Research/Subjects/freecs/camelcase_separated" #startDir = "/Users/joshuaga/Documents/Software Engineering Research/Subjects/freecs/separated" dirlisting = os.listdir(startDir) logging.debug(str(dirlisting)) expr = "" if config.langType == config.Langs.cpp: expr = ".cpp" elif config.langType == config.Langs.java: expr = ".java" elif config.langType == config.Langs.cfunc: expr = ".func" elif config.langType == config.Langs.cfile: expr = "(.h)|(.c)|(.S)|(.tbl)|(.p)|(.cpp)|(.cc)" for root, dirs, files in os.walk(startDir): printWalkData(root, dirs, files) for file in files: currdir, ext = os.path.splitext(file) if re.search(expr, ext): logging.debug("top-level class name and location: ") logging.debug(root + os.sep + file) #if (not (file == "LlamaChat.java") ): # continue fd = open(root + os.sep + file, 'r') if not os.path.exists(outputDir): os.makedirs(outputDir) relativePath = root.replace(startDir, "") + os.sep absDirContainingFile = outputDir + os.sep + relativePath outFullFilename = absDirContainingFile + os.sep + file if not os.path.exists(absDirContainingFile): os.makedirs(absDirContainingFile) outf = open(outFullFilename, 'w') for lineNo, line in enumerate(fd): pattern = re.compile('[\W_]+') alphanumericOnlyLine = pattern.sub(' ', line) splitLineList = alphanumericOnlyLine.split(' ') removeList = [] logging.debug('splitLineList: ' + str(splitLineList)) for strSplit in splitLineList: logging.debug('str in splitLineList: ' + strSplit) strippedStr = strSplit.strip() if config.langType == config.Langs.java: for keyword in javaKeywords: #print 'keyword: ', keyword #print 'strippedStr: ', strippedStr if keyword.lower() == strippedStr.lower(): logging.debug('Adding ' + strSplit + ' to removeList') removeList.append(strSplit) if config.langType == config.Langs.cpp: for keyword in cppKeywords: #print 'keyword: ', keyword #print 'strippedStr: ', strippedStr if keyword.lower() == strippedStr.lower(): logging.debug('Adding ' + strSplit + ' to removeList') if config.langType == config.Langs.cfunc or config.langType == config.Langs.cfile: for keyword in cKeywords: #print 'keyword: ', keyword #print 'strippedStr: ', strippedStr if keyword.lower() == strippedStr.lower(): logging.debug('Adding ' + strSplit + ' to removeList') removeList.append(strSplit) logging.debug('removeList: ' + str(removeList)) for strRemoved in removeList: splitLineList = filter( lambda item: item.lower() != strRemoved.lower(), splitLineList) logging.debug('line with keywords removed: ') logging.debug(str(splitLineList)) splitLine = '' for i in range(len(splitLineList)): strSplit = splitLineList[i] splitLine = splitLine + strSplit if i != (len(splitLineList) - 1): splitLine = splitLine + ' ' splitLine = splitLine upperCount = 0 lineToWrite = '' LOWER, UPPER, NOTALPHA, UNINIT = range(4) lastCharType = UNINIT for index, c in enumerate(splitLine): if not re.search(r"[a-zA-Z]", c): # if not a letter lineToWrite = lineToWrite + c lastCharType = NOTALPHA elif c.islower(): # current character is lowercase if (upperCount > 2 and lastCharType == UPPER): lineToWrite = lineToWrite[:len( lineToWrite) - 1] + " " + lineToWrite[ len(lineToWrite) - 1:] + c else: lineToWrite = lineToWrite + c lastCharType = LOWER upperCount = 0 elif c.isupper(): # current character is uppercase if (lastCharType == LOWER): lineToWrite = lineToWrite + " " lastCharType = UPPER upperCount = upperCount + 1 lineToWrite = lineToWrite + c else: errorStr = 'Unexpected character ' + c + ' in ' + splitLine + '\n' print errorStr logging.error(errorStr) sys.exit(1) outf.write(lineToWrite) fd.close()