def getFilesToLex(top_lvl_dir, extension): """ Desc: Get all filepaths recursively from top_lvl_dir that match extension Parameters: ---------- top_lvl_dir - a directory that is the head of a project or corpus. We will recursively examine all files under it extension - A wildcard regex for the type of extension to collect (e.g. *.java) Returns: -------- list of filepaths matching extension in top_lvl_dir and any subdirector of it. """ # Path to root folder containing the source code basePath = os.path.abspath(top_lvl_dir) if (os.path.isdir(basePath)): inputFolder = Folder(basePath) fileList = inputFolder.fullFileNames(extension, recursive=True) if (extension == "*.c"): #For c, check for header files too. fileList += inputFolder.fullFileNames("*.h", recursive=True) else: #It's a single file fileList = [basePath] return fileList
print( "Example: python dumpImports.py ~/CodeNLP/HaskellProjects/ *.hs haskellImports.txt" ) quit() print(sys.argv) codeFolder = Folder(os.path.abspath(sys.argv[1])) # File type to be considered fileExtension = sys.argv[2] output_file = sys.argv[3] internalCount = 0 externalCount = 0 with open(output_file, 'w') as out: for path in codeFolder.fullFileNames(fileExtension, recursive=True): out.write("File: " + path + "\n") try: with open(path, 'r') as f: for line in f: line = line.replace("\n", "") if (line.strip().startswith("import ") or line.strip().startswith("open ")): shortened = line.replace("import ", "").replace("open ", "") if (isInternalAPI(path, shortened)): internalCount += 1 out.write(line + " (Internal)\n") else: externalCount += 1 out.write(line + " (External)\n")
if ok: mc = MiniChecker('tmp_%d.b.js' % pid) try: isMini = mc.compare(keep_mini=False) except Exception as e: isMini = str(e) cleanup(pid) return [os.path.basename(js_file_path), isMini] else: cleanup(pid) return [os.path.basename(js_file_path), 'Beautifier failed'] except TimeExceededError: cleanup(pid) return [os.path.basename(js_file_path), 'Timeout'] corpus_dir = Folder(sys.argv[1]) pool = multiprocessing.Pool(processes=8) with open('isMinified.csv', 'wb') as f: writer = UnicodeWriter(f) for line in pool.imap(processFile, corpus_dir.fullFileNames("*.js")): writer.writerow(line)
outputFile = args.outputFile ngramOrder = args.ngramOrder testLocation = args.testLocation projectMap = args.projectMap trackTypes = args.trackTypes independentSplit = args.independentSplit assert (ngramOrder > 0) if (args.independentSplit): #Divide the corpus into two. #1) Get set of files in the base directory. basePath = os.path.abspath(inputDir) codeFolder = Folder(basePath) #These variant requires regexes of the form *.<ext> not .*.<ext> fileList = codeFolder.fullFileNames(fileType[1:], recursive=False) #2) Divide into 2 randomly. splitFiles = listUtils.partitionList(fileList) #3) Save each in a temp directory. (cd ../input_dir mkdir [rank|freq]_input_dir) (use ln -s to copy) (parentDir, localDir) = os.path.split(basePath) rankDir = os.path.join(parentDir, "rank_" + localDir) freqDir = os.path.join(parentDir, "freq_" + localDir) #print(splitFiles[0]) #print(splitFiles[1]) #print(len(splitFiles[0])) #print(len(splitFiles[1])) #print(rankDir) #print(freqDir) #quit() #-------------------------------------------------------------------------------------------
#Project -> file mapping projectFiles = {} #String -> #List of Strings #Functions defined in this corpus corpusDefintions = {} #Count of Error tokens errorCount = 0 if (token_split.lower() == "api"): #Load in internally defined functions corpusDefinitions = pickle.load( open(os.path.join(basePath, "definitions.pickle"), 'r')) i = 0 fileList = codeFolder.fullFileNames(fileExtension, recursive=False) if (fileExtension == "*.c"): #For c, check for header files too. fileList += codeFolder.fullFileNames("*.h", recursive=False) #Each top - level directory corresponds to a project. for path in fileList: print(path) #print("In Loop!") try: #if(True): #print("Path: " + path) #fileContents = "" components = path.split(".") fileContents = "" #Minimized Javascript files require some preprocessing to estimate where the new lines were #if(len(components) >= 3 and components[-2].lower() == "min" and components[-1].lower() == "js"): # fileContents = preprocessJSMinFile(path)
stopwords = [] if (noStopwords == 1): with open(stopwordsFile, 'r') as f: for line in f: stopwords.append(line.lower().strip()) basePath = os.path.abspath(inputDir) corpusFolder = Folder(basePath) inArticle = False articleText = [] i = 0 for path in corpusFolder.fullFileNames("*.txt", recursive=False): #Read in inputFile with codecs.open( path, 'r', encoding='latin1', errors='ignore' ) as f: #Wikipedia English is UTF-08, so there shouldn't be errors? for line in f: if (line.startswith("<doc") ): #Some metadata here that might be useful inArticle = True elif (line.startswith("</doc")): inArticle = False i = lexWikiFile(articleText, i, outputDir, noStopwords, stopwords) articleText = [] elif (inArticle and line.strip() != "ENDOFARTICLE."): articleText.append(line)
corpus_dir = Folder(sys.argv[1]) def str_to_bool(s): if s == 'True': return True return False isMini = {} reader = UnicodeReader(open('isMinified.csv', 'r')) for row in reader: isMini[row[0]] = str_to_bool(row[1]) eligible = [ os.path.basename(f) for f in corpus_dir.fullFileNames("*.js") if not isMini.get(os.path.basename(f), False) ] size = len(eligible) tt = int(0.8 * size) training_size = int(0.9 * tt) tuning_size = int(tt - training_size) testing_size = size - tt print 'Total:', size print 'Training:', training_size print 'Tuning:', tuning_size print 'Testing:', testing_size training_sample = random.sample(eligible, training_size)
print 'Usage: python lex.py path_to_code_folder file_name_extension output_file' exit() print sys.argv # Path to root folder containing the source code codeFolder = Folder(os.path.abspath(sys.argv[1])) # File type to be considered fileExtension = sys.argv[2] # Path to output file with tokenized code outputFile = open(os.path.abspath(sys.argv[3]), 'wb') writer = UnicodeWriter(outputFile) for path in codeFolder.fullFileNames(fileExtension, recursive=True): try: fileContents = ''.join(open(path, 'r').readlines()) lexer = get_lexer_for_filename(path) tokens = lex(fileContents, lexer) # returns a generator of tuples tokensList = list(tokens) language = languageForLexer(lexer) # Strip comments (this is language dependent; here only Python) lexedWoComments = tokensExceptTokenType(tokensList, Token.Comment) lexedWoComments = tokensExceptTokenType(lexedWoComments, Token.Literal.String.Doc) ## Token.Name.Decorator ? # Write to file for token in [t[1] for t in lexedWoComments]: outputFile.write(token.encode("utf-8"))
''' Select a random sample from a corpus folder. ''' import os import random from shutil import copyfile import sys sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) from folderManager import Folder corpus_dir = Folder(sys.argv[1]) sample_size = int(sys.argv[2]) out_dir = Folder(sys.argv[3]).create() corpus_sample = random.sample(corpus_dir.fullFileNames("*.js"), sample_size) for f in corpus_sample: copyfile(f, os.path.join(out_dir, os.path.basename(f)))
mosesStatus = checkMosesServers( moses_url_dict) #Eventually turn into list of failed servers #Do a simple kill and restart for the moment (can change to something more selective later). if (args.debug): print(mosesStatus) for port, status in mosesStatus.iteritems(): if (status == "E" or status == "F"): mosesFail = True if (not mosesFail): #Stop checking once the servers are online. break print("Servers are online.") if (args.batch): inputFolder = Folder(os.path.abspath(args.input)) fileList = inputFolder.fullFileNames("*.js", recursive=False) for next_file in fileList: print("Renaming " + str(next_file)) base_file = ntpath.basename(next_file) output_file = \ os.path.join(args.output, base_file[:base_file.rfind(".")] + ".out.js") try: processFile(next_file, output_file, args) except: print("Renaming of " + str(next_file) + " failed.") else: print("Renaming " + str(args.input)) processFile(args.input, args.output, args)
inputDir = sys.argv[1] fileExt = sys.argv[2] dirLevel = int(sys.argv[3]) assert (dirLevel >= 1) except: print("usage: python projectSimilarity.py inputDir fileExt dirLevel") print("dirlevel must be >= 1") quit() basePath = os.path.abspath(inputDir) codeFolder = Folder(basePath) project_profiles = {} #Read in directory structure for path in codeFolder.fullFileNames(fileExt, recursive=False): reducedPath = path.replace(basePath + "/", "") project = getProjectName(reducedPath) name = getName(reducedPath, dirLevel) #print(project) #print(name) project_profiles = dictUtils.addItemToDictSet(project_profiles, project, name) jaccards = [] #list of tuples #Perform comparisons (Jaccard index): for k1, k2 in itertools.combinations(project_profiles, 2): unionSize = len(set.union(project_profiles[k1], project_profiles[k2])) intersectionSize = len( set.intersection(project_profiles[k1], project_profiles[k2]))
import os from folderManager import Folder codeFolder = Folder("/Users/caseycas/CodeNLP/EFLCorpus/TECCL_Corpus_V1.1/02TECCL_V1.1_POS/") fileList = codeFolder.fullFileNames("*.txt") print(len(fileList)) for path in fileList: print(path) lines = [] with open(path, 'r') as f: for line in f: lines.append(line.replace("<s>", "").replace("</s>", "")) with open(path, 'w') as f: for line in lines: f.write(line)
import os from folderManager import Folder inputFolder = Folder("/Users/caseycas/CodeNLP/EnglishSample/all/") fileList = inputFolder.fullFileNames("*.tokens", recursive=True) for path in fileList: fileContents = ''.join(open(path, 'r').readlines()) fileContents = fileContents.replace("<UNK>", "UNK") with open(path, 'w') as f: f.write(fileContents4)