def getFilesToLex(top_lvl_dir, extension):
    """
    Desc: Get all filepaths recursively from top_lvl_dir that
    match extension

    Parameters:
    ----------
    top_lvl_dir - a directory that is the head of a project or
    corpus.  We will recursively examine all files under it

    extension - A wildcard regex for the type of extension
    to collect (e.g. *.java)

    Returns:
    --------
    list of filepaths matching extension in top_lvl_dir and
    any subdirector of it.
    """

    # Path to root folder containing the source code

    basePath = os.path.abspath(top_lvl_dir)
    if (os.path.isdir(basePath)):
        inputFolder = Folder(basePath)
        fileList = inputFolder.fullFileNames(extension, recursive=True)
        if (extension == "*.c"):  #For c, check for header files too.
            fileList += inputFolder.fullFileNames("*.h", recursive=True)
    else:  #It's a single file
        fileList = [basePath]

    return fileList
示例#2
0
    print(
        "Example: python dumpImports.py ~/CodeNLP/HaskellProjects/ *.hs haskellImports.txt"
    )
    quit()

print(sys.argv)

codeFolder = Folder(os.path.abspath(sys.argv[1]))
# File type to be considered
fileExtension = sys.argv[2]
output_file = sys.argv[3]

internalCount = 0
externalCount = 0
with open(output_file, 'w') as out:
    for path in codeFolder.fullFileNames(fileExtension, recursive=True):
        out.write("File: " + path + "\n")
        try:
            with open(path, 'r') as f:
                for line in f:
                    line = line.replace("\n", "")
                    if (line.strip().startswith("import ")
                            or line.strip().startswith("open ")):
                        shortened = line.replace("import ",
                                                 "").replace("open ", "")
                        if (isInternalAPI(path, shortened)):
                            internalCount += 1
                            out.write(line + " (Internal)\n")
                        else:
                            externalCount += 1
                            out.write(line + " (External)\n")
示例#3
0
        if ok:
            mc = MiniChecker('tmp_%d.b.js' % pid)
            try:
                isMini = mc.compare(keep_mini=False)
            except Exception as e:
                isMini = str(e)
                
            cleanup(pid)
            return [os.path.basename(js_file_path), isMini]
        
        else:
            cleanup(pid)
            return [os.path.basename(js_file_path), 'Beautifier failed']
        
    except TimeExceededError:
        
        cleanup(pid)
        return [os.path.basename(js_file_path), 'Timeout']
        

    
    
corpus_dir = Folder(sys.argv[1])

pool = multiprocessing.Pool(processes=8)
with open('isMinified.csv', 'wb') as f:
    writer = UnicodeWriter(f)
    for line in pool.imap(processFile, corpus_dir.fullFileNames("*.js")):
        writer.writerow(line)

示例#4
0
outputFile = args.outputFile
ngramOrder = args.ngramOrder
testLocation = args.testLocation
projectMap = args.projectMap
trackTypes = args.trackTypes
independentSplit = args.independentSplit

assert (ngramOrder > 0)

if (args.independentSplit):
    #Divide the corpus into two.
    #1) Get set of files in the base directory.
    basePath = os.path.abspath(inputDir)
    codeFolder = Folder(basePath)
    #These variant requires regexes of the form *.<ext> not .*.<ext>
    fileList = codeFolder.fullFileNames(fileType[1:], recursive=False)
    #2) Divide into 2 randomly.
    splitFiles = listUtils.partitionList(fileList)
    #3) Save each in a temp directory. (cd ../input_dir mkdir [rank|freq]_input_dir) (use ln -s to copy)
    (parentDir, localDir) = os.path.split(basePath)
    rankDir = os.path.join(parentDir, "rank_" + localDir)
    freqDir = os.path.join(parentDir, "freq_" + localDir)
    #print(splitFiles[0])
    #print(splitFiles[1])
    #print(len(splitFiles[0]))
    #print(len(splitFiles[1]))
    #print(rankDir)
    #print(freqDir)
    #quit()

    #-------------------------------------------------------------------------------------------
示例#5
0
#Project -> file mapping
projectFiles = {}  #String -> #List of Strings
#Functions defined in this corpus
corpusDefintions = {}

#Count of Error tokens
errorCount = 0

if (token_split.lower() == "api"):
    #Load in internally defined functions
    corpusDefinitions = pickle.load(
        open(os.path.join(basePath, "definitions.pickle"), 'r'))

i = 0

fileList = codeFolder.fullFileNames(fileExtension, recursive=False)
if (fileExtension == "*.c"):  #For c, check for header files too.
    fileList += codeFolder.fullFileNames("*.h", recursive=False)
#Each top - level directory corresponds to a project.
for path in fileList:
    print(path)
    #print("In Loop!")
    try:
        #if(True):
        #print("Path: " + path)
        #fileContents = ""
        components = path.split(".")
        fileContents = ""
        #Minimized Javascript files require some preprocessing to estimate where the new lines were
        #if(len(components) >= 3 and components[-2].lower() == "min" and components[-1].lower() == "js"):
        #    fileContents = preprocessJSMinFile(path)
stopwords = []

if (noStopwords == 1):
    with open(stopwordsFile, 'r') as f:
        for line in f:
            stopwords.append(line.lower().strip())

basePath = os.path.abspath(inputDir)
corpusFolder = Folder(basePath)

inArticle = False
articleText = []
i = 0

for path in corpusFolder.fullFileNames("*.txt", recursive=False):
    #Read in inputFile
    with codecs.open(
            path, 'r', encoding='latin1', errors='ignore'
    ) as f:  #Wikipedia English is UTF-08, so there shouldn't be errors?
        for line in f:
            if (line.startswith("<doc")
                ):  #Some metadata here that might be useful
                inArticle = True
            elif (line.startswith("</doc")):
                inArticle = False
                i = lexWikiFile(articleText, i, outputDir, noStopwords,
                                stopwords)
                articleText = []
            elif (inArticle and line.strip() != "ENDOFARTICLE."):
                articleText.append(line)
示例#7
0
corpus_dir = Folder(sys.argv[1])


def str_to_bool(s):
    if s == 'True':
        return True
    return False


isMini = {}
reader = UnicodeReader(open('isMinified.csv', 'r'))
for row in reader:
    isMini[row[0]] = str_to_bool(row[1])

eligible = [
    os.path.basename(f) for f in corpus_dir.fullFileNames("*.js")
    if not isMini.get(os.path.basename(f), False)
]

size = len(eligible)
tt = int(0.8 * size)
training_size = int(0.9 * tt)
tuning_size = int(tt - training_size)
testing_size = size - tt

print 'Total:', size
print 'Training:', training_size
print 'Tuning:', tuning_size
print 'Testing:', testing_size

training_sample = random.sample(eligible, training_size)
    print 'Usage: python lex.py path_to_code_folder file_name_extension output_file'
    exit()

print sys.argv

# Path to root folder containing the source code
codeFolder = Folder(os.path.abspath(sys.argv[1]))

# File type to be considered
fileExtension = sys.argv[2]

# Path to output file with tokenized code
outputFile = open(os.path.abspath(sys.argv[3]), 'wb')
writer = UnicodeWriter(outputFile)

for path in codeFolder.fullFileNames(fileExtension, recursive=True):
    try:
        fileContents = ''.join(open(path, 'r').readlines())
        lexer = get_lexer_for_filename(path)
        tokens = lex(fileContents, lexer) # returns a generator of tuples
        tokensList = list(tokens)
        language = languageForLexer(lexer)
    
        # Strip comments (this is language dependent; here only Python)
        lexedWoComments = tokensExceptTokenType(tokensList, Token.Comment)
        lexedWoComments = tokensExceptTokenType(lexedWoComments, Token.Literal.String.Doc)
        ## Token.Name.Decorator ?

        # Write to file
        for token in [t[1] for t in lexedWoComments]:
            outputFile.write(token.encode("utf-8"))
示例#9
0
'''
Select a random sample from a corpus folder.
'''

import os
import random
from shutil import copyfile

import sys
sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))

from folderManager import Folder

corpus_dir = Folder(sys.argv[1])
sample_size = int(sys.argv[2])
out_dir = Folder(sys.argv[3]).create()

corpus_sample = random.sample(corpus_dir.fullFileNames("*.js"), sample_size)

for f in corpus_sample:
    copyfile(f, os.path.join(out_dir, os.path.basename(f)))


示例#10
0
        mosesStatus = checkMosesServers(
            moses_url_dict)  #Eventually turn into list of failed servers
        #Do a simple kill and restart for the moment (can change to something more selective later).
        if (args.debug):
            print(mosesStatus)
        for port, status in mosesStatus.iteritems():
            if (status == "E" or status == "F"):
                mosesFail = True

        if (not mosesFail):  #Stop checking once the servers are online.
            break

print("Servers are online.")

if (args.batch):
    inputFolder = Folder(os.path.abspath(args.input))
    fileList = inputFolder.fullFileNames("*.js", recursive=False)
    for next_file in fileList:
        print("Renaming " + str(next_file))
        base_file = ntpath.basename(next_file)
        output_file = \
            os.path.join(args.output,
                base_file[:base_file.rfind(".")] + ".out.js")
        try:
            processFile(next_file, output_file, args)
        except:
            print("Renaming of " + str(next_file) + " failed.")
else:
    print("Renaming " + str(args.input))
    processFile(args.input, args.output, args)
    inputDir = sys.argv[1]
    fileExt = sys.argv[2]
    dirLevel = int(sys.argv[3])
    assert (dirLevel >= 1)
except:
    print("usage:  python projectSimilarity.py inputDir fileExt dirLevel")
    print("dirlevel must be >= 1")
    quit()

basePath = os.path.abspath(inputDir)
codeFolder = Folder(basePath)

project_profiles = {}

#Read in directory structure
for path in codeFolder.fullFileNames(fileExt, recursive=False):
    reducedPath = path.replace(basePath + "/", "")
    project = getProjectName(reducedPath)
    name = getName(reducedPath, dirLevel)
    #print(project)
    #print(name)
    project_profiles = dictUtils.addItemToDictSet(project_profiles, project,
                                                  name)

jaccards = []  #list of tuples

#Perform comparisons (Jaccard index):
for k1, k2 in itertools.combinations(project_profiles, 2):
    unionSize = len(set.union(project_profiles[k1], project_profiles[k2]))
    intersectionSize = len(
        set.intersection(project_profiles[k1], project_profiles[k2]))
示例#12
0
import os
from folderManager import Folder

codeFolder = Folder("/Users/caseycas/CodeNLP/EFLCorpus/TECCL_Corpus_V1.1/02TECCL_V1.1_POS/")

fileList = codeFolder.fullFileNames("*.txt")
print(len(fileList))

for path in fileList:
	print(path)
	lines = []
	with open(path, 'r') as f:
	    for line in f:
	        lines.append(line.replace("<s>", "").replace("</s>", ""))
	with open(path, 'w') as f:
	    for line in lines:
	        f.write(line)
示例#13
0
import os
from folderManager import Folder

inputFolder = Folder("/Users/caseycas/CodeNLP/EnglishSample/all/")
fileList = inputFolder.fullFileNames("*.tokens", recursive=True)

for path in fileList:
    fileContents = ''.join(open(path, 'r').readlines())
    fileContents = fileContents.replace("<UNK>", "UNK")
    with open(path, 'w') as f:
        f.write(fileContents4)