def main(moduleInfoFile, outFile): nAttempts = 0 nSuccesses = 0 nFailures = 0 with open(moduleInfoFile, 'r') as inStream, \ open(outFile, 'w') as outStream: for line in inStream: try: line = line.strip() if len(line) > 0: nAttempts += 1 mi = libLF.ModuleInfo().initFromJSON(line) ghp = libLF.GitHubProject().initFromRaw( owner='UNKNOWN', name='UNKNOWN', registry=mi.registry, modules=[mi.toNDJSON()], tarballPath=mi.tarballPath) outStream.write(ghp.toNDJSON() + '\n') nSuccesses += 1 except: libLF.log('Discarding: {}'.format(line)) nFailures += 1 libLF.log( 'Out of {} ModuleInfo\'s: {} successful conversions, {} failures'. format(nAttempts, nSuccesses, nFailures))
def main(regexFile, outFile, parallelism, rngSeed, inputsPerGenerator, generatorTimeout): libLF.log('regexFile {} outFile {} parallelism {} rngSeed {} inputsPerGenerator {} generatorTimeout {}' \ .format(regexFile, outFile, parallelism, rngSeed, inputsPerGenerator, generatorTimeout)) if 0 <= rngSeed: random.seed(rngSeed) #### Load data tasks = getTasks(regexFile, rngSeed, inputsPerGenerator, generatorTimeout) nRegexes = len(tasks) #### Process data # CPU-bound, no limits libLF.log('Submitting to map') results = libLF.parallel.map(tasks, parallelism, libLF.parallel.RateLimitEnums.NO_RATE_LIMIT, libLF.parallel.RateLimitEnums.NO_RATE_LIMIT, jitter=False) #### Emit results libLF.log('Writing results to {}'.format(outFile)) nSuccesses = 0 nExceptions = 0 with open(outFile, 'w') as outStream: for rpai in results: # Emit if type(rpai) is libLF.RegexPatternAndInputs: nSuccesses += 1 libLF.log(' Generated {} unique inputs for regex /{}/' \ .format(len(rpai.getUniqueInputs()), rpai.pattern)) outStream.write(rpai.toNDJSON() + '\n') else: nExceptions += 1 libLF.log('Successfully performed input generation for {} regexes, {} exceptions'.format(nSuccesses, nExceptions))
def _findAnySLInputUsingCSharp(self, regex): libLF.log( 'Testing whether this regex exhibits SL behavior in C#: /{}/'. format(regex.pattern)) assert regex.evilInputs, "regex has no evil inputs" if EXPAND_EVIL_INPUT: # Expand the evil inputs. # The longer expansions may have higher growth rates (larger polynomials), # but may be buggy and not trigger mismatches properly. expandedEIs = [] for ei in regex.evilInputs: expandedEIs += ei.expand() libLF.log("Considering {} EvilInput's, expanded from {}".format( len(expandedEIs), len(regex.evilInputs))) evilInputs = expandedEIs else: evilInputs = regex.evilInputs eng = 'csharp' for ei in regex.evilInputs: engineBehavior = self._measureProductionEngineBehavior( regex, ei, maxQuerySec=10, useCSharpTimeout=False, engines=[eng]) if engineBehavior[eng] == EngineBehavior.SuperLinear: return ei return None
def _evaluateRegex(self, pattern, testStrings, languages): """Evaluate this regex with these strings in this language pattern: str testStrings: str[] languages: str[] @returns dict { lang: libLF.RegexEvaluationResult[], ... } for this request """ with tempfile.NamedTemporaryFile( prefix='SemanticAnalysis-evaluateRegex-queryFile-', suffix='.json', delete=DELETE_TMP_FILES) as queryFile: # We can use the same queryFile for each language query = { "pattern": pattern, "inputs": testStrings, } libLF.writeToFile(queryFile.name, json.dumps(query)) lang2rers = {} for lang in languages: try: # Might time out -- if so, just ignore the language lang2rers[lang] = self._queryRegexInLang( pattern, queryFile.name, lang) except Exception as err: libLF.log('_evaluateRegex: exception in {}: {}'.format( lang, err)) return lang2rers
def queryHelper(self, regex, commandToRunFmt): """query() might benefit from this regex: the libLF.regex to query commandToRunFmt: format string with the invocation We apply commandToRun.format(inFile, outFile) inFile: contains a libLF.Regex, NDJSON formatted outFile: contains a libLF.RegexPatternAndInput, NDJSON formatted @returns: GeneratorQueryResponse[] """ libLF.log('queryHelper for {}:\n regex /{}/\n command {}' \ .format(self.name, regex.pattern, commandToRunFmt)) gqrs = [] with tempfile.NamedTemporaryFile(prefix='GenInput-DriverQueryFile-', suffix='.json', delete=DELETE_TMP_FILES) as inFile, \ tempfile.NamedTemporaryFile(prefix='GenInput-DriverOutFile-', suffix='.json', delete=DELETE_TMP_FILES) as outFile: libLF.writeToFile(inFile.name, regex.toNDJSON()) rc, out = libLF.runcmd(commandToRunFmt.format(inFile.name, outFile.name)) if rc == 0: with open(outFile.name, 'r') as inStream: contents = inStream.read() rpai = libLF.RegexPatternAndInputs().initFromNDJSON(contents) for producer in rpai.stringsByProducer: gqr = GeneratorQueryResponse(producer, rpai.stringsByProducer[producer]) gqrs.append(gqr) return gqrs
def main(slraFile, visDir): libLF.log("slraFile {} visDir {}".format(slraFile, visDir)) #### Load data allSLRAs = loadSLRAFile(slraFile) allSLRAs_timedOut = getTimedOutSet(allSLRAs) libLF.log('{} timed out'.format(len(allSLRAs_timedOut))) allSLRAs_crossRegistry = getCrossRegistrySet(allSLRAs) libLF.log('{} cross registry'.format(len(allSLRAs_crossRegistry))) allSLRAs_crossRegistry_timedOut = allSLRAs_timedOut & allSLRAs_crossRegistry libLF.log('{} timed out and cross registry'.format( len(allSLRAs_crossRegistry_timedOut))) #### Reports libLF.log("Creating reports (to stdout)") print("\n\n") makeReport_variantEffectiveness(allSLRAs) print("\n\n") makeReport_detectorEffectiveness(allSLRAs) print("\n\n") makeReport_perRegistryTimeouts(allSLRAs, visDir) print("\n\n") makeReport_portability(allSLRAs, visDir)
def isSupportedInLanguage(self, lang, vrdPath=DEFAULT_VULN_REGEX_DETECTOR_ROOT): """Returns True if regex can be used in lang Also updates internal member.""" checkRegexSupportScript = os.path.join(vrdPath, 'src', 'validate', 'check-regex-support.pl') # Build query query = {"language": lang, "pattern": self.pattern} libLF.log('Query: {}'.format(json.dumps(query))) # Query from tempfile with tempfile.NamedTemporaryFile(prefix='SyntaxAnalysis-queryLangs-', suffix='.json', delete=True) as ntf: libLF.writeToFile(ntf.name, json.dumps(query)) rc, out = libLF.runcmd("VULN_REGEX_DETECTOR_ROOT={} '{}' '{}'" \ .format(vrdPath, checkRegexSupportScript, ntf.name)) out = out.strip() libLF.log('Got rc {} out\n{}'.format(rc, out)) # TODO Not sure if this can go wrong. assert (rc == 0) obj = json.loads(out) if bool(obj['validPattern']): if lang not in self.supportedLangs: self.supportedLangs.append(lang) return True else: return False
def _extractStatic(self, outputFile, timeout): """Static regex extraction""" cmd = [ staticRegexExtractorCLI, "--out-file", outputFile, "--registry", self.ghp.registry, "--src-path", self.ghp.tarballPath ] fd, logFileName = tempfile.mkstemp(prefix="extract-regexes-static-log", suffix=".log") os.close(fd) with open(logFileName, 'w') as logFile: libLF.log("CMD: {} > {} 2>&1".format(" ".join(cmd), logFileName)) rc = 1 try: tmo = None if timeout <= 0 else timeout res = subprocess.run(cmd, stdout=logFile, stderr=logFile, timeout=tmo) rc = res.returncode except subprocess.TimeoutExpired: libLF.log("Static extraction timed out") rc = -1 if rc != 0: raise IOError( 'Error, static extractor yielded rc {}. Examine {}'.format( rc, logFileName)) if DELETE_TMP_FILES: os.unlink(logFileName)
def setPropertiesJavaVersionToAtLeast(etree, minVers): """Update the Java source version specified with global properties to minVers+. Returns the numeric versions[] for maven.compiler.{source,target}. The versions[] entries are None if the field is not in the etree. """ # https://maven.apache.org/plugins/maven-compiler-plugin/examples/set-compiler-source-and-target.html libLF.log("etree.findall") root = etree.getroot() defaultNamespace = getDefaultNamespace(root) versions = [] for subPath in [ ["properties", "maven.compiler.source"], ["properties", "maven.compiler.target"], ["properties", "java.version"] ]: fullPath = ["."] + [defaultNamespace + p for p in subPath] libLF.log("Searching fullPath: {}".format(fullPath)) xmlMatch = root.find("/".join(fullPath)) if xmlMatch is not None: libLF.log("Found {} - text {}".format(subPath, xmlMatch.text)) currVers = textToJavaVersion(xmlMatch.text) if currVers is not None and currVers < minVers: libLF.log("Updating version from {} to {}".format(currVers, minVers)) xmlMatch.text = javaVersionToText(minVers) versions.append(textToJavaVersion(xmlMatch.text)) else: libLF.log("Did not find {}".format(subPath)) versions.append(None) return versions
def validateDetectorOpinionsInLang(self, lang): """Test the DOs in this language. Returns self""" lang = lang.lower() if lang not in self.SUPPORTED_LANGS: raise ValueError('Unsupported language {}'.format(lang)) libLF.log('Validating detector opinions for <{}> in {}'.format(self.regex.pattern, lang)) self.lang_pump2timedOut[lang] = {} for do in self.detectorOpinions: if do.isVuln: for ei in do.evilInputs: if ei.couldParse: # Try this EvilInput from this SLRegexDetectorOpinion slrvs = self._testEvilInputInLang(ei, lang) for slrv in slrvs: # Is this a valid pattern? if slrv.validPattern: self.lang_validPattern[lang] = True # Did we get a timeout this time? if slrv.nPumps not in self.lang_pump2timedOut[lang]: # Never tried this nPumps before, keep whatever we got self.lang_pump2timedOut[lang][slrv.nPumps] = slrv.timedOut elif self.lang_pump2timedOut[lang][slrv.nPumps] is False: # Tried this nPumps before but didn't get a timeout, keep whatever we got self.lang_pump2timedOut[lang][slrv.nPumps] = slrv.timedOut else: # We have tried this nPumps before and have seen it timeout, so don't care pass else: # Invalid pattern, mark it as such self.lang_validPattern[lang] = False libLF.log('validateDetectorOpinionsInLang: regex <{}> lang_pump2timedOut[{}] = {}'.format(self.regex.pattern, lang, self.lang_pump2timedOut[lang])) return self
def query(selectionScheme, encodingScheme, queryFile, timeout=None): """Query the engine selectionScheme: SELECTION_SCHEME encodingScheme: ENCODING_SCHEME queryFile: file path timeout: integer seconds before raising subprocess.TimeoutExpired returns: EngineMeasurements raises: on rc != 0, or on timeout """ rc, stdout, stderr = libLF.runcmd_OutAndErr(args=[ ProtoRegexEngine.CLI, ProtoRegexEngine.SELECTION_SCHEME.scheme2cox[selectionScheme], ProtoRegexEngine.ENCODING_SCHEME.scheme2cox[encodingScheme], '-f', queryFile ], timeout=timeout) if rc != 0: if "syntax error" in stderr: raise SyntaxError( "Engine raised syntax error\n rc: {}\nstdout:\n{}\n\nstderr:\n{}" .format(rc, stdout, stderr)) else: raise BaseException( 'Invocation failed; rc {} stdout\n {}\n\nstderr\n {}'. format(rc, stdout, stderr)) res = re.search(r"Need (\d+) bits", stdout) if res: libLF.log("Wished for {} bits".format(res.group(1))) # libLF.log("stderr: <" + stderr + ">") return ProtoRegexEngine.EngineMeasurements(stderr.strip(), "-no match-" in stdout)
def getMutRexInputs(pattern, timeout): """Return inputs: str[]""" # Build command to run cmd = ["java", "-jar", MUTREX_PATH, pattern] libLF.log('cmd: ' + " ".join(cmd)) # Get inputs, guarded by a timeout tmo = None if timeout < 0 else timeout inputs = [] try: completedProcess = subprocess.run(cmd, stdout=subprocess.PIPE, timeout=tmo) libLF.log('rc {}'.format(completedProcess.returncode)) if completedProcess.returncode == 0: inputs = processMutRexOutput( completedProcess.stdout.decode('utf-8')) except subprocess.TimeoutExpired: libLF.log("MutRex timed out") try: libLF.log( "Salvaging any strings streamed by MutRex before the timeout") inputs = processMutRexOutput(tmo.stdout.decode('utf-8')) except Exception: pass except Exception as err: libLF.log('Exception: ' + str(err)) return inputs
def _collectData(self, pumps, doMemo): matchTimes = [] if doMemo: ss = libMemo.ProtoRegexEngine.SELECTION_SCHEME.SS_Full else: ss = libMemo.ProtoRegexEngine.SELECTION_SCHEME.SS_None es = libMemo.ProtoRegexEngine.ENCODING_SCHEME.ES_None matchTimeMS = 0 for pump in pumps: if matchTimeMS > MAX_MATCH_MS: break times = [] # Query libLF.log("nick {} nPumps {}".format(self.nick, pump)) queryFile = libMemo.ProtoRegexEngine.buildQueryFile(self.regex.pattern, self.regex.evilInputs[0].build(pump)) for i in range(NTRIALS): em = libMemo.ProtoRegexEngine.query(ss, es, queryFile) matchTimeMS = max(int(em.si_simTimeUS/1000), 10) times.append(matchTimeMS) med = statistics.median(times) os.unlink(queryFile) # Record matchTimes.append(med) libLF.log("nPumps {} matchTimeMS {}".format(pump, matchTimes[-1])) return matchTimes
def _testEvilInputInLang(self, evilInput, lang): """Returns an SLRegexValidation[] with EXP and POW pumps""" # Build query query = { 'language': lang.lower(), 'pattern': self.regex.pattern, 'evilInput': json.loads(evilInput.toNDJSON()), 'nPumps': -1, # Needs a valid value 'timeLimit': self.slTimeout, } slRegexVals = [] for nPumps in [self.EXP_PUMPS, self.powerPumps]: query['nPumps'] = nPumps libLF.log('query: {}'.format(json.dumps(query))) with tempfile.NamedTemporaryFile( prefix='SLRegexAnalysis-validateOpinion-', suffix='.json', delete=True) as ntf: libLF.writeToFile(ntf.name, json.dumps(query)) rc, out = libLF.runcmd("VULN_REGEX_DETECTOR_ROOT={} '{}' '{}' 2>>/tmp/err" \ .format(self.vrdPath, self.testInLanguageScript, ntf.name)) out = out.strip() libLF.log('Got rc {} out\n{}'.format(rc, out)) slRegexVals.append( SLRegexValidation(self.regex.pattern, evilInput, json.loads(out))) return slRegexVals
def visitFrontier(self, tokenNodes): # Consider every sequence of four nodes: Regex, ::, new, and () for i in range(0, len(tokenNodes) - 4 + 1): regexNode = tokenNodes[i] colonsNode = tokenNodes[i + 1] newNode = tokenNodes[i + 2] delimitedNode = tokenNodes[i + 3] if type(regexNode) is dict \ and type(colonsNode) is dict \ and type(newNode) is dict \ and type(delimitedNode) is dict: try: regexNode_true = regexNode['variant'] == "Token" \ and regexNode['fields'][1]['variant'] == "Ident" \ and regexNode['fields'][1]['fields'][0] == "Regex" colonsNode_true = colonsNode['variant'] == "Token" \ and colonsNode['fields'][1] == "ModSep" newNode_true = newNode['variant'] == "Token" \ and newNode['fields'][1]['variant'] == "Ident" \ and newNode['fields'][1]['fields'][0] == "new" delimitedNode_true = delimitedNode['variant'] == "Delimited" \ and delimitedNode['fields'][1]['delim'] == "Paren" if regexNode_true and colonsNode_true and newNode_true and delimitedNode_true: libLF.log('Found Regex::new(...)') self._extractRegexPattern(delimitedNode) except BaseException as err: pass
def main(semanticsTestsFile, semanticOnly, performanceTestsFile, perfOnly): libLF.log('semanticsTestsFile {} semanticOnly, performanceTestsFile {} perfOnly {}' \ .format(semanticsTestsFile, semanticOnly, performanceTestsFile, perfOnly)) #### Check dependencies libLF.checkShellDependencies(shellDeps) #### Load and run each test summary = [] for testType, testsFile in [(TestSuite.SEMANTIC_TEST, semanticsTestsFile), (TestSuite.PERF_TEST, performanceTestsFile)]: if perfOnly and testType != TestSuite.PERF_TEST: continue if semanticOnly and testType != TestSuite.SEMANTIC_TEST: continue libLF.log("Loading {} tests from {}".format(testType, testsFile)) ts = TestSuite(testsFile, testType) libLF.log("Running {} tests".format(testType)) nFailures = ts.run() summary.append("{} tests from {}: {} failures".format( testType, testsFile, nFailures)) libLF.log("****************************************") for line in summary: libLF.log(" " + line)
def enhanceFromRegistry(self): """Enhance this object's members by contacting the registry""" # PyPI API is documented here: https://wiki.python.org/moin/PyPIJSON # And here: https://warehouse.readthedocs.io/api-reference/json/ jsonEndpoint = 'https://pypi.org/pypi/{}/json'.format(self.name) try: response = requests.get(jsonEndpoint, verify=False, timeout=0.5) obj = json.loads(response.text) # There are several viable places for a github URI to live. allUris = [] try: allUris.append(obj['info']['home_page']) except: pass try: allUris.append(obj['info']['project_urls']['Homepage']) except: pass try: allUris.append(obj['info']['project_url']) except: pass # Filter out 'UNKNOWN' allUris = [ uri for uri in allUris if uri is not None and uri.upper() != 'UNKNOWN' ] # Pick out prioritized lists using regexes. githubUris = [uri for uri in allUris if re.search(r'github', uri)] popularVCSUris = [ uri for uri in allUris if re.search(r'github|bitbucket|gitlab|sourceforge', uri) ] # How'd we do? vcsUri = None if len(githubUris): vcsUri = githubUris[0] elif len(popularVCSUris): vcsUri = popularVCSUris[0] elif len(allUris): vcsUri = allUris[0] if vcsUri: libLF.log('Got a vcsUri: {}'.format(vcsUri)) self.vcsUri = vcsUri else: libLF.log('No vcsUri') except KeyboardInterrupt: raise except: pass return self
def makeReport_regexStringLen(df, visDir): libLF.log("Making report on regex string len distr ({} rows)".format( df.shape[0])) libLF.log("Regex string lengths, by language") print(df.groupby('lang')['len'].describe()) sns.boxplot(data=df, x="lang", y="len") plt.show()
def main(mavenProjDir): # cd to the dir libLF.log("cd {}".format(mavenProjDir)) os.chdir(mavenProjDir) if MIN_JAVA_VERSION_FOR_INSTRUMENTATION_APPROACH >= 0: libLF.log("Updating source/target to Java {}".format(MIN_JAVA_VERSION_FOR_INSTRUMENTATION_APPROACH)) updateSourceAndTargetToMinVers()
def cleanUp(tmpDir): if CLEAN_TMP_DIR: libLF.log('cleanUp: Wiping {}'.format(tmpDir)) try: if os.path.isdir(tmpDir): # Make sure we don't touch tarballs shutil.rmtree(tmpDir) except: pass
def getTasks(regexFile, maxInputsPerGenerator, rngSeed, generatorTimeout): regexes = loadRegexFile(regexFile) tasks = [ MyTask(regex, maxInputsPerGenerator, rngSeed, generatorTimeout) for regex in regexes ] libLF.log('Prepared {} tasks'.format(len(tasks))) return tasks
def unpackTarball(ghp): """Unpack tarball. Returns untarred root dir""" tmpDir = tempfile.mkdtemp(prefix=tmpFilePrefix + "-untar-dir-") libLF.log('Unpacking {} to {}'.format(ghp.tarballPath, tmpDir)) with tarfile.open(ghp.tarballPath, "r:gz") as tar: tar.extractall(path=tmpDir) return tmpDir
def findBuildDir(self, projRoot): if self.buildFile is not None: libLF.log("Searching for {} under {}".format( self.buildFile, projRoot)) buildDir = self._findShallowestDirContaining( projRoot, self.buildFile) return buildDir return projRoot
def unpackTarball(tarball): tmpDir = os.path.join(os.sep, 'tmp', 'regex-extractor', str(os.getpid())) libLF.log('Unpacking {} to {}'.format(tarball, tmpDir)) if not os.path.exists(tmpDir): os.makedirs(tmpDir) with tarfile.open(tarball, "r:gz") as tar: tar.extractall(path=tmpDir) return tmpDir
def processMutRexOutput(outputFromMutRex): inputs = set() # "..." (REJECT) or "..." (CONF) (may include newlines) libLF.log('processMutRexOut: out: \n' + outputFromMutRex) for match in re.findall(r"^\"(.*?)\" \((?:REJECT|CONF)\)", outputFromMutRex, re.MULTILINE | re.DOTALL): libLF.log("match: <{}>".format(match)) inputs.add(match) return sorted(list(inputs))
def preprocessProject(ghp, untarDir): """Prepocess project using the appropriate plugin""" preprocessor = registryToPaths[ghp.registry]['preprocessor'] libLF.log projDir = getProjDir(untarDir) cmd = [preprocessor, "--proj-dir", projDir] libLF.log(" ".join(cmd)) subprocess.run(cmd)
def tryBuild(self, projRoot, pythonBinary): self._cdToBuildDir(projRoot) fullCmd = "{} {} test".format(pythonBinary, self.buildFile) libLF.log("CMD: {}".format(fullCmd)) cmdWords = fullCmd.strip().split(" ") res = subprocess.run(cmdWords) return res.returncode == 0
def main(regexFile, visDir): libLF.log('regexFile {} visDir {}' \ .format(regexFile, visDir)) #### Load data regexes = loadRegexFile(regexFile) #### Generate reports makeReport_regexesNotSupportedInSourceRegistries(regexes, visDir) makeReport_syntaxSupportSummary(regexes, visDir)
def processBricsOutput(outputFromBrics): inputs = set() # RAND STR: "..." # (The "..." is JSON-formatted) #libLF.log('processBricsOutput: out: \n' + outputFromBrics) for match in re.findall(r"^\(\w+\) getRandomString\w+: STR: \"(.*?)\"$", outputFromBrics, re.MULTILINE): libLF.log("match: <{}>".format(match)) inputs.add(match) return sorted(list(inputs))
def checkRegistryDependencies(registry): libLF.checkShellDependencies(['cloc'], mustBeExecutable=True) paths = [ registryToPaths[registry]['preprocessor'], *registryToPaths[registry]['instrumentor'].values(), registryToPaths[registry]['moduleRunner'] ] libLF.log("Checking paths for registry {}: {}".format(registry, paths)) libLF.checkShellDependencies(paths, mustBeExecutable=False)