예제 #1
0
def loadRegexFile(regexFile, setStaticToAll):
  """Return a list of Regex's"""
  regexes = []
  libLF.log('Loading regexes from {}'.format(regexFile))
  with open(regexFile, 'r') as inStream:
    for line in inStream:
      line = line.strip()
      if len(line) == 0:
        continue
      
      try:
        # Build a Regex
        regex = libLF.Regex()
        regex.initFromNDJSON(line)

        # Filter
        if type(regex.pattern) is not str or len(regex.pattern) < 1:
          continue
        
        # Populate static langs used in if it is not set.
        # This should only be because it was not set during the LF project.
        if setStaticToAll:
          if len(regex.useCount_registry_to_nModules_static) != 0:
            raise ValueError("Error, you told me to setStaticToAll but it looks like static language use is non-empty")
          regex.useCount_registry_to_nModules_static = regex.useCount_registry_to_nModules

        regexes.append(regex)
      except KeyboardInterrupt:
        raise
      except BaseException as err:
        libLF.log('Exception parsing line:\n  {}\n  {}'.format(line, err))
        traceback.print_exc()

    libLF.log('Loaded {} regexes from {}'.format(len(regexes), regexFile))
    return regexes
예제 #2
0
  def initFromNDJSON(self, jsonStr):
    obj = libLF.fromNDJSON(jsonStr)
    self.regex = libLF.Regex().initFromDict(obj['regex'])

    if 'slTimeout' in obj:
      self.slTimeout = obj['slTimeout']
    else:
      self.slTimeout = self.MATCH_TIMEOUT_SEC
    if 'powerPumps' in obj:
      self.powerPumps = obj['powerPumps']
    else:
      self.powerPumps = self.POW_PUMPS

    self.detectorOpinions = [
      SLRegexDetectorOpinion().initFromDict(doDict) for doDict in obj['detectorOpinions']
    ]

    # Get the lang_validPattern dict.
    # The keys are bools, easy conversion.
    self.lang_validPattern = obj['lang_validPattern']

    # Get the lang_pump2timedOut dict.
    # The keys on pump2timedOut should be integers, but they may have been
    # converted to strings. Convert back again.
    self.lang_pump2timedOut = obj['lang_pump2timedOut']
    for lang in self.lang_pump2timedOut:
      pump2timedOut = self.lang_pump2timedOut[lang]
      for k in pump2timedOut:
        if type(k) is str:
          pump2timedOut[int(k)] = pump2timedOut[k]
          del pump2timedOut[k]
예제 #3
0
def loadRegexFile(regexFile):
    """Return a list of Regex's"""
    regexes = []
    libLF.log('Loading regexes from {}'.format(regexFile))
    with open(regexFile, 'r') as inStream:
        for line in inStream:
            line = line.strip()
            if len(line) == 0:
                continue

            try:
                # Build a Regex
                regex = libLF.Regex()
                regex.initFromNDJSON(line)

                regexes.append(regex)
            except KeyboardInterrupt:
                raise
            except BaseException as err:
                libLF.log('Exception parsing line:\n  {}\n  {}'.format(
                    line, err))
                traceback.print_exc()

        libLF.log('Loaded {} regexes from {}'.format(len(regexes), regexFile))
        return regexes
def main(internetPatternsFile, realPatternsFile, writingDifficultyThreshold):
    with ExitStack() as stack:
        internetPatternsStream = stack.enter_context(
            open(internetPatternsFile, 'r'))
        realPatternsStream = stack.enter_context(open(realPatternsFile, 'r'))

        internetPatternsDict = getInternetPatternsDict(internetPatternsStream)
        nRegexesMatchingInternetRegex = 0

        nRegexes = 0
        nRealRegexesAtLeastXDifficult = 0
        for line in realPatternsStream:
            # Skip blank lines
            if re.match(r'^\s*$', line):
                continue

            try:
                regex = libLF.Regex().initFromNDJSON(line)
                nRegexes += 1

                # Discard patterns that could be independently derived.
                if libLF.scorePatternWritingDifficulty(
                        regex.pattern) < writingDifficultyThreshold:
                    continue
                nRealRegexesAtLeastXDifficult += 1

                if regex.pattern in internetPatternsDict:
                    libLF.log(
                        'realPattern /{}/ matches internet source'.format(
                            regex.pattern))
                    nRegexesMatchingInternetRegex += 1
                else:
                    if VERBOSE:
                        libLF.log(
                            'realPattern /{}/ does not match internet source'.
                            format(regex.pattern))
            except Exception as e:
                libLF.log("Exception?: {}".format(e))
                pass

        nInternetRegexesAtLeastXDifficult = 0
        for pat in internetPatternsDict:
            if libLF.scorePatternWritingDifficulty(
                    pat) < writingDifficultyThreshold:
                continue
            nInternetRegexesAtLeastXDifficult += 1

        # Print summary
        print(
            '{}/{} real regexes matched any of the {} internet regexes (among the {} real regexes and {} internet regexes at least {} difficult)'
            .format(nRegexesMatchingInternetRegex, nRegexes,
                    len(internetPatternsDict), nRealRegexesAtLeastXDifficult,
                    nInternetRegexesAtLeastXDifficult,
                    writingDifficultyThreshold))
def main(regexFiles, outFile):
    # Load
    nRegexUsages = 0
    file2regexUsages = {}
    for f in regexFiles:
        file2regexUsages[f] = loadRegexUsages(f)
        nRegexUsages += len(file2regexUsages[f])
    libLF.log('Loaded {} regexUsage\'s from {} files'.format(
        nRegexUsages, len(regexFiles)))

    # Identify unique regexes in each file.
    nPerFileUniquePatterns = 0
    file2patterns = {}
    for f in file2regexUsages:
        file2patterns[f] = set([ru.pattern for ru in file2regexUsages[f]])
        file2patterns[f].discard('DYNAMIC')
        nPerFileUniquePatterns += len(file2patterns[f])
        libLF.log('{} unique patterns in {}:\n{}' \
            .format(len(file2patterns[f]), f, pprint.pformat(sorted(file2patterns[f]))))
    libLF.log('Counting unique regexes per file, got {} unique regexes'.format(
        nPerFileUniquePatterns))

    # Identify global unique regexes.
    uniqPatterns = set()
    for f in file2patterns:
        uniqPatterns |= file2patterns[f]
    libLF.log('Globally, got {} unique regexes'.format(len(uniqPatterns)))

    # Did we find any intersections among files? Pigeonhole principle.
    if len(uniqPatterns) < nPerFileUniquePatterns:
        perFileUniquePatterns = [
            p for perFilePatterns in file2patterns.values()
            for p in perFilePatterns
        ]
        duplicates = set([
            p for p in perFileUniquePatterns
            if perFileUniquePatterns.count(p) > 1
        ])
        libLF.log('{} regexes appeared in multiple files: {}'.format(
            len(duplicates), duplicates))
    else:
        libLF.log('Each unique regex appeared in only 1 file'.format(
            nPerFileUniquePatterns - len(uniqPatterns)))

    # Emit
    regexes = [libLF.Regex().initFromRaw(p, {}, {}) for p in uniqPatterns]
    libLF.log('Emitting to {}'.format(outFile))
    with open(outFile, 'w') as outStream:
        for regex in regexes:
            outStream.write(regex.toNDJSON() + '\n')
예제 #6
0
def main(regexFile, outFile, seed, nInputs, timeout):
    libLF.log('regexFile {} outFile {} seed {} nInputs {} timeout {}' \
      .format(regexFile, outFile, seed, nInputs, timeout))

    # Get the libLF.Regex
    with open(regexFile, 'r') as inStream:
        regex = libLF.Regex().initFromNDJSON(inStream.read())
    libLF.log('Generating inputs for regex /{}/'.format(regex.pattern))

    # Query Rex
    stringsByProducer = getRexInputs(regex.pattern, seed, nInputs, timeout)

    # Emit
    rpai = libLF.RegexPatternAndInputs().initFromRaw(regex.pattern,
                                                     stringsByProducer)
    libLF.log('Rex generated {} unique inputs for regex /{}/ ({} including duplicates)' \
      .format(len(rpai.getUniqueInputs()), regex.pattern, rpai.getNTotalInputs()))
    with open(outFile, 'w') as outStream:
        outStream.write(rpai.toNDJSON())
예제 #7
0
def main(regexFile, outFile, timeout):
    libLF.log('regexFile {} outFile {} timeout {}' \
      .format(regexFile, outFile, timeout))

    # Get the libLF.Regex
    with open(regexFile, 'r') as inStream:
        regex = libLF.Regex().initFromNDJSON(inStream.read())
    libLF.log('Generating inputs for regex /{}/'.format(regex.pattern))

    # Query ReScue
    mutRexInputs = getReScueInputs(regex.pattern, timeout)
    libLF.log('ReScue generated {} inputs for regex /{}/'.format(
        len(mutRexInputs), regex.pattern))

    # Emit
    stringsByProducer = {"ReScue": mutRexInputs}
    with open(outFile, 'w') as outStream:
        rpai = libLF.RegexPatternAndInputs().initFromRaw(
            regex.pattern, stringsByProducer)
        outStream.write(rpai.toNDJSON())
예제 #8
0
def main(internetPatternsFile, realPatternsFile, writingDifficultyThreshold):
    with ExitStack() as stack:
        internetPatternsStream = stack.enter_context(
            open(internetPatternsFile, 'r'))
        realPatternsStream = stack.enter_context(open(realPatternsFile, 'r'))

        internetPatternsDict = getInternetPatternsDict(internetPatternsStream)
        nRegexesMatchingInternetRegex = 0

        for line in realPatternsStream:
            # Skip blank lines
            if re.match(r'^\s*$', line):
                continue

            try:
                regex = libLF.Regex().initFromNDJSON(line)

                # Discard patterns that could be independently derived.
                if libLF.scorePatternWritingDifficulty(
                        regex.pattern) < writingDifficultyThreshold:
                    continue

                if regex.pattern in internetPatternsDict:
                    libLF.log(
                        'realPattern /{}/ matches internet source'.format(
                            regex.pattern))
                    nRegexesMatchingInternetRegex += 1
                else:
                    libLF.log(
                        'realPattern /{}/ does not match internet source'.
                        format(obj['pattern']))
            except:
                pass

        libLF.log('{} regexes matched internet sources'.format(
            nRegexesMatchingInternetRegex))
예제 #9
0
  def run(self):
    try:
      # Obtain C# patterns
      libLF.log("Generating C# patterns")
      # Replace u flag with i for compatibility with C# and to preserve the
      # presence or absence of flags.
      csharpPatterns = [
        libLF.RegexTranslator.translateRegex(regex.pattern, "", "C#", altUnicodeFlag='i')
        for regex in self.regexList
      ]

      for r, c in zip(self.regexList, csharpPatterns):
        libLF.log("MyTask: /{}/ -> /{}/".format(r.pattern, c))

      # Run the analyses
      if AnalysisStages.ANALYZE_AUTOMATON in self.analyses:
        libLF.log("ANALYZE_AUTOMATON")
        automataMeasures = self.runAutomataCLI(csharpPatterns)
        if len(automataMeasures) and AnalysisStages.ANALYZE_SIMPLE_PATHS in self.analyses:
          libLF.log("ANALYZE_SIMPLE_PATHS")
          nSimplePathsList, averageOutDegreeDensityList = self.computeGraphMetrics(automataMeasures)
        else:
          libLF.log("{} automataMeasures, analyses {} -- skipping computeGraphMetrics".format(len(automataMeasures), self.analyses))
          nSimplePathsList = [ -1 for i in range(len(self.regexList)) ]
          averageOutDegreeDensityList = [ -1 for i in range(len(self.regexList)) ]
      else:
        automataMeasures = [ {} for i in range(len(self.regexList)) ]
        nSimplePathsList = [ -1 for i in range(len(self.regexList)) ]
        averageOutDegreeDensityList = [ -1 for i in range(len(self.regexList)) ]

      if AnalysisStages.ANALYZE_WORST_CASE in self.analyses:
        libLF.log("ANALYZE_WORST_CASE")
        # Perform worst-case analysis on the C#-translated regexes
        regexes_csharp = [
          libLF.Regex().initFromRaw(csharpPattern, {}, {})
          for csharpPattern in csharpPatterns 
        ]
        worstCaseSpencerList = self.predictWorstCaseSpencerPerformance(regexes_csharp)
      else:
        worstCaseSpencerList = [ libLF.SLRegexDetectorOpinion.PRED_COMPLEXITY_UNKNOWN for i in range(len(self.regexList)) ]
      
      libLF.log("Asserting lengths")
      assert(len(self.regexList) == len(csharpPatterns))
      assert(len(self.regexList) == len(automataMeasures))
      assert(len(self.regexList) == len(nSimplePathsList))
      assert(len(self.regexList) == len(worstCaseSpencerList))

      # Prep and return RegexMetrics[]
      libLF.log("Prepping regexMetricsList")
      regexMetricsList = []
      for regex, csharpPattern, autMeasure, nSimplePaths, averageOutDegreeDensity, worstCaseSpencer in zip(
        self.regexList, csharpPatterns, automataMeasures, nSimplePathsList, averageOutDegreeDensityList, worstCaseSpencerList):
        # Prep members for a RegexMetrics
        csharpRegexLen = len(csharpPattern)
        if AnalysisStages.ANALYZE_AUTOMATON in self.analyses:
          validInCSharp = autMeasure['validCSharpRegex']
          if validInCSharp:
            featureVector = autMeasure['featureVector']
            automatonMetrics = autMeasure['automataMeasures']
          else:
            featureVector = {}
            automatonMetrics = {}

          if AnalysisStages.ANALYZE_SIMPLE_PATHS not in self.analyses:
            nSimplePaths = -1
        else:
          validInCSharp = False
          featureVector = {}
          automatonMetrics = {}
        
        # Misc metrics
        nDistinctFeaturesUsed = 0
        for v in featureVector.values():
          if v is not None and v > 0:
            nDistinctFeaturesUsed += 1

        usesSuperLinearFeatures = False
        for abbrv in ["NLKA", "LKA", "NLKB", "LKB", "BKR"]:
          if abbrv in featureVector and featureVector[abbrv] > 0:
            usesSuperLinearFeatures = True

        regexMetrics = RegexMetrics(
          regex.pattern,
          regex.langsUsedInStatic(), regex.langsUsedInDynamic(),
          csharpPattern, csharpRegexLen,
          validInCSharp,
          featureVector, automatonMetrics, nSimplePaths,
          nDistinctFeaturesUsed,
          worstCaseSpencer, averageOutDegreeDensity, usesSuperLinearFeatures
        )
        regexMetricsList.append(regexMetrics)
      libLF.log("Returning regexMetricsList")
      return regexMetricsList
    except BaseException as e:
      libLF.log("Uh oh, hit an exception")
      libLF.log(e)
      traceback.print_exc()
      return self.regexList
def wangPatternToLibLFRegex(pattern):
    return libLF.Regex().initFromRaw(pattern, {}, {})