def main(argv): import os from optparse import OptionParser parser = OptionParser() parser.add_option("-l", "--language", dest="language", default="c", help="tokenize using lexer for language", metavar="LANG") parser.add_option("-s", "--size", dest="size", default=5, help="size of each kgram", metavar="N") parser.add_option("-c", action="store_true", dest="comments", default=False, help="consider comments when tokenizing") parser.add_option("-e", action="store_true", dest="endlines", default=False, help="consider endlines when tokenizing") parser.add_option("-w", action="store_true", dest="whitespace", default=False, help="consider whitespace when tokenizing") parser.add_option("-t", action="store_true", dest="text", default=False, help="consider text when tokenizing") (options, args) = parser.parse_args(argv) if len(args) != 1: #print("Please specify exactly one input file.", file=sys.stderr) sys.exit(os.EX_USAGE) with open(args[0], 'rb') as fin: data = fin.read() for kgram in kgrams(tokenize(options.language, data, options.comments, options.endlines, options.whitespace, options.text), int(options.size)): print(kgram)
def scrutinize(filelist, options): fingerprints = collections.defaultdict(list) documents = collections.defaultdict(list) for filename in filelist: data = normalizeFileLines(filename) #print(filename, file=sys.stderr) for fprint in winnowing(kgrams(tokenize(options.language, data, options.comments, options.endlines, options.whitespace, options.text), int(options.size)), int(options.window)): documents[filename].append(fprint) fingerprints[fprint.hash].append(filename) for document, fprints in documents.items(): matches = collections.defaultdict(int) for fprint in fprints: for matching in fingerprints[fprint.hash]: matches[matching] += 1 tmp = [] for key, val in sorted(matches.items()): if val > len(fprints) * 0.1 and key != document: tmp.append((key, val)) if tmp: print(document, ":", len(fprints)) for key, val in tmp: print(' ', key, val)
def scrutinize(filelist, options): fingerprints = collections.defaultdict(list) documents = collections.defaultdict(list) for filename in filelist: data = normalizeFileLines(filename) #print(filename, file=sys.stderr) for fprint in winnowing( kgrams( tokenize(options.language, data, options.comments, options.endlines, options.whitespace, options.text), int(options.size)), int(options.window)): documents[filename].append(fprint) fingerprints[fprint.hash].append(filename) for document, fprints in documents.items(): matches = collections.defaultdict(int) for fprint in fprints: for matching in fingerprints[fprint.hash]: matches[matching] += 1 tmp = [] for key, val in sorted(matches.items()): if val > len(fprints) * 0.1 and key != document: tmp.append((key, val)) if tmp: print(document, ":", len(fprints)) for key, val in tmp: print(' ', key, val)
def main(argv): import os from optparse import OptionParser parser = OptionParser() parser.add_option("-l", "--language", dest="language", default="c", help="tokenize using lexer for language", metavar="LANG") parser.add_option("-s", "--size", dest="size", default=5, help="size of each kgram", metavar="N") parser.add_option("-c", action="store_true", dest="comments", default=False, help="consider comments when tokenizing") parser.add_option("-e", action="store_true", dest="endlines", default=False, help="consider endlines when tokenizing") parser.add_option("-w", action="store_true", dest="whitespace", default=False, help="consider whitespace when tokenizing") parser.add_option("-W", "--window", dest="window", default=5, help="size of the winnowing window", metavar="W") parser.add_option("-t", action="store_true", dest="text", default=False, help="consider text when tokenizing") (options, args) = parser.parse_args(argv) if len(args) != 1: # print("Please specify exactly one input file.", file=sys.stderr) sys.exit(os.EX_USAGE) with open(args[0], 'rb') as fin: data = fin.read() for fprint in winnowing( kgrams( tokenize(options.language, data, options.comments, options.endlines, options.whitespace, options.text), int(options.size)), int(options.window)): print(fprint)
def examine(filename, options): fingerprints = list() data = normalizeFileLines(filename) for fprint in winnowing( kgrams( tokenize(options.language, data, options.comments, options.endlines, options.whitespace, options.text), int(options.size)), int(options.window)): # Add all the fingerprints to a list and return it. fingerprints.append(fprint) return fingerprints
def examine(filename, options): fingerprints = list() data = normalizeFileLines(filename) for fprint in winnowing(kgrams(tokenize(options.language, data, options.comments, options.endlines, options.whitespace, options.text), int(options.size)), int(options.window)): # Add all the fingerprints to a list and return it. fingerprints.append( fprint) return fingerprints