Exemplo n.º 1
0
def main(argv):
    import os
    from optparse import OptionParser

    parser = OptionParser()
    parser.add_option("-l", "--language", dest="language", default="c",
                      help="tokenize using lexer for language", metavar="LANG")
    parser.add_option("-s", "--size", dest="size", default=5,
                      help="size of each kgram", metavar="N")
    parser.add_option("-c", action="store_true", dest="comments", default=False,
                      help="consider comments when tokenizing")
    parser.add_option("-e", action="store_true", dest="endlines", default=False,
                      help="consider endlines when tokenizing")
    parser.add_option("-w", action="store_true", dest="whitespace",
                      default=False,
                      help="consider whitespace when tokenizing")
    parser.add_option("-t", action="store_true", dest="text", default=False,
                      help="consider text when tokenizing")
    (options, args) = parser.parse_args(argv)

    if len(args) != 1:
        #print("Please specify exactly one input file.", file=sys.stderr)
        sys.exit(os.EX_USAGE)

    with open(args[0], 'rb') as fin:
        data = fin.read()
    for kgram in kgrams(tokenize(options.language, data, options.comments,
                                                         options.endlines,
                                                         options.whitespace,
                                                         options.text),
                        int(options.size)):
        print(kgram)
Exemplo n.º 2
0
def scrutinize(filelist, options):
    fingerprints = collections.defaultdict(list)
    documents = collections.defaultdict(list)

    for filename in filelist:
        data = normalizeFileLines(filename)
        #print(filename, file=sys.stderr)
        for fprint in winnowing(kgrams(tokenize(options.language, data,
                                                      options.comments,
                                                      options.endlines,
                                                      options.whitespace,
                                                      options.text),
                                    int(options.size)),
                                    int(options.window)):
            documents[filename].append(fprint)
            fingerprints[fprint.hash].append(filename)
    for document, fprints in documents.items():
        matches = collections.defaultdict(int)
        for fprint in fprints:
            for matching in fingerprints[fprint.hash]:
                matches[matching] += 1
        tmp = []
        for key, val in sorted(matches.items()):
            if val > len(fprints) * 0.1 and key != document:
                tmp.append((key, val))
        if tmp:
            print(document, ":", len(fprints))
            for key, val in tmp:
                print('   ', key, val)
Exemplo n.º 3
0
def scrutinize(filelist, options):
    fingerprints = collections.defaultdict(list)
    documents = collections.defaultdict(list)

    for filename in filelist:
        data = normalizeFileLines(filename)
        #print(filename, file=sys.stderr)
        for fprint in winnowing(
                kgrams(
                    tokenize(options.language, data, options.comments,
                             options.endlines, options.whitespace,
                             options.text), int(options.size)),
                int(options.window)):
            documents[filename].append(fprint)
            fingerprints[fprint.hash].append(filename)
    for document, fprints in documents.items():
        matches = collections.defaultdict(int)
        for fprint in fprints:
            for matching in fingerprints[fprint.hash]:
                matches[matching] += 1
        tmp = []
        for key, val in sorted(matches.items()):
            if val > len(fprints) * 0.1 and key != document:
                tmp.append((key, val))
        if tmp:
            print(document, ":", len(fprints))
            for key, val in tmp:
                print('   ', key, val)
Exemplo n.º 4
0
def main(argv):
    import os
    from optparse import OptionParser

    parser = OptionParser()
    parser.add_option("-l",
                      "--language",
                      dest="language",
                      default="c",
                      help="tokenize using lexer for language",
                      metavar="LANG")
    parser.add_option("-s",
                      "--size",
                      dest="size",
                      default=5,
                      help="size of each kgram",
                      metavar="N")
    parser.add_option("-c",
                      action="store_true",
                      dest="comments",
                      default=False,
                      help="consider comments when tokenizing")
    parser.add_option("-e",
                      action="store_true",
                      dest="endlines",
                      default=False,
                      help="consider endlines when tokenizing")
    parser.add_option("-w",
                      action="store_true",
                      dest="whitespace",
                      default=False,
                      help="consider whitespace when tokenizing")
    parser.add_option("-W",
                      "--window",
                      dest="window",
                      default=5,
                      help="size of the winnowing window",
                      metavar="W")
    parser.add_option("-t",
                      action="store_true",
                      dest="text",
                      default=False,
                      help="consider text when tokenizing")
    (options, args) = parser.parse_args(argv)

    if len(args) != 1:
        #        print("Please specify exactly one input file.", file=sys.stderr)
        sys.exit(os.EX_USAGE)

    with open(args[0], 'rb') as fin:
        data = fin.read()
    for fprint in winnowing(
            kgrams(
                tokenize(options.language, data, options.comments,
                         options.endlines, options.whitespace, options.text),
                int(options.size)), int(options.window)):
        print(fprint)
Exemplo n.º 5
0
def examine(filename, options):
    fingerprints = list()

    data = normalizeFileLines(filename)
    for fprint in winnowing(
            kgrams(
                tokenize(options.language, data, options.comments,
                         options.endlines, options.whitespace, options.text),
                int(options.size)), int(options.window)):
        # Add all the fingerprints to a list and return it.
        fingerprints.append(fprint)
    return fingerprints
Exemplo n.º 6
0
def examine(filename, options):
    fingerprints  = list()
    
    data = normalizeFileLines(filename)
    for fprint in winnowing(kgrams(tokenize(options.language, data,
                                                  options.comments,
                                                  options.endlines,
                                                  options.whitespace,
                                                  options.text),
                                                  int(options.size)),
                                                  int(options.window)):
        # Add all the fingerprints to a list and return it.
        fingerprints.append( fprint) 
    return fingerprints