示例#1
0
def calculate_scores(gt_path, r_path, idmsg):
    if not os.path.isfile(gt_path):
        print('Groundtruth file not found. Skipping scoring.')
        print(idmsg)
        return 0, 0, 0
    # Read the groundtruth
    truth_dict = {}
    idx_conn_uid = 0
    with open(gt_path) as f:
        for pos, line in enumerate(f):
            if not pos:
                headers = line.strip('\n').split('\t')
                for idxh in range(len(headers)):
                    if headers[idxh] == 'conn_uid':
                        idx_conn_uid = idxh
                        break
                continue
            values = line.strip('\n').split('\t')
            # Concatenating sha2 and conn_uid
            key = '{}:{}'.format(values[1], values[idx_conn_uid])
            truth_dict.update({key: values[0]})

    # Read the estimated results to create a dict for the scoring
    guess_dict = {}
    ids = set()
    with open(r_path) as f:
        for line in f:
            values = line.strip('\n').split(',')
            # The results file only has sha2:conn_uid,label
            guess_dict.update({values[0]: values[1]})
            ids.add(values[0])

    p, r, f = ec.eval_precision_recall_fmeasure(truth_dict, guess_dict, ids)
    print("------ Scores ------")
    print(idmsg)
    print("--------------------")
    print("Precision: %s%%" % p)
    print("Recall: %s%%" % r)
    print("F-Measure: %s%%" % f)
    print("--------------------")
    return p, r, f
示例#2
0
def main(args):
    # Select hash used to identify sample, by default MD5
    hash_type = args.hash if args.hash else 'md5'

    # If ground truth provided, read it from file
    gt_dict = {}
    if args.gt:
        with open(args.gt, 'r') as gt_fd:
            for line in gt_fd:
                gt_hash, family = map(str.lower, line.strip().split('\t', 1))
                gt_dict[gt_hash] = family

        # Guess type of hash in ground truth file
        hash_type = guess_hash(gt_dict.keys()[0])

    # Create AvLabels object
    av_labels = AvLabels(args.gen, args.alias, args.av)

    # Build list of input files
    # NOTE: duplicate input files are not removed
    ifile_l = []
    if (args.vt):
        ifile_l += args.vt
        ifile_are_vt = True
    if (args.lb):
        ifile_l += args.lb
        ifile_are_vt = False
    if (args.vtdir): 
        ifile_l += [os.path.join(args.vtdir, f) for f in os.listdir(args.vtdir)]
        ifile_are_vt = True
    if (args.lbdir):
        ifile_l += [os.path.join(args.lbdir, f) for f in os.listdir(args.lbdir)]
        ifile_are_vt = False

    # Select output prefix
    out_prefix = os.path.basename(os.path.splitext(ifile_l[0])[0])

    # If verbose, open log file
    if args.verbose:
        log_filename = out_prefix + '.verbose'
        verb_fd = open(log_filename, 'w+')

    # Initialize state
    first_token_dict = {}
    token_count_map = {}
    pair_count_map = {}
    token_family_map = {}
    fam_stats = {}
    vt_all = 0
    vt_empty = 0
    singletons = 0

    # Process each input file
    for ifile in ifile_l:
        # Open file
        fd = open(ifile, 'r')

        # Debug info, file processed
        sys.stderr.write('[-] Processing input file %s\n' % ifile)

        # Process all lines in file
        for line in fd:

            # If blank line, skip
            if line == '\n':
                continue

            # Debug info
            if vt_all % 100 == 0:
                sys.stderr.write('\r[-] %d JSON read' % vt_all)
                sys.stderr.flush()
            vt_all += 1

            # Read JSON line and extract sample info (i.e., hashes and labels)
            vt_rep = json.loads(line)
            sample_info = av_labels.get_sample_info(vt_rep, ifile_are_vt)
            if sample_info is None:
                try:
                    name = vt_rep['md5']
                    sys.stderr.write('\nNo AV labels for %s\n' % name)
                except KeyError:
                    sys.stderr.write('\nCould not process: %s\n' % line)
                sys.stderr.flush()
                vt_empty += 1
                continue

            # Sample's name is selected hash type (md5 by default)
            name = getattr(sample_info, hash_type)

            # If the VT report has no AV labels, continue
            if not sample_info[3]:
                vt_empty += 1
                sys.stderr.write('\nNo AV labels for %s\n' % name)
                sys.stderr.flush()
                continue
            
            # Get the distinct tokens from all the av labels in the report
            # And print them. If not verbose, print the first token.
            # If verbose, print the whole list
            try:
                # Get distinct tokens from AV labels
                tokens = av_labels.get_family_ranking(sample_info).items()

                # If alias detection, populate maps
                if args.aliasdetect:
                    prev_tokens = set([])
                    for entry in tokens:
                        curr_tok = entry[0]
                        curr_count = token_count_map.get(curr_tok)
                        if curr_count:
                            token_count_map[curr_tok] = curr_count + 1
                        else:
                            token_count_map[curr_tok] = 1
                        for prev_tok in prev_tokens:
                            if prev_tok < curr_tok:
                                pair = (prev_tok,curr_tok) 
                            else: 
                                pair = (curr_tok,prev_tok)
                            pair_count = pair_count_map.get(pair)
                            if pair_count:
                                pair_count_map[pair] = pair_count + 1
                            else:
                                pair_count_map[pair] = 1
                        prev_tokens.add(curr_tok)

                # If generic token detection, populate map
                if args.gendetect and args.gt:
                    for entry in tokens:
                        curr_tok = entry[0]
                        curr_fam_set = token_family_map.get(curr_tok)
                        family = gt_dict[name] if name in gt_dict else None
                        if curr_fam_set and family:
                            curr_fam_set.add(family)
                        elif family:
                            token_family_map[curr_tok] = set(family)

                # Top candidate is most likely family name
                if tokens:
                    family = tokens[0][0]
                    is_singleton = False
                else:
                    family = "SINGLETON:" + name
                    is_singleton = True
                    singletons += 1

                # Check if sample is PUP, if requested
                if args.pup:
                    is_pup = av_labels.is_pup(sample_info[3])
                    if is_pup:
                        is_pup_str = "\t1"
                    else:
                        is_pup_str = "\t0"
                else:
                    is_pup = None
                    is_pup_str =  ""

                # Build family map for precision, recall, computation
                first_token_dict[name] = family

                # Get ground truth family, if available
                if args.gt:
                    gt_family = '\t' + gt_dict[name] if name in gt_dict else ""
                else:
                    gt_family = ""

                # Print family (and ground truth if available) to stdout
                print '%s\t%s%s%s' % (name, family, gt_family, is_pup_str)

                # If verbose, print tokens (and ground truth if available) 
                # to log file
                if args.verbose:
                    verb_fd.write('%s\t%s%s%s\n' % (
                        name, tokens, gt_family, is_pup_str))

                # Store family stats (if required)
                if args.fam:
                    if is_singleton:
                        ff = 'SINGLETONS'
                    else:
                        ff = family
                    try:
                        numAll, numMal, numPup = fam_stats[ff]
                    except KeyError:
                        numAll = 0
                        numMal = 0
                        numPup = 0

                    numAll += 1
                    if args.pup:
                        if is_pup:
                            numPup += 1
                        else:
                            numMal += 1
                    fam_stats[ff] = (numAll, numMal, numPup)

            except:
                traceback.print_exc(file=sys.stderr)
                continue

        # Debug info
        sys.stderr.write('\r[-] %d JSON read' % vt_all)
        sys.stderr.flush()
        sys.stderr.write('\n')

        # Close file
        fd.close()

    # Print statistics
    sys.stderr.write(
            "[-] Samples: %d NoLabels: %d Singletons: %d "
            "GroundTruth: %d\n" % (
                vt_all, vt_empty, singletons, len(gt_dict)))

    # If ground truth, print precision, recall, and F1-measure
    if args.gt and args.eval:
        precision, recall, fmeasure = \
                    ec.eval_precision_recall_fmeasure(gt_dict,
                                                      first_token_dict)
        sys.stderr.write( \
            "Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n" % \
                          (precision, recall, fmeasure))

    # If generic token detection, print map
    if args.gendetect:
        # Open generic tokens file
        gen_filename = out_prefix + '.gen'
        gen_fd = open(gen_filename, 'w+')
        # Output header line
        gen_fd.write("Token\t#Families\n")
        sorted_pairs = sorted(token_family_map.iteritems(), 
                              key=lambda x: len(x[1]) if x[1] else 0, 
                              reverse=True)
        for (t,fset) in sorted_pairs:
            gen_fd.write("%s\t%d\n" % (t, len(fset)))

        # Close generic tokens file
        gen_fd.close()
        sys.stderr.write('[-] Generic token data in %s\n' % (gen_filename))

    # If alias detection, print map
    if args.aliasdetect:
        # Open alias file
        alias_filename = out_prefix + '.alias'
        alias_fd = open(alias_filename, 'w+')
        # Sort token pairs by number of times they appear together
        sorted_pairs = sorted(
                pair_count_map.items(), key=itemgetter(1))
        # Output header line
        alias_fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\n")
        # Compute token pair statistic and output to alias file
        for (t1,t2),c in sorted_pairs:
            n1 = token_count_map[t1]
            n2 = token_count_map[t2]
            if (n1 < n2):
                x = t1
                y = t2
                xn = n1
                yn = n2
            else:
                x = t2
                y = t1
                xn = n2
                yn = n1
            f = float(c) / float(xn)
            alias_fd.write("%s\t%s\t%d\t%d\t%d\t%0.2f\n" % (
                x,y,xn,yn,c,f))
        # Close alias file
        alias_fd.close()
        sys.stderr.write('[-] Alias data in %s\n' % (alias_filename))

    # If family statistics, output to file
    if args.fam:
        # Open family file
        fam_filename = out_prefix + '.families'
        fam_fd = open(fam_filename, 'w+')
        # Output header line
        if args.pup:
            fam_fd.write("# Family\tTotal\tMalware\tPUP\tFamType\n")
        else:
            fam_fd.write("# Family\tTotal\n")
        # Sort map
        sorted_pairs = sorted(fam_stats.items(), key=itemgetter(1),
                              reverse=True)
        # Print map contents
        for (f,fstat) in sorted_pairs:
            if args.pup:
                if fstat[1] > fstat[2]:
                    famType = "malware"
                else:
                    famType = "pup"
                fam_fd.write("%s\t%d\t%d\t%d\t%s\n" % (f, fstat[0], fstat[1],
                                                fstat[2], famType))
            else:
                fam_fd.write("%s\t%d\n" % (f, fstat[0]))
        # Close file
        fam_fd.close()
        sys.stderr.write('[-] Family data in %s\n' % (fam_filename))

    # Close log file
    if args.verbose:
        sys.stderr.write('[-] Verbose output in %s\n' % (log_filename))
        verb_fd.close()
def main(args):
    # Select hash used to identify sample, by default MD5
    hash_type = args.hash if args.hash else 'md5'

    # If ground truth provided, read it from file
    gt_dict = {}
    if args.gt:
        with open(args.gt, 'r') as gt_fd:
            for line in gt_fd:
                gt_hash, family = map(str, line.strip().split('\t', 1))
                gt_dict[gt_hash] = family

        # Guess type of hash in ground truth file
        hash_type = guess_hash(list(gt_dict.keys())[0])

    # Create AvLabels object
    av_labels = AvLabels(args.tag, args.exp, args.tax, args.av,
                         args.aliasdetect)

    # Build list of input files
    # NOTE: duplicate input files are not removed
    ifile_l = []
    if (args.vt):
        ifile_l += args.vt
        ifile_are_vt = True
    if (args.lb):
        ifile_l += args.lb
        ifile_are_vt = False
    if (args.vtdir):
        ifile_l += [
            os.path.join(args.vtdir, f) for f in os.listdir(args.vtdir)
        ]
        ifile_are_vt = True
    if (args.lbdir):
        ifile_l += [
            os.path.join(args.lbdir, f) for f in os.listdir(args.lbdir)
        ]
        ifile_are_vt = False

    # Select correct sample info extraction function
    if not ifile_are_vt:
        get_sample_info = av_labels.get_sample_info_lb
    elif args.vt3:
        get_sample_info = av_labels.get_sample_info_vt_v3
    else:
        get_sample_info = av_labels.get_sample_info_vt_v2

    # Select output prefix
    out_prefix = os.path.basename(os.path.splitext(ifile_l[0])[0])

    # Initialize state
    first_token_dict = {}
    token_count_map = {}
    pair_count_map = {}
    vt_all = 0
    avtags_dict = {}
    stats = {
        'samples': 0,
        'noscans': 0,
        'tagged': 0,
        'maltagged': 0,
        'FAM': 0,
        'CLASS': 0,
        'BEH': 0,
        'FILE': 0,
        'UNK': 0
    }

    # Process each input file
    for ifile in ifile_l:
        # Open file
        if args.gzip:
            fd = gzip.open(ifile, 'rt')
        else:
            fd = open(ifile, 'r')

        # Debug info, file processed
        sys.stderr.write('[-] Processing input file %s\n' % ifile)

        # Process all lines in file
        for line in fd:

            # If blank line, skip
            if line == '\n':
                continue

            # Debug info
            if vt_all % 100 == 0:
                sys.stderr.write('\r[-] %d JSON read' % vt_all)
                sys.stderr.flush()
            vt_all += 1

            # Read JSON line
            vt_rep = json.loads(line)

            # Extract sample info
            sample_info = get_sample_info(vt_rep)

            # If no sample info, log error and continue
            if sample_info is None:
                try:
                    name = vt_rep['md5']
                    sys.stderr.write('\nNo scans for %s\n' % name)
                except KeyError:
                    sys.stderr.write('\nCould not process: %s\n' % line)
                sys.stderr.flush()
                stats['noscans'] += 1
                continue

            # Sample's name is selected hash type (md5 by default)
            name = getattr(sample_info, hash_type)

            # If the VT report has no AV labels, output and continue
            if not sample_info.labels:
                sys.stdout.write('%s\t-\t[]\n' % (name))
                # sys.stderr.write('\nNo AV labels for %s\n' % name)
                # sys.stderr.flush()
                continue

            # Compute VT_Count
            vt_count = len(sample_info.labels)

            # Get the distinct tokens from all the av labels in the report
            # And print them.
            try:
                av_tmp = av_labels.get_sample_tags(sample_info)
                tags = av_labels.rank_tags(av_tmp)

                # AV VENDORS PER TOKEN
                if args.avtags:
                    for t in av_tmp:
                        tmap = avtags_dict.get(t, {})
                        for av in av_tmp[t]:
                            ctr = tmap.get(av, 0)
                            tmap[av] = ctr + 1
                        avtags_dict[t] = tmap

                if args.aliasdetect:
                    prev_tokens = set()
                    for entry in tags:
                        curr_tok = entry[0]
                        curr_count = token_count_map.get(curr_tok, 0)
                        token_count_map[curr_tok] = curr_count + 1
                        for prev_tok in prev_tokens:
                            if prev_tok < curr_tok:
                                pair = (prev_tok, curr_tok)
                            else:
                                pair = (curr_tok, prev_tok)
                            pair_count = pair_count_map.get(pair, 0)
                            pair_count_map[pair] = pair_count + 1
                        prev_tokens.add(curr_tok)

                # Collect stats
                # FIX: should iterate once over tags,
                # for both stats and aliasdetect
                if tags:
                    stats["tagged"] += 1
                    if args.stats:
                        if (vt_count > 3):
                            stats["maltagged"] += 1
                            cat_map = {
                                'FAM': False,
                                'CLASS': False,
                                'BEH': False,
                                'FILE': False,
                                'UNK': False
                            }
                            for t in tags:
                                path, cat = av_labels.taxonomy.get_info(t[0])
                                cat_map[cat] = True
                            for c in cat_map:
                                if cat_map[c]:
                                    stats[c] += 1

                # Check if sample is PUP, if requested
                if args.pup:
                    if av_labels.is_pup(tags, av_labels.taxonomy):
                        is_pup_str = "\t1"
                    else:
                        is_pup_str = "\t0"
                else:
                    is_pup_str = ""

                # Select family for sample if needed,
                # i.e., for compatibility mode or for ground truth
                if args.c or args.gt:
                    fam = "SINGLETON:" + name
                    # fam = ''
                    for (t, s) in tags:
                        cat = av_labels.taxonomy.get_category(t)
                        if (cat == "UNK") or (cat == "FAM"):
                            fam = t
                            break

                # Get ground truth family, if available
                if args.gt:
                    first_token_dict[name] = fam
                    gt_family = '\t' + gt_dict.get(name, "")
                else:
                    gt_family = ""

                # Get VT tags as string
                if args.vtt:
                    vtt = list_str(sample_info.vt_tags, prefix="\t")
                else:
                    vtt = ""

                # Print family (and ground truth if available) to stdout
                if not args.c:
                    if args.path:
                        tag_str = format_tag_pairs(tags, av_labels.taxonomy)
                    else:
                        tag_str = format_tag_pairs(tags)
                    sys.stdout.write(
                        '%s\t%d\t%s%s%s%s\n' %
                        (name, vt_count, tag_str, gt_family, is_pup_str, vtt))
                else:
                    sys.stdout.write('%s\t%s%s%s\n' %
                                     (name, fam, gt_family, is_pup_str))
            except:
                traceback.print_exc(file=sys.stderr)
                continue

        # Debug info
        sys.stderr.write('\r[-] %d JSON read' % vt_all)
        sys.stderr.flush()
        sys.stderr.write('\n')

        # Close file
        fd.close()

    # Print statistics
    sys.stderr.write(
        "[-] Samples: %d NoScans: %d NoTags: %d GroundTruth: %d\n" %
        (vt_all, stats['noscans'], vt_all - stats['tagged'], len(gt_dict)))

    # If ground truth, print precision, recall, and F1-measure
    if args.gt:
        precision, recall, fmeasure = \
                    ec.eval_precision_recall_fmeasure(gt_dict,
                                                      first_token_dict)
        sys.stderr.write(
            "Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n" % \
                          (precision, recall, fmeasure))

    # Output stats
    if args.stats:
        stats_fd = open("%s.stats" % out_prefix, 'w')
        num_samples = vt_all
        stats_fd.write('Samples: %d\n' % num_samples)
        num_tagged = stats['tagged']
        frac = float(num_tagged) / float(num_samples) * 100
        stats_fd.write('Tagged (all): %d (%.01f%%)\n' % (num_tagged, frac))
        num_maltagged = stats['maltagged']
        frac = float(num_maltagged) / float(num_samples) * 100
        stats_fd.write('Tagged (VT>3): %d (%.01f%%)\n' % (num_maltagged, frac))
        for c in ['FILE', 'CLASS', 'BEH', 'FAM', 'UNK']:
            count = stats[c]
            frac = float(count) / float(num_maltagged) * 100
            stats_fd.write('%s: %d (%.01f%%)\n' % (c, stats[c], frac))
        stats_fd.close()

    # Output vendor info
    if args.avtags:
        avtags_fd = open("%s.avtags" % out_prefix, 'w')
        for t in sorted(avtags_dict.keys()):
            avtags_fd.write('%s\t' % t)
            pairs = sorted(avtags_dict[t].items(),
                           key=lambda pair: pair[1],
                           reverse=True)
            for pair in pairs:
                avtags_fd.write('%s|%d,' % (pair[0], pair[1]))
            avtags_fd.write('\n')
        avtags_fd.close()

    # If alias detection, print map
    if args.aliasdetect:
        # Open alias file
        alias_filename = out_prefix + '.alias'
        alias_fd = open(alias_filename, 'w+')
        # Sort token pairs by number of times they appear together
        sorted_pairs = sorted(pair_count_map.items(), key=itemgetter(1))
        # sorted_pairs = sorted(
        #     pair_count_map.items())

        # Output header line
        alias_fd.write("# t1\tt2\t|t1|\t|t2|\t"
                       "|t1^t2|\t|t1^t2|/|t1|\t|t1^t2|/|t2|\n")
        # Compute token pair statistic and output to alias file
        for (t1, t2), c in sorted_pairs:
            n1 = token_count_map[t1]
            n2 = token_count_map[t2]
            if (n1 < n2):
                x = t1
                y = t2
                xn = n1
                yn = n2
            else:
                x = t2
                y = t1
                xn = n2
                yn = n1
            f = float(c) / float(xn)
            finv = float(c) / float(yn)
            if args.path:
                x = av_labels.taxonomy.get_path(x)
                y = av_labels.taxonomy.get_path(y)
            alias_fd.write("%s\t%s\t%d\t%d\t%d\t%0.2f\t%0.2f\n" %
                           (x, y, xn, yn, c, f, finv))
        # Close alias file
        alias_fd.close()
        sys.stderr.write('[-] Alias data in %s\n' % (alias_filename))
示例#4
0
def main(args):
    # Select hash used to identify sample, by default MD5
    hash_type = args.hash if args.hash else 'md5'

    # If ground truth provided, read it from file
    gt_dict = {}
    if args.gt:
        with open(args.gt, 'r') as gt_fd:
            for line in gt_fd:
                gt_hash, family = map(str.lower, line.strip().split('\t', 1))
                gt_dict[gt_hash] = family

        # Guess type of hash in ground truth file
        hash_type = guess_hash(gt_dict.keys()[0])

    # Create AvLabels object
    av_labels = AvLabels(args.gen, args.alias, args.av)

    # Select input file with AV labels
    ifile = args.vt if args.vt else args.lb

    # If verbose, open log file
    if args.verbose:
        log_filename = os.path.basename(os.path.splitext(ifile)[0]) + \
                            '.verbose'
        verb_fd = open(log_filename, 'w+')

    # Process each JSON
    vt_all = 0
    vt_empty = 0
    singletons = 0
    with open(ifile, 'r') as fd:
        first_token_dict = {}
        token_count_map = {}
        pair_count_map = {}
        token_family_map = {}

        for line in fd:

            # If blank line, skip
            if line == '\n':
                continue

            # Debug info
            if vt_all % 100 == 0:
                sys.stderr.write('\r[-] %d JSON read' % vt_all)
                sys.stderr.flush()
            vt_all += 1

            # Read JSON line and extract sample info (i.e., hashes and labels)
            vt_rep = json.loads(line)
            sample_info = av_labels.get_sample_info(vt_rep, args.vt)
            name = getattr(sample_info, hash_type)

            # If the VT report has no AV labels, continue
            if not sample_info[3]:
                vt_empty += 1
                sys.stderr.write('\nNo AV labels for %s\n' % name)
                sys.stderr.flush()
                continue

            # Get the distinct tokens from all the av labels in the report
            # And print them. If not verbose, print the first token.
            # If verbose, print the whole list
            try:
                # Get distinct tokens from AV labels
                tokens = av_labels.get_family_ranking(sample_info).items()

                # If alias detection, populate maps
                if args.aliasdetect:
                    prev_tok = ""
                    for entry in tokens:
                        curr_tok = entry[0]
                        curr_count = token_count_map.get(curr_tok)
                        if curr_count:
                            token_count_map[curr_tok] = curr_count + 1
                        else:
                            token_count_map[curr_tok] = 1
                        if prev_tok != "":
                            if prev_tok < curr_tok:
                                pair = (prev_tok, curr_tok)
                            else:
                                pair = (curr_tok, prev_tok)
                            pair_count = pair_count_map.get(pair)
                            if pair_count:
                                pair_count_map[pair] = pair_count + 1
                            else:
                                pair_count_map[pair] = 1
                        prev_tok = curr_tok

                # If generic token detection, populate map
                if args.gendetect and args.gt:
                    for entry in tokens:
                        curr_tok = entry[0]
                        curr_fam_set = token_family_map.get(curr_tok)
                        family = gt_dict[name] if name in gt_dict else None
                        if curr_fam_set and family:
                            curr_fam_set.add(family)
                        elif family:
                            token_family_map[curr_tok] = set(family)

                # Top candidate is most likely family name
                if tokens:
                    family = tokens[0][0]
                else:
                    family = "SINGLETON:" + name
                    singletons += 1

                # Check if sample is PUP, if requested
                if args.pup:
                    if av_labels.is_pup(sample_info[3]):
                        is_pup_str = "\t1"
                    else:
                        is_pup_str = "\t0"
                else:
                    is_pup_str = ""

                # Build family map for precision, recall, computation
                first_token_dict[name] = family

                # Get ground truth family, if available
                if args.gt:
                    gt_family = '\t' + gt_dict[name] if name in gt_dict else ""
                else:
                    gt_family = ""

                # Print family (and ground truth if available) to stdout
                print '%s\t%s%s%s' % (name, family, gt_family, is_pup_str)

                # If verbose, print tokens (and ground truth if available)
                # to log file
                if args.verbose:
                    verb_fd.write('%s\t%s%s%s\n' %
                                  (name, tokens, gt_family, is_pup_str))

            except:
                traceback.print_exc(file=sys.stderr)
                continue

        # Debug info
        sys.stderr.write('\r[-] %d JSON read' % vt_all)
        sys.stderr.flush()
        sys.stderr.write('\n')

    # Print statistics
    sys.stderr.write("[-] Samples: %d NoLabels: %d Singletons: %d "
                     "GroundTruth: %d\n" %
                     (vt_all, vt_empty, singletons, len(gt_dict)))

    # If ground truth, print precision, recall, and F1-measure
    if args.gt and args.eval:
        precision, recall, fmeasure = \
                    ec.eval_precision_recall_fmeasure(gt_dict,
                                                      first_token_dict)
        sys.stderr.write( \
            "Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n" % \
                          (precision, recall, fmeasure))

    # If generic token detection, print map
    if args.gendetect:
        # Open generic tokens file
        gen_filename = os.path.basename(os.path.splitext(ifile)[0]) + \
                            '.gen'
        gen_fd = open(gen_filename, 'w+')
        # Output header line
        gen_fd.write("Token\t#Families\n")
        sorted_pairs = sorted(token_family_map.iteritems(),
                              key=lambda x: len(x[1]) if x[1] else 0,
                              reverse=True)
        for (t, fset) in sorted_pairs:
            gen_fd.write("%s\t%d\n" % (t, len(fset)))

        # Close generic tokens file
        gen_fd.close()

    # If alias detection, print map
    if args.aliasdetect:
        # Open alias file
        alias_filename = os.path.basename(os.path.splitext(ifile)[0]) + \
                            '.alias'
        alias_fd = open(alias_filename, 'w+')
        # Sort token pairs by number of times they appear together
        sorted_pairs = sorted(pair_count_map.items(), key=itemgetter(1))
        # Output header line
        alias_fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\n")
        # Compute token pair statistic and output to alias file
        for (t1, t2), c in sorted_pairs:
            n1 = token_count_map[t1]
            n2 = token_count_map[t2]
            if (n1 < n2):
                x = t1
                y = t2
                xn = n1
                yn = n2
            else:
                x = t2
                y = t1
                xn = n2
                yn = n1
            f = float(c) / float(xn)
            alias_fd.write("%s\t%s\t%d\t%d\t%d\t%0.2f\n" %
                           (x, y, xn, yn, c, f))
        # Close alias file
        alias_fd.close()

    # Close log file
    if args.verbose:
        sys.stderr.write('[-] Verbose output in %s\n' % (log_filename))
        verb_fd.close()