Пример #1
0
def select_card(cards, stats, i):
    card = cards[i]
    nearest = stats['dists']['cbow'][i]
    perp = stats['ngram']['perp'][i]
    perp_per = stats['ngram']['perp_per'][i]
    perp_max = stats['ngram']['perp_max'][i]

    if nearest > 0.9 or perp_per > 2.0 or perp_max > 10.0:
        return None
        
    ((_, total_good, _, _), _) = mtg_validate.process_props([card])
    if not total_good == 1:
        return False

    # print '===='
    # print nearest
    # print perp
    # print perp_per
    # print perp_max
    # print '----'
    # print card.format()

    return True
Пример #2
0
def select_card(cards, stats, i):
    card = cards[i]
    nearest = stats['dists']['cbow'][i]
    perp = stats['ngram']['perp'][i]
    perp_per = stats['ngram']['perp_per'][i]
    perp_max = stats['ngram']['perp_max'][i]

    if nearest > 0.9 or perp_per > 2.0 or perp_max > 10.0:
        return None
        
    ((_, total_good, _, _), _) = mtg_validate.process_props([card])
    if not total_good == 1:
        return False

    # print '===='
    # print nearest
    # print perp
    # print perp_per
    # print perp_max
    # print '----'
    # print card.format()

    return True
Пример #3
0
def get_statistics(fname, lm = None, sep = False, verbose=False):
    stats = OrderedDict()
    cards = jdecode.mtg_open_file(fname, verbose=verbose)
    stats['cards'] = cards

    # unpack the name of the checkpoint - terrible and hacky
    try:
        final_name = os.path.basename(fname)
        halves = final_name.split('_epoch')
        cp_name = halves[0]
        cp_info = halves[1][:-4]
        info_halves = cp_info.split('_')
        cp_epoch = float(info_halves[0])
        fragments = info_halves[1].split('.')
        cp_vloss = float('.'.join(fragments[:2]))
        cp_temp = float('.'.join(fragments[-2:]))
        cp_ident = '.'.join(fragments[2:-2])
        stats['cp'] = OrderedDict([('name', cp_name),
                                   ('epoch', cp_epoch),
                                   ('vloss', cp_vloss),
                                   ('temp', cp_temp),
                                   ('ident', cp_ident)])
    except Exception as e:
        pass

    # validate
    ((total_all, total_good, total_bad, total_uncovered), 
         values) = mtg_validate.process_props(cards)
    
    stats['props'] = annotate_values(values)
    stats['props']['overall'] = OrderedDict([('total', total_all), 
                                             ('good', total_good), 
                                             ('bad', total_bad), 
                                             ('uncovered', total_uncovered)])

    # distances
    distfname = fname + '.dist'
    if os.path.isfile(distfname):
        name_dupes = 0
        card_dupes = 0
        with open(distfname, 'rt') as f:
            distlines = f.read().split('\n')
        dists = OrderedDict([('name', []), ('cbow', [])])
        for line in distlines:
            fields = line.split('|')
            if len(fields) < 4:
                continue
            idx = int(fields[0])
            name = str(fields[1])
            ndist = float(fields[2])
            cdist = float(fields[3])
            dists['name'] += [ndist]
            dists['cbow'] += [cdist]
            if ndist == 1.0:
                name_dupes += 1
            if cdist == 1.0:
                card_dupes += 1

        dists['name_mean'] = mean_nonan(dists['name'])
        dists['cbow_mean'] = mean_nonan(dists['cbow'])
        dists['name_geomean'] = gmean_nonzero(dists['name'])
        dists['cbow_geomean'] = gmean_nonzero(dists['cbow'])
        stats['dists'] = dists
        
    # n-grams
    if not lm is None:
        ngram = OrderedDict([('perp', []), ('perp_per', []), 
                             ('perp_max', []), ('perp_per_max', [])])
        for card in cards:
            if len(card.text.text) == 0:
                perp = 0.0
                perp_per = 0.0
            elif sep:
                vtexts = [line.vectorize().split() for line in card.text_lines 
                          if len(line.vectorize().split()) > 0]
                perps = [lm.perplexity(vtext) for vtext in vtexts]
                perps_per = [perps[i] / float(len(vtexts[i])) for i in range(0, len(vtexts))]
                perp = gmean_nonzero(perps)
                perp_per = gmean_nonzero(perps_per)
                perp_max = max(perps)
                perp_per_max = max(perps_per)
            else:
                vtext = card.text.vectorize().split()
                perp = lm.perplexity(vtext)
                perp_per = perp / float(len(vtext))
                perp_max = perp
                perp_per_max = perps_per

            ngram['perp'] += [perp]
            ngram['perp_per'] += [perp_per]
            ngram['perp_max'] += [perp_max]
            ngram['perp_per_max'] += [perp_per_max]

        ngram['perp_mean'] = mean_nonan(ngram['perp'])
        ngram['perp_per_mean'] = mean_nonan(ngram['perp_per'])
        ngram['perp_geomean'] = gmean_nonzero(ngram['perp'])
        ngram['perp_per_geomean'] = gmean_nonzero(ngram['perp_per'])
        stats['ngram'] = ngram

    return stats
Пример #4
0
def get_statistics(fname, lm=None, sep=False, verbose=False):
    stats = OrderedDict()
    cards = jdecode.mtg_open_file(fname, verbose=verbose)
    stats['cards'] = cards

    # unpack the name of the checkpoint - terrible and hacky
    try:
        final_name = os.path.basename(fname)
        halves = final_name.split('_epoch')
        cp_name = halves[0]
        cp_info = halves[1][:-4]
        info_halves = cp_info.split('_')
        cp_epoch = float(info_halves[0])
        fragments = info_halves[1].split('.')
        cp_vloss = float('.'.join(fragments[:2]))
        cp_temp = float('.'.join(fragments[-2:]))
        cp_ident = '.'.join(fragments[2:-2])
        stats['cp'] = OrderedDict([('name', cp_name), ('epoch', cp_epoch),
                                   ('vloss', cp_vloss), ('temp', cp_temp),
                                   ('ident', cp_ident)])
    except Exception as e:
        pass

    # validate
    ((total_all, total_good, total_bad, total_uncovered),
     values) = mtg_validate.process_props(cards)

    stats['props'] = annotate_values(values)
    stats['props']['overall'] = OrderedDict([('total', total_all),
                                             ('good', total_good),
                                             ('bad', total_bad),
                                             ('uncovered', total_uncovered)])

    # distances
    distfname = fname + '.dist'
    if os.path.isfile(distfname):
        name_dupes = 0
        card_dupes = 0
        with open(distfname, 'rt') as f:
            distlines = f.read().split('\n')
        dists = OrderedDict([('name', []), ('cbow', [])])
        for line in distlines:
            fields = line.split('|')
            if len(fields) < 4:
                continue
            idx = int(fields[0])
            name = str(fields[1])
            ndist = float(fields[2])
            cdist = float(fields[3])
            dists['name'] += [ndist]
            dists['cbow'] += [cdist]
            if ndist == 1.0:
                name_dupes += 1
            if cdist == 1.0:
                card_dupes += 1

        dists['name_mean'] = mean_nonan(dists['name'])
        dists['cbow_mean'] = mean_nonan(dists['cbow'])
        dists['name_geomean'] = gmean_nonzero(dists['name'])
        dists['cbow_geomean'] = gmean_nonzero(dists['cbow'])
        stats['dists'] = dists

    # n-grams
    if not lm is None:
        ngram = OrderedDict([('perp', []), ('perp_per', []), ('perp_max', []),
                             ('perp_per_max', [])])
        for card in cards:
            if len(card.text.text) == 0:
                perp = 0.0
                perp_per = 0.0
            elif sep:
                vtexts = [
                    line.vectorize().split() for line in card.text_lines
                    if len(line.vectorize().split()) > 0
                ]
                perps = [lm.perplexity(vtext) for vtext in vtexts]
                perps_per = [
                    perps[i] / float(len(vtexts[i]))
                    for i in range(0, len(vtexts))
                ]
                perp = gmean_nonzero(perps)
                perp_per = gmean_nonzero(perps_per)
                perp_max = max(perps)
                perp_per_max = max(perps_per)
            else:
                vtext = card.text.vectorize().split()
                perp = lm.perplexity(vtext)
                perp_per = perp / float(len(vtext))
                perp_max = perp
                perp_per_max = perps_per

            ngram['perp'] += [perp]
            ngram['perp_per'] += [perp_per]
            ngram['perp_max'] += [perp_max]
            ngram['perp_per_max'] += [perp_per_max]

        ngram['perp_mean'] = mean_nonan(ngram['perp'])
        ngram['perp_per_mean'] = mean_nonan(ngram['perp_per'])
        ngram['perp_geomean'] = gmean_nonzero(ngram['perp'])
        ngram['perp_per_geomean'] = gmean_nonzero(ngram['perp_per'])
        stats['ngram'] = ngram

    return stats