Exemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(
        description='Stochastically adds features to a corpus.')
    parser.add_argument('--vocab',
                        '-v',
                        metavar='FILE',
                        help='The vocab labels.')
    parser.add_argument('--input',
                        '-i',
                        metavar='FILE',
                        help='The input corpus (in Andrews format).')
    parser.add_argument('--output',
                        '-o',
                        metavar='FILE',
                        help='The output corpus (in Andrews format).')
    parser.add_argument('--features',
                        '-f',
                        metavar='FILE',
                        help='The (dense) vector space of features.')

    args = parser.parse_args()
    vocab_labels = load_labels(args.vocab)
    features = load_features(args.features)
    feature_map = word_ids_to_features(vocab_labels, features)

    logging.info("First pass; gathering statistics.")
    inpt = utfopen(args.input)
    numlines = len(inpt.readlines())
    inpt.close()

    logging.info("Starting second pass; actually writing output.")
    output = open(args.output, 'w', 1024 * 1024)
    inpt = utfopen(args.input)
    for lno, line in enumerate(inpt.readlines(), 1):
        if lno % 1000 == 0:
            logging.info("Processing doc# %d/%d (%4.1f%%)" %
                         (lno, numlines, 100 * float(lno) / numlines))
        for chunk in itersplit(line, ' '):
            chunk = chunk.rstrip()
            if not chunk: continue
            idx = chunk.rindex(":")
            wid, cnt = chunk[:idx], chunk[idx + 1:]
            if wid not in feature_map:
                output.write(chunk + ' ')
            else:
                cnt = int(cnt)
                dist = feature_map[wid]
                cnts = Counter(stochastic_choice(dist) for i in xrange(cnt))
                for fid, cnt in cnts.iteritems():
                    output.write('%s,%d:%d ' % (wid, fid, cnt))
        output.write('\n')

    inpt.close()
    output.close()
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(description='Stochastically adds features to a corpus.')
    parser.add_argument('--vocab', '-v', metavar='FILE',
                        help='The vocab labels.')
    parser.add_argument('--input', '-i', metavar='FILE',
                        help='The input corpus (in Andrews format). Must be multimodal.')
    parser.add_argument('--output', '-o', metavar='FILE',
                        help='The output corpus (in Andrews format).')
    parser.add_argument('--outvocab', '-V', metavar='FILE',
                        help='The output vocab labels; necessary for OOV processing later.')

    args = parser.parse_args()
    vocab_labels = load_labels(args.vocab)

    logging.info("First pass; gathering statistics.")
    inpt = open(args.input)
    numlines = len(inpt.readlines())
    inpt.close()

    output_labels = {}
    output_labels_file = utfopenwrite(args.outvocab)

    logging.info("Starting second pass; actually writing output.")
    output = open(args.output, 'w', 1024*1024)
    inpt = open(args.input)
    for lno, line in enumerate(inpt.readlines(), 1):
        if lno % 1000 == 0:
            logging.info("Processing doc# %d/%d (%4.1f%%)" % (lno, numlines, 100*float(lno)/numlines))

        outline = []
        for chunk in itersplit(line, ' '):
            chunk = chunk.rstrip()
            if not chunk: continue
            if ',' not in chunk: continue # strip just words
            idx = chunk.index(',')
            wid = int(chunk[:idx])
            rest = chunk[idx:]

            if wid not in output_labels:
                output_labels[wid] = len(output_labels) + 1
                output_labels_file.write("%d\t" % output_labels[wid])
                output_labels_file.write(vocab_labels[wid])
                output_labels_file.write("\n")
            outline.append(str(output_labels[wid]) + rest)

        if outline:
            output.write(' '.join(outline))
            output.write('\n')

    inpt.close()
    output.close()
Exemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(description='Stochastically adds features to a corpus.')
    parser.add_argument('--vocab', '-v', metavar='FILE',
                        help='The vocab labels.')
    parser.add_argument('--input', '-i', metavar='FILE',
                        help='The input corpus (in Andrews format).')
    parser.add_argument('--output', '-o', metavar='FILE',
                        help='The output corpus (in Andrews format).')
    parser.add_argument('--features', '-f', metavar='FILE',
                        help='The (dense) vector space of features.')

    args = parser.parse_args()
    vocab_labels = load_labels(args.vocab)
    features = load_features(args.features)
    feature_map = word_ids_to_features(vocab_labels, features)

    logging.info("First pass; gathering statistics.")
    inpt = utfopen(args.input)
    numlines = len(inpt.readlines())
    inpt.close()

    logging.info("Starting second pass; actually writing output.")
    output = open(args.output, 'w', 1024*1024)
    inpt = utfopen(args.input)
    for lno, line in enumerate(inpt.readlines(), 1):
        if lno % 1000 == 0:
            logging.info("Processing doc# %d/%d (%4.1f%%)" % (lno, numlines, 100*float(lno)/numlines))
        for chunk in itersplit(line, ' '):
            chunk = chunk.rstrip()
            if not chunk: continue
            idx = chunk.rindex(":")
            wid, cnt = chunk[:idx], chunk[idx+1:]
            if wid not in feature_map:
                output.write(chunk + ' ')
            else:
                cnt = int(cnt)
                dist = feature_map[wid]
                cnts = Counter(stochastic_choice(dist) for i in xrange(cnt))
                for fid, cnt in cnts.iteritems():
                    output.write('%s,%d:%d ' % (wid, fid, cnt))
        output.write('\n')

    inpt.close()
    output.close()
Exemplo n.º 4
0
def main():
    comps = pd.read_table(COMP_FILE)
    comps = comps[comps.compound != comps.const]
    calcsims = list(chain(*zip(comps['compound'], comps['const'])))
    label_vocab = load_labels("target-labels.txt")
    phi_nn = { w[:w.rindex('/')] : i for i, w in label_vocab.iteritems() if '/NN' in w }
    model = np.load("model_250.npy.npz")
    phi = row_norm(model["phi"].T)

    ranked_sims = {}
    done = set()
    for z, word in enumerate(calcsims):
        if word in done or word not in phi_nn:
            continue
        done.add(word)
        i = phi_nn[word]
        w_dist = phi[i]
        sims = calc_similarities(w_dist, phi)
        percentile = percentile_ranked(sims)
        ranked_sims[word] = sims
        logging.info("Done with %d/%d [%s]" % (z + 1, len(calcsims), word))

    ratings_compound = []
    ratings_const = []
    gold = []
    for compound, const, mean in zip(comps.compound, comps.const, comps['mean']):
        if compound not in ranked_sims or const not in ranked_sims:
            continue
        ranked_sims_compound = ranked_sims[compound]
        ranked_sims_const = ranked_sims[const]

        ratings_compound.append(ranked_sims_compound[phi_nn[const]])
        ratings_const.append(ranked_sims_const[phi_nn[compound]])
        gold.append(mean)

    print ratings_compound
    print ratings_const
    print gold
    print spearmanr(ratings_compound, gold)
    print spearmanr(ratings_const, gold)
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(description='Outputs a human readable model.')
    parser.add_argument('models', metavar='FILE', help='The saved models.', nargs='+')
    parser.add_argument('--eval', '-e', metavar='EVALDATA', default='comp',
                        choices=['disco', 'discotrain', 'discovalid', 'discotest', 'discotrainvalid',
                                 'comp', 'compmod', 'comphead',
                                 'schm280'],
                        help="The data set to evaluate against.")
    parser.add_argument('--vocab', '-v', metavar='FILE',
                        help='The vocab labels.')
    parser.add_argument('--acc-thresh', type=float, default=0,
                        help="Don't include pairwise comparisons whose judgements are closer than this threshold.")
    args = parser.parse_args()

    vocab_labels = load_labels(args.vocab)
    vocab_labels = {w : i for i, w in vocab_labels.iteritems()}
    eval_tab = load_eval_table(vocab_labels, args.eval)

    model_evaluations = []
    for model in args.models:
        logging.info("Processing model '%s'..." % model)
        m = np.load(model)
        k = m['k']
        ll = np.mean('loglikelihoods' in m and m['loglikelihoods'][-5:] or m['perwordbounds'][-5:])
        iter = m['max_iteration']
        time = np.sum(m['timediffs'])
        phi = np.ascontiguousarray(m['phi'])
        topic_normed = row_norm(phi)
        word_normed = col_norm(phi)

        lmid = topic_lmi(phi)

        model_eval = dict(k=k, ll=ll, iter=iter, time=time, 
                          alpha=m['alpha'], eta=m['eta'], mu=m['mu'],
                          eval=args.eval, input=m['input_filename'])

        similarities = {}
        for i, pair in eval_tab.iterrows():
            try:
                left_id = vocab_labels[pair['left']]
                right_id = vocab_labels[pair['right']]
            except KeyError:
                continue

            pair_k = (pair['left'], pair['right'])
            right_given_left = np.dot(topic_normed[:,right_id], word_normed[:,left_id])
            left_given_right = np.dot(topic_normed[:,left_id], word_normed[:,right_id])
            jsdiv_sim = jsdiv(word_normed[:,right_id], word_normed[:,left_id])
            symkldiv_sim = symkldiv(word_normed[:,right_id], word_normed[:,left_id])
            kldiv1 = kldiv(word_normed[:,right_id], word_normed[:,left_id])
            kldiv2 = kldiv(word_normed[:,left_id], word_normed[:,right_id])
            cos_lmi = cos(lmid[:,right_id], lmid[:,left_id])

            similarities[pair_k] = {'right':  pair['right'],
                                    'left':     pair['left'],
                                    'right|left': right_given_left,
                                    'left|right': left_given_right,
                                    'jsdiv':     jsdiv_sim,
                                    'symkldiv':  symkldiv_sim,
                                    'kldiv1':    kldiv1,
                                    'kldiv2':    kldiv2,
                                    'coslmi':    cos_lmi,
                                    'human':     pair['similarity'],
                                    }

        # let's compute spearman's rho for each of the measures:
        tmp = pd.DataFrame(similarities.values())
        for m in ['right|left', 'left|right', 'jsdiv', 'symkldiv', 'kldiv1', 'kldiv2', 'coslmi']:
            rho, p = scipy.stats.spearmanr(tmp[m], tmp['human'])
            model_eval['rho_' + m] = rho
            model_eval['p_' + m] = p
            model_eval['n'] = len(tmp[m])

        # okay now let's do accuracy style measures
        baseline_correct = 0
        jsdiv_correct = 0
        symkldiv_correct = 0
        kldiv1_correct = 0
        kldiv2_correct = 0
        rightleft_correct = 0
        leftright_correct = 0
        lmicos_correct = 0
        pairs_compared = 0.0
        for (i, pair1), (j, pair2) in combinations(eval_tab.iterrows(), 2):
            if pair1['similarity'] == pair2['similarity'] or abs(pair1['similarity'] - pair2['similarity']) < 1.0:
                continue

            try:
                pair1_k = (pair1['left'], pair1['right'])
                similarities1 = similarities[pair1_k]
                pair2_k = (pair2['left'], pair2['right'])
                similarities2 = similarities[pair2_k]
            except KeyError:
                continue

            gold = pair1['similarity'] < pair2['similarity']

            pairs_compared += 1
            baseline_correct += (gold == 1)

            jsdiv_correct += (gold == (similarities1['jsdiv'] > similarities2['jsdiv']))
            symkldiv_correct += (gold == (similarities1['symkldiv'] > similarities2['symkldiv']))
            rightleft_correct += (gold == (similarities1['right|left'] < similarities2['right|left']))
            leftright_correct += (gold == (similarities1['left|right'] < similarities2['left|right']))
            lmicos_correct += (gold == (similarities1['coslmi'] < similarities2['coslmi']))

        prod = 100.0 / pairs_compared
        model_eval['filename'] = model
        model_eval['model_type'] = os.path.dirname(model)
        model_eval['acc_baseline'] = baseline_correct / pairs_compared
        model_eval['acc_jsdiv'] = jsdiv_correct / pairs_compared
        model_eval['acc_symkldiv'] = jsdiv_correct / pairs_compared
        model_eval['acc_right|left'] = rightleft_correct / pairs_compared
        model_eval['acc_left|right'] = leftright_correct / pairs_compared
        model_eval['acc_coslmi'] = lmicos_correct / pairs_compared

        model_evaluations.append(model_eval)

    pd.DataFrame(model_evaluations).to_csv(sys.stdout, index=False)
Exemplo n.º 6
0
def row_norm(a):
    row_sums = a.sum(axis=1)
    return a / row_sums[:, np.newaxis]


def col_norm(a):
    col_sums = a.sum(axis=0)
    return a / col_sums


model_file = sys.argv[1]
comp_file = '/home/01813/roller/tmp/imsgrounded/data/comp/comp-values_all_sorted.tsv'
target_labels_file = '/scratch/01813/roller/corpora/webko/TermDoc/target-labels.txt'

vocab_labels = load_labels(target_labels_file)
vocab_labels = {w: i for i, w in vocab_labels.iteritems()}

from onlineldavb import dirichlet_expectation_2
phi = np.ascontiguousarray(np.load(model_file)['phi'])
#phi = np.exp(dirichlet_expectation_2(phi))

topic_normed = col_norm(phi)
word_normed = row_norm(phi)

comp_tab = pd.read_table(comp_file, encoding='utf-8')
comp_tab = comp_tab[comp_tab['const'] != comp_tab['compound']]

compound = []
const = []
ratings = []
Exemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser(
        description='Outputs a human readable model.')
    parser.add_argument('models',
                        metavar='FILE',
                        help='The saved models.',
                        nargs='+')
    parser.add_argument('--eval',
                        '-e',
                        metavar='EVALDATA',
                        default='comp',
                        choices=[
                            'disco', 'discotrain', 'discovalid', 'discotest',
                            'discotrainvalid', 'comp', 'compmod', 'comphead',
                            'schm280'
                        ],
                        help="The data set to evaluate against.")
    parser.add_argument('--vocab',
                        '-v',
                        metavar='FILE',
                        help='The vocab labels.')
    parser.add_argument(
        '--acc-thresh',
        type=float,
        default=0,
        help=
        "Don't include pairwise comparisons whose judgements are closer than this threshold."
    )
    args = parser.parse_args()

    vocab_labels = load_labels(args.vocab)
    vocab_labels = {w: i for i, w in vocab_labels.iteritems()}
    eval_tab = load_eval_table(vocab_labels, args.eval)

    model_evaluations = []
    for model in args.models:
        logging.info("Processing model '%s'..." % model)
        m = np.load(model)
        k = m['k']
        ll = np.mean('loglikelihoods' in m and m['loglikelihoods'][-5:]
                     or m['perwordbounds'][-5:])
        iter = m['max_iteration']
        time = np.sum(m['timediffs'])
        phi = np.ascontiguousarray(m['phi'])
        topic_normed = row_norm(phi)
        word_normed = col_norm(phi)

        lmid = topic_lmi(phi)

        model_eval = dict(k=k,
                          ll=ll,
                          iter=iter,
                          time=time,
                          alpha=m['alpha'],
                          eta=m['eta'],
                          mu=m['mu'],
                          eval=args.eval,
                          input=m['input_filename'])

        similarities = {}
        for i, pair in eval_tab.iterrows():
            try:
                left_id = vocab_labels[pair['left']]
                right_id = vocab_labels[pair['right']]
            except KeyError:
                continue

            pair_k = (pair['left'], pair['right'])
            right_given_left = np.dot(topic_normed[:, right_id],
                                      word_normed[:, left_id])
            left_given_right = np.dot(topic_normed[:, left_id],
                                      word_normed[:, right_id])
            jsdiv_sim = jsdiv(word_normed[:, right_id], word_normed[:,
                                                                    left_id])
            symkldiv_sim = symkldiv(word_normed[:, right_id],
                                    word_normed[:, left_id])
            kldiv1 = kldiv(word_normed[:, right_id], word_normed[:, left_id])
            kldiv2 = kldiv(word_normed[:, left_id], word_normed[:, right_id])
            cos_lmi = cos(lmid[:, right_id], lmid[:, left_id])

            similarities[pair_k] = {
                'right': pair['right'],
                'left': pair['left'],
                'right|left': right_given_left,
                'left|right': left_given_right,
                'jsdiv': jsdiv_sim,
                'symkldiv': symkldiv_sim,
                'kldiv1': kldiv1,
                'kldiv2': kldiv2,
                'coslmi': cos_lmi,
                'human': pair['similarity'],
            }

        # let's compute spearman's rho for each of the measures:
        tmp = pd.DataFrame(similarities.values())
        for m in [
                'right|left', 'left|right', 'jsdiv', 'symkldiv', 'kldiv1',
                'kldiv2', 'coslmi'
        ]:
            rho, p = scipy.stats.spearmanr(tmp[m], tmp['human'])
            model_eval['rho_' + m] = rho
            model_eval['p_' + m] = p
            model_eval['n'] = len(tmp[m])

        # okay now let's do accuracy style measures
        baseline_correct = 0
        jsdiv_correct = 0
        symkldiv_correct = 0
        kldiv1_correct = 0
        kldiv2_correct = 0
        rightleft_correct = 0
        leftright_correct = 0
        lmicos_correct = 0
        pairs_compared = 0.0
        for (i, pair1), (j, pair2) in combinations(eval_tab.iterrows(), 2):
            if pair1['similarity'] == pair2['similarity'] or abs(
                    pair1['similarity'] - pair2['similarity']) < 1.0:
                continue

            try:
                pair1_k = (pair1['left'], pair1['right'])
                similarities1 = similarities[pair1_k]
                pair2_k = (pair2['left'], pair2['right'])
                similarities2 = similarities[pair2_k]
            except KeyError:
                continue

            gold = pair1['similarity'] < pair2['similarity']

            pairs_compared += 1
            baseline_correct += (gold == 1)

            jsdiv_correct += (gold == (similarities1['jsdiv'] >
                                       similarities2['jsdiv']))
            symkldiv_correct += (gold == (similarities1['symkldiv'] >
                                          similarities2['symkldiv']))
            rightleft_correct += (gold == (similarities1['right|left'] <
                                           similarities2['right|left']))
            leftright_correct += (gold == (similarities1['left|right'] <
                                           similarities2['left|right']))
            lmicos_correct += (gold == (similarities1['coslmi'] <
                                        similarities2['coslmi']))

        prod = 100.0 / pairs_compared
        model_eval['filename'] = model
        model_eval['model_type'] = os.path.dirname(model)
        model_eval['acc_baseline'] = baseline_correct / pairs_compared
        model_eval['acc_jsdiv'] = jsdiv_correct / pairs_compared
        model_eval['acc_symkldiv'] = jsdiv_correct / pairs_compared
        model_eval['acc_right|left'] = rightleft_correct / pairs_compared
        model_eval['acc_left|right'] = leftright_correct / pairs_compared
        model_eval['acc_coslmi'] = lmicos_correct / pairs_compared

        model_evaluations.append(model_eval)

    pd.DataFrame(model_evaluations).to_csv(sys.stdout, index=False)
Exemplo n.º 8
0
from scipy.stats import spearmanr
from nicemodel import load_labels

def row_norm(a):
    row_sums = a.sum(axis=1)
    return a / row_sums[:, np.newaxis]

def col_norm(a):
    col_sums = a.sum(axis=0)
    return a / col_sums

model_file = sys.argv[1]
comp_file = '/home/01813/roller/tmp/imsgrounded/data/comp/comp-values_all_sorted.tsv'
target_labels_file = '/scratch/01813/roller/corpora/webko/TermDoc/target-labels.txt'

vocab_labels = load_labels(target_labels_file)
vocab_labels = {w : i for i, w in vocab_labels.iteritems()}

from onlineldavb import dirichlet_expectation_2
phi = np.ascontiguousarray(np.load(model_file)['phi'])
#phi = np.exp(dirichlet_expectation_2(phi))

topic_normed = col_norm(phi)
word_normed = row_norm(phi)

comp_tab = pd.read_table(comp_file, encoding='utf-8')
comp_tab = comp_tab[comp_tab['const'] != comp_tab['compound']]

compound = []
const = []
ratings = []
Exemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser(description='Checks for prediction of association norms.')
    parser.add_argument('--model', '-m', metavar='FILE',
                        help='The saved model.')
    parser.add_argument('--vocab', '-v', metavar='FILE',
                        help='The vocab labels.')
    parser.add_argument('--features', '-f', metavar='FILE',
                        help='The feature labels.')
    #parser.add_argument('--docs', '-D', metavar='FILE',
    #                    help='Output the document distributions for these documents.')
    #parser.add_argument('--docids', '-d', metavar='FILE',
    #                    help='The document labels.')
    args = parser.parse_args()

    model = np.load(args.model)
    phi = row_norm(np.ascontiguousarray(model["phi"].T))
    #pi = safe_pi_read(args.model)

    label_vocab = load_labels(args.vocab)
    #docids = codecs.getreader('utf-8')(open(args.docids)).readlines()

    phi_nn = { w[:w.rindex('/')] : i for i, w in label_vocab.iteritems() if '/NN' in w }

    nopos_labels = mdict()
    for i, v in label_vocab.iteritems():
        nopos = v[:v.rindex('/')]
        nopos_labels[nopos] = i

    assocs = load_associations()
    to_compute_similarities = list(set(t for t, a, c in assocs))

    ranked_sims = {}

    logging.info("compute similarities...")

    for z, w_i in enumerate(to_compute_similarities):
        if w_i not in phi_nn:
            continue
        i = phi_nn[w_i]
        w_i_dist = norm1(phi[i])
        similarities = np.array([cached_jsdiv(i, j, w_i_dist, w_j_dist) for j, w_j_dist in enumerate(phi)])
        ranked_sims[w_i] = percentile_ranked(similarities)
        logging.debug("%d / %d done." % (z + 1, len(to_compute_similarities)))

    logging.info("finished computing similarities.")

    measures = []
    oov_count = 0
    noov_count = 0
    for t, a, c in assocs:
        if t not in ranked_sims or a not in nopos_labels:
            oov_count += 1
            continue
        noov_count += 1
        ranked = ranked_sims[t]
        m = max(ranked[i] for i in nopos_labels[a])
        measures += [m] * c

    measures = np.array(measures)
    print "mean: %f" % measures.mean()
    print "std: %f" % measures.std()
    print "oov: %d" % oov_count
    print "len(measures) = %d" % len(measures)
    print "# hit: %d" % noov_count
    print "Percentiles [.05, .10, .25, .5, .75, .90, .95] ="
    print "     [%.8f, %.8f, %.8f, %.8f, %.8f, %.8f, %.8f]" % tuple([scipy.stats.scoreatpercentile(measures, p) for p in [5, 10, 25, 50, 75, 90, 95]])
Exemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser(
        description='Checks for prediction of association norms.')
    parser.add_argument('--model',
                        '-m',
                        metavar='FILE',
                        help='The saved model.')
    parser.add_argument('--vocab',
                        '-v',
                        metavar='FILE',
                        help='The vocab labels.')
    parser.add_argument('--features',
                        '-f',
                        metavar='FILE',
                        help='The feature labels.')
    #parser.add_argument('--docs', '-D', metavar='FILE',
    #                    help='Output the document distributions for these documents.')
    #parser.add_argument('--docids', '-d', metavar='FILE',
    #                    help='The document labels.')
    args = parser.parse_args()

    model = np.load(args.model)
    phi = row_norm(np.ascontiguousarray(model["phi"].T))
    #pi = safe_pi_read(args.model)

    label_vocab = load_labels(args.vocab)
    #docids = codecs.getreader('utf-8')(open(args.docids)).readlines()

    phi_nn = {
        w[:w.rindex('/')]: i
        for i, w in label_vocab.iteritems() if '/NN' in w
    }

    nopos_labels = mdict()
    for i, v in label_vocab.iteritems():
        nopos = v[:v.rindex('/')]
        nopos_labels[nopos] = i

    assocs = load_associations()
    to_compute_similarities = list(set(t for t, a, c in assocs))

    ranked_sims = {}

    logging.info("compute similarities...")

    for z, w_i in enumerate(to_compute_similarities):
        if w_i not in phi_nn:
            continue
        i = phi_nn[w_i]
        w_i_dist = norm1(phi[i])
        similarities = np.array([
            cached_jsdiv(i, j, w_i_dist, w_j_dist)
            for j, w_j_dist in enumerate(phi)
        ])
        ranked_sims[w_i] = percentile_ranked(similarities)
        logging.debug("%d / %d done." % (z + 1, len(to_compute_similarities)))

    logging.info("finished computing similarities.")

    measures = []
    oov_count = 0
    noov_count = 0
    for t, a, c in assocs:
        if t not in ranked_sims or a not in nopos_labels:
            oov_count += 1
            continue
        noov_count += 1
        ranked = ranked_sims[t]
        m = max(ranked[i] for i in nopos_labels[a])
        measures += [m] * c

    measures = np.array(measures)
    print "mean: %f" % measures.mean()
    print "std: %f" % measures.std()
    print "oov: %d" % oov_count
    print "len(measures) = %d" % len(measures)
    print "# hit: %d" % noov_count
    print "Percentiles [.05, .10, .25, .5, .75, .90, .95] ="
    print "     [%.8f, %.8f, %.8f, %.8f, %.8f, %.8f, %.8f]" % tuple([
        scipy.stats.scoreatpercentile(measures, p)
        for p in [5, 10, 25, 50, 75, 90, 95]
    ])
Exemplo n.º 11
0
def main():
    parser = argparse.ArgumentParser(
        description='Stochastically adds features to a corpus.')
    parser.add_argument('--vocab',
                        '-v',
                        metavar='FILE',
                        help='The vocab labels.')
    parser.add_argument(
        '--input',
        '-i',
        metavar='FILE',
        help='The input corpus (in Andrews format). Must be multimodal.')
    parser.add_argument('--output',
                        '-o',
                        metavar='FILE',
                        help='The output corpus (in Andrews format).')
    parser.add_argument(
        '--outvocab',
        '-V',
        metavar='FILE',
        help='The output vocab labels; necessary for OOV processing later.')

    args = parser.parse_args()
    vocab_labels = load_labels(args.vocab)

    logging.info("First pass; gathering statistics.")
    inpt = open(args.input)
    numlines = len(inpt.readlines())
    inpt.close()

    output_labels = {}
    output_labels_file = utfopenwrite(args.outvocab)

    logging.info("Starting second pass; actually writing output.")
    output = open(args.output, 'w', 1024 * 1024)
    inpt = open(args.input)
    for lno, line in enumerate(inpt.readlines(), 1):
        if lno % 1000 == 0:
            logging.info("Processing doc# %d/%d (%4.1f%%)" %
                         (lno, numlines, 100 * float(lno) / numlines))

        outline = []
        for chunk in itersplit(line, ' '):
            chunk = chunk.rstrip()
            if not chunk: continue
            if ',' not in chunk: continue  # strip just words
            idx = chunk.index(',')
            wid = int(chunk[:idx])
            rest = chunk[idx:]

            if wid not in output_labels:
                output_labels[wid] = len(output_labels) + 1
                output_labels_file.write("%d\t" % output_labels[wid])
                output_labels_file.write(vocab_labels[wid])
                output_labels_file.write("\n")
            outline.append(str(output_labels[wid]) + rest)

        if outline:
            output.write(' '.join(outline))
            output.write('\n')

    inpt.close()
    output.close()