def main(): parser = argparse.ArgumentParser( description='Stochastically adds features to a corpus.') parser.add_argument('--vocab', '-v', metavar='FILE', help='The vocab labels.') parser.add_argument('--input', '-i', metavar='FILE', help='The input corpus (in Andrews format).') parser.add_argument('--output', '-o', metavar='FILE', help='The output corpus (in Andrews format).') parser.add_argument('--features', '-f', metavar='FILE', help='The (dense) vector space of features.') args = parser.parse_args() vocab_labels = load_labels(args.vocab) features = load_features(args.features) feature_map = word_ids_to_features(vocab_labels, features) logging.info("First pass; gathering statistics.") inpt = utfopen(args.input) numlines = len(inpt.readlines()) inpt.close() logging.info("Starting second pass; actually writing output.") output = open(args.output, 'w', 1024 * 1024) inpt = utfopen(args.input) for lno, line in enumerate(inpt.readlines(), 1): if lno % 1000 == 0: logging.info("Processing doc# %d/%d (%4.1f%%)" % (lno, numlines, 100 * float(lno) / numlines)) for chunk in itersplit(line, ' '): chunk = chunk.rstrip() if not chunk: continue idx = chunk.rindex(":") wid, cnt = chunk[:idx], chunk[idx + 1:] if wid not in feature_map: output.write(chunk + ' ') else: cnt = int(cnt) dist = feature_map[wid] cnts = Counter(stochastic_choice(dist) for i in xrange(cnt)) for fid, cnt in cnts.iteritems(): output.write('%s,%d:%d ' % (wid, fid, cnt)) output.write('\n') inpt.close() output.close()
def main(): parser = argparse.ArgumentParser(description='Stochastically adds features to a corpus.') parser.add_argument('--vocab', '-v', metavar='FILE', help='The vocab labels.') parser.add_argument('--input', '-i', metavar='FILE', help='The input corpus (in Andrews format). Must be multimodal.') parser.add_argument('--output', '-o', metavar='FILE', help='The output corpus (in Andrews format).') parser.add_argument('--outvocab', '-V', metavar='FILE', help='The output vocab labels; necessary for OOV processing later.') args = parser.parse_args() vocab_labels = load_labels(args.vocab) logging.info("First pass; gathering statistics.") inpt = open(args.input) numlines = len(inpt.readlines()) inpt.close() output_labels = {} output_labels_file = utfopenwrite(args.outvocab) logging.info("Starting second pass; actually writing output.") output = open(args.output, 'w', 1024*1024) inpt = open(args.input) for lno, line in enumerate(inpt.readlines(), 1): if lno % 1000 == 0: logging.info("Processing doc# %d/%d (%4.1f%%)" % (lno, numlines, 100*float(lno)/numlines)) outline = [] for chunk in itersplit(line, ' '): chunk = chunk.rstrip() if not chunk: continue if ',' not in chunk: continue # strip just words idx = chunk.index(',') wid = int(chunk[:idx]) rest = chunk[idx:] if wid not in output_labels: output_labels[wid] = len(output_labels) + 1 output_labels_file.write("%d\t" % output_labels[wid]) output_labels_file.write(vocab_labels[wid]) output_labels_file.write("\n") outline.append(str(output_labels[wid]) + rest) if outline: output.write(' '.join(outline)) output.write('\n') inpt.close() output.close()
def main(): parser = argparse.ArgumentParser(description='Stochastically adds features to a corpus.') parser.add_argument('--vocab', '-v', metavar='FILE', help='The vocab labels.') parser.add_argument('--input', '-i', metavar='FILE', help='The input corpus (in Andrews format).') parser.add_argument('--output', '-o', metavar='FILE', help='The output corpus (in Andrews format).') parser.add_argument('--features', '-f', metavar='FILE', help='The (dense) vector space of features.') args = parser.parse_args() vocab_labels = load_labels(args.vocab) features = load_features(args.features) feature_map = word_ids_to_features(vocab_labels, features) logging.info("First pass; gathering statistics.") inpt = utfopen(args.input) numlines = len(inpt.readlines()) inpt.close() logging.info("Starting second pass; actually writing output.") output = open(args.output, 'w', 1024*1024) inpt = utfopen(args.input) for lno, line in enumerate(inpt.readlines(), 1): if lno % 1000 == 0: logging.info("Processing doc# %d/%d (%4.1f%%)" % (lno, numlines, 100*float(lno)/numlines)) for chunk in itersplit(line, ' '): chunk = chunk.rstrip() if not chunk: continue idx = chunk.rindex(":") wid, cnt = chunk[:idx], chunk[idx+1:] if wid not in feature_map: output.write(chunk + ' ') else: cnt = int(cnt) dist = feature_map[wid] cnts = Counter(stochastic_choice(dist) for i in xrange(cnt)) for fid, cnt in cnts.iteritems(): output.write('%s,%d:%d ' % (wid, fid, cnt)) output.write('\n') inpt.close() output.close()
def main(): comps = pd.read_table(COMP_FILE) comps = comps[comps.compound != comps.const] calcsims = list(chain(*zip(comps['compound'], comps['const']))) label_vocab = load_labels("target-labels.txt") phi_nn = { w[:w.rindex('/')] : i for i, w in label_vocab.iteritems() if '/NN' in w } model = np.load("model_250.npy.npz") phi = row_norm(model["phi"].T) ranked_sims = {} done = set() for z, word in enumerate(calcsims): if word in done or word not in phi_nn: continue done.add(word) i = phi_nn[word] w_dist = phi[i] sims = calc_similarities(w_dist, phi) percentile = percentile_ranked(sims) ranked_sims[word] = sims logging.info("Done with %d/%d [%s]" % (z + 1, len(calcsims), word)) ratings_compound = [] ratings_const = [] gold = [] for compound, const, mean in zip(comps.compound, comps.const, comps['mean']): if compound not in ranked_sims or const not in ranked_sims: continue ranked_sims_compound = ranked_sims[compound] ranked_sims_const = ranked_sims[const] ratings_compound.append(ranked_sims_compound[phi_nn[const]]) ratings_const.append(ranked_sims_const[phi_nn[compound]]) gold.append(mean) print ratings_compound print ratings_const print gold print spearmanr(ratings_compound, gold) print spearmanr(ratings_const, gold)
def main(): parser = argparse.ArgumentParser(description='Outputs a human readable model.') parser.add_argument('models', metavar='FILE', help='The saved models.', nargs='+') parser.add_argument('--eval', '-e', metavar='EVALDATA', default='comp', choices=['disco', 'discotrain', 'discovalid', 'discotest', 'discotrainvalid', 'comp', 'compmod', 'comphead', 'schm280'], help="The data set to evaluate against.") parser.add_argument('--vocab', '-v', metavar='FILE', help='The vocab labels.') parser.add_argument('--acc-thresh', type=float, default=0, help="Don't include pairwise comparisons whose judgements are closer than this threshold.") args = parser.parse_args() vocab_labels = load_labels(args.vocab) vocab_labels = {w : i for i, w in vocab_labels.iteritems()} eval_tab = load_eval_table(vocab_labels, args.eval) model_evaluations = [] for model in args.models: logging.info("Processing model '%s'..." % model) m = np.load(model) k = m['k'] ll = np.mean('loglikelihoods' in m and m['loglikelihoods'][-5:] or m['perwordbounds'][-5:]) iter = m['max_iteration'] time = np.sum(m['timediffs']) phi = np.ascontiguousarray(m['phi']) topic_normed = row_norm(phi) word_normed = col_norm(phi) lmid = topic_lmi(phi) model_eval = dict(k=k, ll=ll, iter=iter, time=time, alpha=m['alpha'], eta=m['eta'], mu=m['mu'], eval=args.eval, input=m['input_filename']) similarities = {} for i, pair in eval_tab.iterrows(): try: left_id = vocab_labels[pair['left']] right_id = vocab_labels[pair['right']] except KeyError: continue pair_k = (pair['left'], pair['right']) right_given_left = np.dot(topic_normed[:,right_id], word_normed[:,left_id]) left_given_right = np.dot(topic_normed[:,left_id], word_normed[:,right_id]) jsdiv_sim = jsdiv(word_normed[:,right_id], word_normed[:,left_id]) symkldiv_sim = symkldiv(word_normed[:,right_id], word_normed[:,left_id]) kldiv1 = kldiv(word_normed[:,right_id], word_normed[:,left_id]) kldiv2 = kldiv(word_normed[:,left_id], word_normed[:,right_id]) cos_lmi = cos(lmid[:,right_id], lmid[:,left_id]) similarities[pair_k] = {'right': pair['right'], 'left': pair['left'], 'right|left': right_given_left, 'left|right': left_given_right, 'jsdiv': jsdiv_sim, 'symkldiv': symkldiv_sim, 'kldiv1': kldiv1, 'kldiv2': kldiv2, 'coslmi': cos_lmi, 'human': pair['similarity'], } # let's compute spearman's rho for each of the measures: tmp = pd.DataFrame(similarities.values()) for m in ['right|left', 'left|right', 'jsdiv', 'symkldiv', 'kldiv1', 'kldiv2', 'coslmi']: rho, p = scipy.stats.spearmanr(tmp[m], tmp['human']) model_eval['rho_' + m] = rho model_eval['p_' + m] = p model_eval['n'] = len(tmp[m]) # okay now let's do accuracy style measures baseline_correct = 0 jsdiv_correct = 0 symkldiv_correct = 0 kldiv1_correct = 0 kldiv2_correct = 0 rightleft_correct = 0 leftright_correct = 0 lmicos_correct = 0 pairs_compared = 0.0 for (i, pair1), (j, pair2) in combinations(eval_tab.iterrows(), 2): if pair1['similarity'] == pair2['similarity'] or abs(pair1['similarity'] - pair2['similarity']) < 1.0: continue try: pair1_k = (pair1['left'], pair1['right']) similarities1 = similarities[pair1_k] pair2_k = (pair2['left'], pair2['right']) similarities2 = similarities[pair2_k] except KeyError: continue gold = pair1['similarity'] < pair2['similarity'] pairs_compared += 1 baseline_correct += (gold == 1) jsdiv_correct += (gold == (similarities1['jsdiv'] > similarities2['jsdiv'])) symkldiv_correct += (gold == (similarities1['symkldiv'] > similarities2['symkldiv'])) rightleft_correct += (gold == (similarities1['right|left'] < similarities2['right|left'])) leftright_correct += (gold == (similarities1['left|right'] < similarities2['left|right'])) lmicos_correct += (gold == (similarities1['coslmi'] < similarities2['coslmi'])) prod = 100.0 / pairs_compared model_eval['filename'] = model model_eval['model_type'] = os.path.dirname(model) model_eval['acc_baseline'] = baseline_correct / pairs_compared model_eval['acc_jsdiv'] = jsdiv_correct / pairs_compared model_eval['acc_symkldiv'] = jsdiv_correct / pairs_compared model_eval['acc_right|left'] = rightleft_correct / pairs_compared model_eval['acc_left|right'] = leftright_correct / pairs_compared model_eval['acc_coslmi'] = lmicos_correct / pairs_compared model_evaluations.append(model_eval) pd.DataFrame(model_evaluations).to_csv(sys.stdout, index=False)
def row_norm(a): row_sums = a.sum(axis=1) return a / row_sums[:, np.newaxis] def col_norm(a): col_sums = a.sum(axis=0) return a / col_sums model_file = sys.argv[1] comp_file = '/home/01813/roller/tmp/imsgrounded/data/comp/comp-values_all_sorted.tsv' target_labels_file = '/scratch/01813/roller/corpora/webko/TermDoc/target-labels.txt' vocab_labels = load_labels(target_labels_file) vocab_labels = {w: i for i, w in vocab_labels.iteritems()} from onlineldavb import dirichlet_expectation_2 phi = np.ascontiguousarray(np.load(model_file)['phi']) #phi = np.exp(dirichlet_expectation_2(phi)) topic_normed = col_norm(phi) word_normed = row_norm(phi) comp_tab = pd.read_table(comp_file, encoding='utf-8') comp_tab = comp_tab[comp_tab['const'] != comp_tab['compound']] compound = [] const = [] ratings = []
def main(): parser = argparse.ArgumentParser( description='Outputs a human readable model.') parser.add_argument('models', metavar='FILE', help='The saved models.', nargs='+') parser.add_argument('--eval', '-e', metavar='EVALDATA', default='comp', choices=[ 'disco', 'discotrain', 'discovalid', 'discotest', 'discotrainvalid', 'comp', 'compmod', 'comphead', 'schm280' ], help="The data set to evaluate against.") parser.add_argument('--vocab', '-v', metavar='FILE', help='The vocab labels.') parser.add_argument( '--acc-thresh', type=float, default=0, help= "Don't include pairwise comparisons whose judgements are closer than this threshold." ) args = parser.parse_args() vocab_labels = load_labels(args.vocab) vocab_labels = {w: i for i, w in vocab_labels.iteritems()} eval_tab = load_eval_table(vocab_labels, args.eval) model_evaluations = [] for model in args.models: logging.info("Processing model '%s'..." % model) m = np.load(model) k = m['k'] ll = np.mean('loglikelihoods' in m and m['loglikelihoods'][-5:] or m['perwordbounds'][-5:]) iter = m['max_iteration'] time = np.sum(m['timediffs']) phi = np.ascontiguousarray(m['phi']) topic_normed = row_norm(phi) word_normed = col_norm(phi) lmid = topic_lmi(phi) model_eval = dict(k=k, ll=ll, iter=iter, time=time, alpha=m['alpha'], eta=m['eta'], mu=m['mu'], eval=args.eval, input=m['input_filename']) similarities = {} for i, pair in eval_tab.iterrows(): try: left_id = vocab_labels[pair['left']] right_id = vocab_labels[pair['right']] except KeyError: continue pair_k = (pair['left'], pair['right']) right_given_left = np.dot(topic_normed[:, right_id], word_normed[:, left_id]) left_given_right = np.dot(topic_normed[:, left_id], word_normed[:, right_id]) jsdiv_sim = jsdiv(word_normed[:, right_id], word_normed[:, left_id]) symkldiv_sim = symkldiv(word_normed[:, right_id], word_normed[:, left_id]) kldiv1 = kldiv(word_normed[:, right_id], word_normed[:, left_id]) kldiv2 = kldiv(word_normed[:, left_id], word_normed[:, right_id]) cos_lmi = cos(lmid[:, right_id], lmid[:, left_id]) similarities[pair_k] = { 'right': pair['right'], 'left': pair['left'], 'right|left': right_given_left, 'left|right': left_given_right, 'jsdiv': jsdiv_sim, 'symkldiv': symkldiv_sim, 'kldiv1': kldiv1, 'kldiv2': kldiv2, 'coslmi': cos_lmi, 'human': pair['similarity'], } # let's compute spearman's rho for each of the measures: tmp = pd.DataFrame(similarities.values()) for m in [ 'right|left', 'left|right', 'jsdiv', 'symkldiv', 'kldiv1', 'kldiv2', 'coslmi' ]: rho, p = scipy.stats.spearmanr(tmp[m], tmp['human']) model_eval['rho_' + m] = rho model_eval['p_' + m] = p model_eval['n'] = len(tmp[m]) # okay now let's do accuracy style measures baseline_correct = 0 jsdiv_correct = 0 symkldiv_correct = 0 kldiv1_correct = 0 kldiv2_correct = 0 rightleft_correct = 0 leftright_correct = 0 lmicos_correct = 0 pairs_compared = 0.0 for (i, pair1), (j, pair2) in combinations(eval_tab.iterrows(), 2): if pair1['similarity'] == pair2['similarity'] or abs( pair1['similarity'] - pair2['similarity']) < 1.0: continue try: pair1_k = (pair1['left'], pair1['right']) similarities1 = similarities[pair1_k] pair2_k = (pair2['left'], pair2['right']) similarities2 = similarities[pair2_k] except KeyError: continue gold = pair1['similarity'] < pair2['similarity'] pairs_compared += 1 baseline_correct += (gold == 1) jsdiv_correct += (gold == (similarities1['jsdiv'] > similarities2['jsdiv'])) symkldiv_correct += (gold == (similarities1['symkldiv'] > similarities2['symkldiv'])) rightleft_correct += (gold == (similarities1['right|left'] < similarities2['right|left'])) leftright_correct += (gold == (similarities1['left|right'] < similarities2['left|right'])) lmicos_correct += (gold == (similarities1['coslmi'] < similarities2['coslmi'])) prod = 100.0 / pairs_compared model_eval['filename'] = model model_eval['model_type'] = os.path.dirname(model) model_eval['acc_baseline'] = baseline_correct / pairs_compared model_eval['acc_jsdiv'] = jsdiv_correct / pairs_compared model_eval['acc_symkldiv'] = jsdiv_correct / pairs_compared model_eval['acc_right|left'] = rightleft_correct / pairs_compared model_eval['acc_left|right'] = leftright_correct / pairs_compared model_eval['acc_coslmi'] = lmicos_correct / pairs_compared model_evaluations.append(model_eval) pd.DataFrame(model_evaluations).to_csv(sys.stdout, index=False)
from scipy.stats import spearmanr from nicemodel import load_labels def row_norm(a): row_sums = a.sum(axis=1) return a / row_sums[:, np.newaxis] def col_norm(a): col_sums = a.sum(axis=0) return a / col_sums model_file = sys.argv[1] comp_file = '/home/01813/roller/tmp/imsgrounded/data/comp/comp-values_all_sorted.tsv' target_labels_file = '/scratch/01813/roller/corpora/webko/TermDoc/target-labels.txt' vocab_labels = load_labels(target_labels_file) vocab_labels = {w : i for i, w in vocab_labels.iteritems()} from onlineldavb import dirichlet_expectation_2 phi = np.ascontiguousarray(np.load(model_file)['phi']) #phi = np.exp(dirichlet_expectation_2(phi)) topic_normed = col_norm(phi) word_normed = row_norm(phi) comp_tab = pd.read_table(comp_file, encoding='utf-8') comp_tab = comp_tab[comp_tab['const'] != comp_tab['compound']] compound = [] const = [] ratings = []
def main(): parser = argparse.ArgumentParser(description='Checks for prediction of association norms.') parser.add_argument('--model', '-m', metavar='FILE', help='The saved model.') parser.add_argument('--vocab', '-v', metavar='FILE', help='The vocab labels.') parser.add_argument('--features', '-f', metavar='FILE', help='The feature labels.') #parser.add_argument('--docs', '-D', metavar='FILE', # help='Output the document distributions for these documents.') #parser.add_argument('--docids', '-d', metavar='FILE', # help='The document labels.') args = parser.parse_args() model = np.load(args.model) phi = row_norm(np.ascontiguousarray(model["phi"].T)) #pi = safe_pi_read(args.model) label_vocab = load_labels(args.vocab) #docids = codecs.getreader('utf-8')(open(args.docids)).readlines() phi_nn = { w[:w.rindex('/')] : i for i, w in label_vocab.iteritems() if '/NN' in w } nopos_labels = mdict() for i, v in label_vocab.iteritems(): nopos = v[:v.rindex('/')] nopos_labels[nopos] = i assocs = load_associations() to_compute_similarities = list(set(t for t, a, c in assocs)) ranked_sims = {} logging.info("compute similarities...") for z, w_i in enumerate(to_compute_similarities): if w_i not in phi_nn: continue i = phi_nn[w_i] w_i_dist = norm1(phi[i]) similarities = np.array([cached_jsdiv(i, j, w_i_dist, w_j_dist) for j, w_j_dist in enumerate(phi)]) ranked_sims[w_i] = percentile_ranked(similarities) logging.debug("%d / %d done." % (z + 1, len(to_compute_similarities))) logging.info("finished computing similarities.") measures = [] oov_count = 0 noov_count = 0 for t, a, c in assocs: if t not in ranked_sims or a not in nopos_labels: oov_count += 1 continue noov_count += 1 ranked = ranked_sims[t] m = max(ranked[i] for i in nopos_labels[a]) measures += [m] * c measures = np.array(measures) print "mean: %f" % measures.mean() print "std: %f" % measures.std() print "oov: %d" % oov_count print "len(measures) = %d" % len(measures) print "# hit: %d" % noov_count print "Percentiles [.05, .10, .25, .5, .75, .90, .95] =" print " [%.8f, %.8f, %.8f, %.8f, %.8f, %.8f, %.8f]" % tuple([scipy.stats.scoreatpercentile(measures, p) for p in [5, 10, 25, 50, 75, 90, 95]])
def main(): parser = argparse.ArgumentParser( description='Checks for prediction of association norms.') parser.add_argument('--model', '-m', metavar='FILE', help='The saved model.') parser.add_argument('--vocab', '-v', metavar='FILE', help='The vocab labels.') parser.add_argument('--features', '-f', metavar='FILE', help='The feature labels.') #parser.add_argument('--docs', '-D', metavar='FILE', # help='Output the document distributions for these documents.') #parser.add_argument('--docids', '-d', metavar='FILE', # help='The document labels.') args = parser.parse_args() model = np.load(args.model) phi = row_norm(np.ascontiguousarray(model["phi"].T)) #pi = safe_pi_read(args.model) label_vocab = load_labels(args.vocab) #docids = codecs.getreader('utf-8')(open(args.docids)).readlines() phi_nn = { w[:w.rindex('/')]: i for i, w in label_vocab.iteritems() if '/NN' in w } nopos_labels = mdict() for i, v in label_vocab.iteritems(): nopos = v[:v.rindex('/')] nopos_labels[nopos] = i assocs = load_associations() to_compute_similarities = list(set(t for t, a, c in assocs)) ranked_sims = {} logging.info("compute similarities...") for z, w_i in enumerate(to_compute_similarities): if w_i not in phi_nn: continue i = phi_nn[w_i] w_i_dist = norm1(phi[i]) similarities = np.array([ cached_jsdiv(i, j, w_i_dist, w_j_dist) for j, w_j_dist in enumerate(phi) ]) ranked_sims[w_i] = percentile_ranked(similarities) logging.debug("%d / %d done." % (z + 1, len(to_compute_similarities))) logging.info("finished computing similarities.") measures = [] oov_count = 0 noov_count = 0 for t, a, c in assocs: if t not in ranked_sims or a not in nopos_labels: oov_count += 1 continue noov_count += 1 ranked = ranked_sims[t] m = max(ranked[i] for i in nopos_labels[a]) measures += [m] * c measures = np.array(measures) print "mean: %f" % measures.mean() print "std: %f" % measures.std() print "oov: %d" % oov_count print "len(measures) = %d" % len(measures) print "# hit: %d" % noov_count print "Percentiles [.05, .10, .25, .5, .75, .90, .95] =" print " [%.8f, %.8f, %.8f, %.8f, %.8f, %.8f, %.8f]" % tuple([ scipy.stats.scoreatpercentile(measures, p) for p in [5, 10, 25, 50, 75, 90, 95] ])
def main(): parser = argparse.ArgumentParser( description='Stochastically adds features to a corpus.') parser.add_argument('--vocab', '-v', metavar='FILE', help='The vocab labels.') parser.add_argument( '--input', '-i', metavar='FILE', help='The input corpus (in Andrews format). Must be multimodal.') parser.add_argument('--output', '-o', metavar='FILE', help='The output corpus (in Andrews format).') parser.add_argument( '--outvocab', '-V', metavar='FILE', help='The output vocab labels; necessary for OOV processing later.') args = parser.parse_args() vocab_labels = load_labels(args.vocab) logging.info("First pass; gathering statistics.") inpt = open(args.input) numlines = len(inpt.readlines()) inpt.close() output_labels = {} output_labels_file = utfopenwrite(args.outvocab) logging.info("Starting second pass; actually writing output.") output = open(args.output, 'w', 1024 * 1024) inpt = open(args.input) for lno, line in enumerate(inpt.readlines(), 1): if lno % 1000 == 0: logging.info("Processing doc# %d/%d (%4.1f%%)" % (lno, numlines, 100 * float(lno) / numlines)) outline = [] for chunk in itersplit(line, ' '): chunk = chunk.rstrip() if not chunk: continue if ',' not in chunk: continue # strip just words idx = chunk.index(',') wid = int(chunk[:idx]) rest = chunk[idx:] if wid not in output_labels: output_labels[wid] = len(output_labels) + 1 output_labels_file.write("%d\t" % output_labels[wid]) output_labels_file.write(vocab_labels[wid]) output_labels_file.write("\n") outline.append(str(output_labels[wid]) + rest) if outline: output.write(' '.join(outline)) output.write('\n') inpt.close() output.close()