def main(): parser = argparse.ArgumentParser( description='Outputs a human readable model.') parser.add_argument('--input', '-i', metavar='FILE', help='The input corpus (in Andrews format).') parser.add_argument('--output', '-o', metavar='FILE', help='The output corpus (in Andrews format).') args = parser.parse_args() input = open(args.input) output = open(args.output, 'w') for i, line in enumerate(input.readlines()): line = line.rstrip() output_lst = [] for w in itersplit(line, ' '): if not w.strip(): continue if ',' not in w: continue output_lst.append(w[w.index(',') + 1:]) output.write(" ".join(output_lst) + "\n") logging.info("Document %d: %d words written." % (i, len(output_lst))) input.close() output.close()
def main(): parser = argparse.ArgumentParser( description='Stochastically adds features to a corpus.') parser.add_argument('--vocab', '-v', metavar='FILE', help='The vocab labels.') parser.add_argument('--input', '-i', metavar='FILE', help='The input corpus (in Andrews format).') parser.add_argument('--output', '-o', metavar='FILE', help='The output corpus (in Andrews format).') parser.add_argument('--features', '-f', metavar='FILE', help='The (dense) vector space of features.') args = parser.parse_args() vocab_labels = load_labels(args.vocab) features = load_features(args.features) feature_map = word_ids_to_features(vocab_labels, features) logging.info("First pass; gathering statistics.") inpt = utfopen(args.input) numlines = len(inpt.readlines()) inpt.close() logging.info("Starting second pass; actually writing output.") output = open(args.output, 'w', 1024 * 1024) inpt = utfopen(args.input) for lno, line in enumerate(inpt.readlines(), 1): if lno % 1000 == 0: logging.info("Processing doc# %d/%d (%4.1f%%)" % (lno, numlines, 100 * float(lno) / numlines)) for chunk in itersplit(line, ' '): chunk = chunk.rstrip() if not chunk: continue idx = chunk.rindex(":") wid, cnt = chunk[:idx], chunk[idx + 1:] if wid not in feature_map: output.write(chunk + ' ') else: cnt = int(cnt) dist = feature_map[wid] cnts = Counter(stochastic_choice(dist) for i in xrange(cnt)) for fid, cnt in cnts.iteritems(): output.write('%s,%d:%d ' % (wid, fid, cnt)) output.write('\n') inpt.close() output.close()
def main(): parser = argparse.ArgumentParser(description='Stochastically adds features to a corpus.') parser.add_argument('--vocab', '-v', metavar='FILE', help='The vocab labels.') parser.add_argument('--input', '-i', metavar='FILE', help='The input corpus (in Andrews format). Must be multimodal.') parser.add_argument('--output', '-o', metavar='FILE', help='The output corpus (in Andrews format).') parser.add_argument('--outvocab', '-V', metavar='FILE', help='The output vocab labels; necessary for OOV processing later.') args = parser.parse_args() vocab_labels = load_labels(args.vocab) logging.info("First pass; gathering statistics.") inpt = open(args.input) numlines = len(inpt.readlines()) inpt.close() output_labels = {} output_labels_file = utfopenwrite(args.outvocab) logging.info("Starting second pass; actually writing output.") output = open(args.output, 'w', 1024*1024) inpt = open(args.input) for lno, line in enumerate(inpt.readlines(), 1): if lno % 1000 == 0: logging.info("Processing doc# %d/%d (%4.1f%%)" % (lno, numlines, 100*float(lno)/numlines)) outline = [] for chunk in itersplit(line, ' '): chunk = chunk.rstrip() if not chunk: continue if ',' not in chunk: continue # strip just words idx = chunk.index(',') wid = int(chunk[:idx]) rest = chunk[idx:] if wid not in output_labels: output_labels[wid] = len(output_labels) + 1 output_labels_file.write("%d\t" % output_labels[wid]) output_labels_file.write(vocab_labels[wid]) output_labels_file.write("\n") outline.append(str(output_labels[wid]) + rest) if outline: output.write(' '.join(outline)) output.write('\n') inpt.close() output.close()
def main(): parser = argparse.ArgumentParser(description='Stochastically adds features to a corpus.') parser.add_argument('--vocab', '-v', metavar='FILE', help='The vocab labels.') parser.add_argument('--input', '-i', metavar='FILE', help='The input corpus (in Andrews format).') parser.add_argument('--output', '-o', metavar='FILE', help='The output corpus (in Andrews format).') parser.add_argument('--features', '-f', metavar='FILE', help='The (dense) vector space of features.') args = parser.parse_args() vocab_labels = load_labels(args.vocab) features = load_features(args.features) feature_map = word_ids_to_features(vocab_labels, features) logging.info("First pass; gathering statistics.") inpt = utfopen(args.input) numlines = len(inpt.readlines()) inpt.close() logging.info("Starting second pass; actually writing output.") output = open(args.output, 'w', 1024*1024) inpt = utfopen(args.input) for lno, line in enumerate(inpt.readlines(), 1): if lno % 1000 == 0: logging.info("Processing doc# %d/%d (%4.1f%%)" % (lno, numlines, 100*float(lno)/numlines)) for chunk in itersplit(line, ' '): chunk = chunk.rstrip() if not chunk: continue idx = chunk.rindex(":") wid, cnt = chunk[:idx], chunk[idx+1:] if wid not in feature_map: output.write(chunk + ' ') else: cnt = int(cnt) dist = feature_map[wid] cnts = Counter(stochastic_choice(dist) for i in xrange(cnt)) for fid, cnt in cnts.iteritems(): output.write('%s,%d:%d ' % (wid, fid, cnt)) output.write('\n') inpt.close() output.close()
def main(): parser = argparse.ArgumentParser(description='Outputs a human readable model.') parser.add_argument('--input', '-i', metavar='FILE', help='The input corpus (in Andrews format).') parser.add_argument('--output', '-o', metavar='FILE', help='The output corpus (in Andrews format).') args = parser.parse_args() input = open(args.input) output = open(args.output, 'w') for i, line in enumerate(input.readlines()): line = line.rstrip() output_lst = [] for w in itersplit(line, ' '): if not w.strip(): continue if ',' not in w: continue output_lst.append(w[w.index(',')+1:]) output.write(" ".join(output_lst) + "\n") logging.info("Document %d: %d words written." % (i, len(output_lst))) input.close() output.close()
def main(): parser = argparse.ArgumentParser( description='Stochastically adds features to a corpus.') parser.add_argument('--vocab', '-v', metavar='FILE', help='The vocab labels.') parser.add_argument( '--input', '-i', metavar='FILE', help='The input corpus (in Andrews format). Must be multimodal.') parser.add_argument('--output', '-o', metavar='FILE', help='The output corpus (in Andrews format).') parser.add_argument( '--outvocab', '-V', metavar='FILE', help='The output vocab labels; necessary for OOV processing later.') args = parser.parse_args() vocab_labels = load_labels(args.vocab) logging.info("First pass; gathering statistics.") inpt = open(args.input) numlines = len(inpt.readlines()) inpt.close() output_labels = {} output_labels_file = utfopenwrite(args.outvocab) logging.info("Starting second pass; actually writing output.") output = open(args.output, 'w', 1024 * 1024) inpt = open(args.input) for lno, line in enumerate(inpt.readlines(), 1): if lno % 1000 == 0: logging.info("Processing doc# %d/%d (%4.1f%%)" % (lno, numlines, 100 * float(lno) / numlines)) outline = [] for chunk in itersplit(line, ' '): chunk = chunk.rstrip() if not chunk: continue if ',' not in chunk: continue # strip just words idx = chunk.index(',') wid = int(chunk[:idx]) rest = chunk[idx:] if wid not in output_labels: output_labels[wid] = len(output_labels) + 1 output_labels_file.write("%d\t" % output_labels[wid]) output_labels_file.write(vocab_labels[wid]) output_labels_file.write("\n") outline.append(str(output_labels[wid]) + rest) if outline: output.write(' '.join(outline)) output.write('\n') inpt.close() output.close()