def yaml_iter(): for i, line in enumerate(input_lines, 1): sys.stdout.write("Aligned {:8.4f}% \r".format(100 * i / n_data)) sys.stdout.flush() doc = read_line_document(line) res = process(doc) yield (doc.filename, res)
def load_documents(documents_path): name2doc = dict() docs = list() print("Loading documents from {} ...".format(documents_path)) with open(documents_path, "r") as f: for i, line in enumerate(f, 1): sys.stdout.write("{:10d} documents read.\r".format(i)) sys.stdout.flush() doc = read_line_document(line) name2doc[doc.filename] = doc docs.append(doc) print("") return docs, name2doc
def main(): import argparse hlp = "Compute input vocabulary." parser = argparse.ArgumentParser(hlp) parser.add_argument("--documents", required=True, help="Path to preprocessed documents.") parser.add_argument("--output", required=True, help="Path to write vocab.") parser.add_argument("--size", required=True, type=int, help="Number of most frequent vocab words to keep.") parser.add_argument( "--special", nargs="+", default=["<E>", "<D>", "<S>", "<B>", "__UNK__", "__ENT__"]) args = parser.parse_args() assert (args.size > 0) counts = defaultdict(int) vocab_dir = os.path.dirname(args.output) if vocab_dir != "" and not os.path.exists(vocab_dir): os.makedirs(vocab_dir) with open(args.documents, "r") as f: for i, line in enumerate(f, 1): sys.stdout.write("\rRead {:7d} documents".format(i)) sys.stdout.flush() doc = read_line_document(line) for tokens in doc.highlights: for token in tokens: if token.ne not in ents: counts[re.sub(r"\d", "D", token.lower())] += 1 counts = counts.items() counts.sort(key=lambda x: x[1], reverse=True) vocab = args.special + [w for w, c in counts[:args.size]] with open(args.output, "w") as f: f.write("\n".join(vocab))
def main(): import argparse help_msg = "Format data for sentence generation task using neural " \ + "network implemented in Torch/Lua." parser = argparse.ArgumentParser(help_msg) parser.add_argument('--documents', required=True, help="Path to preprocessed data.") parser.add_argument('--alignments', required=True, help="Path to alignments data.") parser.add_argument('--output', required=True, help="File to write data.") parser.add_argument('--input-vocab', required=True, help="Path to input vocab.") parser.add_argument('--output-vocab', required=True, help="Path to output vocab.") parser.add_argument('--entity-mode', required=True, choices=["1-tag", "3-tags"]) args = parser.parse_args() output_dir = os.path.dirname(args.output) if output_dir != '' and not os.path.exists(output_dir): os.makedirs(output_dir) print("Reading alignments from {} ...".format(args.alignments)) name2alignments = dict() with open(args.alignments, "r") as f: for i, (filename, alignments) in enumerate(yaml.load_all(f), 1): sys.stdout.write("\rRead {:7d} alignments".format(i)) sys.stdout.flush() name2alignments[filename] = alignments print("") print("Reading input vocab from {} ...".format(args.input_vocab)) id2vocab_in, vocab2id_in = read_vocab(args.input_vocab) print("Reading output vocab from {} ...".format(args.output_vocab)) id2vocab_out, vocab2id_out = read_vocab(args.output_vocab) print("Reading documents from {} ...".format(args.documents)) print("Writing data to {} ...".format(args.output)) with open(args.documents, "r") as f, open(args.output, "w") as o: for i, line in enumerate(f, 1): sys.stdout.write("\rRead {:7d} documents".format(i)) sys.stdout.flush() doc = read_line_document(line) if doc.filename not in name2alignments: print("\nSkipping {}, no alignment found.".format( doc.filename)) continue alignments = name2alignments[doc.filename] dls = process_example(doc, alignments, vocab2id_in, id2vocab_in, vocab2id_out, id2vocab_out, args.entity_mode) for dl in dls: o.write(dl) print("")