def read_jsonl(path, _log, _run, name='test', encoding='utf-8', lower=True): _log.info('Reading %s JSONL file from %s', name, path) with open(path, encoding=encoding) as f: for line in f: yield Document.from_mapping(json.loads(line.strip()), lower=lower) if SAVE_FILES: _run.add_resource(path)
def main(args): os.makedirs(args.output_dir, exist_ok=True) with open(args.path, encoding=args.encoding) as f: for line in f: doc = Document.from_mapping(json.loads(line.strip()), lower=args.lower) write_neuralsum_oracle(doc, args.output_dir, encoding=args.encoding)
def read_jsonl(path, _log, _run, name='test', encoding='utf-8', lower=True, remove_puncts=True, replace_digits=True, stopwords_path=None): _log.info('Reading %s JSONL file from %s', name, path) if SAVE_FILES: _run.add_resource(path) stopwords = None if stopwords_path is None else read_stopwords(stopwords_path) with open(path, encoding=encoding) as f: for line in f: yield Document.from_mapping( json.loads(line.strip()), lower=lower, remove_puncts=remove_puncts, replace_digits=replace_digits, stopwords=stopwords)
def main(args): docs = [] with open(args.path, encoding=args.encoding) as f: for linum, line in enumerate(f): try: obj = json.loads(line.strip()) docs.append(Document.from_mapping(obj)) except Exception as e: message = f'line {linum+1}: {e}' raise RuntimeError(message) with Executor(max_workers=args.max_workers) as ex: results = ex.map(label_sentences, docs) for best_rouge, doc in results: print(json.dumps(doc.to_dict(), sort_keys=True)) if args.verbose: print(f'ROUGE-1-F: {best_rouge:.2f}', file=sys.stderr)
def main(args): objs = [] with open(args.path, encoding=args.encoding) as f: for linum, line in enumerate(f): try: objs.append(json.loads(line.strip())) except Exception as e: message = f'line {linum+1}: {e}' raise RuntimeError(message) nlp = spacy.blank('id') with ProcessPoolExecutor(max_workers=args.max_workers) as exc: tok_objs = exc.map(partial(tokenize_obj, nlp), objs, chunksize=args.chunk_size) docs = [Document.from_mapping(obj) for obj in tok_objs] if args.discard_long_summary: docs = [doc for doc in docs if not has_long_summary(doc)] print('\n'.join(json.dumps(doc.to_dict(), sort_keys=True) for doc in docs))