예제 #1
0
def read_jsonl(path, _log, _run, name='test', encoding='utf-8', lower=True):
    _log.info('Reading %s JSONL file from %s', name, path)
    with open(path, encoding=encoding) as f:
        for line in f:
            yield Document.from_mapping(json.loads(line.strip()), lower=lower)
    if SAVE_FILES:
        _run.add_resource(path)
예제 #2
0
def main(args):
    os.makedirs(args.output_dir, exist_ok=True)
    with open(args.path, encoding=args.encoding) as f:
        for line in f:
            doc = Document.from_mapping(json.loads(line.strip()),
                                        lower=args.lower)
            write_neuralsum_oracle(doc,
                                   args.output_dir,
                                   encoding=args.encoding)
예제 #3
0
파일: corpus.py 프로젝트: MinhajulMU/mysum
def read_jsonl(path, _log, _run, name='test', encoding='utf-8', lower=True, remove_puncts=True,
               replace_digits=True, stopwords_path=None):
    _log.info('Reading %s JSONL file from %s', name, path)
    if SAVE_FILES:
        _run.add_resource(path)
    stopwords = None if stopwords_path is None else read_stopwords(stopwords_path)

    with open(path, encoding=encoding) as f:
        for line in f:
            yield Document.from_mapping(
                json.loads(line.strip()), lower=lower, remove_puncts=remove_puncts,
                replace_digits=replace_digits, stopwords=stopwords)
예제 #4
0
def main(args):
    docs = []
    with open(args.path, encoding=args.encoding) as f:
        for linum, line in enumerate(f):
            try:
                obj = json.loads(line.strip())
                docs.append(Document.from_mapping(obj))
            except Exception as e:
                message = f'line {linum+1}: {e}'
                raise RuntimeError(message)

    with Executor(max_workers=args.max_workers) as ex:
        results = ex.map(label_sentences, docs)
        for best_rouge, doc in results:
            print(json.dumps(doc.to_dict(), sort_keys=True))
            if args.verbose:
                print(f'ROUGE-1-F: {best_rouge:.2f}', file=sys.stderr)
예제 #5
0
def main(args):
    objs = []
    with open(args.path, encoding=args.encoding) as f:
        for linum, line in enumerate(f):
            try:
                objs.append(json.loads(line.strip()))
            except Exception as e:
                message = f'line {linum+1}: {e}'
                raise RuntimeError(message)

    nlp = spacy.blank('id')
    with ProcessPoolExecutor(max_workers=args.max_workers) as exc:
        tok_objs = exc.map(partial(tokenize_obj, nlp), objs, chunksize=args.chunk_size)
        docs = [Document.from_mapping(obj) for obj in tok_objs]
        if args.discard_long_summary:
            docs = [doc for doc in docs if not has_long_summary(doc)]
        print('\n'.join(json.dumps(doc.to_dict(), sort_keys=True) for doc in docs))