Пример #1
0
def load_passage(filename, annotate=False):
    WIKIFIER.enabled = False
    converters = {k: partial(c, annotate=annotate) for k, c in FROM_FORMAT.items()}
    passages = ioutil.read_files_and_dirs(filename, converters=converters, attempts=1, delay=0)
    try:
        return next(iter(passages))
    except StopIteration:
        return passages
Пример #2
0
            pass
    return 0


def get_eval_type(scores):
    return UNLABELED if Config().is_unlabeled(scores.format) else LABELED


# Marks input passages as text so that we don't accidentally train on them
def from_text_format(*args, **kwargs):
    for passage in from_text(*args, **kwargs):
        passage.extra["format"] = "text"
        yield passage


CONVERTERS = {k: partial(c, annotate=True) for k, c in FROM_FORMAT.items()}
CONVERTERS[""] = CONVERTERS["txt"] = from_text_format


def read_passages(args, files):
    expanded = [f for pattern in files for f in sorted(glob(pattern)) or (pattern,)]
    return ioutil.read_files_and_dirs(expanded, sentences=args.sentences, paragraphs=args.paragraphs,
                                      converters=CONVERTERS, lang=Config().args.lang)


def filter_passages_for_bert(passages, args):
    from pytorch_pretrained_bert import BertTokenizer
    is_uncased_model = "uncased" in args.bert_model
    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=is_uncased_model)
    for passage in passages:
        text = [node.text for node in passage.layer(layer0.LAYER_ID).all]
Пример #3
0
            pass
    return 0


def get_eval_type(scores):
    return UNLABELED if Config().is_unlabeled(scores.format) else LABELED


# Marks input passages as text so that we don't accidentally train on them
def from_text_format(*args, **kwargs):
    for passage in from_text(*args, **kwargs):
        passage.extra["format"] = "text"
        yield passage


CONVERTERS = {k: partial(c, annotate=True) for k, c in FROM_FORMAT.items()}
CONVERTERS[""] = CONVERTERS["txt"] = from_text_format


def read_passages(args, files):
    expanded = [f for pattern in files for f in sorted(glob(pattern)) or (pattern,)]
    return ioutil.read_files_and_dirs(expanded, sentences=args.sentences, paragraphs=args.paragraphs,
                                      converters=CONVERTERS, lang=Config().args.lang)


# noinspection PyTypeChecker,PyStringFormat
def main_generator():
    args = Config().args
    assert args.passages or args.train, "Either passages or --train is required (use -h for help)"
    assert args.models or args.train or args.folds, "Either --model or --train or --folds is required"
    assert not (args.train or args.dev) or not args.folds, "--train and --dev are incompatible with --folds"
Пример #4
0
def open_out_file(spec, name=None):
    if spec.join:
        filename = spec.join
        if not filename.endswith(".conllu"):
            filename += ".conllu"
        return open(os.path.join(spec.out_dir, filename),
                    "a",
                    encoding="utf-8")
    return open(os.path.join(spec.out_dir, name + ".conllu"),
                "w",
                encoding="utf-8")


CONVERTERS = {
    f: lambda l: (to_conllu_native(p) for p in c(l, passage_id=None))
    for f, c in FROM_FORMAT.items()
}
CONVERTERS[
    "conllu"] = split_by_empty_lines  # If getting CoNLL-U as input, don't bother converting just to convert back


def main(args):
    for spec in read_specs(args, converters=CONVERTERS):
        scores = []
        sentences, to_parse = tee(
            (to_conllu_native(p),
             to_conllu_native(p, test=True, enhanced=False)
             ) if isinstance(p, core.Passage) else (p, strip_enhanced(p))
            for p in spec.passages)
        t = tqdm(zip((x for x, _ in sentences),
                     split_by_empty_lines(
Пример #5
0
        "--label-map",
        help=
        "CSV file specifying mapping of input edge labels to output edge labels"
    )
    add_boolean_option(argparser,
                       "node-ids",
                       "print tikz code rather than showing plots",
                       short="i")
    argparser.add_argument("-f",
                           "--format",
                           choices=("png", "svg"),
                           default="png",
                           help="image format")
    args = argparser.parse_args()

    FROM_FORMAT = {k: partial(v, **vars(args)) for k, v in FROM_FORMAT.items()}
    if args.out_dir:
        os.makedirs(args.out_dir, exist_ok=True)
    for passage in get_passages_with_progress_bar(args.passages,
                                                  desc="Visualizing",
                                                  converters=FROM_FORMAT):
        map_labels(passage, args.label_map)
        if args.normalize:
            normalize(passage, extra=args.extra_normalization)
        if args.tikz:
            tikz = visualization.tikz(passage, node_ids=args.node_ids)
            if args.out_dir:
                with open(os.path.join(args.out_dir, passage.ID + ".tikz.txt"),
                          "w") as f:
                    print(tikz, file=f)
            else: