예제 #1
0
 def test_line_list(self):
     line = ["This", "is", "an", "example", "line", "."]
     tokens = tokenize.tokenize(line)
     assert tokens
     assert (isinstance(tokens, list)
             and all(isinstance(tok, Token) for tok in tokens))
     assert [tok.text for tok in tokens] == line
 def test_token_title_alnum(self):
     token = tokenize.tokenize(["Test0"])[0]
     obs_features = parse_utils.get_token_features_base(token)
     exp_features = {
         "idx": 0,
         "len": 5,
         "shape": "Xxxxd",
         "prefix": "T",
         "suffix": "st0",
         "is_alpha": False,
         "is_digit": False,
         "is_lower": False,
         "is_upper": False,
         "is_title": True,
         "is_punct": False,
         "is_left_punct": False,
         "is_right_punct": False,
         "is_bracket": False,
         "is_quote": False,
         "is_space": False,
         "like_num": False,
         "like_url": False,
         "like_email": False,
         "is_stop": False,
         "is_alnum": True,
         "is_newline": False,
         "is_partial_digit": True,
         "is_partial_punct": False,
     }
     assert obs_features == exp_features
 def test_token_upper_with_punct(self):
     token = tokenize.tokenize(["VICE-VERSA"])[0]
     obs_features = parse_utils.get_token_features_base(token)
     exp_features = {
         "idx": 0,
         "len": 10,
         "shape": "XXXX-XXXX",
         "prefix": "V",
         "suffix": "RSA",
         "is_alpha": False,
         "is_digit": False,
         "is_lower": False,
         "is_upper": True,
         "is_title": False,
         "is_punct": False,
         "is_left_punct": False,
         "is_right_punct": False,
         "is_bracket": False,
         "is_quote": False,
         "is_space": False,
         "like_num": False,
         "like_url": False,
         "like_email": False,
         "is_stop": False,
         "is_alnum": False,
         "is_newline": False,
         "is_partial_digit": False,
         "is_partial_punct": True,
     }
     assert obs_features == exp_features
예제 #4
0
 def test_date_range_split(self):
     line = "I worked there 2012–2015."
     exp_texts = ["I", "worked", "there", "2012", "–", "2015", "."]
     tokens = tokenize.tokenize(line)
     assert tokens
     assert (isinstance(tokens, list)
             and all(isinstance(tok, Token) for tok in tokens))
     assert [tok.text for tok in tokens] == exp_texts
예제 #5
0
 def test_phone_number_merge(self):
     line = "Call me at 555-123-4567 ASAP."
     exp_texts = ["Call", "me", "at", "555-123-4567", "ASAP", "."]
     tokens = tokenize.tokenize(line)
     assert tokens
     assert (isinstance(tokens, list)
             and all(isinstance(tok, Token) for tok in tokens))
     assert [tok.text for tok in tokens] == exp_texts
예제 #6
0
def generate_labeled_tokens(templates,
                            fields,
                            *,
                            n=1,
                            fixed_val_field_keys=None):
    """
    Generate one or many fake examples by combining fields arranged as in ``templates``
    with values and default labels specified by ``fields``.

    Args:
        templates (List[str] or List[Callable])
        fields (Dict[str, Tuple[Callable, str]])
        section (str)
        n (int)
        fixed_val_field_keys (str or Set[str])

    Yields:
        List[Tuple[str, str]]
    """
    fixed_val_field_keys = utils.to_collection(fixed_val_field_keys, str, set)
    for template in rnd.choices(templates, k=n):
        if callable(template):
            template = template()
        template_fields = regexes.RE_TEMPLATE_FIELD.findall(template)
        field_keys = []
        field_labels = []
        field_vals = []
        const_field_vals = {}
        for key, label, prob in template_fields:
            if prob and rnd.random() > float(prob):
                continue
            field_key = key if "|" not in key else rnd.choice(key.split("|"))
            field_label = label or fields[field_key][1]
            if fixed_val_field_keys and field_key in fixed_val_field_keys:
                field_value = const_field_vals.setdefault(
                    field_label,
                    fields[field_key][0](),
                )
            else:
                field_value = fields[field_key][0]()
            field_keys.append(field_key)
            field_labels.append(field_label)
            field_vals.append(field_value)
        tok_labels = [(tok.text, label)
                      for val, label in zip(field_vals, field_labels)
                      for tok in tokenize.tokenize(val)]
        yield tok_labels
예제 #7
0
def main():
    parser = argparse.ArgumentParser(
        description="Script to train a CRF parser on section-specific texts.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    add_arguments(parser)
    args = parser.parse_args()

    LOGGER.setLevel(args.loglevel)

    if args.params_filepath:
        params = next(
            fileio.load_json(args.params_filepath.resolve(), lines=False))
    else:
        params = {}
    trainer = pycrfsuite.Trainer(algorithm=args.algorithm,
                                 params=params,
                                 verbose=args.verbose)

    module = importlib.import_module(args.module_name)

    all_feature_label_pairs = []
    labeled_lines = fileio.load_json(module.FPATH_TRAINING_DATA, lines=True)
    for labeled_line in labeled_lines:
        labels = [label for _, label in labeled_line]
        token_strs = [token for token, _ in labeled_line]
        tokens = tokenize.tokenize(token_strs)
        features = module.parse.featurize(tokens)
        all_feature_label_pairs.append((features, labels))

    if args.test_size == 0.0:
        holdout = -1
        for features, labels in all_feature_label_pairs:
            trainer.append(features, labels, group=0)
    else:
        holdout = 1
        for features, labels in all_feature_label_pairs:
            group = 0 if random.random() >= args.test_size else 1
            trainer.append(features, labels, group=group)

    LOGGER.info("training CRF model with the following params:\n%s",
                trainer.get_params())
    trainer.train(str(module.FPATH_TAGGER), holdout=holdout)
    LOGGER.info("saved trained model settings to %s", module.FPATH_TAGGER)
예제 #8
0
def parse_lines(lines, tagger=None):
    """
    Parse a sequence of text lines belonging to the "basics" section of a résumé
    to produce structured data in the form of :class:`schemas.ResumeBasicsSchema`
    using trained Conditional Random Field (CRF) taggers.

    Args:
        lines (List[str])
        tagger (:class:`pycrfsuite.Tagger`)

    Returns:
        Dict[str, obj]
    """
    if tagger is None:
        tagger = parse_utils.load_tagger(basics.FPATH_TAGGER)

    tokens = tokenize.tokenize("\n".join(lines).strip())
    features = featurize(tokens)
    labeled_tokens = parse_utils.tag(tokens, features, tagger=tagger)
    data = _parse_labeled_tokens(labeled_tokens)
    return data
예제 #9
0
def main():
    parser = argparse.ArgumentParser(
        description=(
            "Manually label distinct lines of tokenized text by a specified labeling scheme, "
            "then save the collection of labeled lines in a .jsonl file."
        ),
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    add_arguments(parser)
    args = parser.parse_args()

    LOGGER.setLevel(args.loglevel)
    module = importlib.import_module(args.module_name)

    fpath_training_data = module.FPATH_TRAINING_DATA
    if fpath_training_data.is_file():
        labeled_lines = list(fileio.load_json(
            fpath_training_data, lines=True))
        seen_tokenized_lines = {
            tuple(tok_text for tok_text, _ in line)
            for line in labeled_lines
        }
    else:
        labeled_lines = []
        seen_tokenized_lines = set()

    n_labeled_lines = len(labeled_lines)
    LOGGER.info("loaded %s labeled lines from %s", n_labeled_lines, fpath_training_data)

    labels = args.labels or module.LABELS
    print_help(labels)

    unlabeled_lines = list(fileio.load_text(
        args.unlabeled_data.resolve(), lines=True))
    n = len(unlabeled_lines)
    for i, line in enumerate(unlabeled_lines):
        tokens = tuple(
            tok if isinstance(tok, str) else tok.text
            for tok in tokenize.tokenize(line)
        )
        if not tokens:
            LOGGER.debug("line \"%s\" doesn't have any tokens; skipping...", line)
            continue
        elif tokens in seen_tokenized_lines:
            LOGGER.debug("line \"%s\" already labeled; skipping...", line)
            continue
        else:
            try:
                print("\n{}".format("-" * 64))
                print("{} / {}".format(i, n))
                labeled_line = label_line(line, tokens, labels)
            except StopIteration:
                break
        seen_tokenized_lines.add(tokens)
        labeled_lines.append(labeled_line)

    if len(labeled_lines) > n_labeled_lines:
        fileio.save_json(fpath_training_data, labeled_lines, lines=True)
        LOGGER.info(
            "saved %s labeled lines to %s",
            len(labeled_lines), fpath_training_data,
        )
    else:
        LOGGER.info("no additional lines labeled")