def test_line_list(self): line = ["This", "is", "an", "example", "line", "."] tokens = tokenize.tokenize(line) assert tokens assert (isinstance(tokens, list) and all(isinstance(tok, Token) for tok in tokens)) assert [tok.text for tok in tokens] == line
def test_token_title_alnum(self): token = tokenize.tokenize(["Test0"])[0] obs_features = parse_utils.get_token_features_base(token) exp_features = { "idx": 0, "len": 5, "shape": "Xxxxd", "prefix": "T", "suffix": "st0", "is_alpha": False, "is_digit": False, "is_lower": False, "is_upper": False, "is_title": True, "is_punct": False, "is_left_punct": False, "is_right_punct": False, "is_bracket": False, "is_quote": False, "is_space": False, "like_num": False, "like_url": False, "like_email": False, "is_stop": False, "is_alnum": True, "is_newline": False, "is_partial_digit": True, "is_partial_punct": False, } assert obs_features == exp_features
def test_token_upper_with_punct(self): token = tokenize.tokenize(["VICE-VERSA"])[0] obs_features = parse_utils.get_token_features_base(token) exp_features = { "idx": 0, "len": 10, "shape": "XXXX-XXXX", "prefix": "V", "suffix": "RSA", "is_alpha": False, "is_digit": False, "is_lower": False, "is_upper": True, "is_title": False, "is_punct": False, "is_left_punct": False, "is_right_punct": False, "is_bracket": False, "is_quote": False, "is_space": False, "like_num": False, "like_url": False, "like_email": False, "is_stop": False, "is_alnum": False, "is_newline": False, "is_partial_digit": False, "is_partial_punct": True, } assert obs_features == exp_features
def test_date_range_split(self): line = "I worked there 2012–2015." exp_texts = ["I", "worked", "there", "2012", "–", "2015", "."] tokens = tokenize.tokenize(line) assert tokens assert (isinstance(tokens, list) and all(isinstance(tok, Token) for tok in tokens)) assert [tok.text for tok in tokens] == exp_texts
def test_phone_number_merge(self): line = "Call me at 555-123-4567 ASAP." exp_texts = ["Call", "me", "at", "555-123-4567", "ASAP", "."] tokens = tokenize.tokenize(line) assert tokens assert (isinstance(tokens, list) and all(isinstance(tok, Token) for tok in tokens)) assert [tok.text for tok in tokens] == exp_texts
def generate_labeled_tokens(templates, fields, *, n=1, fixed_val_field_keys=None): """ Generate one or many fake examples by combining fields arranged as in ``templates`` with values and default labels specified by ``fields``. Args: templates (List[str] or List[Callable]) fields (Dict[str, Tuple[Callable, str]]) section (str) n (int) fixed_val_field_keys (str or Set[str]) Yields: List[Tuple[str, str]] """ fixed_val_field_keys = utils.to_collection(fixed_val_field_keys, str, set) for template in rnd.choices(templates, k=n): if callable(template): template = template() template_fields = regexes.RE_TEMPLATE_FIELD.findall(template) field_keys = [] field_labels = [] field_vals = [] const_field_vals = {} for key, label, prob in template_fields: if prob and rnd.random() > float(prob): continue field_key = key if "|" not in key else rnd.choice(key.split("|")) field_label = label or fields[field_key][1] if fixed_val_field_keys and field_key in fixed_val_field_keys: field_value = const_field_vals.setdefault( field_label, fields[field_key][0](), ) else: field_value = fields[field_key][0]() field_keys.append(field_key) field_labels.append(field_label) field_vals.append(field_value) tok_labels = [(tok.text, label) for val, label in zip(field_vals, field_labels) for tok in tokenize.tokenize(val)] yield tok_labels
def main(): parser = argparse.ArgumentParser( description="Script to train a CRF parser on section-specific texts.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) add_arguments(parser) args = parser.parse_args() LOGGER.setLevel(args.loglevel) if args.params_filepath: params = next( fileio.load_json(args.params_filepath.resolve(), lines=False)) else: params = {} trainer = pycrfsuite.Trainer(algorithm=args.algorithm, params=params, verbose=args.verbose) module = importlib.import_module(args.module_name) all_feature_label_pairs = [] labeled_lines = fileio.load_json(module.FPATH_TRAINING_DATA, lines=True) for labeled_line in labeled_lines: labels = [label for _, label in labeled_line] token_strs = [token for token, _ in labeled_line] tokens = tokenize.tokenize(token_strs) features = module.parse.featurize(tokens) all_feature_label_pairs.append((features, labels)) if args.test_size == 0.0: holdout = -1 for features, labels in all_feature_label_pairs: trainer.append(features, labels, group=0) else: holdout = 1 for features, labels in all_feature_label_pairs: group = 0 if random.random() >= args.test_size else 1 trainer.append(features, labels, group=group) LOGGER.info("training CRF model with the following params:\n%s", trainer.get_params()) trainer.train(str(module.FPATH_TAGGER), holdout=holdout) LOGGER.info("saved trained model settings to %s", module.FPATH_TAGGER)
def parse_lines(lines, tagger=None): """ Parse a sequence of text lines belonging to the "basics" section of a résumé to produce structured data in the form of :class:`schemas.ResumeBasicsSchema` using trained Conditional Random Field (CRF) taggers. Args: lines (List[str]) tagger (:class:`pycrfsuite.Tagger`) Returns: Dict[str, obj] """ if tagger is None: tagger = parse_utils.load_tagger(basics.FPATH_TAGGER) tokens = tokenize.tokenize("\n".join(lines).strip()) features = featurize(tokens) labeled_tokens = parse_utils.tag(tokens, features, tagger=tagger) data = _parse_labeled_tokens(labeled_tokens) return data
def main(): parser = argparse.ArgumentParser( description=( "Manually label distinct lines of tokenized text by a specified labeling scheme, " "then save the collection of labeled lines in a .jsonl file." ), formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) add_arguments(parser) args = parser.parse_args() LOGGER.setLevel(args.loglevel) module = importlib.import_module(args.module_name) fpath_training_data = module.FPATH_TRAINING_DATA if fpath_training_data.is_file(): labeled_lines = list(fileio.load_json( fpath_training_data, lines=True)) seen_tokenized_lines = { tuple(tok_text for tok_text, _ in line) for line in labeled_lines } else: labeled_lines = [] seen_tokenized_lines = set() n_labeled_lines = len(labeled_lines) LOGGER.info("loaded %s labeled lines from %s", n_labeled_lines, fpath_training_data) labels = args.labels or module.LABELS print_help(labels) unlabeled_lines = list(fileio.load_text( args.unlabeled_data.resolve(), lines=True)) n = len(unlabeled_lines) for i, line in enumerate(unlabeled_lines): tokens = tuple( tok if isinstance(tok, str) else tok.text for tok in tokenize.tokenize(line) ) if not tokens: LOGGER.debug("line \"%s\" doesn't have any tokens; skipping...", line) continue elif tokens in seen_tokenized_lines: LOGGER.debug("line \"%s\" already labeled; skipping...", line) continue else: try: print("\n{}".format("-" * 64)) print("{} / {}".format(i, n)) labeled_line = label_line(line, tokens, labels) except StopIteration: break seen_tokenized_lines.add(tokens) labeled_lines.append(labeled_line) if len(labeled_lines) > n_labeled_lines: fileio.save_json(fpath_training_data, labeled_lines, lines=True) LOGGER.info( "saved %s labeled lines to %s", len(labeled_lines), fpath_training_data, ) else: LOGGER.info("no additional lines labeled")