def eval_mst(model_path, test_path, out_prefix, lowercase=True, tagger=None, force=False, result_strem=None): mp = MSTParser() # ------------------------------------------- # Use the output prefix to create some new files. # ------------------------------------------- eval_path = out_prefix + '_eval_tagged.txt' out_path = out_prefix + '_out_tagged.txt' # ------------------------------------------- # Rewrite the test file; POS tag the data # with the POS tags from our tagger, # and strip features. # ------------------------------------------- if not os.path.exists(eval_path) or force: LOG.log(1000, "") cc = ConllCorpus.read(test_path, lowercase=True) if lowercase: cc.lower() cc.strip_tags() cc.strip_feats() if tagger is not None: LOG.log(1000, "POS Tagging evaluation ") cc.tag(StanfordPOSTagger(tagger)) os.makedirs(os.path.dirname(eval_path), exist_ok=True) cc.write(eval_path) # ------------------------------------------- mp.test(model_path, eval_path, out_path) eval_conll_paths(test_path, out_path)
def extract_from_xigt(input_filelist = list, classifier_prefix=None, classifier_feats=CLASS_FEATS_DEFAULT, cfg_path=None, tagger_prefix=None, dep_prefix=None, pos_method=None, aln_method=None, sent_prefix=None, no_alignment_heur=False, sent_type=SENT_TYPE_T_G, **kwargs): # ------- Dictionaries for keeping track of gloss_pos preprocessing. -------- # This dictionary will first, be a list of "words" (full word-level) subword_dict = SubwordDict() # ------------------------------------------- # Map the argument provided for "dep_pos" to # the alignment type that will be searched # ------------------------------------------- use_pos = ARG_POS_MAP[pos_method] use_aln = ALN_ARG_MAP[aln_method] # ------------------------------------------- # Get the tagset mapping if provided # ------------------------------------------- tagpath = kwargs.get('tagmap') tm = None if tagpath is None else TagMap(tagpath) # ============================================================================= # 1) SET UP # ============================================================================= extracted_tagged_snts = 0 extracted_parsed_snts = 0 inst_count = 0 if dep_prefix or tagger_prefix: if use_pos == ARG_POS_NONE: EXTRACT_LOG.log(NORM_LEVEL, 'Not using POS tags for extraction.') elif use_pos is None: EXTRACT_LOG.log(NORM_LEVEL, "Using any available POS tags for extraction.") else: EXTRACT_LOG.log(NORM_LEVEL, 'Using language line tags produced by method "{}"...'.format(use_pos)) # Set up the classifier.... if classifier_prefix is not None: EXTRACT_LOG.log(NORM_LEVEL, "Gathering statistics on POS tags...") # Set up the tagger training file... if tagger_prefix is not None: tagger_train_path = tagger_prefix+'_tagger_train.txt' tagger_model_path = tagger_prefix+'.tagger' EXTRACT_LOG.log(NORM_LEVEL, 'Opening tagger training file at "{}"'.format(tagger_train_path)) fileutils.makedirs(os.path.dirname(tagger_train_path)) tagger_train_f = open(tagger_train_path, 'w', encoding='utf-8') # Set up the dependency parser output if it's specified... dep_train_f = None dep_train_path = None if dep_prefix is not None: dep_train_path = dep_prefix+'_dep_train.txt' EXTRACT_LOG.log(NORM_LEVEL, 'Writing dependency parser training data to "{}"'.format(dep_train_path)) # Make the containing directory if it does not exist. fileutils.makedirs(os.path.dirname(dep_prefix)) # Write out the training file. dep_train_f = open(dep_train_path, 'w', encoding='utf-8') # Set up the files for writing out alignment. if sent_prefix is not None: fileutils.makedirs(os.path.dirname(sent_prefix)) e_f = open(sent_prefix + '_e.txt', 'w', encoding='utf-8') f_f = open(sent_prefix + '_f.txt', 'w', encoding='utf-8') # Set up the CFG path for writing. if cfg_path is not None: fileutils.makedirs(os.path.dirname(cfg_path)) cfg_f = open(cfg_path, 'w', encoding='utf-8') # ------------------------------------------- # Iterate over the provided files. # ------------------------------------------- for path in input_filelist: xc = xc_load(path, mode=INCREMENTAL) # ------------------------------------------- # Do the appropriate extraction for each # ------------------------------------------- for inst in xc: inst_count += 1 if tagger_prefix is not None: extracted_tagged_snts += extract_tagger_from_instance(inst, tagger_train_f, use_pos, tm) if dep_prefix is not None: extracted_parsed_snts += extract_parser_from_instance(inst, dep_train_f, use_pos, tm) if classifier_prefix is not None: gather_gloss_pos_stats(inst, subword_dict, classifier_feats) if sent_prefix is not None: try: extract_sents_from_inst(inst, e_f, f_f, no_alignment_heur=no_alignment_heur, sent_type=sent_type, aln_method=use_aln) except NoNormLineException: pass if cfg_path: extract_cfg_rules_from_inst(inst, cfg_f) # ------------------------------------------- # After looping # ------------------------------------------- EXTRACT_LOG.log(NORM_LEVEL, "{} instances processed.".format(inst_count)) # Add punctuation marks to the tagger. if tagger_prefix is not None: if extracted_tagged_snts == 0: EXTRACT_LOG.error("No tags were found. Not writing out file.") tagger_train_f.close() unlink(tagger_train_path) else: for t in ['?','“','"',"''","'",',','…','/','--','-','``','`',':',';','«','»']: tagger_train_f.write('{}{}{}\n'.format(t,'/','PUNC')) tagger_train_f.close() EXTRACT_LOG.log(NORM_LEVEL, 'Training postagger using "{}"'.format(tagger_train_path)) # Now, train the POStagger... train_postagger(tagger_train_path, tagger_model_path) EXTRACT_LOG.log(NORM_LEVEL, "Tagger training complete.") # ============================================================================= # Classifier output... # ============================================================================= if classifier_prefix is not None: # The path for the svm-light-based features. class_dir = os.path.dirname(classifier_prefix) os.makedirs(class_dir, exist_ok=True) feat_path = classifier_prefix+'.feats.txt' class_path = classifier_prefix+'.classifier' write_out_gram_dict(subword_dict, feat_path, classifier_feats) EXTRACT_LOG.log(NORM_LEVEL, "Training classifier.") train_txt(feat_path, class_path) EXTRACT_LOG.log(NORM_LEVEL, "Complete.") if cfg_path: cfg_f.close() # ------------------------------------------- # Train # ------------------------------------------- if dep_prefix: if extracted_parsed_snts == 0: EXTRACT_LOG.error("No dependency parses were found. Not training parser.") dep_train_f.close() unlink(dep_train_path) else: EXTRACT_LOG.log(NORM_LEVEL, "{} dependency parses found. Training parser...".format(extracted_parsed_snts)) dep_train_f.close() dep_parser_path = dep_prefix+'.depparser' mp = MSTParser() mp.train(dep_train_path, dep_parser_path)
p.add_argument('-f', '--force', help='Force overwrite of precursor files', default=False, action='store_true') args = p.parse_args() # ------------------------------------------- # Sanity check the arguments # ------------------------------------------- if args.CMD == 'test': if args.parser is None or args.test is None: print("\nERROR: --model and --test args are required for test CMD.\n") p.print_help() sys.exit(1) elif not os.path.exists(args.parser): LOG.error('Error: parser file "{}" does not exist.'.format(args.parser)) sys.exit(1) elif not os.path.exists(args.test): LOG.error('Error: eval file "{}" does not exist.'.format(args.parser)) sys.exit(1) elif not args.output: LOG.error('Error: "-o" argument is required for output.') sys.exit(1) LOG.log(1000, "Beginning test of parser...") eval_mst(args.parser, args.test, args.output, tagger=args.tagger) elif args.CMD == 'train': if args.train is None or args.parser is None: print("\nERROR: --train and --model args are required for train CMD.") p.print_help() sys.exit(1) mp = MSTParser() mp.train(args.train, args.parser)