예제 #1
0
def eval_mst(model_path, test_path, out_prefix, lowercase=True, tagger=None, force=False, result_strem=None):
    mp = MSTParser()

    # -------------------------------------------
    # Use the output prefix to create some new files.
    # -------------------------------------------
    eval_path = out_prefix + '_eval_tagged.txt'
    out_path  = out_prefix + '_out_tagged.txt'


    # -------------------------------------------
    # Rewrite the test file; POS tag the data
    # with the POS tags from our tagger,
    # and strip features.
    # -------------------------------------------
    if not os.path.exists(eval_path) or force:
        LOG.log(1000, "")
        cc = ConllCorpus.read(test_path, lowercase=True)
        if lowercase:
            cc.lower()
        cc.strip_tags()
        cc.strip_feats()
        if tagger is not None:
            LOG.log(1000, "POS Tagging evaluation ")
            cc.tag(StanfordPOSTagger(tagger))
        os.makedirs(os.path.dirname(eval_path), exist_ok=True)
        cc.write(eval_path)
    # -------------------------------------------


    mp.test(model_path, eval_path, out_path)
    eval_conll_paths(test_path, out_path)
예제 #2
0
def extract_from_xigt(input_filelist = list, classifier_prefix=None, classifier_feats=CLASS_FEATS_DEFAULT,
                      cfg_path=None, tagger_prefix=None,
                      dep_prefix=None, pos_method=None, aln_method=None,
                      sent_prefix=None, no_alignment_heur=False, sent_type=SENT_TYPE_T_G, **kwargs):

    # ------- Dictionaries for keeping track of gloss_pos preprocessing. --------

    # This dictionary will first, be a list of "words" (full word-level)
    subword_dict = SubwordDict()

    # -------------------------------------------
    # Map the argument provided for "dep_pos" to
    # the alignment type that will be searched
    # -------------------------------------------
    use_pos = ARG_POS_MAP[pos_method]
    use_aln = ALN_ARG_MAP[aln_method]

    # -------------------------------------------
    # Get the tagset mapping if provided
    # -------------------------------------------
    tagpath = kwargs.get('tagmap')
    tm = None if tagpath is None else TagMap(tagpath)

    # =============================================================================
    # 1) SET UP
    # =============================================================================

    extracted_tagged_snts = 0
    extracted_parsed_snts = 0
    inst_count = 0


    if dep_prefix or tagger_prefix:
        if use_pos == ARG_POS_NONE:
            EXTRACT_LOG.log(NORM_LEVEL, 'Not using POS tags for extraction.')
        elif use_pos is None:
            EXTRACT_LOG.log(NORM_LEVEL, "Using any available POS tags for extraction.")
        else:
            EXTRACT_LOG.log(NORM_LEVEL, 'Using language line tags produced by method "{}"...'.format(use_pos))


    # Set up the classifier....
    if classifier_prefix is not None:
        EXTRACT_LOG.log(NORM_LEVEL, "Gathering statistics on POS tags...")

    # Set up the tagger training file...
    if tagger_prefix is not None:
        tagger_train_path = tagger_prefix+'_tagger_train.txt'
        tagger_model_path = tagger_prefix+'.tagger'


        EXTRACT_LOG.log(NORM_LEVEL, 'Opening tagger training file at "{}"'.format(tagger_train_path))
        fileutils.makedirs(os.path.dirname(tagger_train_path))
        tagger_train_f = open(tagger_train_path, 'w', encoding='utf-8')

    # Set up the dependency parser output if it's specified...
    dep_train_f = None
    dep_train_path = None
    if dep_prefix is not None:
        dep_train_path = dep_prefix+'_dep_train.txt'
        EXTRACT_LOG.log(NORM_LEVEL, 'Writing dependency parser training data to "{}"'.format(dep_train_path))

        # Make the containing directory if it does not exist.
        fileutils.makedirs(os.path.dirname(dep_prefix))

        # Write out the training file.
        dep_train_f = open(dep_train_path, 'w', encoding='utf-8')

    # Set up the files for writing out alignment.
    if sent_prefix is not None:
        fileutils.makedirs(os.path.dirname(sent_prefix))
        e_f = open(sent_prefix + '_e.txt', 'w', encoding='utf-8')
        f_f = open(sent_prefix + '_f.txt', 'w', encoding='utf-8')

    # Set up the CFG path for writing.
    if cfg_path is not None:
        fileutils.makedirs(os.path.dirname(cfg_path))
        cfg_f = open(cfg_path, 'w', encoding='utf-8')

    # -------------------------------------------
    # Iterate over the provided files.
    # -------------------------------------------
    for path in input_filelist:
        xc = xc_load(path, mode=INCREMENTAL)

        # -------------------------------------------
        # Do the appropriate extraction for each
        # -------------------------------------------
        for inst in xc:
            inst_count += 1
            if tagger_prefix is not None:
                extracted_tagged_snts += extract_tagger_from_instance(inst, tagger_train_f, use_pos, tm)

            if dep_prefix is not None:
                extracted_parsed_snts += extract_parser_from_instance(inst, dep_train_f, use_pos, tm)

            if classifier_prefix is not None:
                gather_gloss_pos_stats(inst, subword_dict, classifier_feats)

            if sent_prefix is not None:
                try:
                    extract_sents_from_inst(inst, e_f, f_f, no_alignment_heur=no_alignment_heur,
                                            sent_type=sent_type, aln_method=use_aln)
                except NoNormLineException:
                    pass

            if cfg_path:
                extract_cfg_rules_from_inst(inst, cfg_f)

    # -------------------------------------------
    # After looping
    # -------------------------------------------

    EXTRACT_LOG.log(NORM_LEVEL, "{} instances processed.".format(inst_count))

    # Add punctuation marks to the tagger.
    if tagger_prefix is not None:
        if extracted_tagged_snts == 0:
            EXTRACT_LOG.error("No tags were found. Not writing out file.")
            tagger_train_f.close()
            unlink(tagger_train_path)
        else:
            for t in ['?','“','"',"''","'",',','…','/','--','-','``','`',':',';','«','»']:
                tagger_train_f.write('{}{}{}\n'.format(t,'/','PUNC'))
            tagger_train_f.close()
            EXTRACT_LOG.log(NORM_LEVEL, 'Training postagger using "{}"'.format(tagger_train_path))
            # Now, train the POStagger...
            train_postagger(tagger_train_path, tagger_model_path)
            EXTRACT_LOG.log(NORM_LEVEL, "Tagger training complete.")



    # =============================================================================
    # Classifier output...
    # =============================================================================

    if classifier_prefix is not None:

        # The path for the svm-light-based features.
        class_dir  = os.path.dirname(classifier_prefix)
        os.makedirs(class_dir, exist_ok=True)

        feat_path  =  classifier_prefix+'.feats.txt'
        class_path  = classifier_prefix+'.classifier'

        write_out_gram_dict(subword_dict, feat_path, classifier_feats)

        EXTRACT_LOG.log(NORM_LEVEL, "Training classifier.")
        train_txt(feat_path, class_path)
        EXTRACT_LOG.log(NORM_LEVEL, "Complete.")

    if cfg_path:
        cfg_f.close()

    # -------------------------------------------
    # Train
    # -------------------------------------------
    if dep_prefix:
        if extracted_parsed_snts == 0:
            EXTRACT_LOG.error("No dependency parses were found. Not training parser.")
            dep_train_f.close()
            unlink(dep_train_path)
        else:
            EXTRACT_LOG.log(NORM_LEVEL, "{} dependency parses found. Training parser...".format(extracted_parsed_snts))
            dep_train_f.close()
            dep_parser_path = dep_prefix+'.depparser'
            mp = MSTParser()
            mp.train(dep_train_path, dep_parser_path)
예제 #3
0
    p.add_argument('-f', '--force', help='Force overwrite of precursor files', default=False, action='store_true')

    args = p.parse_args()

    # -------------------------------------------
    # Sanity check the arguments
    # -------------------------------------------
    if args.CMD == 'test':
        if args.parser is None or args.test is None:
            print("\nERROR: --model and --test args are required for test CMD.\n")
            p.print_help()
            sys.exit(1)
        elif not os.path.exists(args.parser):
            LOG.error('Error: parser file "{}" does not exist.'.format(args.parser))
            sys.exit(1)
        elif not os.path.exists(args.test):
            LOG.error('Error: eval file "{}" does not exist.'.format(args.parser))
            sys.exit(1)
        elif not args.output:
            LOG.error('Error: "-o" argument is required for output.')
            sys.exit(1)

        LOG.log(1000, "Beginning test of parser...")
        eval_mst(args.parser, args.test, args.output, tagger=args.tagger)
    elif args.CMD == 'train':
        if args.train is None or args.parser is None:
            print("\nERROR: --train and --model args are required for train CMD.")
            p.print_help()
            sys.exit(1)
        mp = MSTParser()
        mp.train(args.train, args.parser)