def process_dicts(class_path): c = pickle.load(open(c_path, "rb")) d = pickle.load(open(d_path, "rb")) m = pickle.load(open(m_path, "rb")) print(len(c), len(d), len(m)) # Threshold: thresh = 30 # Now, we want to write out every word that we've seen at least 3 times. out_path = os.path.join(proj_root, "odin_feats.txt") out_f = open(out_path, "w", encoding="utf-8") for i, w in enumerate(d.keys()): if d[w].total() < thresh: LOG.debug("Skipping {}".format(w)) else: LOG.debug("Testing {}".format(w)) for tag in d[w].keys(): LOG.debug("Writing out tag for {}-{}".format(w, tag)) t = GoldTagPOSToken(w, goldlabel=tag) write_gram(t, output=out_f, feat_next_gram=False, feat_prev_gram=False, lowercase=True) out_f.flush() out_f.close() train_txt(out_path, class_path)
def instances_to_classifier( instances, class_out_path, tag_method=None, posdict=None, feat_path=None, context_feats=False ): """ Given a list of IGT instances, create a gloss-line classifier from them. :param instances: :type instances: list[RGIgt] :param class_out_path: :type class_out_path: str :param tag_method: :param posdict: :param feat_path: Path to specify where to write out the svmlight-format feature file. If it is none, use a temp file. """ # Create a temporary file for the features file that we will # create... if feat_path is None: ntf = NamedTemporaryFile("w", delete=True, encoding="utf-8") else: ntf = open(feat_path, "w", encoding="utf-8") counts = CountDict() def callback(result): out_string, cur_instances = result ntf.write(out_string) counts.add("instances", 1) p = Pool(cpu_count()) # Iterate through the instances provided for chunk in chunkIt(list(instances), cpu_count()): # p.apply_async(chunk_to_features, args=[chunk, tag_method, posdict, context_feats], callback=callback) callback(chunk_to_features(chunk, tag_method=tag_method, posdict=posdict, context_feats=context_feats)) p.close() p.join() if counts["instances"] == 0: raise ClassifierException("No gloss POS tags found!") ntf.close() return train_txt(ntf.name, class_out_path)
def extract_from_xigt(input_filelist = list, classifier_prefix=None, classifier_feats=CLASS_FEATS_DEFAULT, cfg_path=None, tagger_prefix=None, dep_prefix=None, pos_method=None, aln_method=None, sent_prefix=None, no_alignment_heur=False, sent_type=SENT_TYPE_T_G, **kwargs): # ------- Dictionaries for keeping track of gloss_pos preprocessing. -------- # This dictionary will first, be a list of "words" (full word-level) subword_dict = SubwordDict() # ------------------------------------------- # Map the argument provided for "dep_pos" to # the alignment type that will be searched # ------------------------------------------- use_pos = ARG_POS_MAP[pos_method] use_aln = ALN_ARG_MAP[aln_method] # ------------------------------------------- # Get the tagset mapping if provided # ------------------------------------------- tagpath = kwargs.get('tagmap') tm = None if tagpath is None else TagMap(tagpath) # ============================================================================= # 1) SET UP # ============================================================================= extracted_tagged_snts = 0 extracted_parsed_snts = 0 inst_count = 0 if dep_prefix or tagger_prefix: if use_pos == ARG_POS_NONE: EXTRACT_LOG.log(NORM_LEVEL, 'Not using POS tags for extraction.') elif use_pos is None: EXTRACT_LOG.log(NORM_LEVEL, "Using any available POS tags for extraction.") else: EXTRACT_LOG.log(NORM_LEVEL, 'Using language line tags produced by method "{}"...'.format(use_pos)) # Set up the classifier.... if classifier_prefix is not None: EXTRACT_LOG.log(NORM_LEVEL, "Gathering statistics on POS tags...") # Set up the tagger training file... if tagger_prefix is not None: tagger_train_path = tagger_prefix+'_tagger_train.txt' tagger_model_path = tagger_prefix+'.tagger' EXTRACT_LOG.log(NORM_LEVEL, 'Opening tagger training file at "{}"'.format(tagger_train_path)) fileutils.makedirs(os.path.dirname(tagger_train_path)) tagger_train_f = open(tagger_train_path, 'w', encoding='utf-8') # Set up the dependency parser output if it's specified... dep_train_f = None dep_train_path = None if dep_prefix is not None: dep_train_path = dep_prefix+'_dep_train.txt' EXTRACT_LOG.log(NORM_LEVEL, 'Writing dependency parser training data to "{}"'.format(dep_train_path)) # Make the containing directory if it does not exist. fileutils.makedirs(os.path.dirname(dep_prefix)) # Write out the training file. dep_train_f = open(dep_train_path, 'w', encoding='utf-8') # Set up the files for writing out alignment. if sent_prefix is not None: fileutils.makedirs(os.path.dirname(sent_prefix)) e_f = open(sent_prefix + '_e.txt', 'w', encoding='utf-8') f_f = open(sent_prefix + '_f.txt', 'w', encoding='utf-8') # Set up the CFG path for writing. if cfg_path is not None: fileutils.makedirs(os.path.dirname(cfg_path)) cfg_f = open(cfg_path, 'w', encoding='utf-8') # ------------------------------------------- # Iterate over the provided files. # ------------------------------------------- for path in input_filelist: xc = xc_load(path, mode=INCREMENTAL) # ------------------------------------------- # Do the appropriate extraction for each # ------------------------------------------- for inst in xc: inst_count += 1 if tagger_prefix is not None: extracted_tagged_snts += extract_tagger_from_instance(inst, tagger_train_f, use_pos, tm) if dep_prefix is not None: extracted_parsed_snts += extract_parser_from_instance(inst, dep_train_f, use_pos, tm) if classifier_prefix is not None: gather_gloss_pos_stats(inst, subword_dict, classifier_feats) if sent_prefix is not None: try: extract_sents_from_inst(inst, e_f, f_f, no_alignment_heur=no_alignment_heur, sent_type=sent_type, aln_method=use_aln) except NoNormLineException: pass if cfg_path: extract_cfg_rules_from_inst(inst, cfg_f) # ------------------------------------------- # After looping # ------------------------------------------- EXTRACT_LOG.log(NORM_LEVEL, "{} instances processed.".format(inst_count)) # Add punctuation marks to the tagger. if tagger_prefix is not None: if extracted_tagged_snts == 0: EXTRACT_LOG.error("No tags were found. Not writing out file.") tagger_train_f.close() unlink(tagger_train_path) else: for t in ['?','“','"',"''","'",',','…','/','--','-','``','`',':',';','«','»']: tagger_train_f.write('{}{}{}\n'.format(t,'/','PUNC')) tagger_train_f.close() EXTRACT_LOG.log(NORM_LEVEL, 'Training postagger using "{}"'.format(tagger_train_path)) # Now, train the POStagger... train_postagger(tagger_train_path, tagger_model_path) EXTRACT_LOG.log(NORM_LEVEL, "Tagger training complete.") # ============================================================================= # Classifier output... # ============================================================================= if classifier_prefix is not None: # The path for the svm-light-based features. class_dir = os.path.dirname(classifier_prefix) os.makedirs(class_dir, exist_ok=True) feat_path = classifier_prefix+'.feats.txt' class_path = classifier_prefix+'.classifier' write_out_gram_dict(subword_dict, feat_path, classifier_feats) EXTRACT_LOG.log(NORM_LEVEL, "Training classifier.") train_txt(feat_path, class_path) EXTRACT_LOG.log(NORM_LEVEL, "Complete.") if cfg_path: cfg_f.close() # ------------------------------------------- # Train # ------------------------------------------- if dep_prefix: if extracted_parsed_snts == 0: EXTRACT_LOG.error("No dependency parses were found. Not training parser.") dep_train_f.close() unlink(dep_train_path) else: EXTRACT_LOG.log(NORM_LEVEL, "{} dependency parses found. Training parser...".format(extracted_parsed_snts)) dep_train_f.close() dep_parser_path = dep_prefix+'.depparser' mp = MSTParser() mp.train(dep_train_path, dep_parser_path)