def generate_training_data(self, infile, feat_options={}, encoding='utf-8'): data_file_name = tempfile.mktemp() data_file = codecs.open(data_file_name,'w',encoding) inst_ct = 0 for s in BrownReader(infile): # build token list for each sentence (urgh! FIXME) tokens = [] for wd,tag in s: token = Token( string=wd, pos=tag ) token.label = token.pos # label is POS tag tokens.append( token ) # create training instance for each token for i in range(len(tokens)): inst_ct += 1 os.write(0, "%s" %"\b"*len(str(inst_ct))+str(inst_ct)) inst = Instance( label=tokens[i].label,\ index=i, tokens=tokens,\ feat_selection=feat_options,\ lex_dict=self.lex_dict,\ tag_dict=self.tag_dict,\ cache=self.cache ) inst.get_features() print >> data_file, inst.__str__() # print >> sys.stderr, inst.__str__().encode('utf8') os.write(0,'\n') data_file.close() return data_file_name
def tag_token_sequence(self, tokens, feat_options={}, beam_size=3): ''' N-best breath search for the best tag sequence for each sentence''' # maintain N-best sequences of tagged tokens sequences = [([],0.0)] # log prob. for i,token in enumerate(tokens): n_best_sequences = [] # cache static features cached_inst = Instance( label=tokens[i].label, index=i, tokens=tokens, feat_selection=feat_options, lex_dict=self.lex_dict, tag_dict=self.tag_dict, cache=self.cache ) cached_inst.get_static_features() # get possible tags: union of tags found in tag_dict and # lex_dict wd = token.string legit_tags1 = self.tag_dict.get(wd,{}) legit_tags2 = {} # self.lex_dict.get(wd,{}) for j,seq in enumerate(sequences): seq_j,log_pr_j = sequences[j] tokens_j = seq_j+tokens[i:] # tokens with previous labels # classify token inst = Instance( label=tokens[i].label, index=i, tokens=tokens_j, feat_selection=feat_options, lex_dict=self.lex_dict, tag_dict=self.tag_dict, cache=self.cache ) inst.fv = cached_inst.fv[:] inst.get_sequential_features() label_pr_distrib = self.classifier.class_distribution(inst.fv) # extend sequence j with current token for (cl,pr) in label_pr_distrib: # make sure that cl is a legal tag if legit_tags1 or legit_tags2: if (cl not in legit_tags1) and (cl not in legit_tags2): continue labelled_token = Token(string=token.string,pos=token.pos,\ comment=token.comment,\ label=cl,label_pr_distrib=label_pr_distrib) n_best_sequences.append((seq_j+[labelled_token],log_pr_j+math.log(pr))) # sort sequences n_best_sequences.sort( key=operator.itemgetter(1) ) # keep N best sequences = n_best_sequences[-beam_size:] # return sequence with highest prob. best_sequence = sequences[-1][0] # print >> sys.stderr, "Best tok seq:", [(t.string,t.label) for t in best_sequence] return best_sequence