def __init__(self, trie, keep_discarded=False, keep_filtered=False, outs=None): """ Initializes the callback. If specified, the training sentences are written to @c outs; otherwise they are written to a separate output file for each input file called <input_file_name>.ner. @param trie the trie (not) that stores links seen in the page so far. @param keep_discarded if @c True, the discarded sentences will be written to a file called <input_file_name>.discarded. @param outs the output stream to write the data to. """ DefaultConllCallback.__init__(self) self._sent = SentenceData() self._title = [] # title of the current page self._cat = None # category of the current page self._gold_map = {} # page: category self._redirect_map = {} # page: redirects self._trie = trie # the NERs mentioned on the page self._mode = NERTrainingCallback.NO_LINK # state machine state self._punct = set(u'.?!:') # Sentences should end with one of these self._link_punct = set(u'.?!:,;') # To strip from the end of links self._keep_discarded = keep_discarded self._keep_filtered = keep_filtered self._out = None self._discarded = None self._filtered = None self._quot = misc.quotationMarks | misc.brackets
def fileStart(self, file_name): """Opens the output file.""" DefaultConllCallback.fileStart(self, file_name) self._out = open(file_name + '.ner', 'w') if self._keep_discarded: self._discarded = open(file_name + '.discarded', 'w') if self._keep_filtered: self._filtered = open(file_name + '.filtered', 'w')
def fileEnd(self): """Closes the output file.""" if self._sent.num_train > 0: sys.stderr.write("Written {0} sentences out of {1}, with avg length = {2} for file {3}.\n".format( self._sent.num_train, self._sent.num_sentences, float(self._sent.num_words) / self._sent.num_train, self.cc_file)) self._sent.clear_statistics() DefaultConllCallback.fileEnd(self) self._out.close() self._out = None if self._discarded is not None: self._discarded.close() self._discarded = None if self._filtered is not None: self._filtered.close() self._filtered = None
def fieldStart(self, field): # print "NERTrainingCallback:fieldStart" DefaultConllCallback.fieldStart(self, field.lower()) # We get the category from the title, and also need the words it # contains before the last '(' (after which it only contains Wikipedia- # specific information, such as disambiguation, etc.) for the link # inference trie if (self.cc_field.lower() == 'title'): print "PAGE", self.cc_title.encode('utf-8') self._title = [] self._title_pars = self.cc_title.count('(') if self._title_pars == 0: self._title_pars = 1 self._trie.clear() self._cat = self._gold_map.get(self.cc_title.lower(), None) if self._cat is not None: self._trie.add_title(self.cc_title, self._cat) self.__add_redirects(self.cc_title, self._cat) elif self.cc_field.lower() == 'body': self.first = True self._mode = NERTrainingCallback.NO_LINK self.tmp = [] self.sentence_words = []