def __init__(self, trie, keep_discarded=False, keep_filtered=False, outs=None):
        """
        Initializes the callback. If specified, the training sentences are
        written to @c outs; otherwise they are written to a separate output
        file for each input file called <input_file_name>.ner.
        
        @param trie the trie (not) that stores links seen in the page so far.
        @param keep_discarded if @c True, the discarded sentences will be
                              written to a file called <input_file_name>.discarded.
        @param outs the output stream to write the data to.
        """
        DefaultConllCallback.__init__(self)
        self._sent = SentenceData()
        self._title = []         # title of the current page
        self._cat = None         # category of the current page
        self._gold_map = {}      # page: category
        self._redirect_map = {}  # page: redirects
        self._trie = trie        # the NERs mentioned on the page
        self._mode = NERTrainingCallback.NO_LINK  # state machine state
        self._punct = set(u'.?!:')  # Sentences should end with one of these
        self._link_punct = set(u'.?!:,;')  # To strip from the end of links
        self._keep_discarded = keep_discarded
        self._keep_filtered = keep_filtered

        self._out = None
        self._discarded = None
        self._filtered = None

        self._quot = misc.quotationMarks | misc.brackets
 def fileStart(self, file_name):
     """Opens the output file."""
     DefaultConllCallback.fileStart(self, file_name)
     self._out = open(file_name + '.ner', 'w')
     if self._keep_discarded:
         self._discarded = open(file_name + '.discarded', 'w')
     if self._keep_filtered:
         self._filtered = open(file_name + '.filtered', 'w')
 def fileEnd(self):
     """Closes the output file."""
     if self._sent.num_train > 0:
         sys.stderr.write("Written {0} sentences out of {1}, with avg length = {2} for file {3}.\n".format(
                 self._sent.num_train, self._sent.num_sentences,
                 float(self._sent.num_words) / self._sent.num_train,
                 self.cc_file))
     self._sent.clear_statistics()
     DefaultConllCallback.fileEnd(self)
     self._out.close()
     self._out = None
     if self._discarded is not None:
         self._discarded.close()
         self._discarded = None
     if self._filtered is not None:
         self._filtered.close()
         self._filtered = None
    def fieldStart(self, field):
#        print "NERTrainingCallback:fieldStart"
        DefaultConllCallback.fieldStart(self, field.lower())
        # We get the category from the title, and also need the words it
        # contains before the last '(' (after which it only contains Wikipedia-
        # specific information, such as disambiguation, etc.) for the link
        # inference trie
        if (self.cc_field.lower() == 'title'):
            print "PAGE", self.cc_title.encode('utf-8')
            self._title = []
            self._title_pars = self.cc_title.count('(')
            if self._title_pars == 0:
                self._title_pars = 1

            self._trie.clear()
            self._cat = self._gold_map.get(self.cc_title.lower(), None)
            if self._cat is not None:
                self._trie.add_title(self.cc_title, self._cat)
                self.__add_redirects(self.cc_title, self._cat)
        elif self.cc_field.lower() == 'body':
            self.first = True
            self._mode = NERTrainingCallback.NO_LINK
            self.tmp = []
            self.sentence_words = []