Пример #1
0
    def generate_y(self, min_char_match=20):
        """
        returns X: list of sentence strings
                y: numpy vector of 1, -1 (for positive/negative examples)
        """
        good_match = False # this will be set to True if sufficent matching characters in
                           # at least one of the parts of the quotes

        match_indices = []


        # go through quotes, match using difflib
        # and keep any matches which are long enough so likely true matches
        for quote in self.quotes:

            self.sequencematcher.set_seq1(quote)

            best_match = self.sequencematcher.find_longest_match(0, len(quote), 0, self.lenpdf)

            # only interested in good quality matches
            if best_match.size > min_char_match:
                good_match = True
                match_indices.append((best_match.b, best_match.b + best_match.size)) # add (start_i, end_i) tuples (of PDF indices)

        
        y = []

        if not good_match:
            # if quality criteria not met, leave here
            # (i.e. return empty lists [], [])
            return y

        # otherwise continue and generate feature and answer vectors

        # get indices of sentences (rather than split)
        sent_indices = sent_tokenizer.span_tokenize(self.pdftext)

        # go through sentence indices
        # make X (list of sentences)
        # and calculate y, if there is *any* overlap with matched quoted text then
        # y = True
        for (start_i, end_i) in sent_indices:



            # if any overlaps with quotes, then y = True, else False
            if any((self._overlap((start_i, end_i), match_tuple) for match_tuple in match_indices)):
                y.append(1)
            else:
                y.append(-1)
        return y
Пример #2
0
 def load_pdftext(self, pdftext):
     self.pdftext = pdftext
     self.lenpdf = len(pdftext)
     self.sequencematcher.set_seq2(self.pdftext)
     self.sent_indices =  sent_tokenizer.span_tokenize(self.pdftext)