def generate_y(self, min_char_match=20): """ returns X: list of sentence strings y: numpy vector of 1, -1 (for positive/negative examples) """ good_match = False # this will be set to True if sufficent matching characters in # at least one of the parts of the quotes match_indices = [] # go through quotes, match using difflib # and keep any matches which are long enough so likely true matches for quote in self.quotes: self.sequencematcher.set_seq1(quote) best_match = self.sequencematcher.find_longest_match(0, len(quote), 0, self.lenpdf) # only interested in good quality matches if best_match.size > min_char_match: good_match = True match_indices.append((best_match.b, best_match.b + best_match.size)) # add (start_i, end_i) tuples (of PDF indices) y = [] if not good_match: # if quality criteria not met, leave here # (i.e. return empty lists [], []) return y # otherwise continue and generate feature and answer vectors # get indices of sentences (rather than split) sent_indices = sent_tokenizer.span_tokenize(self.pdftext) # go through sentence indices # make X (list of sentences) # and calculate y, if there is *any* overlap with matched quoted text then # y = True for (start_i, end_i) in sent_indices: # if any overlaps with quotes, then y = True, else False if any((self._overlap((start_i, end_i), match_tuple) for match_tuple in match_indices)): y.append(1) else: y.append(-1) return y
def load_pdftext(self, pdftext): self.pdftext = pdftext self.lenpdf = len(pdftext) self.sequencematcher.set_seq2(self.pdftext) self.sent_indices = sent_tokenizer.span_tokenize(self.pdftext)