def test_preserve_case(self): preserved_cases = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0 ] self.assertEqual( [int(spacy_utils.preserve_case(tok)) for tok in self.spacy_doc], preserved_cases)
def test_preserve_case(spacy_doc): preserved_cases = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, ] assert [int(spacy_utils.preserve_case(tok)) for tok in spacy_doc] == preserved_cases
def direct_quotations(doc): """ Baseline, not-great attempt at direction quotation extraction (no indirect or mixed quotations) using rules and patterns. English only. Args: doc (``textacy.Doc`` or ``spacy.Doc``) Yields: (``spacy.Span``, ``spacy.Token``, ``spacy.Span``): next quotation in ``doc`` represented as a (speaker, reporting verb, quotation) 3-tuple Notes: Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic Tagging of Reported Speech in Newspaper Articles". TODO: Better approach would use ML, but needs a training dataset. """ if isinstance(doc, textacy.Doc): if doc.lang != 'en': raise NotImplementedError('sorry, English-language texts only :(') doc = doc.spacy_doc quote_end_punct = {',', '.', '?', '!'} quote_indexes = set(itertoolz.concat( (m.start(), m.end() - 1) for m in re.finditer(r"(\".*?\")|(''.*?'')|(``.*?'')", doc.string))) quote_positions = list(itertoolz.partition( 2, sorted(tok.i for tok in doc if tok.idx in quote_indexes))) sents = list(doc.sents) sent_positions = [(sent.start, sent.end) for sent in sents] for q0, q1 in quote_positions: quote = doc[q0: q1 + 1] # we're only looking for direct quotes, not indirect or mixed if not any(char in quote_end_punct for char in quote.text[-4:]): continue # get adjacent sentences candidate_sent_indexes = [] for i, (s0, s1) in enumerate(sent_positions): if s0 <= q1 + 1 and s1 > q1: candidate_sent_indexes.append(i) elif s0 < q0 and s1 >= q0 - 1: candidate_sent_indexes.append(i) for si in candidate_sent_indexes: sent = sents[si] # get any reporting verbs rvs = [tok for tok in sent if spacy_utils.preserve_case(tok) is False and tok.lemma_ in REPORTING_VERBS and tok.pos_ == 'VERB' and not any(oq0 <= tok.i <= oq1 for oq0, oq1 in quote_positions)] # get target offset against which to measure distances of NEs if rvs: if len(rvs) == 1: rv = rvs[0] else: min_rv_dist = 1000 for rv_candidate in rvs: rv_dist = min(abs(rv_candidate.i - qp) for qp in (q0, q1)) if rv_dist < min_rv_dist: rv = rv_candidate min_rv_dist = rv_dist else: break else: # TODO: do we have no other recourse?! continue try: # rv_subj = _find_subjects(rv)[0] rv_subj = get_subjects_of_verb(rv)[0] except IndexError: continue # if rv_subj.text in {'he', 'she'}: # for ne in named_entities(doc, good_ne_types={'PERSON'}): # if ne.start < rv_subj.i: # speaker = ne # else: # break # else: span = get_span_for_compound_noun(rv_subj) speaker = doc[span[0]: span[1] + 1] yield (speaker, rv, quote) break
def direct_quotations(doc): """ Baseline, not-great attempt at direction quotation extraction (no indirect or mixed quotations) using rules and patterns. English only. Args: doc (``spacy.Doc``) Yields: (``spacy.Span``, ``spacy.Token``, ``spacy.Span``): next quotation in ``doc`` represented as a (speaker, reporting verb, quotation) 3-tuple Notes: Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic Tagging of Reported Speech in Newspaper Articles". TODO: Better approach would use ML, but needs a training dataset. """ quote_end_punct = {',', '.', '?', '!'} quote_indexes = set(itertoolz.concat( (m.start(), m.end() - 1) for m in re.finditer(r"(\".*?\")|(''.*?'')|(``.*?'')", doc.string))) quote_positions = list(itertoolz.partition( 2, sorted(tok.i for tok in doc if tok.idx in quote_indexes))) sents = list(doc.sents) sent_positions = [(sent.start, sent.end) for sent in sents] for q0, q1 in quote_positions: quote = doc[q0: q1 + 1] # we're only looking for direct quotes, not indirect or mixed if not any(char in quote_end_punct for char in quote.text[-4:]): continue # get adjacent sentences candidate_sent_indexes = [] for i, (s0, s1) in enumerate(sent_positions): if s0 <= q1 + 1 and s1 > q1: candidate_sent_indexes.append(i) elif s0 < q0 and s1 >= q0 - 1: candidate_sent_indexes.append(i) for si in candidate_sent_indexes: sent = sents[si] # get any reporting verbs rvs = [tok for tok in sent if spacy_utils.preserve_case(tok) is False and tok.lemma_ in REPORTING_VERBS and tok.pos_ == 'VERB' and not any(oq0 <= tok.i <= oq1 for oq0, oq1 in quote_positions)] # get target offset against which to measure distances of NEs if rvs: if len(rvs) == 1: rv = rvs[0] else: min_rv_dist = 1000 for rv_candidate in rvs: rv_dist = min(abs(rv_candidate.i - qp) for qp in (q0, q1)) if rv_dist < min_rv_dist: rv = rv_candidate min_rv_dist = rv_dist else: break else: # TODO: do we have no other recourse?! continue try: # rv_subj = _find_subjects(rv)[0] rv_subj = get_subjects_of_verb(rv)[0] except IndexError: continue # if rv_subj.text in {'he', 'she'}: # for ne in named_entities(doc, good_ne_types={'PERSON'}): # if ne.start < rv_subj.i: # speaker = ne # else: # break # else: span = get_span_for_compound_noun(rv_subj) speaker = doc[span[0]: span[1] + 1] yield (speaker, rv, quote) break
def test_preserve_case(self): preserved_cases = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0] self.assertEqual([int(spacy_utils.preserve_case(tok)) for tok in self.spacy_doc], preserved_cases)