def simple_subjects_and_objects(self, verb): """ Extract all simple subjects and objects for a given verb. Uses Textacy get_objects_of_verb and get_subjects_of_verb methods param: verb A Spacy Token return: A list of verb subjects and objects (Spacy Tokens or Spans) """ verb_objects = get_objects_of_verb(verb) verb_subjects = get_subjects_of_verb(verb) verb_objects.extend(verb_subjects) return verb_objects
def subject_verb_object_triples(doc): """ Extract an ordered sequence of subject-verb-object (SVO) triples from a spacy-parsed doc. Note that this only works for SVO languages. Args: doc (``spacy.Doc`` or ``spacy.Span``): either a spacy document or a sentence thereof Yields: (``spacy.Span``, ``spacy.Span``, ``spacy.Span``): the next 3-tuple from ``doc`` representing a (subject, verb, object) triple, in order of apperance # TODO: What to do about questions, where it may be VSO instead of SVO? # TODO: What about non-adjacent verb negations? # TODO: What about object (noun) negations? """ try: sents = doc.sents except AttributeError: sents = [doc] for sent in sents: start_i = sent[0].i verbs = get_main_verbs_of_sent(sent) for verb in verbs: subjs = get_subjects_of_verb(verb) if not subjs: continue objs = get_objects_of_verb(verb) if not objs: continue # add adjacent auxiliaries to verbs, for context # and add compounds to compound nouns verb_span = get_span_for_verb_auxiliaries(verb) verb = sent[verb_span[0] - start_i: verb_span[1] - start_i + 1] for subj in subjs: subj = sent[get_span_for_compound_noun(subj)[0] - start_i: subj.i - start_i + 1] for obj in objs: if obj.pos == NOUN: span = get_span_for_compound_noun(obj) elif obj.pos == VERB: span = get_span_for_verb_auxiliaries(obj) else: span = (obj.i, obj.i) obj = sent[span[0] - start_i: span[1] - start_i + 1] yield (subj, verb, obj)
def direct_quotations(doc): """ Baseline, not-great attempt at direction quotation extraction (no indirect or mixed quotations) using rules and patterns. English only. Args: doc (``textacy.Doc`` or ``spacy.Doc``) Yields: (``spacy.Span``, ``spacy.Token``, ``spacy.Span``): next quotation in ``doc`` represented as a (speaker, reporting verb, quotation) 3-tuple Notes: Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic Tagging of Reported Speech in Newspaper Articles". TODO: Better approach would use ML, but needs a training dataset. """ if isinstance(doc, textacy.Doc): if doc.lang != 'en': raise NotImplementedError('sorry, English-language texts only :(') doc = doc.spacy_doc quote_end_punct = {',', '.', '?', '!'} quote_indexes = set(itertoolz.concat( (m.start(), m.end() - 1) for m in re.finditer(r"(\".*?\")|(''.*?'')|(``.*?'')", doc.string))) quote_positions = list(itertoolz.partition( 2, sorted(tok.i for tok in doc if tok.idx in quote_indexes))) sents = list(doc.sents) sent_positions = [(sent.start, sent.end) for sent in sents] for q0, q1 in quote_positions: quote = doc[q0: q1 + 1] # we're only looking for direct quotes, not indirect or mixed if not any(char in quote_end_punct for char in quote.text[-4:]): continue # get adjacent sentences candidate_sent_indexes = [] for i, (s0, s1) in enumerate(sent_positions): if s0 <= q1 + 1 and s1 > q1: candidate_sent_indexes.append(i) elif s0 < q0 and s1 >= q0 - 1: candidate_sent_indexes.append(i) for si in candidate_sent_indexes: sent = sents[si] # get any reporting verbs rvs = [tok for tok in sent if spacy_utils.preserve_case(tok) is False and tok.lemma_ in REPORTING_VERBS and tok.pos_ == 'VERB' and not any(oq0 <= tok.i <= oq1 for oq0, oq1 in quote_positions)] # get target offset against which to measure distances of NEs if rvs: if len(rvs) == 1: rv = rvs[0] else: min_rv_dist = 1000 for rv_candidate in rvs: rv_dist = min(abs(rv_candidate.i - qp) for qp in (q0, q1)) if rv_dist < min_rv_dist: rv = rv_candidate min_rv_dist = rv_dist else: break else: # TODO: do we have no other recourse?! continue try: # rv_subj = _find_subjects(rv)[0] rv_subj = get_subjects_of_verb(rv)[0] except IndexError: continue # if rv_subj.text in {'he', 'she'}: # for ne in named_entities(doc, good_ne_types={'PERSON'}): # if ne.start < rv_subj.i: # speaker = ne # else: # break # else: span = get_span_for_compound_noun(rv_subj) speaker = doc[span[0]: span[1] + 1] yield (speaker, rv, quote) break
def direct_quotations(doc): """ Baseline, not-great attempt at direction quotation extraction (no indirect or mixed quotations) using rules and patterns. English only. Args: doc (``spacy.Doc``) Yields: (``spacy.Span``, ``spacy.Token``, ``spacy.Span``): next quotation in ``doc`` represented as a (speaker, reporting verb, quotation) 3-tuple Notes: Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic Tagging of Reported Speech in Newspaper Articles". TODO: Better approach would use ML, but needs a training dataset. """ quote_end_punct = {',', '.', '?', '!'} quote_indexes = set(itertoolz.concat( (m.start(), m.end() - 1) for m in re.finditer(r"(\".*?\")|(''.*?'')|(``.*?'')", doc.string))) quote_positions = list(itertoolz.partition( 2, sorted(tok.i for tok in doc if tok.idx in quote_indexes))) sents = list(doc.sents) sent_positions = [(sent.start, sent.end) for sent in sents] for q0, q1 in quote_positions: quote = doc[q0: q1 + 1] # we're only looking for direct quotes, not indirect or mixed if not any(char in quote_end_punct for char in quote.text[-4:]): continue # get adjacent sentences candidate_sent_indexes = [] for i, (s0, s1) in enumerate(sent_positions): if s0 <= q1 + 1 and s1 > q1: candidate_sent_indexes.append(i) elif s0 < q0 and s1 >= q0 - 1: candidate_sent_indexes.append(i) for si in candidate_sent_indexes: sent = sents[si] # get any reporting verbs rvs = [tok for tok in sent if spacy_utils.preserve_case(tok) is False and tok.lemma_ in REPORTING_VERBS and tok.pos_ == 'VERB' and not any(oq0 <= tok.i <= oq1 for oq0, oq1 in quote_positions)] # get target offset against which to measure distances of NEs if rvs: if len(rvs) == 1: rv = rvs[0] else: min_rv_dist = 1000 for rv_candidate in rvs: rv_dist = min(abs(rv_candidate.i - qp) for qp in (q0, q1)) if rv_dist < min_rv_dist: rv = rv_candidate min_rv_dist = rv_dist else: break else: # TODO: do we have no other recourse?! continue try: # rv_subj = _find_subjects(rv)[0] rv_subj = get_subjects_of_verb(rv)[0] except IndexError: continue # if rv_subj.text in {'he', 'she'}: # for ne in named_entities(doc, good_ne_types={'PERSON'}): # if ne.start < rv_subj.i: # speaker = ne # else: # break # else: span = get_span_for_compound_noun(rv_subj) speaker = doc[span[0]: span[1] + 1] yield (speaker, rv, quote) break
def subject_verb_object_triples(doc): """ Extract an ordered sequence of subject-verb-object (SVO) triples from a spacy-parsed doc. Note that this only works for SVO languages. Args: doc (``textacy.Doc`` or ``spacy.Doc`` or ``spacy.Span``) Yields: Tuple[``spacy.Span``, ``spacy.Span``, ``spacy.Span``]: the next 3-tuple of spans from ``doc`` representing a (subject, verb, object) triple, in order of appearance """ # TODO: Rewrite rules based on http://www.anthology.aclweb.org/W/W12/W12-0702.pdf # TODO: Think about relative clauses (that-of) e.g., products that include # TODO: What to do about questions, where it may be VSO instead of SVO? # TODO: What about non-adjacent verb negations? # TODO: What about object (noun) negations? if isinstance(doc, SpacySpan): sents = [doc] else: # textacy.Doc or spacy.Doc sents = doc.sents for sent in sents: start_i = sent[0].i verbs_init = get_main_verbs_of_sent(sent) list_candidates = [] verb_tmp_token = None for verb_init in verbs_init: if (verb_init['token'] != verb_tmp_token): verb_tmp_token = verb_init['token'] subjs = get_subjects_of_verb(verb_init['token'], sent) if not subjs: continue verbs = get_span_for_verb_auxiliaries(verb_init['token'], start_i, sent) list_candidates.append((subjs, verbs)) for subjs, verbs in list_candidates: for verb in verbs: objs = get_objects_of_verb(verb['token']) if not objs: continue # add adjacent auxiliaries to verbs, for context # and add compounds to compound nouns for subj in subjs: subj_type = subj.ent_type_ subj = sent[get_span_for_compound_noun(subj)[0] - start_i:subj.i - start_i + 1] for obj in objs: obj_type = obj.ent_type_ if obj.pos != VERB: #obj.pos == NOUN or obj.pos == PROPN: span = get_span_for_compound_noun(obj) elif obj.pos == VERB: #span = get_span_for_verb_auxiliaries(obj, start_i, sent) span = (obj.i, obj.i) else: span = (obj.i, obj.i) obj = sent[span[0] - start_i:span[1] - start_i + 1] score = subj.similarity(obj) + obj.similarity(subj) yield (subj, verb, obj, score, subj_type, obj_type)