def test_direct_quotations(self): expected = [ 'he, said, "I heard Donald Trump say we need to close mosques in the United States,"', 'he, said, "Is that what we want our kids to learn?"'] observed = [', '.join(item.text for item in triple) for triple in extract.direct_quotations(self.spacy_doc)] self.assertEqual(observed, expected)
def test_direct_quotations(en_nlp, text, exp): obs = list(extract.direct_quotations(en_nlp(text))) assert all( hasattr(dq, attr) for dq in obs for attr in ["speaker", "cue", "content"]) obs_text = [([tok.text for tok in speaker], [tok.text for tok in cue], content.text) for speaker, cue, content in obs] assert obs_text == exp
def test_direct_quotations(self): expected = [ 'he, said, "I heard Donald Trump say we need to close mosques in the United States,"', 'he, said, "Is that what we want our kids to learn?"' ] observed = [ ', '.join(item.text for item in triple) for triple in extract.direct_quotations(self.spacy_doc) ] self.assertEqual(observed, expected)
def test_direct_quotations(spacy_doc): expected = [ ("he", "said", '"I heard Donald Trump say we need to close mosques in the United States,"' ), ("he", "said", '"Is that what we want our kids to learn?"'), ] result = list(extract.direct_quotations(spacy_doc)) assert all(isinstance(dq, tuple) for dq in result) assert all(isinstance(obj, (Span, Token)) for dq in result for obj in dq) observed = [tuple(obj.text for obj in dq) for dq in result] assert observed == expected
def _get_quotes(self): quote_count = [] for stance in tqdm.tqdm(self._stances): body = self._original_articles.get(stance['Body ID']).decode( 'utf-8', 'replace') doc = Doc(content=body, lang=u'en') quotes = direct_quotations(doc) quote_counter = 0 for q in quotes: quote_counter = quote_counter + len(q[2]) quote_counter = quote_counter / len(body) quote_count.append(quote_counter) return quote_count
def direct_quotations(self): """ Baseline, not-great attempt at direction quotation extraction (no indirect or mixed quotations) using rules and patterns. English only. Yields: (``spacy.Span``, ``spacy.Token``, ``spacy.Span``): next quotation represented as a (speaker, reporting verb, quotation) 3-tuple .. seealso:: :func:`extract.direct_quotations() <textacy.extract.direct_quotations>` """ if self.lang != 'en': raise NotImplementedError('sorry, English-language texts only :(') for dq in extract.direct_quotations(self.spacy_doc): yield dq
def direct_quotations(self): """ Baseline, not-great attempt at direction quotation extraction (no indirect or mixed quotations) using rules and patterns. English only. """ return extract.direct_quotations(self.spacy_doc)
def grammars( carrel, grammar, query, noun, lemma, sort, count ) : """Extract sentence fragments from <carrel> where fragments are one of: \b nouns - all the nouns and noun chunks quotes - things people say svo - fragments in the form of subject-verb-object (the default) sss - a more advanced version of svo; fragments beginning with an entity, are co-occur with a verb, and are followed by a phrase This is very useful for the purposes of listing more complete ideas from a text. Examples: \b rdr grammars homer rdr grammars -g nouns homer rdr grammars -g sss -n hector -l be homer""" # require from textacy import extract from os import system from re import search # sanity check checkForCarrel( carrel ) # initialize doc = carrel2doc( carrel ) # get the features; svo if grammar == 'svo' : # do the work features = list( extract.subject_verb_object_triples( doc ) ) # simplify the result items = [] for feature in features : subject = [ token.text_with_ws for token in feature.subject ] verb = [ token.text_with_ws for token in feature.verb ] object = [ token.text_with_ws for token in feature.object ] items.append(' \t'.join( [ ''.join( subject ), ''.join( verb ), ''.join( object ) ] ) ) # done features = items # quotes elif grammar == 'quotes' : # do the work features = list( extract.direct_quotations( doc ) ) # simplify the result items = [] for feature in features : # parse and stringify speaker = [ token.text_with_ws for token in feature.speaker ] cue = [ token.text_with_ws for token in feature.cue ] content = feature.content.text_with_ws items.append( '\t'.join( [ ''.join( speaker ), ''.join( cue ), content ] ) ) # done features = items # noun chunks elif grammar == 'nouns' : # do the work and simplify the result features = list( extract.noun_chunks( doc ) ) features = [ feature.text for feature in features ] # semi-structured sentences elif grammar == 'sss' : # sanity check if not noun : click.echo( "Error: When specifying sss, the -n option is required. See 'rdr grammars --help'.", err=True ) exit() # do the work features = list( extract.semistructured_statements( doc, entity=noun, cue=lemma ) ) # simplify the result items = [] for feature in features : entity = [ token.text_with_ws for token in feature.entity ] cue = [ token.text_with_ws for token in feature.cue ] fragment = [ token.text_with_ws for token in feature.fragment ] items.append( '\t'.join( [ ''.join( entity ), ''.join( cue ), ''.join( fragment ) ] ) ) # done features = items # filter, conditionally if query : features = [ feature for feature in features if ( search( query, feature ) ) ] # sort, conditionally if sort : features.sort() # count, conditionally if count : # initialize a dictionary and process each feature items = {} for feature in features : # update the dictionary if feature in items : items[ feature ] += 1 else : items[ feature ] = 1 # sort the dictionary; return the features features = sorted( items.items(), key=lambda x:x[ 1 ], reverse=True ) # process each feature, again items = [] for feature in features : # create a record and update record = str( feature[ 1 ] ) + '\t' + feature[ 0 ] items.append( record ) # done features = items # output for feature in features : click.echo( feature )
def test_direct_quotations_spanish(es_nlp, text, exp): obs = extract.direct_quotations(es_nlp(text)) obs_text = [([tok.text for tok in speaker], [tok.text for tok in cue], content.text) for speaker, cue, content in obs] assert obs_text == exp