예제 #1
0
 def test_direct_quotations(self):
     expected = [
         'he, said, "I heard Donald Trump say we need to close mosques in the United States,"',
         'he, said, "Is that what we want our kids to learn?"']
     observed = [', '.join(item.text for item in triple) for triple in
                 extract.direct_quotations(self.spacy_doc)]
     self.assertEqual(observed, expected)
예제 #2
0
def test_direct_quotations(en_nlp, text, exp):
    obs = list(extract.direct_quotations(en_nlp(text)))
    assert all(
        hasattr(dq, attr) for dq in obs
        for attr in ["speaker", "cue", "content"])
    obs_text = [([tok.text for tok in speaker], [tok.text
                                                 for tok in cue], content.text)
                for speaker, cue, content in obs]
    assert obs_text == exp
예제 #3
0
 def test_direct_quotations(self):
     expected = [
         'he, said, "I heard Donald Trump say we need to close mosques in the United States,"',
         'he, said, "Is that what we want our kids to learn?"'
     ]
     observed = [
         ', '.join(item.text for item in triple)
         for triple in extract.direct_quotations(self.spacy_doc)
     ]
     self.assertEqual(observed, expected)
예제 #4
0
def test_direct_quotations(spacy_doc):
    expected = [
        ("he", "said",
         '"I heard Donald Trump say we need to close mosques in the United States,"'
         ),
        ("he", "said", '"Is that what we want our kids to learn?"'),
    ]
    result = list(extract.direct_quotations(spacy_doc))
    assert all(isinstance(dq, tuple) for dq in result)
    assert all(isinstance(obj, (Span, Token)) for dq in result for obj in dq)
    observed = [tuple(obj.text for obj in dq) for dq in result]
    assert observed == expected
예제 #5
0
    def _get_quotes(self):
        quote_count = []
        for stance in tqdm.tqdm(self._stances):
            body = self._original_articles.get(stance['Body ID']).decode(
                'utf-8', 'replace')
            doc = Doc(content=body, lang=u'en')
            quotes = direct_quotations(doc)
            quote_counter = 0

            for q in quotes:
                quote_counter = quote_counter + len(q[2])
            quote_counter = quote_counter / len(body)
            quote_count.append(quote_counter)

        return quote_count
예제 #6
0
    def direct_quotations(self):
        """
        Baseline, not-great attempt at direction quotation extraction (no indirect
        or mixed quotations) using rules and patterns. English only.

        Yields:
            (``spacy.Span``, ``spacy.Token``, ``spacy.Span``): next quotation
                represented as a (speaker, reporting verb, quotation) 3-tuple

        .. seealso:: :func:`extract.direct_quotations() <textacy.extract.direct_quotations>`
        """
        if self.lang != 'en':
            raise NotImplementedError('sorry, English-language texts only :(')
        for dq in extract.direct_quotations(self.spacy_doc):
            yield dq
예제 #7
0
파일: texts.py 프로젝트: kevntao/textacy
    def direct_quotations(self):
        """
        Baseline, not-great attempt at direction quotation extraction (no indirect
        or mixed quotations) using rules and patterns. English only.

        Yields:
            (``spacy.Span``, ``spacy.Token``, ``spacy.Span``): next quotation
                represented as a (speaker, reporting verb, quotation) 3-tuple

        .. seealso:: :func:`extract.direct_quotations() <textacy.extract.direct_quotations>`
        """
        if self.lang != 'en':
            raise NotImplementedError('sorry, English-language texts only :(')
        for dq in extract.direct_quotations(self.spacy_doc):
            yield dq
예제 #8
0
파일: texts.py 프로젝트: EricSchles/textacy
 def direct_quotations(self):
     """
     Baseline, not-great attempt at direction quotation extraction (no indirect
     or mixed quotations) using rules and patterns. English only.
     """
     return extract.direct_quotations(self.spacy_doc)
예제 #9
0
def grammars( carrel, grammar, query, noun, lemma, sort, count ) :

	"""Extract sentence fragments from <carrel> where fragments are one of:
	
	\b
	  nouns - all the nouns and noun chunks
	  quotes - things people say
	  svo - fragments in the form of subject-verb-object (the default)
	  sss - a more advanced version of svo; fragments beginning
	    with an entity, are co-occur with a verb, and are followed
	    by a phrase
	
	This is very useful for the purposes of listing more complete ideas from a text.
	
	Examples:
	
	\b
	  rdr grammars homer
	  rdr grammars -g nouns homer
	  rdr grammars -g sss -n hector -l be homer"""
	
	# require
	from textacy import extract
	from os      import system
	from re      import search
	
	# sanity check
	checkForCarrel( carrel )

	# initialize
	doc = carrel2doc( carrel )

	# get the features; svo
	if grammar == 'svo' :
	
		# do the work
		features = list( extract.subject_verb_object_triples( doc ) )
		
		# simplify the result
		items = []
		for feature in features :
		
			subject = [ token.text_with_ws for token in feature.subject ]
			verb    = [ token.text_with_ws for token in feature.verb ]
			object  = [ token.text_with_ws for token in feature.object ]
			items.append(' \t'.join( [ ''.join( subject ), ''.join( verb ), ''.join( object ) ] ) )

		# done
		features = items
		
	# quotes
	elif grammar == 'quotes' :
	
		# do the work
		features = list( extract.direct_quotations( doc ) )
		
		# simplify the result
		items = []
		for feature in features :
		
			# parse and stringify
			speaker = [ token.text_with_ws for token in feature.speaker ]
			cue     = [ token.text_with_ws for token in feature.cue ]
			content = feature.content.text_with_ws
			items.append( '\t'.join( [ ''.join( speaker ), ''.join( cue ), content ] ) )

		# done
		features = items

	# noun chunks
	elif grammar == 'nouns' :
	
		# do the work and simplify the result
		features = list( extract.noun_chunks( doc ) )
		features = [ feature.text for feature in features ]
		
	# semi-structured sentences
	elif grammar == 'sss' :

		# sanity check
		if not noun :
		
			click.echo( "Error: When specifying sss, the -n option is required. See 'rdr grammars --help'.", err=True )
			exit()
			
		# do the work
		features = list( extract.semistructured_statements( doc, entity=noun, cue=lemma ) )

		# simplify the result
		items = []
		for feature in features :
		
			entity   = [ token.text_with_ws for token in feature.entity ]
			cue      = [ token.text_with_ws for token in feature.cue ]
			fragment = [ token.text_with_ws for token in feature.fragment ]
			items.append( '\t'.join( [ ''.join( entity ), ''.join( cue ), ''.join( fragment ) ] ) )

		# done
		features = items

	# filter, conditionally
	if query : features = [ feature for feature in features if ( search( query, feature ) ) ]
	
	# sort, conditionally
	if sort : features.sort()
	
	# count, conditionally
	if count :
	
		# initialize a dictionary and process each feature
		items = {}
		for feature in features :

			# update the dictionary
			if feature in items : items[ feature ] += 1
			else                : items[ feature ]  = 1

		# sort the dictionary; return the features
		features = sorted( items.items(), key=lambda x:x[ 1 ], reverse=True )
		
		# process each feature, again
		items = []
		for feature in features :
			
			# create a record and update
			record = str( feature[ 1 ] ) + '\t' + feature[ 0 ]
			items.append( record )
		
		# done
		features = items
	
	# output
	for feature in features : click.echo( feature )
예제 #10
0
def test_direct_quotations_spanish(es_nlp, text, exp):
    obs = extract.direct_quotations(es_nlp(text))
    obs_text = [([tok.text for tok in speaker], [tok.text
                                                 for tok in cue], content.text)
                for speaker, cue, content in obs]
    assert obs_text == exp
예제 #11
0
 def direct_quotations(self):
     """
     Baseline, not-great attempt at direction quotation extraction (no indirect
     or mixed quotations) using rules and patterns. English only.
     """
     return extract.direct_quotations(self.spacy_doc)