Пример #1
0
    def semistructured_statements(self, entity, **kwargs):
        """
        Extract "semi-structured statements" from doc, each as a (entity, cue, fragment)
        triple. This is similar to subject-verb-object triples.

        Args:
            entity (str): a noun or noun phrase of some sort (e.g. "President Obama",
                "global warming", "Python")
            **kwargs:
                cue (str, optional): verb lemma with which `entity` is associated
                    (e.g. "talk about", "have", "write")
                ignore_entity_case (bool, optional): if True, entity matching is
                    case-independent
                min_n_words (int, optional): min number of tokens allowed in a
                    matching fragment
                max_n_words (int, optional): max number of tokens allowed in a
                    matching fragment

        Yields:
            (``spacy.Span`` or ``spacy.Token``, ``spacy.Span`` or ``spacy.Token``, ``spacy.Span``):
                  where each element is a matching (entity, cue, fragment) triple

        .. seealso:: :func:`extract.semistructured_statements() <textacy.extract.semistructured_statements>`
        """
        for sss in extract.semistructured_statements(self.spacy_doc, entity,
                                                     **kwargs):
            yield sss
Пример #2
0
    def semistructured_statements(self, entity, **kwargs):
        """
        Extract "semi-structured statements" from doc, each as a (entity, cue, fragment)
        triple. This is similar to subject-verb-object triples.

        Args:
            entity (str): a noun or noun phrase of some sort (e.g. "President Obama",
                "global warming", "Python")
            **kwargs:
                cue (str, optional): verb lemma with which `entity` is associated
                    (e.g. "talk about", "have", "write")
                ignore_entity_case (bool, optional): if True, entity matching is
                    case-independent
                min_n_words (int, optional): min number of tokens allowed in a
                    matching fragment
                max_n_words (int, optional): max number of tokens allowed in a
                    matching fragment

        Yields:
            (``spacy.Span`` or ``spacy.Token``, ``spacy.Span`` or ``spacy.Token``, ``spacy.Span``):
                  where each element is a matching (entity, cue, fragment) triple

        .. seealso:: :func:`extract.semistructured_statements() <textacy.extract.semistructured_statements>`
        """
        for sss in extract.semistructured_statements(self.spacy_doc, entity, **kwargs):
            yield sss
Пример #3
0
def get_statements(parsed_doc: spacy.language.Doc, possible_subjects: List[str]) -> List[tuple]:

    statements = []

    for subject in possible_subjects:
        statements.extend(list(semistructured_statements(parsed_doc, subject)))

    return statements
Пример #4
0
def test_semistructured_statements(spacy_doc):
    expected = (
        "we", "discussed",
        "the impact of technology trends on education in the Middle East")
    observed = next(
        extract.semistructured_statements(spacy_doc, "we", cue="discuss"))
    assert isinstance(observed, tuple) and len(observed) == 3
    assert all(isinstance(obj, (Span, Token)) for obj in observed)
    assert all(obs.text == exp for obs, exp in compat.zip_(observed, expected))
Пример #5
0
    def semistructured_statements(self, entity, **kwargs):
        """
        Extract "semi-structured statements" from doc, each as a (entity, cue, fragment)
        triple. This is similar to subject-verb-object triples.

        Args:
            entity (str): a noun or noun phrase of some sort (e.g. "President Obama",
                "global warming", "Python")

        .. seealso:: :func:`extract.semistructured_statements() <textacy.extract.semistructured_statements>`
        for all function kwargs.
        """
        return extract.semistructured_statements(self.spacy_doc, entity, **kwargs)
Пример #6
0
    def semistructured_statements(self, entity, **kwargs):
        """
        Extract "semi-structured statements" from doc, each as a (entity, cue, fragment)
        triple. This is similar to subject-verb-object triples.

        Args:
            entity (str): a noun or noun phrase of some sort (e.g. "President Obama",
                "global warming", "Python")

        .. seealso:: :func:`extract.semistructured_statements() <textacy.extract.semistructured_statements>`
        for all function kwargs.
        """
        return extract.semistructured_statements(self.spacy_doc, entity,
                                                 **kwargs)
Пример #7
0
def test_extract_functionality(doc):
    bigrams = list(
        extract.ngrams(doc,
                       2,
                       filter_stops=True,
                       filter_punct=True,
                       filter_nums=False))[:10]
    for bigram in bigrams:
        assert isinstance(bigram, SpacySpan)
        assert len(bigram) == 2

    trigrams = list(
        extract.ngrams(doc,
                       3,
                       filter_stops=True,
                       filter_punct=True,
                       min_freq=2))[:10]
    for trigram in trigrams:
        assert isinstance(trigram, SpacySpan)
        assert len(trigram) == 3

    nes = list(
        extract.named_entities(doc,
                               drop_determiners=False,
                               exclude_types='numeric'))[:10]
    for ne in nes:
        assert isinstance(ne, SpacySpan)
        assert ne.label_
        assert ne.label_ != 'QUANTITY'

    pos_regex_matches = list(
        extract.pos_regex_matches(
            doc, constants.POS_REGEX_PATTERNS['en']['NP']))[:10]
    for match in pos_regex_matches:
        assert isinstance(match, SpacySpan)

    stmts = list(extract.semistructured_statements(doc, 'I', cue='be'))[:10]
    for stmt in stmts:
        assert isinstance(stmt, list)
        assert isinstance(stmt[0], compat.unicode_)
        assert len(stmt) == 3

    kts = keyterms.textrank(doc, n_keyterms=10)
    for keyterm in kts:
        assert isinstance(keyterm, tuple)
        assert isinstance(keyterm[0], compat.unicode_)
        assert isinstance(keyterm[1], float)
        assert keyterm[1] > 0.0
Пример #8
0
def test_semistructured_statements(sss_doc, entity, cue, fragment_len_range,
                                   exp):
    obs = list(
        extract.semistructured_statements(
            sss_doc,
            entity=entity,
            cue=cue,
            fragment_len_range=fragment_len_range))
    assert all(
        hasattr(sss, attr) for sss in obs
        for attr in ["entity", "cue", "fragment"])
    obs_text = [([tok.text
                  for tok in e], [tok.text
                                  for tok in c], [tok.text for tok in f])
                for e, c, f in obs]
    assert obs_text == exp
Пример #9
0
def test_extract_functionality(doc):
    bigrams = list(
        extract.ngrams(doc,
                       2,
                       filter_stops=True,
                       filter_punct=True,
                       filter_nums=False))[:10]
    for bigram in bigrams:
        assert isinstance(bigram, Span)
        assert len(bigram) == 2

    trigrams = list(
        extract.ngrams(doc,
                       3,
                       filter_stops=True,
                       filter_punct=True,
                       min_freq=2))[:10]
    for trigram in trigrams:
        assert isinstance(trigram, Span)
        assert len(trigram) == 3

    nes = list(
        extract.entities(doc, drop_determiners=False,
                         exclude_types="numeric"))[:10]
    for ne in nes:
        assert isinstance(ne, Span)
        assert ne.label_
        assert ne.label_ != "QUANTITY"

    pos_regex_matches = list(
        extract.pos_regex_matches(
            doc, constants.POS_REGEX_PATTERNS["en"]["NP"]))[:10]
    for match in pos_regex_matches:
        assert isinstance(match, Span)

    stmts = list(extract.semistructured_statements(doc, "I", cue="be"))[:10]
    for stmt in stmts:
        assert isinstance(stmt, list)
        assert isinstance(stmt[0], compat.unicode_)
        assert len(stmt) == 3

    kts = textacy.ke.textrank(doc, topn=10)
    for keyterm in kts:
        assert isinstance(keyterm, tuple)
        assert isinstance(keyterm[0], compat.unicode_)
        assert isinstance(keyterm[1], float)
        assert keyterm[1] > 0.0
Пример #10
0
def test_extract_functionality(doc):
    bigrams = list(
        extract.ngrams(doc,
                       2,
                       filter_stops=True,
                       filter_punct=True,
                       filter_nums=False))[:10]
    for bigram in bigrams:
        assert isinstance(bigram, Span)
        assert len(bigram) == 2

    trigrams = list(
        extract.ngrams(doc,
                       3,
                       filter_stops=True,
                       filter_punct=True,
                       min_freq=2))[:10]
    for trigram in trigrams:
        assert isinstance(trigram, Span)
        assert len(trigram) == 3

    nes = list(
        extract.entities(doc, drop_determiners=False,
                         exclude_types="numeric"))[:10]
    for ne in nes:
        assert isinstance(ne, Span)
        assert ne.label_
        assert ne.label_ != "QUANTITY"

    regex_matches = list(extract.regex_matches(doc, "Mr\. Speaker"))[:10]
    for match in regex_matches:
        assert isinstance(match, Span)

    stmts = list(extract.semistructured_statements(doc, entity="I",
                                                   cue="be"))[:10]
    for stmt in stmts:
        assert isinstance(stmt, list)
        assert isinstance(stmt[0], str)
        assert len(stmt) == 3

    kts = kt.textrank(doc, topn=10)
    for keyterm in kts:
        assert isinstance(keyterm, tuple)
        assert isinstance(keyterm[0], str)
        assert isinstance(keyterm[1], float)
        assert keyterm[1] > 0.0
Пример #11
0
def get_semi_structured_statements(doc, entity, cue='be'):
    assert isinstance(doc, textacy.Doc) or isinstance(
        doc, spacy.tokens.Doc), "Only {} are supported".format(possible_docs)
    assert isinstance(entity, basestring)
    assert isinstance(cue, basestring)
    return extract.semistructured_statements(doc, entity, cue)
Пример #12
0
def grammars( carrel, grammar, query, noun, lemma, sort, count ) :

	"""Extract sentence fragments from <carrel> where fragments are one of:
	
	\b
	  nouns - all the nouns and noun chunks
	  quotes - things people say
	  svo - fragments in the form of subject-verb-object (the default)
	  sss - a more advanced version of svo; fragments beginning
	    with an entity, are co-occur with a verb, and are followed
	    by a phrase
	
	This is very useful for the purposes of listing more complete ideas from a text.
	
	Examples:
	
	\b
	  rdr grammars homer
	  rdr grammars -g nouns homer
	  rdr grammars -g sss -n hector -l be homer"""
	
	# require
	from textacy import extract
	from os      import system
	from re      import search
	
	# sanity check
	checkForCarrel( carrel )

	# initialize
	doc = carrel2doc( carrel )

	# get the features; svo
	if grammar == 'svo' :
	
		# do the work
		features = list( extract.subject_verb_object_triples( doc ) )
		
		# simplify the result
		items = []
		for feature in features :
		
			subject = [ token.text_with_ws for token in feature.subject ]
			verb    = [ token.text_with_ws for token in feature.verb ]
			object  = [ token.text_with_ws for token in feature.object ]
			items.append(' \t'.join( [ ''.join( subject ), ''.join( verb ), ''.join( object ) ] ) )

		# done
		features = items
		
	# quotes
	elif grammar == 'quotes' :
	
		# do the work
		features = list( extract.direct_quotations( doc ) )
		
		# simplify the result
		items = []
		for feature in features :
		
			# parse and stringify
			speaker = [ token.text_with_ws for token in feature.speaker ]
			cue     = [ token.text_with_ws for token in feature.cue ]
			content = feature.content.text_with_ws
			items.append( '\t'.join( [ ''.join( speaker ), ''.join( cue ), content ] ) )

		# done
		features = items

	# noun chunks
	elif grammar == 'nouns' :
	
		# do the work and simplify the result
		features = list( extract.noun_chunks( doc ) )
		features = [ feature.text for feature in features ]
		
	# semi-structured sentences
	elif grammar == 'sss' :

		# sanity check
		if not noun :
		
			click.echo( "Error: When specifying sss, the -n option is required. See 'rdr grammars --help'.", err=True )
			exit()
			
		# do the work
		features = list( extract.semistructured_statements( doc, entity=noun, cue=lemma ) )

		# simplify the result
		items = []
		for feature in features :
		
			entity   = [ token.text_with_ws for token in feature.entity ]
			cue      = [ token.text_with_ws for token in feature.cue ]
			fragment = [ token.text_with_ws for token in feature.fragment ]
			items.append( '\t'.join( [ ''.join( entity ), ''.join( cue ), ''.join( fragment ) ] ) )

		# done
		features = items

	# filter, conditionally
	if query : features = [ feature for feature in features if ( search( query, feature ) ) ]
	
	# sort, conditionally
	if sort : features.sort()
	
	# count, conditionally
	if count :
	
		# initialize a dictionary and process each feature
		items = {}
		for feature in features :

			# update the dictionary
			if feature in items : items[ feature ] += 1
			else                : items[ feature ]  = 1

		# sort the dictionary; return the features
		features = sorted( items.items(), key=lambda x:x[ 1 ], reverse=True )
		
		# process each feature, again
		items = []
		for feature in features :
			
			# create a record and update
			record = str( feature[ 1 ] ) + '\t' + feature[ 0 ]
			items.append( record )
		
		# done
		features = items
	
	# output
	for feature in features : click.echo( feature )