Пример #1
0
    def concordance(self, *args, **kwargs): 
        """
        A concordance method for Tregex queries, CoreNLP dependencies, 
        tokenised data or plaintext.

           >>> wv = ['want', 'need', 'feel', 'desire']
           >>> corpus.concordance({'l': wv, 'f': 'root'})

        Arguments are the same as :func:`~corpkit.interrogation.Interrogation.interrogate`, plus:

        :param only_format_match: if True, left and right window will just be words, regardless of what is in 'show'
        :type only_format_match: bool

        :param random: randomise lines
        :type random: bool

        :param only_unique: only unique lines
        :type only_unique: bool

        :returns: A :class:`corpkit.interrogation.Concordance` instance

        """

        from corpkit.interrogator import interrogator
        return interrogator(self, conc = True, *args, **kwargs)
Пример #2
0
    def get_stats(self, *args):
        """
        Get some basic stats from the corpus, and store as :py:attr:`~corpkit.corpus.Corpus.features`

           >>> corpus.get_stats()

        :returns: None
        """
        from corpkit import interrogator
        self.features = interrogator(self.path, 's', 'any').results
        print '\nFeatures defined. See .features attribute ...' 
Пример #3
0
def interroplot(path, search, **kwargs):
    """Demo function for interrogator/plotter.

        1. Interrogates path with a query
        2. Gets relative frequencies
        3. Plots the top seven results

    :param path: path to corpus
    :type path: str
    :param search: search
    """
    import corpkit
    from corpkit import interrogator
    if type(search) == str:
        search = {'t': search}
    kwargs['show'] = kwargs.pop('show', 'w')
    quickstart = interrogator(path, search = search, **kwargs)
    edited = quickstart.edit('%', 'self', print_info = False)
    edited.plot(str(path))
Пример #4
0
def interroplot(path, query):
    """Demo function for interrogator/plotter.

        1. Interrogates path with Tregex query, 
        2. Gets relative frequencies
        3. Plots the top seven results

    :param path: path to corpus
    :type path: str
    
    :param query: Tregex query
    :type query: str

    """
    import corpkit
    from corpkit import interrogator, editor, plotter
    quickstart = interrogator(path, 'words', query, show = ['w'])
    edited = editor(quickstart.results, '%', quickstart.totals, print_info = False)
    plotter(str(path), edited.results)
Пример #5
0
def interroplot(path, search, **kwargs):
    """Demo function for interrogator/plotter.

        1. Interrogates path with a query
        2. Gets relative frequencies
        3. Plots the top seven results

    :param path: path to corpus
    :type path: str
    :param search: search
    """
    import corpkit
    from corpkit import interrogator
    if type(search) == str:
        search = {'t': search}
    kwargs['show'] = kwargs.pop('show', 'w')
    quickstart = interrogator(path, search=search, **kwargs)
    edited = quickstart.edit('%', 'self', print_info=False)
    edited.plot(str(path))
Пример #6
0
# <markdowncell>
# Next, we perform interrogations with *interrogator()*. Its most important arguments are:
#
# 1. **path to corpus** (the *path* variable)
#
# 2. Tregex **options**:
#   * **'t'**: return only words
#   * **'c'**: return a count of matches
#
# 3. the **Tregex query**

# We only need to count tokens, so we can use the **count** option (it's often faster than getting lists of matching tokens). The cell below will run *interrogator()* over each annual subcorpus and count the number of matches for the query.

# <codecell>
allwords = interrogator(path, 'count', allwords_query) 

# <markdowncell>
# When the interrogation has finished, we can view the total counts by getting the *totals* branch of the *allwords* interrogation:

# <codecell>
# from the allwords results, print the totals
print allwords.totals

# <markdowncell>
# If you want to see the query and options that created the results, you can print the *query* branch.

# <codecell>
print allwords.query

# <headingcell level=3>
Пример #7
0
# 1. **path to corpus**
#
# 2. Tregex **options**:
#   * **'t/w/words'**: return only words
#   * **'c/count'**: return a count of matches
#   * **'p/pos'**: return only the tag
#   * **'b/both'**: return tag and word together
#
# 3. a **Tregex query**

# We only need to count tokens, so we can use the `'count'` option (it's often faster than getting lists of matching tokens). The cell below will run `interrogator()` over each annual subcorpus and count the number of matches for the query.

# Some common Tregex patterns have been predefined. Searching for `'any'` will find any word in the corpus and count it.

# <codecell>
allwords = interrogator(annual_trees, 'count', 'any') 

# <markdowncell>
# When the interrogation has finished, we can view our results:

# <codecell>
# from the allwords results, print the totals
print allwords.totals

# <markdowncell>
# If you want to see the query and options that created the results, you can use:

# <codecell>
print allwords.query

# <markdowncell>
Пример #8
0
 def concordance(self, *args, **kwargs):
     """interrogate the corpus using :func:`~corpkit.corpus.Corpus.concordance`"""
     from corpkit import interrogator
     return interrogator([s for s in self], conc = True, *args, **kwargs)
Пример #9
0
 def interrogate(self, *args, **kwargs):
     """interrogate the corpus using :func:`~corpkit.corpus.Corpus.interrogate`"""
     from corpkit import interrogator
     return interrogator([s for s in self], *args, **kwargs)
Пример #10
0
    def interrogate(self, search, *args, **kwargs):
        """Interrogate a corpus of texts for a lexicogrammatical phenomenon

            >>> # show lemma form of nouns ending in 'ing'
            >>> q = {'w': r'ing$', 'p': r'^N'}
            >>> data = corpus.interrogate(q, show = 'l')
        
        :param search: What query should be matching
           - t/tregex
           - w/word
           - l/lemma
           - f/function
           - g/governor
           - d/dependent
           - p/pos
           - i/index
           - n/ngrams
           - s/general stats
        :type search: str, or, for dependencies, a dict like ``{'w': 'help', 'p': r'^V'}``

        :param searchmode: Return results matching any/all criteria
        :type searchmode: str ('any'/'all')

        :param exclude: The inverse of `search`, removing results from search
        :type exclude: dict -- ``{'l': 'be'}``

        :param excludemode: Exclude results matching any/all criteria
        :type excludemode: str ('any'/'all')
        
        :param query: A search query for the interrogation
        :type query: str -- regex/Tregex pattern; dict -- ``{name: pattern}``; list -- word list to match
        
        :param show: What to output. If multiple strings are passed, results will be colon-separated, in order
           - t/tree
           - w/word
           - l/lemma
           - g/governor
           - d/dependent
           - f/function
           - p/pos
           - i/index
           - a/distance from root
        :type show: list of strings

        :param lemmatise: Force lemmatisation on results
        :type lemmatise: bool
           
        :param lemmatag: Explicitly pass a pos to lemmatiser (generally when data is unparsed)
        :type lemmatag: False/'n'/'v'/'a'/'r'
        
        :param spelling: Convert all to U.S. or U.K. English
        :type spelling: False/'US'/'UK'
           
        :param dep_type: The kind of Stanford CoreNLP dependency parses you want to use
        :type dep_type: str -- 'basic-dependencies'/'a', 'collapsed-dependencies'/'b', 'collapsed-ccprocessed-dependencies'/'c'
        
        :param quicksave: Save result as pickle to ```saved_interrogations/str``` on completion
        :type quicksave: str
        
        :param gramsize: size of ngrams (default 2)
        :type gramsize: int

        :param split_contractions: make ``"don't"`` et al into two tokens
        :type split_contractions: bool

        :param multiprocess: how many parallel processes to run
        :type multiprocess: int / bool (to determine automatically)

        :param files_as_subcorpora: treat each file as a subcorpus
        :type files_as_subcorpora: bool

        :returns: A :class:`corpkit.interrogation.Interrogation` object, with ``.query``, ``.results``, ``.totals`` attributes. If multiprocessing is \
        invoked, result may be a :class:`corpkit.interrogation.Interrodict` containing corpus names, queries or speakers as keys.
        """
        from corpkit import interrogator
        par = kwargs.pop('multiprocess', None)
        if par and self.subcorpora:
            if type(par) == int:
                kwargs['multiprocess'] = par
            return interrogator(self.subcorpora, search, *args, **kwargs)
        else:
            return interrogator(self, search, *args, **kwargs)