def concordance(self, *args, **kwargs): """ A concordance method for Tregex queries, CoreNLP dependencies, tokenised data or plaintext. >>> wv = ['want', 'need', 'feel', 'desire'] >>> corpus.concordance({'l': wv, 'f': 'root'}) Arguments are the same as :func:`~corpkit.interrogation.Interrogation.interrogate`, plus: :param only_format_match: if True, left and right window will just be words, regardless of what is in 'show' :type only_format_match: bool :param random: randomise lines :type random: bool :param only_unique: only unique lines :type only_unique: bool :returns: A :class:`corpkit.interrogation.Concordance` instance """ from corpkit.interrogator import interrogator return interrogator(self, conc = True, *args, **kwargs)
def get_stats(self, *args): """ Get some basic stats from the corpus, and store as :py:attr:`~corpkit.corpus.Corpus.features` >>> corpus.get_stats() :returns: None """ from corpkit import interrogator self.features = interrogator(self.path, 's', 'any').results print '\nFeatures defined. See .features attribute ...'
def interroplot(path, search, **kwargs): """Demo function for interrogator/plotter. 1. Interrogates path with a query 2. Gets relative frequencies 3. Plots the top seven results :param path: path to corpus :type path: str :param search: search """ import corpkit from corpkit import interrogator if type(search) == str: search = {'t': search} kwargs['show'] = kwargs.pop('show', 'w') quickstart = interrogator(path, search = search, **kwargs) edited = quickstart.edit('%', 'self', print_info = False) edited.plot(str(path))
def interroplot(path, query): """Demo function for interrogator/plotter. 1. Interrogates path with Tregex query, 2. Gets relative frequencies 3. Plots the top seven results :param path: path to corpus :type path: str :param query: Tregex query :type query: str """ import corpkit from corpkit import interrogator, editor, plotter quickstart = interrogator(path, 'words', query, show = ['w']) edited = editor(quickstart.results, '%', quickstart.totals, print_info = False) plotter(str(path), edited.results)
def interroplot(path, search, **kwargs): """Demo function for interrogator/plotter. 1. Interrogates path with a query 2. Gets relative frequencies 3. Plots the top seven results :param path: path to corpus :type path: str :param search: search """ import corpkit from corpkit import interrogator if type(search) == str: search = {'t': search} kwargs['show'] = kwargs.pop('show', 'w') quickstart = interrogator(path, search=search, **kwargs) edited = quickstart.edit('%', 'self', print_info=False) edited.plot(str(path))
# <markdowncell> # Next, we perform interrogations with *interrogator()*. Its most important arguments are: # # 1. **path to corpus** (the *path* variable) # # 2. Tregex **options**: # * **'t'**: return only words # * **'c'**: return a count of matches # # 3. the **Tregex query** # We only need to count tokens, so we can use the **count** option (it's often faster than getting lists of matching tokens). The cell below will run *interrogator()* over each annual subcorpus and count the number of matches for the query. # <codecell> allwords = interrogator(path, 'count', allwords_query) # <markdowncell> # When the interrogation has finished, we can view the total counts by getting the *totals* branch of the *allwords* interrogation: # <codecell> # from the allwords results, print the totals print allwords.totals # <markdowncell> # If you want to see the query and options that created the results, you can print the *query* branch. # <codecell> print allwords.query # <headingcell level=3>
# 1. **path to corpus** # # 2. Tregex **options**: # * **'t/w/words'**: return only words # * **'c/count'**: return a count of matches # * **'p/pos'**: return only the tag # * **'b/both'**: return tag and word together # # 3. a **Tregex query** # We only need to count tokens, so we can use the `'count'` option (it's often faster than getting lists of matching tokens). The cell below will run `interrogator()` over each annual subcorpus and count the number of matches for the query. # Some common Tregex patterns have been predefined. Searching for `'any'` will find any word in the corpus and count it. # <codecell> allwords = interrogator(annual_trees, 'count', 'any') # <markdowncell> # When the interrogation has finished, we can view our results: # <codecell> # from the allwords results, print the totals print allwords.totals # <markdowncell> # If you want to see the query and options that created the results, you can use: # <codecell> print allwords.query # <markdowncell>
def concordance(self, *args, **kwargs): """interrogate the corpus using :func:`~corpkit.corpus.Corpus.concordance`""" from corpkit import interrogator return interrogator([s for s in self], conc = True, *args, **kwargs)
def interrogate(self, *args, **kwargs): """interrogate the corpus using :func:`~corpkit.corpus.Corpus.interrogate`""" from corpkit import interrogator return interrogator([s for s in self], *args, **kwargs)
def interrogate(self, search, *args, **kwargs): """Interrogate a corpus of texts for a lexicogrammatical phenomenon >>> # show lemma form of nouns ending in 'ing' >>> q = {'w': r'ing$', 'p': r'^N'} >>> data = corpus.interrogate(q, show = 'l') :param search: What query should be matching - t/tregex - w/word - l/lemma - f/function - g/governor - d/dependent - p/pos - i/index - n/ngrams - s/general stats :type search: str, or, for dependencies, a dict like ``{'w': 'help', 'p': r'^V'}`` :param searchmode: Return results matching any/all criteria :type searchmode: str ('any'/'all') :param exclude: The inverse of `search`, removing results from search :type exclude: dict -- ``{'l': 'be'}`` :param excludemode: Exclude results matching any/all criteria :type excludemode: str ('any'/'all') :param query: A search query for the interrogation :type query: str -- regex/Tregex pattern; dict -- ``{name: pattern}``; list -- word list to match :param show: What to output. If multiple strings are passed, results will be colon-separated, in order - t/tree - w/word - l/lemma - g/governor - d/dependent - f/function - p/pos - i/index - a/distance from root :type show: list of strings :param lemmatise: Force lemmatisation on results :type lemmatise: bool :param lemmatag: Explicitly pass a pos to lemmatiser (generally when data is unparsed) :type lemmatag: False/'n'/'v'/'a'/'r' :param spelling: Convert all to U.S. or U.K. English :type spelling: False/'US'/'UK' :param dep_type: The kind of Stanford CoreNLP dependency parses you want to use :type dep_type: str -- 'basic-dependencies'/'a', 'collapsed-dependencies'/'b', 'collapsed-ccprocessed-dependencies'/'c' :param quicksave: Save result as pickle to ```saved_interrogations/str``` on completion :type quicksave: str :param gramsize: size of ngrams (default 2) :type gramsize: int :param split_contractions: make ``"don't"`` et al into two tokens :type split_contractions: bool :param multiprocess: how many parallel processes to run :type multiprocess: int / bool (to determine automatically) :param files_as_subcorpora: treat each file as a subcorpus :type files_as_subcorpora: bool :returns: A :class:`corpkit.interrogation.Interrogation` object, with ``.query``, ``.results``, ``.totals`` attributes. If multiprocessing is \ invoked, result may be a :class:`corpkit.interrogation.Interrodict` containing corpus names, queries or speakers as keys. """ from corpkit import interrogator par = kwargs.pop('multiprocess', None) if par and self.subcorpora: if type(par) == int: kwargs['multiprocess'] = par return interrogator(self.subcorpora, search, *args, **kwargs) else: return interrogator(self, search, *args, **kwargs)