def concordance(self, *args, **kwargs): """ A concordance method for Tregex queries, CoreNLP dependencies, tokenised data or plaintext. :Example: >>> wv = ['want', 'need', 'feel', 'desire'] >>> corpus.concordance({L: wv, F: 'root'}) 0 01 1-01.txt.xml But , so I feel like i do that for w 1 01 1-01.txt.xml I felt a little like oh , i 2 01 1-01.txt.xml he 's a difficult man I feel like his work ethic 3 01 1-01.txt.xml So I felt like i recognized li ... ... Arguments are the same as :func:`~corpkit.interrogation.Interrogation.interrogate`, plus: :param only_format_match: if True, left and right window will just be words, regardless of what is in 'show' :type only_format_match: bool :param random: randomise lines :type random: bool :param only_unique: only unique lines :type only_unique: bool :returns: A :class:`corpkit.interrogation.Concordance` instance """ from interrogator import interrogator kwargs.pop('do_concordancing', None) kwargs.pop('conc', None) return interrogator(self, do_concordancing = 'only', *args, **kwargs)
def concordance(self, *args, **kwargs): """ A concordance method for Tregex queries, CoreNLP dependencies, tokenised data or plaintext. :Example: >>> wv = ['want', 'need', 'feel', 'desire'] >>> corpus.concordance({L: wv, F: 'root'}) 0 01 1-01.txt.xml But , so I feel like i do that for w 1 01 1-01.txt.xml I felt a little like oh , i 2 01 1-01.txt.xml he 's a difficult man I feel like his work ethic 3 01 1-01.txt.xml So I felt like i recognized li ... ... Arguments are the same as :func:`~corpkit.interrogation.Interrogation.interrogate`, plus: :param only_format_match: if True, left and right window will just be words, regardless of what is in 'show' :type only_format_match: bool :param random: randomise lines :type random: bool :param only_unique: only unique lines :type only_unique: bool :returns: A :class:`corpkit.interrogation.Concordance` instance """ from interrogator import interrogator kwargs.pop('do_concordancing', None) kwargs.pop('conc', None) return interrogator(self, do_concordancing='only', *args, **kwargs)
def interroplot(path, query): import corpkit """Interrogates path with Tregex query, gets relative frequencies, and plots the top seven results""" from corpkit import interrogator, editor, plotter quickstart = interrogator(path, 'words', query) edited = editor(quickstart.results, '%', quickstart.totals, print_info = False) plotter(str(path), edited.results)
def concordance(self, *args, **kwargs): """ A concordance method for Tregex queries, CoreNLP dependencies, tokenised data or plaintext. >>> wv = ['want', 'need', 'feel', 'desire'] >>> corpus.concordance({'l': wv, 'f': 'root'}) Arguments are the same as :func:`~corpkit.interrogation.Interrogation.interrogate`, plus: :param only_format_match: if True, left and right window will just be words, regardless of what is in 'show' :type only_format_match: bool :param random: randomise lines :type random: bool :param only_unique: only unique lines :type only_unique: bool :returns: A :class:`corpkit.interrogation.Concordance` instance """ from interrogator import interrogator return interrogator(self, conc = True, *args, **kwargs)
def features(self): """ Get some basic stats from the corpus, and store as :py:attr:`~corpkit.corpus.Corpus.features` >>> corpus.get_stats() :returns: None """ from interrogator import interrogator return interrogator(self, 's', 'any').results
def get_stats(self, *args): """ Get some basic stats from the corpus, and store as :py:attr:`~corpkit.corpus.Corpus.features` >>> corpus.get_stats() :returns: None """ from interrogator import interrogator self.features = interrogator(self.path, 's', 'any').results print('\nFeatures defined. See .features attribute ...')
def features(self): """ Generate and show basic stats from the corpus, including number of sentences, clauses, process types, etc. :Example: >>> corpus.features SB Characters Tokens Words Closed class words Open class words Clauses 01 26873 8513 7308 4809 3704 2212 02 25844 7933 6920 4313 3620 2270 03 18376 5683 4877 3067 2616 1640 04 20066 6354 5366 3587 2767 1775 """ from interrogator import interrogator return interrogator(self, 's', 'any').results
def multiquery(corpus, query, sort_by = 'total', quicksave = False): import corpkit """Creates a named tuple for a list of named queries to count. Pass in something like: [[u'NPs in corpus', r'NP'], [u'VPs in corpus', r'VP']]""" import collections import os import pandas import pandas as pd from time import strftime, localtime from interrogator import interrogator from editor import editor if quicksave: savedir = 'saved_interrogations' if not quicksave.endswith('.p'): quicksave = quicksave + '.p' fullpath = os.path.join(savedir, quicksave) while os.path.isfile(fullpath): selection = raw_input("\nSave error: %s already exists in %s.\n\nPick a new name: " % (savename, savedir)) if not selection.endswith('.p'): selection = selection + '.p' fullpath = os.path.join(savedir, selection) results = [] for name, pattern in query: result = interrogator(corpus, 'count', pattern) result.totals.name = name # rename count results.append(result.totals) results = pd.concat(results, axis = 1) results = editor(results, sort_by = sort_by, print_info = False, keep_stats = False) time = strftime("%H:%M:%S", localtime()) print '%s: Finished! %d unique results, %d total.' % (time, len(results.results.columns), results.totals.sum()) if quicksave: from other import save_result save_result(results, quicksave) return results
def pmultiquery(corpus, search, show = 'words', query = 'any', sort_by = 'total', quicksave = False, multiprocess = 'default', function_filter = False, just_speakers = False, root = False, note = False, print_info = True, **kwargs): """Parallel process multiple queries or corpora. This function is used by interrogator() if: a) path is a list of paths b) query is a dict of named queries c) just speakers == 'each', or a list of speakers with len(list) > 1 This function needs joblib 0.8.4 or above in order to run properly. There's no reason to call it yourself.""" import collections import os import pandas as pd import collections from collections import namedtuple from time import strftime, localtime import corpkit from interrogator import interrogator from editor import editor from other import save from interrogation import Interrogation try: from joblib import Parallel, delayed except: pass #raise ValueError('joblib, the module used for multiprocessing, cannot be found. ' \ # 'Install with:\n\n pip install joblib') import multiprocessing def best_num_parallel(num_cores, num_queries): import corpkit """decide how many parallel processes to run the idea, more or less, is to balance the load when possible""" if num_queries <= num_cores: return num_queries if num_queries > num_cores: if (num_queries / num_cores) == num_cores: return int(num_cores) if num_queries % num_cores == 0: try: return max([int(num_queries / n) for n in range(2, num_cores) if int(num_queries / n) <= num_cores]) except ValueError: return num_cores else: import math if (float(math.sqrt(num_queries))).is_integer(): square_root = math.sqrt(num_queries) if square_root <= num_queries / num_cores: return int(square_root) return num_cores num_cores = multiprocessing.cpu_count() # what is our iterable? ... multiple_option = False multiple_queries = False multiple_speakers = False multiple_corpora = False multiple_search = False mult_corp_are_subs = False denom = 1 if hasattr(corpus, '__iter__'): multiple_corpora = True num_cores = best_num_parallel(num_cores, len(corpus)) denom = len(corpus) if all(c.__class__ == corpkit.corpus.Subcorpus for c in corpus): mult_corp_are_subs = True elif (type(query) == list or type(query) == dict) and not hasattr(search, '__iter__'): multiple_queries = True num_cores = best_num_parallel(num_cores, len(query)) denom = len(query) elif hasattr(search, '__iter__') and type(search) != dict: multiple_search = True num_cores = best_num_parallel(num_cores, len(list(search.keys()))) denom = len(list(search.keys())) elif hasattr(function_filter, '__iter__'): multiple_option = True num_cores = best_num_parallel(num_cores, len(list(function_filter.keys()))) denom = len(list(function_filter.keys())) elif just_speakers: from build import get_speaker_names_from_xml_corpus multiple_speakers = True if just_speakers == 'each' or just_speakers == ['each']: just_speakers = get_speaker_names_from_xml_corpus(corpus.path) if len(just_speakers) == 0: print('No speaker name data found.') return num_cores = best_num_parallel(num_cores, len(just_speakers)) denom = len(just_speakers) if type(multiprocess) == int: num_cores = multiprocess if multiprocess is False: num_cores = 1 # make sure quicksaves are right type if quicksave is True: raise ValueError('quicksave must be string when using pmultiquery.') # the options that don't change d = { #'paralleling': True, 'function': 'interrogator', 'root': root, 'note': note, 'denominator': denom} # add kwargs to query for k, v in list(kwargs.items()): d[k] = v # make a list of dicts to pass to interrogator, # with the iterable unique in every one ds = [] if multiple_corpora: for index, p in enumerate(corpus): name = p.name a_dict = dict(d) a_dict['corpus'] = p a_dict['search'] = search a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name.replace('-parsed', '') a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_queries: for index, (name, q) in enumerate(query.items()): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = search a_dict['query'] = q a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_option: for index, (name, q) in enumerate(function_filter.items()): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = search a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['function_filter'] = q a_dict['printstatus'] = False ds.append(a_dict) elif multiple_speakers: for index, name in enumerate(just_speakers): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = search a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = [name] a_dict['function_filter'] = function_filter a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_search: for index, val in enumerate(search): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = val a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = just_speakers a_dict['function_filter'] = function_filter a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) if kwargs.get('do_concordancing') is False: message = 'Interrogating' elif kwargs.get('do_concordancing') is True: message = 'Interrogating and concordancing' elif kwargs.get('do_concordancing').lower() == 'only': message = 'Concordancing' time = strftime("%H:%M:%S", localtime()) sformat = '' for i, (k, v) in enumerate(list(search.items())): if type(v) == list: vformat = ', '.join(v[:5]) if len(v) > 5: vformat += ' ...' else: vformat = v sformat += '%s: %s' %(k, vformat) if i < len(search.keys()) - 1: sformat += '\n ' if multiple_corpora and not multiple_option: corplist = "\n ".join([i.name for i in corpus[:20]]) if len(corpus) > 20: corplist += '\n ... and %d more ...\n' % (len(corpus) - 20) print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes):\n %s" \ "\n Query: '%s'\n %s corpus ... \n" % (time, len(corpus), num_cores, corplist, sformat, message))) elif multiple_queries: print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \ "\n Queries: '%s'\n %s corpus ... \n" % (time, len(search), num_cores, corpus.name, "', '".join(list(search.values())), message) )) elif multiple_search: print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \ "\n Queries: '%s'\n %s corpus ... \n" % (time, len(list(search.keys())), num_cores, corpus.name, str(list(search.values())), message))) elif multiple_option: print(("\n%s: Beginning %d parallel corpus interrogations (multiple options): %s" \ "\n Query: '%s'\n %s corpus ... \n" % (time, num_cores, corpus.name, sformat, message) )) elif multiple_speakers: print(("\n%s: Beginning %d parallel corpus interrogations: %s" \ "\n Query: '%s'\n %s corpus ... \n" % (time, num_cores, corpus.name, sformat, message) )) # run in parallel, get either a list of tuples (non-c option) # or a dataframe (c option) #import sys #reload(sys) #stdout=sys.stdout failed = False terminal = False used_joblib = False #ds = ds[::-1] if not root: from blessings import Terminal terminal = Terminal() print('\n' * (len(ds) - 2)) for dobj in ds: linenum = dobj['paralleling'] # this try handles nosetest problems in sublime text try: with terminal.location(0, terminal.height - (linenum + 1)): # this is a really bad idea. thetime = strftime("%H:%M:%S", localtime()) num_spaces = 26 - len(dobj['outname']) print('%s: QUEUED: %s' % (thetime, dobj['outname'])) except: pass if not root and multiprocess: #res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) try: #ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True) res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) used_joblib = True except: failed = True print('Multiprocessing failed.') raise if not res: failed = True else: res = [] for index, d in enumerate(ds): d['startnum'] = (100 / denom) * index res.append(interrogator(**d)) try: res = sorted(res) except: pass # multiprocessing way #from multiprocessing import Process #from interrogator import interrogator #jobs = [] ##for d in ds: ## p = multiprocessing.Process(target=interrogator, kwargs=(**d,)) ## jobs.append(p) ## p.start() ## while p.is_alive(): ## import time ## time.sleep(2) ## if root: ## root.update() #result_queue = multiprocessing.Queue() # #for d in ds: #funs = [interrogator(result_queue, **kwargs) for kwargs in ds] #jobs = [multiprocessing.Process(mc) for mc in funs] #for job in jobs: job.start() #for job in jobs: job.join() #results = [result_queue.get() for mc in funs] import corpkit from interrogation import Concordance if kwargs.get('do_concordancing') == 'only': concs = pd.concat([x for x in res]) thetime = strftime("%H:%M:%S", localtime()) print('\n\n%s: Finished! %d results.\n\n' % (thetime, len(concs.index))) return Concordance(concs) from collections import OrderedDict if not all(type(i.results) == pd.core.series.Series for i in res): out = OrderedDict() for interrog, d in zip(res, ds): for unpicklable in ['note', 'root']: interrog.query.pop(unpicklable, None) out[interrog.query['outname']] = interrog if quicksave: fullpath = os.path.join('saved_interrogations', quicksave) while os.path.isdir(fullpath): selection = input("\nSave error: %s already exists in %s.\n\nType 'o' to overwrite, or enter a new name: " % (quicksave, 'saved_interrogations')) if selection == 'o' or selection == 'O': import shutil shutil.rmtree(fullpath) else: import os fullpath = os.path.join('saved_interrogations', selection) for k, v in list(out.items()): save(v, k, savedir = fullpath, print_info = False) time = strftime("%H:%M:%S", localtime()) print("\n%s: %d files saved to %s" % ( time, len(list(out.keys())), fullpath)) time = strftime("%H:%M:%S", localtime()) print("\n\n%s: Finished! Output is a dictionary with keys:\n\n '%s'\n" % (time, "'\n '".join(sorted(out.keys())))) from interrogation import Interrodict return Interrodict(out) # make query and total branch, save, return else: #print sers #print ds if multiple_corpora and not mult_corp_are_subs: sers = [i.results for i in res] out = pd.DataFrame(sers, index = [i.query['outname'] for i in res]) out = out.reindex_axis(sorted(out.columns), axis=1) # sort cols out = out.fillna(0) # nan to zero out = out.astype(int) # float to int out = out.T else: out = pd.concat([r.results for r in res], axis = 1) # format like normal out = out[sorted(list(out.columns))] out = out.T out = out.fillna(0) # nan to zero out = out.astype(int) if 'c' in show and mult_corp_are_subs: out = out.sum() out.index = sorted(list(out.index)) # sort by total if type(out) == pd.core.frame.DataFrame: out.ix['Total-tmp'] = out.sum() tot = out.ix['Total-tmp'] out = out[tot.argsort()[::-1]] out = out.drop('Total-tmp', axis = 0) out = out.edit(sort_by = sort_by, print_info = False, keep_stats = False, \ df1_always_df = kwargs.get('df1_always_df')) if len(out.results.columns) == 1: out.results = out.results.sort_index() if kwargs.get('do_concordancing') is True: concs = pd.concat([x.concordance for x in res], ignore_index = True) concs = concs.sort_values(by='c') concs = concs.reset_index(drop=True) out.concordance = Concordance(concs) thetime = strftime("%H:%M:%S", localtime()) if terminal: with terminal.location(0, terminal.height): print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n')) else: print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n')) #if used_joblib: if quicksave: from other import save save(out, quicksave) print('\n') return out
def interrogate(self, search, *args, **kwargs): """Interrogate a corpus of texts for a lexicogrammatical phenomenon >>> # show lemma form of nouns ending in 'ing' >>> q = {'w': r'ing$', 'p': r'^N'} >>> data = corpus.interrogate(q, show = 'l') :param search: What query should be matching - t/tregex - w/word - l/lemma - f/function - g/governor - d/dependent - p/pos - i/index - n/ngrams - s/general stats :type search: str, or, for dependencies, a dict like ``{'w': 'help', 'p': r'^V'}`` :param searchmode: Return results matching any/all criteria :type searchmode: str ('any'/'all') :param exclude: The inverse of `search`, removing results from search :type exclude: dict -- ``{'l': 'be'}`` :param excludemode: Exclude results matching any/all criteria :type excludemode: str ('any'/'all') :param query: A search query for the interrogation :type query: str -- regex/Tregex pattern; dict -- ``{name: pattern}``; list -- word list to match :param show: What to output. If multiple strings are passed, results will be colon-separated, in order - t/tree - w/word - l/lemma - g/governor - d/dependent - f/function - p/pos - i/index - a/distance from root :type show: list of strings :param lemmatise: Force lemmatisation on results :type lemmatise: bool :param lemmatag: Explicitly pass a pos to lemmatiser (generally when data is unparsed) :type lemmatag: False/'n'/'v'/'a'/'r' :param spelling: Convert all to U.S. or U.K. English :type spelling: False/'US'/'UK' :param dep_type: The kind of Stanford CoreNLP dependency parses you want to use :type dep_type: str -- 'basic-dependencies'/'a', 'collapsed-dependencies'/'b', 'collapsed-ccprocessed-dependencies'/'c' :param quicksave: Save result as pickle to ```saved_interrogations/str``` on completion :type quicksave: str :param gramsize: size of ngrams (default 2) :type gramsize: int :param split_contractions: make ``"don't"`` et al into two tokens :type split_contractions: bool :param multiprocess: how many parallel processes to run :type multiprocess: int / bool (to determine automatically) :param files_as_subcorpora: treat each file as a subcorpus :type files_as_subcorpora: bool :returns: A :class:`corpkit.interrogation.Interrogation` object, with ``.query``, ``.results``, ``.totals`` attributes. If multiprocessing is \ invoked, result may be a :class:`corpkit.interrogation.Interrodict` containing corpus names, queries or speakers as keys. """ from interrogator import interrogator par = kwargs.pop('multiprocess', None) if par and self.subcorpora: if type(par) == int: kwargs['multiprocess'] = par return interrogator(self.subcorpora, search, *args, **kwargs) else: return interrogator(self, search, *args, **kwargs)
def interrogate(self, *args, **kwargs): """Interrogate the corpus using :func:`~corpkit.corpus.Corpus.interrogate`""" from interrogator import interrogator return interrogator(self, *args, **kwargs)
def concordance(self, *args, **kwargs): """Concordance the corpus using :func:`~corpkit.corpus.Corpus.concordance`""" from interrogator import interrogator return interrogator(self, do_concordancing='only', *args, **kwargs)
def concordance(self, *args, **kwargs): """Concordance the corpus using :func:`~corpkit.corpus.Corpus.concordance`""" from interrogator import interrogator return interrogator(self, do_concordancing = 'only', *args, **kwargs)
def interrogate(self, search, *args, **kwargs): """Interrogate a corpus of texts for a lexicogrammatical phenomenon :Example: >>> corpus = Corpus('data/conversations-parsed') ### show lemma form of nouns ending in 'ing' >>> q = {W: r'ing$', P: r'^N'} >>> data = corpus.interrogate(q, show = L) >>> data.results .. something anything thing feeling everything nothing morning 01 14 11 12 1 6 0 1 02 10 20 4 4 8 3 0 03 14 5 5 3 1 0 0 ... ... :param search: What query should be matching - t/tregex - w/word - l/lemma - f/function - g/governor - d/dependent - p/pos - i/index - n/ngrams - s/general stats :type search: str, or, for dependencies, a dict like `{W: 'help', P: r'^V'}` :param searchmode: Return results matching any/all criteria :type searchmode: str -- `'any'`/`'all'` :param exclude: The inverse of `search`, removing results from search :type exclude: dict -- `{L: 'be'}` :param excludemode: Exclude results matching any/all criteria :type excludemode: str -- `'any'`/`'all'` :param query: A search query for the interrogation :type query: - str -- regex/Tregex pattern - dict -- `{name: pattern}` - list -- word list to match :param show: What to output. If multiple strings are passed, results will be colon-separated, in order - t/tree - w/word - l/lemma - g/governor - d/dependent - f/function - p/pos - i/index - a/distance from root :type show: list of strings :param lemmatise: Force lemmatisation on results :type lemmatise: bool :param lemmatag: Explicitly pass a pos to lemmatiser (generally when data is unparsed) :type lemmatag: False/'n'/'v'/'a'/'r' :param spelling: Convert all to U.S. or U.K. English :type spelling: False/'US'/'UK' :param dep_type: The kind of Stanford CoreNLP dependency parses you want to use :type dep_type: str -- 'basic-dependencies'/'a', 'collapsed-dependencies'/'b', 'collapsed-ccprocessed-dependencies'/'c' :param save: Save result as pickle to `saved_interrogations/<save>` on completion :type save: str :param gramsize: size of ngrams (default 2) :type gramsize: int :param split_contractions: make `"don't"` et al into two tokens :type split_contractions: bool :param multiprocess: how many parallel processes to run :type multiprocess: int / bool (to determine automatically) :param files_as_subcorpora: treat each file as a subcorpus :type files_as_subcorpora: bool :param do_concordancing: Concordance while interrogating, store as `.concordance` attribute :type do_concordancing: bool/'only' :param maxconc: Maximum number of concordance lines :type maxcond: int :returns: A :class:`corpkit.interrogation.Interrogation` object, with `.query`, `.results`, `.totals` attributes. If multiprocessing is \ invoked, result may be a :class:`corpkit.interrogation.Interrodict` containing corpus names, queries or speakers as keys. """ from interrogator import interrogator par = kwargs.pop('multiprocess', None) if par and self.subcorpora: if type(par) == int: kwargs['multiprocess'] = par return interrogator(self.subcorpora, search, *args, **kwargs) else: return interrogator(self, search, *args, **kwargs)
def interrogate(self, *args, **kwargs): """interrogate the corpus using :func:`~corpkit.corpus.Corpus.interrogate`""" from interrogator import interrogator return interrogator([s for s in self], *args, **kwargs)
def pmultiquery(corpus, search, show='words', query='any', sort_by='total', quicksave=False, multiprocess='default', just_speakers=False, root=False, note=False, print_info=True, **kwargs): """Parallel process multiple queries or corpora. This function is used by interrogator() for multiprocessing. There's no reason to call this function yourself.""" import collections import os import pandas as pd import collections from collections import namedtuple from time import strftime, localtime import corpkit from interrogator import interrogator from editor import editor from other import save from interrogation import Interrogation try: from joblib import Parallel, delayed except: pass #raise ValueError('joblib, the module used for multiprocessing, cannot be found. ' \ # 'Install with:\n\n pip install joblib') import multiprocessing locs = locals() for k, v in kwargs.items(): locs[k] = v def best_num_parallel(num_cores, num_queries): import corpkit """decide how many parallel processes to run the idea, more or less, is to balance the load when possible""" if num_queries <= num_cores: return num_queries if num_queries > num_cores: if (num_queries / num_cores) == num_cores: return int(num_cores) if num_queries % num_cores == 0: try: return max([ int(num_queries / n) for n in range(2, num_cores) if int(num_queries / n) <= num_cores ]) except ValueError: return num_cores else: import math if (float(math.sqrt(num_queries))).is_integer(): square_root = math.sqrt(num_queries) if square_root <= num_queries / num_cores: return int(square_root) return num_cores num_cores = multiprocessing.cpu_count() # what is our iterable? ... multiple_option = False multiple_queries = False multiple_speakers = False multiple_corpora = False multiple_search = False mult_corp_are_subs = False denom = 1 if hasattr(corpus, '__iter__'): multiple_corpora = True num_cores = best_num_parallel(num_cores, len(corpus)) denom = len(corpus) if all(c.__class__ == corpkit.corpus.Subcorpus for c in corpus): mult_corp_are_subs = True elif (type(query) == list or type(query) == dict) and not hasattr(search, '__iter__'): multiple_queries = True num_cores = best_num_parallel(num_cores, len(query)) denom = len(query) elif hasattr(search, '__iter__') and all( type(i) == dict for i in list(search.values())): multiple_search = True num_cores = best_num_parallel(num_cores, len(list(search.keys()))) denom = len(list(search.keys())) elif just_speakers: from build import get_speaker_names_from_xml_corpus multiple_speakers = True if just_speakers == 'each' or just_speakers == ['each']: just_speakers = get_speaker_names_from_xml_corpus(corpus.path) if len(just_speakers) == 0: print('No speaker name data found.') return num_cores = best_num_parallel(num_cores, len(just_speakers)) denom = len(just_speakers) # if this thing has already come through multiquery, don't multiprocess this time #if kwargs.get('outname'): # multiprocess = False if multiple_corpora and any(x is True for x in [ multiple_speakers, multiple_queries, multiple_search, multiple_option ]): from corpus import Corpus, Corpora if corpus.__class__ == Corpora: multiprocess = False else: corpus = Corpus(corpus) if type(multiprocess) == int: num_cores = multiprocess if multiprocess is False: num_cores = 1 # make sure quicksaves are right type if quicksave is True: raise ValueError('quicksave must be string when using pmultiquery.') # the options that don't change d = { #'paralleling': True, 'function': 'interrogator', 'root': root, 'note': note, 'denominator': denom } # add kwargs to query for k, v in list(kwargs.items()): d[k] = v # make a list of dicts to pass to interrogator, # with the iterable unique in every one ds = [] if multiple_corpora: for index, p in enumerate(corpus): name = p.name a_dict = dict(d) a_dict['corpus'] = p a_dict['search'] = search a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name.replace('-parsed', '') a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_queries: for index, (name, q) in enumerate(query.items()): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = search a_dict['query'] = q a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_speakers: for index, name in enumerate(just_speakers): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = search a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = [name] a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_search: for index, (name, val) in enumerate(search.items()): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = val a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) if kwargs.get('do_concordancing') is False: message = 'Interrogating' elif kwargs.get('do_concordancing') is True: message = 'Interrogating and concordancing' elif kwargs.get('do_concordancing').lower() == 'only': message = 'Concordancing' time = strftime("%H:%M:%S", localtime()) sformat = '' if multiple_queries: to_it_over = query else: to_it_over = search for i, (k, v) in enumerate(list(to_it_over.items())): if type(v) == list: vformat = ', '.join(v[:5]) if len(v) > 5: vformat += ' ...' elif type(v) == dict: vformat = '' for kk, vv in v.items(): if type(vv) == list: vv = ', '.join(vv[:5]) vformat += '\n %s: %s' % (kk, vv) if len(vv) > 5: vformat += ' ...' else: vformat = v sformat += '%s: %s' % (k, vformat) if i < len(to_it_over.keys()) - 1: sformat += '\n ' if print_info: if multiple_corpora and not multiple_option: corplist = "\n ".join([i.name for i in corpus[:20]]) if len(corpus) > 20: corplist += '\n ... and %d more ...\n' % (len(corpus) - 20) print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes):\n %s" \ "\n Query: %s\n %s corpus ... \n" % (time, len(corpus), num_cores, corplist, sformat, message))) elif multiple_queries: print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \ "\n Queries: %s\n %s corpus ... \n" % (time, len(query), num_cores, corpus.name, sformat, message) )) elif multiple_search: print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \ "\n Queries: %s\n %s corpus ... \n" % (time, len(list(search.keys())), num_cores, corpus.name, sformat, message))) elif multiple_option: print(("\n%s: Beginning %d parallel corpus interrogations (multiple options): %s" \ "\n Query: %s\n %s corpus ... \n" % (time, num_cores, corpus.name, sformat, message) )) elif multiple_speakers: print(("\n%s: Beginning %d parallel corpus interrogations: %s" \ "\n Query: %s\n %s corpus ... \n" % (time, num_cores, corpus.name, sformat, message) )) # run in parallel, get either a list of tuples (non-c option) # or a dataframe (c option) #import sys #reload(sys) #stdout=sys.stdout failed = False terminal = False used_joblib = False #ds = ds[::-1] if not root and print_info: from blessings import Terminal terminal = Terminal() print('\n' * (len(ds) - 2)) for dobj in ds: linenum = dobj['paralleling'] # this try handles nosetest problems in sublime text try: with terminal.location(0, terminal.height - (linenum + 1)): # this is a really bad idea. thetime = strftime("%H:%M:%S", localtime()) num_spaces = 26 - len(dobj['outname']) print('%s: QUEUED: %s' % (thetime, dobj['outname'])) except: pass if not root and multiprocess: #res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) try: #ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True) res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) used_joblib = True except: failed = True print('Multiprocessing failed.') raise if not res: failed = True else: res = [] for index, d in enumerate(ds): d['startnum'] = (100 / denom) * index res.append(interrogator(**d)) try: res = sorted(res) except: pass # multiprocessing way #from multiprocessing import Process #from interrogator import interrogator #jobs = [] ##for d in ds: ## p = multiprocessing.Process(target=interrogator, kwargs=(**d,)) ## jobs.append(p) ## p.start() ## while p.is_alive(): ## import time ## time.sleep(2) ## if root: ## root.update() #result_queue = multiprocessing.Queue() # #for d in ds: #funs = [interrogator(result_queue, **kwargs) for kwargs in ds] #jobs = [multiprocessing.Process(mc) for mc in funs] #for job in jobs: job.start() #for job in jobs: job.join() #results = [result_queue.get() for mc in funs] import corpkit from interrogation import Concordance if kwargs.get('do_concordancing') == 'only': concs = pd.concat([x for x in res]) thetime = strftime("%H:%M:%S", localtime()) if print_info: print('\n\n%s: Finished! %d results.\n\n' % (thetime, len(concs.index))) return Concordance(concs) from collections import OrderedDict if not all(type(i.results) == pd.core.series.Series for i in res): out = OrderedDict() for interrog, d in zip(res, ds): for unpicklable in ['note', 'root']: interrog.query.pop(unpicklable, None) try: out[interrog.query['outname']] = interrog except KeyError: out[d['outname']] = interrog if quicksave: fullpath = os.path.join('saved_interrogations', quicksave) while os.path.isdir(fullpath): selection = input( "\nSave error: %s already exists in %s.\n\nType 'o' to overwrite, or enter a new name: " % (quicksave, 'saved_interrogations')) if selection == 'o' or selection == 'O': import shutil shutil.rmtree(fullpath) else: import os fullpath = os.path.join('saved_interrogations', selection) for k, v in list(out.items()): save(v, k, savedir=fullpath, print_info=False) time = strftime("%H:%M:%S", localtime()) print("\n%s: %d files saved to %s" % (time, len(list(out.keys())), fullpath)) time = strftime("%H:%M:%S", localtime()) if print_info: print( "\n\n%s: Finished! Output is a dictionary with keys:\n\n '%s'\n" % (time, "'\n '".join(sorted(out.keys())))) from interrogation import Interrodict idict = Interrodict(out) # remove unpicklable bits from query from types import ModuleType, FunctionType, BuiltinMethodType, BuiltinFunctionType locs = {k: v for k, v in locs.items() if not isinstance(v, ModuleType) \ and not isinstance(v, FunctionType) \ and not isinstance(v, BuiltinFunctionType) \ and not isinstance(v, BuiltinMethodType)} idict.query = locs return idict # make query and total branch, save, return else: #print sers #print ds if multiple_corpora and not mult_corp_are_subs: sers = [i.results for i in res] out = pd.DataFrame(sers, index=[i.query['outname'] for i in res]) out = out.reindex_axis(sorted(out.columns), axis=1) # sort cols out = out.fillna(0) # nan to zero out = out.astype(int) # float to int out = out.T else: try: out = pd.concat([r.results for r in res], axis=1) except ValueError: return None # format like normal out = out[sorted(list(out.columns))] out = out.T out = out.fillna(0) # nan to zero out = out.astype(int) if 'c' in show and mult_corp_are_subs: out = out.sum() out.index = sorted(list(out.index)) # sort by total if type(out) == pd.core.frame.DataFrame: out.ix['Total-tmp'] = out.sum() tot = out.ix['Total-tmp'] out = out[tot.argsort()[::-1]] out = out.drop('Total-tmp', axis=0) out = out.edit(sort_by = sort_by, print_info = False, keep_stats = False, \ df1_always_df = kwargs.get('df1_always_df')) if len(out.results.columns) == 1: out.results = out.results.sort_index() if kwargs.get('do_concordancing') is True: concs = pd.concat([x.concordance for x in res], ignore_index=True) concs = concs.sort_values(by='c') concs = concs.reset_index(drop=True) out.concordance = Concordance(concs) thetime = strftime("%H:%M:%S", localtime()) if terminal and print_info: with terminal.location(0, terminal.height): print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len( out.results.columns), out.totals.sum(), '\n')) else: if print_info: print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len( out.results.columns), out.totals.sum(), '\n')) #if used_joblib: if quicksave: from other import save save(out, quicksave) return out
def configurations(corpus, search, **kwargs): """Get behaviour of a word---see corpkit.corpus.Corpus.configurations() for docs""" import corpkit from dictionaries.wordlists import wordlists from dictionaries.roles import roles from interrogation import Interrodict from interrogator import interrogator from collections import OrderedDict root = kwargs.get('root') note = kwargs.get('note') if search.get('l') and search.get('w'): raise ValueError('Search only for a word or a lemma, not both.') if search.get('l'): dep_word_or_lemma = 'dl' gov_word_or_lemma = 'gl' word_or_token = search.get('l') else: if search.get('w'): dep_word_or_lemma = 'd' gov_word_or_lemma = 'g' word_or_token = search.get('w') queries = {'participant': {'left_participant_in': {dep_word_or_lemma: word_or_token, 'df': r'^.subj.*', 'f': roles.event}, 'right_participant_in': {dep_word_or_lemma: word_or_token, 'df': r'^[di]obj', 'f': roles.event}, 'modified_by': {'f': r'^amod', gov_word_or_lemma: word_or_token}, 'and_or': {'f': 'conj:(and|or)', 'gf': roles.participant, gov_word_or_lemma: word_or_token}, }, 'process': {'has_subject': {'f': roles.participant1, gov_word_or_lemma: word_or_token}, 'has_object': {'f': roles.participant2, gov_word_or_lemma: word_or_token}, 'modalised_by': {'f': r'aux', 'w': wordlists.modals, gov_word_or_lemma: word_or_token}, 'modulated_by': {'f': 'advmod', 'gf': roles.event, gov_word_or_lemma: word_or_token}, 'and_or': {'f': 'conj:(and|or)', 'gf': roles.event, gov_word_or_lemma: word_or_token}, }, 'modifier': {'modifies': {'df': roles.modifier, dep_word_or_lemma: word_or_token}, 'modulated_by': {'f': 'advmod', 'gf': roles.modifier, gov_word_or_lemma: word_or_token}, 'and_or': {'f': 'conj:(and|or)', 'gf': roles.modifier, gov_word_or_lemma: word_or_token}, } } if search.get('f'): if search.get('f').lower().startswith('part'): queries = queries['participant'] elif search.get('f').lower().startswith('proc'): queries = queries['process'] elif search.get('f').lower().startswith('mod'): queries = queries['modifier'] else: newqueries = {} for k, v in queries.items(): for name, pattern in v.items(): newqueries[name] = pattern queries = newqueries queries['and_or'] = {'f': 'conj:(and|or)', gov_word_or_lemma: word_or_token}, total_queries = 0 for k, v in queries.items(): for subk, subv in v.items(): total_queries += 1 kwargs['search'] = queries data = interrogator(corpus, **kwargs) for k, v in data.items(): v.results = v.results.drop(word_or_token, axis = 1, errors = 'ignore') v.totals = v.results.sum(axis = 1) data[k] = v return data
def concordance(self, *args, **kwargs): """interrogate the corpus using :func:`~corpkit.corpus.Corpus.concordance`""" from interrogator import interrogator return interrogator([s for s in self], conc = True, *args, **kwargs)
def pmultiquery( path, option="c", query="any", sort_by="total", quicksave=False, num_proc="default", function_filter=False, just_speakers=False, root=False, note=False, print_info=True, **kwargs ): """Parallel process multiple queries or corpora. This function is used by interrogator if: a) path is a list of paths b) query is a dict of named queries c) function_filter is iterable d) just speakers == 'each' This function needs joblib 0.8.4 or above in order to run properly.""" import collections import os import pandas import pandas as pd import collections from collections import namedtuple from time import strftime, localtime from interrogator import interrogator from editor import editor from other import save_result try: from joblib import Parallel, delayed except: pass # raise ValueError('joblib, the module used for multiprocessing, cannot be found. ' \ # 'Install with:\n\n pip install joblib') import multiprocessing def best_num_parallel(num_cores, num_queries): import corpkit """decide how many parallel processes to run the idea, more or less, is to balance the load when possible""" if num_queries <= num_cores: return num_queries if num_queries > num_cores: if (num_queries / num_cores) == num_cores: return int(num_cores) if num_queries % num_cores == 0: return max([int(num_queries / n) for n in range(2, num_cores) if int(num_queries / n) <= num_cores]) else: import math if (float(math.sqrt(num_queries))).is_integer(): square_root = math.sqrt(num_queries) if square_root <= num_queries / num_cores: return int(square_root) return num_cores num_cores = multiprocessing.cpu_count() # are we processing multiple queries or corpora? # find out optimal number of cores to use. multiple_option = False multiple_queries = False multiple_speakers = False multiple_corpora = False denom = 1 if hasattr(path, "__iter__"): multiple_corpora = True num_cores = best_num_parallel(num_cores, len(path)) denom = len(path) elif hasattr(query, "__iter__"): multiple_queries = True num_cores = best_num_parallel(num_cores, len(query)) denom = len(query) elif hasattr(function_filter, "__iter__"): multiple_option = True num_cores = best_num_parallel(num_cores, len(function_filter.keys())) denom = len(function_filter.keys()) elif just_speakers: from corpkit.build import get_speaker_names_from_xml_corpus multiple_speakers = True if just_speakers == "each": just_speakers = get_speaker_names_from_xml_corpus(path) if len(just_speakers) == 0: print "No speaker name data found." return num_cores = best_num_parallel(num_cores, len(just_speakers)) denom = len(just_speakers) if num_proc != "default": num_cores = num_proc # make sure quicksaves are right type if quicksave is True: raise ValueError("quicksave must be string when using pmultiquery.") # the options that don't change d = { "option": option, #'paralleling': True, "function": "interrogator", "root": root, "note": note, "denominator": denom, } # add kwargs to query for k, v in kwargs.items(): d[k] = v # make a list of dicts to pass to interrogator, # with the iterable unique in every one ds = [] if multiple_corpora: path = sorted(path) for index, p in enumerate(path): name = os.path.basename(p) a_dict = dict(d) a_dict["path"] = p a_dict["query"] = query a_dict["outname"] = name a_dict["just_speakers"] = just_speakers a_dict["paralleling"] = index a_dict["printstatus"] = False ds.append(a_dict) elif multiple_queries: for index, (name, q) in enumerate(query.items()): a_dict = dict(d) a_dict["path"] = path a_dict["query"] = q a_dict["outname"] = name a_dict["just_speakers"] = just_speakers a_dict["paralleling"] = index a_dict["printstatus"] = False ds.append(a_dict) elif multiple_option: for index, (name, q) in enumerate(function_filter.items()): a_dict = dict(d) a_dict["path"] = path a_dict["query"] = query a_dict["outname"] = name a_dict["just_speakers"] = just_speakers a_dict["paralleling"] = index a_dict["function_filter"] = q a_dict["printstatus"] = False ds.append(a_dict) elif multiple_speakers: for index, name in enumerate(just_speakers): a_dict = dict(d) a_dict["path"] = path a_dict["query"] = query a_dict["outname"] = name a_dict["just_speakers"] = [name] a_dict["function_filter"] = function_filter a_dict["paralleling"] = index a_dict["printstatus"] = False ds.append(a_dict) time = strftime("%H:%M:%S", localtime()) if multiple_corpora and not multiple_option: print ( "\n%s: Beginning %d parallel corpus interrogations:\n %s" "\n\n Query: '%s'" "\n Interrogating corpus ... \n" % (time, num_cores, "\n ".join(path), query) ) elif multiple_queries: print ( "\n%s: Beginning %d parallel corpus interrogations: %s" "\n Queries: '%s'" "\n Interrogating corpus ... \n" % (time, num_cores, os.path.basename(path), "', '".join(query.values())) ) elif multiple_option: print ( "\n%s: Beginning %d parallel corpus interrogations (multiple options): %s" "\n\n Query: '%s'" "\n Interrogating corpus ... \n" % (time, num_cores, os.path.basename(path), query) ) elif multiple_speakers: print ( "\n%s: Beginning %d parallel corpus interrogations: %s" "\n\n Query: '%s'" "\n Interrogating corpus ... \n" % (time, num_cores, os.path.basename(path), query) ) # run in parallel, get either a list of tuples (non-c option) # or a dataframe (c option) # import sys # reload(sys) # stdout=sys.stdout failed = False # ds = ds[::-1] if not root: from blessings import Terminal terminal = Terminal() print "\n" * (len(ds) - 2) for dobj in ds: linenum = dobj["paralleling"] with terminal.location(0, terminal.height - (linenum + 1)): # this is a really bad idea. thetime = strftime("%H:%M:%S", localtime()) print "%s: [ 0%% (%s) ]" % (thetime, dobj["outname"]) # res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) try: # ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True) res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) print "\n\n\n" except: failed = True print "Multiprocessing failed." raise try: res = sorted(res) except: failed = True pass elif root or failed: res = [] for index, d in enumerate(ds): d["startnum"] = (100 / denom) * index res.append(interrogator(**d)) try: res = sorted(res) except: pass # multiprocessing way # from multiprocessing import Process # from corpkit.interrogator import interrogator # jobs = [] ##for d in ds: ## p = multiprocessing.Process(target=interrogator, kwargs=(**d,)) ## jobs.append(p) ## p.start() ## while p.is_alive(): ## import time ## time.sleep(2) ## if root: ## root.update() # result_queue = multiprocessing.Queue() # # for d in ds: # funs = [interrogator(result_queue, **kwargs) for kwargs in ds] # jobs = [multiprocessing.Process(mc) for mc in funs] # for job in jobs: job.start() # for job in jobs: job.join() # results = [result_queue.get() for mc in funs] # turn list into dict of results, make query and total branches, # save and return if not option.startswith("c"): out = {} # print '' for (name, data), d in zip(res, ds): for unpicklable in ["note", "root"]: try: del d[unpicklable] except KeyError: pass if not option.startswith("k"): outputnames = collections.namedtuple("interrogation", ["query", "results", "totals"]) try: stotal = data.sum(axis=1) stotal.name = u"Total" except ValueError: stotal = data.sum() output = outputnames(d, data, stotal) else: outputnames = collections.namedtuple("interrogation", ["query", "results"]) output = outputnames(d, data) out[name] = output # could be wrong for unstructured corpora? if quicksave: fullpath = os.path.join("saved_interrogations", quicksave) while os.path.isdir(fullpath): selection = raw_input( "\nSave error: %s already exists in %s.\n\nType 'o' to overwrite, or enter a new name: " % (quicksave, "saved_interrogations") ) if selection == "o" or selection == "O": import shutil shutil.rmtree(fullpath) else: import os fullpath = os.path.join("saved_interrogations", selection) for k, v in out.items(): save_result(v, k, savedir=fullpath, print_info=False) time = strftime("%H:%M:%S", localtime()) print "\n%s: %d files saved to %s" % (time, len(out.keys()), fullpath) time = strftime("%H:%M:%S", localtime()) print "\n\n%s: Finished! Output is a dictionary with keys:\n\n '%s'\n" % ( time, "'\n '".join(sorted(out.keys())), ) return out # make query and total branch, save, return else: out = pd.concat(res, axis=1) out = editor(out, sort_by=sort_by, print_info=False, keep_stats=False) time = strftime("%H:%M:%S", localtime()) print "\n\n%s: Finished! %d unique results, %d total." % (time, len(out.results.columns), out.totals.sum()) if quicksave: from other import save_result save_result(out, quicksave) return out