def features(self): """ Generate and show basic stats from the corpus, including number of sentences, clauses, process types, etc. :Example: >>> corpus.features SB Characters Tokens Words Closed class words Open class words Clauses 01 26873 8513 7308 4809 3704 2212 02 25844 7933 6920 4313 3620 2270 03 18376 5683 4877 3067 2616 1640 04 20066 6354 5366 3587 2767 1775 """ import os from os.path import isfile, isdir, join from corpkit.interrogator import interrogator from corpkit.other import load from corpkit.dictionaries import mergetags savedir = 'saved_interrogations' if isfile(join(savedir, self.name + '-features.p')): try: return load(self.name + '-features').results except AttributeError: return load(self.name + '-features') else: feat = interrogator(self, 's', 'any').results if isdir(savedir): feat.save(self.name + '-features') return feat
def multiquery(corpus, query, sort_by = 'total', quicksave = False): """Creates a named tuple for a list of named queries to count. Pass in something like: [[u'NPs in corpus', r'NP'], [u'VPs in corpus', r'VP']]""" import collections import os import pandas import pandas as pd from time import strftime, localtime from corpkit.interrogator import interrogator from corpkit.editor import editor if quicksave: savedir = 'data/saved_interrogations' if not quicksave.endswith('.p'): quicksave = quicksave + '.p' fullpath = os.path.join(savedir, quicksave) while os.path.isfile(fullpath): selection = raw_input("\nSave error: %s already exists in %s.\n\nPick a new name: " % (savename, savedir)) if not selection.endswith('.p'): selection = selection + '.p' fullpath = os.path.join(savedir, selection) results = [] for name, pattern in query: result = interrogator(corpus, 'count', pattern) result.totals.name = name # rename count results.append(result.totals) results = pd.concat(results, axis = 1) results = editor(results, sort_by = sort_by, print_info = False, keep_stats = False) time = strftime("%H:%M:%S", localtime()) print '%s: Finished! %d unique results, %d total.' % (time, len(results.results.columns), results.totals.sum()) if quicksave: from corpkit.other import save_result save_result(results, quicksave) return results
def wordclasses(self): """ Generate and show basic stats from the corpus, including number of sentences, clauses, process types, etc. :Example: >>> corpus.wordclasses SB Verb Noun Preposition Determiner ... 01 26873 8513 7308 5508 ... 02 25844 7933 6920 3323 ... 03 18376 5683 4877 3137 ... 04 20066 6354 5366 4336 ... """ import os from os.path import isfile, isdir, join from corpkit.interrogator import interrogator from corpkit.other import load from corpkit.dictionaries import mergetags savedir = 'saved_interrogations' if isfile(join(savedir, self.name + '-wordclasses.p')): try: return load(self.name + '-wordclasses').results except AttributeError: return load(self.name + '-wordclasses') elif isfile(join(savedir, self.name + '-postags.p')): try: posdata = load(self.name + '-postags').results except AttributeError: posdata = load(self.name + '-postags') return posdata.edit( merge_entries=mergetags, sort_by='total').results else: feat = interrogator(self, 't', 'any', show='pl').results if isdir(savedir): feat.save(self.name + '-wordclasses') return feat
def postags(self): """ Generate and show basic stats from the corpus, including number of sentences, clauses, process types, etc. :Example: >>> corpus.postags SB NN VB JJ IN DT 01 26873 8513 7308 4809 3704 ... 02 25844 7933 6920 4313 3620 ... 03 18376 5683 4877 3067 2616 ... 04 20066 6354 5366 3587 2767 ... """ import os from os.path import isfile, isdir, join from corpkit.interrogator import interrogator from corpkit.other import load from corpkit.dictionaries import mergetags savedir = 'saved_interrogations' if isfile(join(savedir, self.name + '-postags.p')): try: return load(self.name + '-postags').results except AttributeError: return load(self.name + '-postags') else: feat = interrogator(self, 't', 'any', show='p').results if isdir(savedir): feat.save(self.name + '-postags') wordclss = feat.edit( merge_entries=mergetags, sort_by='total').results wordclss.save(self.name + '-wordclasses') return feat
def pmultiquery(corpus, search, show = 'words', query = 'any', sort_by = 'total', quicksave = False, multiprocess = 'default', function_filter = False, just_speakers = False, root = False, note = False, print_info = True, **kwargs): """Parallel process multiple queries or corpora. This function is used by interrogator() if: a) path is a list of paths b) query is a dict of named queries c) just speakers == 'each', or a list of speakers with len(list) > 1 This function needs joblib 0.8.4 or above in order to run properly. There's no reason to call it yourself.""" import collections import os import pandas as pd import collections from collections import namedtuple from time import strftime, localtime import corpkit from corpkit.interrogator import interrogator from corpkit.editor import editor from corpkit.other import save from corpkit.interrogation import Interrogation try: from joblib import Parallel, delayed except: pass #raise ValueError('joblib, the module used for multiprocessing, cannot be found. ' \ # 'Install with:\n\n pip install joblib') import multiprocessing def best_num_parallel(num_cores, num_queries): import corpkit """decide how many parallel processes to run the idea, more or less, is to balance the load when possible""" if num_queries <= num_cores: return num_queries if num_queries > num_cores: if (num_queries / num_cores) == num_cores: return int(num_cores) if num_queries % num_cores == 0: try: return max([int(num_queries / n) for n in range(2, num_cores) if int(num_queries / n) <= num_cores]) except ValueError: return num_cores else: import math if (float(math.sqrt(num_queries))).is_integer(): square_root = math.sqrt(num_queries) if square_root <= num_queries / num_cores: return int(square_root) return num_cores num_cores = multiprocessing.cpu_count() # what is our iterable? ... multiple_option = False multiple_queries = False multiple_speakers = False multiple_corpora = False multiple_search = False mult_corp_are_subs = False denom = 1 if hasattr(corpus, '__iter__'): multiple_corpora = True num_cores = best_num_parallel(num_cores, len(corpus)) denom = len(corpus) if all(c.__class__ == corpkit.corpus.Subcorpus for c in corpus): mult_corp_are_subs = True elif hasattr(query, '__iter__'): multiple_queries = True num_cores = best_num_parallel(num_cores, len(query)) denom = len(query) elif hasattr(search, '__iter__') and type(search) != dict: multiple_search = True num_cores = best_num_parallel(num_cores, len(search.keys())) denom = len(search.keys()) elif hasattr(function_filter, '__iter__'): multiple_option = True num_cores = best_num_parallel(num_cores, len(function_filter.keys())) denom = len(function_filter.keys()) elif just_speakers: from corpkit.build import get_speaker_names_from_xml_corpus multiple_speakers = True if just_speakers == 'each' or just_speakers == ['each']: just_speakers = get_speaker_names_from_xml_corpus(corpus.path) if len(just_speakers) == 0: print 'No speaker name data found.' return num_cores = best_num_parallel(num_cores, len(just_speakers)) denom = len(just_speakers) if type(multiprocess) == int: num_cores = multiprocess if multiprocess is False: num_cores = 1 # make sure quicksaves are right type if quicksave is True: raise ValueError('quicksave must be string when using pmultiquery.') # the options that don't change d = { #'paralleling': True, 'function': 'interrogator', 'root': root, 'note': note, 'denominator': denom} # add kwargs to query for k, v in kwargs.items(): d[k] = v # make a list of dicts to pass to interrogator, # with the iterable unique in every one ds = [] if multiple_corpora: for index, p in enumerate(corpus): name = p.name a_dict = dict(d) a_dict['corpus'] = p a_dict['search'] = search a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name.replace('-parsed', '') a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_queries: for index, (name, q) in enumerate(query.items()): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = search a_dict['query'] = q a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_option: for index, (name, q) in enumerate(function_filter.items()): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = search a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['function_filter'] = q a_dict['printstatus'] = False ds.append(a_dict) elif multiple_speakers: for index, name in enumerate(just_speakers): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = search a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = [name] a_dict['function_filter'] = function_filter a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_search: for index, val in enumerate(search): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = val a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = just_speakers a_dict['function_filter'] = function_filter a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) time = strftime("%H:%M:%S", localtime()) sformat = '\n '.join(['%s: %s' % (k.rjust(3), v) for k, v in search.items()]) if multiple_corpora and not multiple_option: print ("\n%s: Beginning %d corpus interrogations (in %d parallel processes):\n %s" \ "\n Query: '%s'\n" % (time, len(corpus), num_cores, "\n ".join([i.name for i in corpus]), sformat)) elif multiple_queries: print ("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \ "\n Queries: '%s'\n" % (time, len(search), num_cores, corpus.name, "', '".join(search.values())) ) elif multiple_search: print ("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \ "\n Queries: '%s'\n" % (time, len(search.keys()), num_cores, corpus.name, str(search.values()))) elif multiple_option: print ("\n%s: Beginning %d parallel corpus interrogations (multiple options): %s" \ "\n Query: '%s'\n" % (time, num_cores, corpus.name, sformat) ) elif multiple_speakers: print ("\n%s: Beginning %d parallel corpus interrogations: %s" \ "\n Query: '%s'\n" % (time, num_cores, corpus.name, sformat) ) # run in parallel, get either a list of tuples (non-c option) # or a dataframe (c option) #import sys #reload(sys) #stdout=sys.stdout failed = False terminal = False used_joblib = False #ds = ds[::-1] if not root: from blessings import Terminal terminal = Terminal() print '\n' * (len(ds) - 2) for dobj in ds: linenum = dobj['paralleling'] # this try handles nosetest problems in sublime text try: with terminal.location(0, terminal.height - (linenum + 1)): # this is a really bad idea. thetime = strftime("%H:%M:%S", localtime()) num_spaces = 26 - len(dobj['outname']) print '%s: QUEUED: %s' % (thetime, dobj['outname']) except: pass if not root and multiprocess: #res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) try: #ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True) res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) used_joblib = True except: failed = True print 'Multiprocessing failed.' raise if not res: failed = True else: res = [] for index, d in enumerate(ds): d['startnum'] = (100 / denom) * index res.append(interrogator(**d)) try: res = sorted(res) except: pass # multiprocessing way #from multiprocessing import Process #from corpkit.interrogator import interrogator #jobs = [] ##for d in ds: ## p = multiprocessing.Process(target=interrogator, kwargs=(**d,)) ## jobs.append(p) ## p.start() ## while p.is_alive(): ## import time ## time.sleep(2) ## if root: ## root.update() #result_queue = multiprocessing.Queue() # #for d in ds: #funs = [interrogator(result_queue, **kwargs) for kwargs in ds] #jobs = [multiprocessing.Process(mc) for mc in funs] #for job in jobs: job.start() #for job in jobs: job.join() #results = [result_queue.get() for mc in funs] # turn list into dict of results, make query and total branches, # save and return if not all(type(i.results) == pd.core.series.Series for i in res): out = {} for interrog, d in zip(res, ds): interrog.query = d for unpicklable in ['note', 'root']: interrog.query.pop(unpicklable, None) out[interrog.query['outname']] = interrog # could be wrong for unstructured corpora? if quicksave: fullpath = os.path.join('saved_interrogations', quicksave) while os.path.isdir(fullpath): selection = raw_input("\nSave error: %s already exists in %s.\n\nType 'o' to overwrite, or enter a new name: " % (quicksave, 'saved_interrogations')) if selection == 'o' or selection == 'O': import shutil shutil.rmtree(fullpath) else: import os fullpath = os.path.join('saved_interrogations', selection) for k, v in out.items(): save(v, k, savedir = fullpath, print_info = False) time = strftime("%H:%M:%S", localtime()) print "\n%s: %d files saved to %s" % ( time, len(out.keys()), fullpath) time = strftime("%H:%M:%S", localtime()) print "\n%s: Finished! Output is a dictionary with keys:\n\n '%s'\n" % (time, "'\n '".join(sorted(out.keys()))) from corpkit.interrogation import Interrodict return Interrodict(out) # make query and total branch, save, return else: #print sers #print ds if multiple_corpora and not mult_corp_are_subs: sers = [i.results for i in res] out = pd.DataFrame(sers, index = [d['outname'] for d in ds]) out = out.reindex_axis(sorted(out.columns), axis=1) # sort cols out = out.fillna(0) # nan to zero out = out.astype(int) # float to int out = out.T else: out = pd.concat([r.results for r in res], axis = 1) # format like normal out = out[sorted(list(out.columns))] out = out.T out = out.fillna(0) # nan to zero out = out.astype(int) # sort by total if type(out) == pd.core.frame.DataFrame: out.ix['Total-tmp'] = out.sum() tot = out.ix['Total-tmp'] out = out[tot.argsort()[::-1]] out = out.drop('Total-tmp', axis = 0) out = out.edit(sort_by = sort_by, print_info = False, keep_stats = False) thetime = strftime("%H:%M:%S", localtime()) if terminal: with terminal.location(0, terminal.height): print '\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n') else: print '\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n') if used_joblib: print '\n' * (len(ds) - 3) if quicksave: from corpkit.other import save save(out, quicksave) return out
def pmultiquery(corpus, search, show='words', query='any', sort_by='total', save=False, multiprocess='default', root=False, note=False, print_info=True, subcorpora=False, **kwargs ): """ - Parallel process multiple queries or corpora. - This function is used by corpkit.interrogator.interrogator() - for multiprocessing. - There's no reason to call this function yourself. """ import os from pandas import DataFrame, Series import pandas as pd import collections from collections import namedtuple, OrderedDict from time import strftime, localtime import corpkit from corpkit.interrogator import interrogator from corpkit.interrogation import Interrogation, Interrodict from corpkit.process import canpickle try: from joblib import Parallel, delayed except ImportError: pass import multiprocessing locs = locals() for k, v in kwargs.items(): locs[k] = v in_notebook = locs.get('in_notebook') def best_num_parallel(num_cores, num_queries): """decide how many parallel processes to run the idea, more or less, is to balance the load when possible""" import corpkit if num_queries <= num_cores: return num_queries if num_queries > num_cores: if (num_queries / num_cores) == num_cores: return int(num_cores) if num_queries % num_cores == 0: try: return max([int(num_queries / n) for n in range(2, num_cores) \ if int(num_queries / n) <= num_cores]) except ValueError: return num_cores else: import math if (float(math.sqrt(num_queries))).is_integer(): square_root = math.sqrt(num_queries) if square_root <= num_queries / num_cores: return int(square_root) return num_cores num_cores = multiprocessing.cpu_count() # what is our iterable? ... multiple = kwargs.get('multiple', False) mult_corp_are_subs = False if hasattr(corpus, '__iter__'): if all(getattr(x, 'level', False) == 's' for x in corpus): mult_corp_are_subs = True non_first_sub = None if subcorpora: non_first_sub = subcorpora[1:] if isinstance(subcorpora, list) else None subval = subcorpora if not non_first_sub else subcorpora[0] #print(subcorpora, non_first_sub, subval) if subcorpora is True: import re subcorpora = re.compile(r'.*') else: # strange travis error happened here subcorpora = corpus.metadata['fields'][subval] if len(subcorpora) == 0: print('No %s metadata found.' % str(subval)) return mapcores = {'datalist': [corpus, 'corpus'], 'multiplecorpora': [corpus, 'corpus'], 'namedqueriessingle': [query, 'query'], 'namedqueriesmultiple': [search, 'search'], 'subcorpora': [subcorpora, 'subcorpora']} # a is a dummy, just to produce default one toiter, itsname = mapcores.get(multiple, [False, False]) if isinstance(toiter, dict): toiter = toiter.items() denom = len(toiter) num_cores = best_num_parallel(num_cores, denom) # todo: code below makes no sense vals = ['eachspeaker', 'multiplespeaker', 'namedqueriesmultiple'] if multiple == 'multiplecorpora' and any(x is True for x in vals): from corpkit.corpus import Corpus, Corpora if isinstance(corpus, Corpora): multiprocess = False else: corpus = Corpus(corpus) if isinstance(multiprocess, int): num_cores = multiprocess if multiprocess is False: num_cores = 1 # make sure saves are right type if save is True: raise ValueError('save must be string when multiprocessing.') # make a list of dicts to pass to interrogator, # with the iterable unique in every one locs['printstatus'] = False locs['multiprocess'] = False locs['df1_always_df'] = False locs['files_as_subcorpora'] = False locs['corpus'] = corpus if multiple == 'multiplespeaker': locs['multispeaker'] = True if isinstance(non_first_sub, list) and len(non_first_sub) == 1: non_first_sub = non_first_sub[0] # make the default query locs = {k: v for k, v in locs.items() if canpickle(v)} # make a new dict for every iteration ds = [dict(**locs) for i in range(denom)] for index, (d, bit) in enumerate(zip(ds, toiter)): d['paralleling'] = index if multiple in ['namedqueriessingle', 'namedqueriesmultiple']: d[itsname] = bit[1] d['outname'] = bit[0] elif multiple in ['multiplecorpora', 'datalist']: d['outname'] = bit.name.replace('-parsed', '') d[itsname] = bit elif multiple in ['subcorpora']: d[itsname] = bit jmd = {subval: bit} # put this earlier j2 = kwargs.get('just_metadata', False) if not j2: j2 = {} jmd.update(j2) d['just_metadata'] = jmd d['outname'] = bit d['by_metadata'] = False d['subcorpora'] = non_first_sub if non_first_sub: d['print_info'] = False # message printer should be a function... if kwargs.get('conc') is False: message = 'Interrogating' elif kwargs.get('conc') is True: message = 'Interrogating and concordancing' elif kwargs.get('conc').lower() == 'only': message = 'Concordancing' time = strftime("%H:%M:%S", localtime()) from corpkit.process import dictformat if print_info: # proper printing for plurals # in truth this needs to be revised, it's horrible. sformat = dictformat(search, query) if num_cores == 1: add_es = '' else: add_es = 'es' if multiple in ['multiplecorpora', 'datalist']: corplist = "\n ".join([i.name for i in list(corpus)[:20]]) if len(corpus) > 20: corplist += '\n ... and %d more ...\n' % (len(corpus) - 20) print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s):\n %s" \ "\n Query: %s\n %s corpus ... \n" % (time, len(corpus), num_cores, add_es, corplist, sformat, message))) elif multiple == 'namedqueriessingle': print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \ "\n Queries: %s\n %s corpus ... \n" % (time, len(query), num_cores, add_es, corpus.name, sformat, message) )) elif multiple == 'namedqueriesmultiple': print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \ "\n Queries: %s\n %s corpus ... \n" % (time, len(list(search.keys())), num_cores, add_es, corpus.name, sformat, message))) elif multiple in ['eachspeaker', 'multiplespeaker']: print(("\n%s: Beginning %d parallel corpus interrogation%s: %s" \ "\n Query: %s\n %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat, message) )) elif multiple in ['subcorpora']: print(("\n%s: Beginning %d parallel corpus interrogation%s: %s" \ "\n Query: %s\n %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat, message) )) # run in parallel, get either a list of tuples (non-c option) # or a dataframe (c option) #import sys #reload(sys) #stdout=sys.stdout failed = False terminal = False used_joblib = False #ds = ds[::-1] #todo: the number of blank lines to print can be way wrong if not root and print_info: from blessings import Terminal terminal = Terminal() print('\n' * (len(ds) - 2)) for dobj in ds: linenum = dobj['paralleling'] # this try handles nosetest problems in sublime text try: with terminal.location(0, terminal.height - (linenum + 1)): # this is a really bad idea. thetime = strftime("%H:%M:%S", localtime()) num_spaces = 26 - len(dobj['outname']) print('%s: QUEUED: %s' % (thetime, dobj['outname'])) except: pass if not root and multiprocess: try: res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) used_joblib = True except: failed = True print('Multiprocessing failed.') raise if not res: failed = True else: res = [] for index, d in enumerate(ds): d['startnum'] = (100 / denom) * index res.append(interrogator(**d)) try: res = sorted([i for i in res if i]) except: pass # remove unpicklable bits from query from types import ModuleType, FunctionType, BuiltinMethodType, BuiltinFunctionType badtypes = (ModuleType, FunctionType, BuiltinFunctionType, BuiltinMethodType) qlocs = {k: v for k, v in locs.items() if not isinstance(v, badtypes)} if hasattr(qlocs.get('corpus', False), 'name'): qlocs['corpus'] = qlocs['corpus'].path else: qlocs['corpus'] = list([i.path for i in qlocs.get('corpus', [])]) # return just a concordance from corpkit.interrogation import Concordance if kwargs.get('conc') == 'only': concs = pd.concat([x for x in res]) thetime = strftime("%H:%M:%S", localtime()) concs = concs.reset_index(drop=True) if kwargs.get('maxconc'): concs = concs[:kwargs.get('maxconc')] lines = Concordance(concs) if save: lines.save(save, print_info=print_info) if print_info: print('\n\n%s: Finished! %d results.\n\n' % (thetime, format(len(concs.index), ','))) return lines # return interrodict (to become multiindex) if isinstance(res[0], Interrodict) or not all(isinstance(i.results, Series) for i in res): out = OrderedDict() for interrog, d in zip(res, ds): for unpicklable in ['note', 'root']: interrog.query.pop(unpicklable, None) try: out[interrog.query['outname']] = interrog except KeyError: out[d['outname']] = interrog idict = Interrodict(out) if print_info: thetime = strftime("%H:%M:%S", localtime()) print("\n\n%s: Finished! Output is multi-indexed." % thetime) idict.query = qlocs if save: idict.save(save, print_info=print_info) return idict # make query and total branch, save, return # todo: standardise this so we don't have to guess transposes # else: if multiple == 'multiplecorpora' and not mult_corp_are_subs: sers = [i.results for i in res] out = DataFrame(sers, index=[i.query['outname'] for i in res]) out = out.reindex_axis(sorted(out.columns), axis=1) # sort cols out = out.fillna(0) # nan to zero out = out.astype(int) # float to int out = out.T else: # make a series from counts if all(len(i.results) == 1 for i in res): out = pd.concat([r.results for r in res]) out = out.sort_index() else: try: out = pd.concat([r.results for r in res], axis=1) out = out.T out.index = [i.query['outname'] for i in res] except ValueError: return None # format like normal # this sorts subcorpora, which are cls out = out[sorted(list(out.columns))] # puts subcorpora in the right place if not mult_corp_are_subs and multiple != 'subcorpora': out = out.T if multiple == 'subcorpora': out = out.sort_index() out = out.fillna(0) # nan to zero out = out.astype(int) if 'c' in show and mult_corp_are_subs: out = out.sum() out.index = sorted(list(out.index)) # sort by total if isinstance(out, DataFrame): out = out[list(out.sum().sort_values(ascending=False).index)] # really need to figure out the deal with tranposing! if all(x.endswith('.xml') for x in list(out.columns)) \ or all(x.endswith('.txt') for x in list(out.columns)) \ or all(x.endswith('.conll') for x in list(out.columns)): out = out.T if kwargs.get('nosubmode'): out = out.sum() from corpkit.interrogation import Interrogation tt = out.sum(axis=1) if isinstance(out, DataFrame) else out.sum() out = Interrogation(results=out, totals=tt, query=qlocs) if hasattr(out, 'columns') and len(out.columns) == 1: out = out.sort_index() if kwargs.get('conc') is True: try: concs = pd.concat([x.concordance for x in res], ignore_index=True) concs = concs.sort_values(by='c') concs = concs.reset_index(drop=True) if kwargs.get('maxconc'): concs = concs[:kwargs.get('maxconc')] out.concordance = Concordance(concs) except ValueError: out.concordance = None thetime = strftime("%H:%M:%S", localtime()) if terminal: print(terminal.move(terminal.height-1, 0)) if print_info: if terminal: print(terminal.move(terminal.height-1, 0)) if hasattr(out.results, 'columns'): print('%s: Interrogation finished! %s unique results, %s total.' % (thetime, format(len(out.results.columns), ','), format(out.totals.sum(), ','))) else: print('%s: Interrogation finished! %s matches.' % (thetime, format(tt, ','))) if save: out.save(save, print_info = print_info) if list(out.results.index) == ['0'] and not kwargs.get('df1_always_df'): out.results = out.results.ix[0].sort_index() return out
def interroplot(path, query): """Interrogates path with Tregex query, gets relative frequencies, and plots the top seven results""" from corpkit import interrogator, editor, plotter quickstart = interrogator(path, 'words', query) edited = editor(quickstart.results, '%', quickstart.totals, print_info = False) plotter(str(path), edited.results)
def concordance(self, *args, **kwargs): """ Concordance the corpus using :func:`~corpkit.corpus.Corpus.concordance` """ from corpkit.interrogator import interrogator return interrogator(self, conc='only', *args, **kwargs)
def interrogate(self, *args, **kwargs): """ Interrogate the corpus using :func:`~corpkit.corpus.Corpus.interrogate` """ from corpkit.interrogator import interrogator return interrogator(self, *args, **kwargs)
def interrogate(self, search, *args, **kwargs): """ Interrogate a corpus of texts for a lexicogrammatical phenomenon. This method iterates over the files/folders in a corpus, searching the texts, and returning a :class:`corpkit.interrogation.Interrogation` object containing the results. The main options are `search`, where you specify search criteria, and `show`, where you specify what you want to appear in the output. :Example: >>> corpus = Corpus('data/conversations-parsed') ### show lemma form of nouns ending in 'ing' >>> q = {W: r'ing$', P: r'^N'} >>> data = corpus.interrogate(q, show=L) >>> data.results .. something anything thing feeling everything nothing morning 01 14 11 12 1 6 0 1 02 10 20 4 4 8 3 0 03 14 5 5 3 1 0 0 ... ... :param search: What part of the lexicogrammar to search, and what criteria to match. The `keys` are the thing to be searched, and values are the criteria. To search parse trees, use the `T` key, and a Tregex query as the value. When searching dependencies, you can use any of: +--------------------+-------+----------+-----------+-----------+ | | Match | Governor | Dependent | Head | +====================+=======+==========+===========+===========+ | Word | `W` | `G` | `D` | `H` | +--------------------+-------+----------+-----------+-----------+ | Lemma | `L` | `GL` | `DL` | `HL` | +--------------------+-------+----------+-----------+-----------+ | Function | `F` | `GF` | `DF` | `HF` | +--------------------+-------+----------+-----------+-----------+ | POS tag | `P` | `GP` | `DP` | `HP` | +--------------------+-------+----------+-----------+-----------+ | Word class | `X` | `GX` | `DX` | `HX` | +--------------------+-------+----------+-----------+-----------+ | Distance from root | `R` | `GR` | `DR` | `HR` | +--------------------+-------+----------+-----------+-----------+ | Index | `I` | `GI` | `DI` | `HI` | +--------------------+-------+----------+-----------+-----------+ | Sentence index | `S` | `SI` | `SI` | `SI` | +--------------------+-------+----------+-----------+-----------+ Values should be regular expressions or wordlists to match. :type search: `dict` :Example: >>> corpus.interrogate({T: r'/NN.?/' < /^t/'}) # T- nouns, via trees >>> corpus.interrogate({W: '^t': P: r'^v'}) # T- nouns, via dependencies :param searchmode: Return results matching any/all criteria :type searchmode: `str` -- `'any'`/`'all'` :param exclude: The inverse of `search`, removing results from search :type exclude: `dict` -- `{L: 'be'}` :param excludemode: Exclude results matching any/all criteria :type excludemode: `str` -- `'any'`/`'all'` :param query: A search query for the interrogation. This is only used when `search` is a `str`, or when multiprocessing. When `search` If `search` is a str, the search criteria can be passed in as `query, in order to allow the simpler syntax: >>> corpus.interrogate(GL, '(think|want|feel)') When multiprocessing, the following is possible: >>> {'Nouns': r'/NN.?/', 'Verbs': r'/VB.?/'} ### return an :class:`corpkit.interrogation.Interrodict` object: >>> corpus.interrogate(T, q) ### return an :class:`corpkit.interrogation.Interrogation` object: >>> corpus.interrogate(T, q, show=C) :type query: `str`, `dict` or `list` :param show: What to output. If multiple strings are passed in as a `list`, results will be colon-separated, in the suppled order. Possible values are the same as those for `search`, plus options n-gramming and getting collocates: +------+-----------------------+------------------------+ | Show | Gloss | Example | +======+=======================+========================+ | N | N-gram word | `The women were` | +------+-----------------------+------------------------+ | NL | N-gram lemma | `The woman be` | +------+-----------------------+------------------------+ | NF | N-gram function | `det nsubj root` | +------+-----------------------+------------------------+ | NP | N-gram POS tag | `DT NNS VBN` | +------+-----------------------+------------------------+ | NX | N-gram word class | `determiner noun verb` | +------+-----------------------+------------------------+ | B | Collocate word | `The_were` | +------+-----------------------+------------------------+ | BL | Collocate lemma | `The_be` | +------+-----------------------+------------------------+ | BF | Collocate function | `det_root` | +------+-----------------------+------------------------+ | BP | Collocate POS tag | `DT_VBN` | +------+-----------------------+------------------------+ | BX | Collocate word class | `determiner_verb` | +------+-----------------------+------------------------+ :type show: `str`/`list` of strings :param lemmatise: Force lemmatisation on results. **Deprecated: instead, output a lemma form with the `show` argument** :type lemmatise: `bool` :param lemmatag: Explicitly pass a POS to lemmatiser (generally when data is unparsed, or when tag cannot be recovered from Tregex query) :type lemmatag: `'n'`/`'v'`/`'a'`/`'r'`/`False` :param spelling: Convert all to U.S. or U.K. English :type spelling: `False`/`'US'`/`'UK'` :param dep_type: The kind of Stanford CoreNLP dependency parses you want to use: `'basic-dependencies'`, `'collapsed-dependencies'`, or `'collapsed-ccprocessed-dependencies'`. :param save: Save result as pickle to `saved_interrogations/<save>` on completion :type save: `str` :param gramsize: Size of n-grams (default 2) :type gramsize: `int` :param split_contractions: Make `"don't"` et al into two tokens :type split_contractions: `bool` :param multiprocess: How many parallel processes to run :type multiprocess: `int`/`bool` (`bool` determines automatically) :param files_as_subcorpora: Treat each file as a subcorpus, ignoring actual subcorpora if present :type files_as_subcorpora: `bool` :param conc: Generate a concordance while interrogating, store as `.concordance` attribute :type conc: `bool`/`'only'` :param coref: Allow counting of pronominal referents :type coref: `bool` :param representative: Allow copula coreference matching :type representative: `bool` :param representative: Allow non-copula coreference matching :type representative: `bool` :param tgrep: Use `TGrep` for tree querying. TGrep is less expressive than Tregex, and is slower, but can work without Java. :type tgrep: `bool` :param just_speakers: Limit search to paricular speakers. If 'each', generate :class:`corpkit.interrogation.Interrodict` for each speaker. If a `list` of speaker names, generate :class:`corpkit.interrogation.Interrodict` for each named speaker. If compiled regular expression, generate :class:`corpkit.interrogation.Interrogation` with each speaker matching the regex conflated. :type just_speakers: `str`/`each`/`list`/`regex` :returns: A :class:`corpkit.interrogation.Interrogation` object, with `.query`, `.results`, `.totals` attributes. If multiprocessing is invoked, result may be a :class:`corpkit.interrogation.Interrodict` containing corpus names, queries or speakers as keys. """ from corpkit.interrogator import interrogator par = kwargs.pop('multiprocess', None) kwargs.pop('corpus', None) if par and self.subcorpora: if isinstance(par, int): kwargs['multiprocess'] = par return interrogator(self.subcorpora, search, *args, **kwargs) else: kwargs['multiprocess'] = par return interrogator(self, search, *args, **kwargs)
def pmultiquery(corpus, search, show='words', query='any', sort_by='total', save=False, multiprocess='default', just_speakers=False, root=False, note=False, print_info=True, **kwargs ): """ - Parallel process multiple queries or corpora. - This function is used by corpkit.interrogator.interrogator() - for multiprocessing. - There's no reason to call this function yourself.""" import os from pandas import DataFrame, Series import pandas as pd import collections from collections import namedtuple, OrderedDict from time import strftime, localtime import corpkit from corpkit.interrogator import interrogator from corpkit.interrogation import Interrogation try: from joblib import Parallel, delayed except ImportError: pass import multiprocessing locs = locals() for k, v in kwargs.items(): locs[k] = v in_notebook = locs.get('in_notebook') def best_num_parallel(num_cores, num_queries): """decide how many parallel processes to run the idea, more or less, is to balance the load when possible""" import corpkit if num_queries <= num_cores: return num_queries if num_queries > num_cores: if (num_queries / num_cores) == num_cores: return int(num_cores) if num_queries % num_cores == 0: try: return max([int(num_queries / n) for n in range(2, num_cores) \ if int(num_queries / n) <= num_cores]) except ValueError: return num_cores else: import math if (float(math.sqrt(num_queries))).is_integer(): square_root = math.sqrt(num_queries) if square_root <= num_queries / num_cores: return int(square_root) return num_cores num_cores = multiprocessing.cpu_count() # what is our iterable? ... multiple_option = False multiple_queries = False multiple_speakers = False multiple_corpora = False multiple_search = False mult_corp_are_subs = False denom = 1 if hasattr(corpus, '__iter__'): multiple_corpora = True num_cores = best_num_parallel(num_cores, len(corpus)) denom = len(corpus) if all(c.__class__ == corpkit.corpus.Subcorpus for c in corpus): mult_corp_are_subs = True elif (isinstance(query, (list, dict)) and not hasattr(search, '__iter__')): multiple_queries = True num_cores = best_num_parallel(num_cores, len(query)) denom = len(query) elif hasattr(search, '__iter__') and all(isinstance(i, dict) for i in list(search.values())): multiple_search = True num_cores = best_num_parallel(num_cores, len(list(search.keys()))) denom = len(list(search.keys())) elif just_speakers: from build import get_speaker_names_from_xml_corpus multiple_speakers = True if just_speakers == 'each' or just_speakers == ['each']: just_speakers = get_speaker_names_from_xml_corpus(corpus.path) if len(just_speakers) == 0: print('No speaker name data found.') return num_cores = best_num_parallel(num_cores, len(just_speakers)) denom = len(just_speakers) if multiple_corpora and any(x is True for x in [multiple_speakers, multiple_queries, multiple_search, multiple_option]): from corpkit.corpus import Corpus, Corpora if isinstance(corpus, Corpora): multiprocess = False else: corpus = Corpus(corpus) if isinstance(multiprocess, int): num_cores = multiprocess if multiprocess is False: num_cores = 1 # make sure saves are right type if save is True: raise ValueError('save must be string when multiprocessing.') # the options that don't change d = {'function': 'interrogator', 'root': root, 'note': note, 'denominator': denom} # add kwargs to query for k, v in list(kwargs.items()): d[k] = v # make a list of dicts to pass to interrogator, # with the iterable unique in every one ds = [] if multiple_corpora: for index, p in enumerate(corpus): name = p.name a_dict = dict(d) a_dict['corpus'] = p a_dict['search'] = search a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name.replace('-parsed', '') a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_queries: for index, (name, q) in enumerate(query.items()): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = search a_dict['query'] = q a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_speakers: for index, name in enumerate(just_speakers): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = search a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = [name] a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) elif multiple_search: for index, (name, val) in enumerate(search.items()): a_dict = dict(d) a_dict['corpus'] = corpus a_dict['search'] = val a_dict['query'] = query a_dict['show'] = show a_dict['outname'] = name a_dict['just_speakers'] = just_speakers a_dict['paralleling'] = index a_dict['printstatus'] = False ds.append(a_dict) if kwargs.get('conc') is False: message = 'Interrogating' elif kwargs.get('conc') is True: message = 'Interrogating and concordancing' elif kwargs.get('conc').lower() == 'only': message = 'Concordancing' time = strftime("%H:%M:%S", localtime()) sformat = '' if multiple_queries: to_it_over = query else: to_it_over = search for i, (k, v) in enumerate(list(to_it_over.items())): if isinstance(v, list): vformat = ', '.join(v[:5]) if len(v) > 5: vformat += ' ...' elif isinstance(v, dict): vformat = '' for kk, vv in v.items(): if isinstance(vv, list): vv = ', '.join(vv[:5]) vformat += '\n %s: %s' % (kk, vv) if len(vv) > 5: vformat += ' ...' else: try: vformat = v.pattern except AttributeError: vformat = v sformat += '%s: %s' %(k, vformat) if i < len(to_it_over.keys()) - 1: sformat += '\n ' if print_info: # proper printing for plurals # in truth this needs to be revised, it's horrible. if num_cores == 1: add_es = '' else: add_es = 'es' if multiple_corpora and not multiple_option: corplist = "\n ".join([i.name for i in corpus[:20]]) if len(corpus) > 20: corplist += '\n ... and %d more ...\n' % (len(corpus) - 20) print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s):\n %s" \ "\n Query: %s\n %s corpus ... \n" % (time, len(corpus), num_cores, add_es, corplist, sformat, message))) elif multiple_queries: print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \ "\n Queries: %s\n %s corpus ... \n" % (time, len(query), num_cores, add_es, corpus.name, sformat, message) )) elif multiple_search: print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \ "\n Queries: %s\n %s corpus ... \n" % (time, len(list(search.keys())), num_cores, add_es, corpus.name, sformat, message))) elif multiple_option: print(("\n%s: Beginning %d parallel corpus interrogation%s (multiple options): %s" \ "\n Query: %s\n %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat, message) )) elif multiple_speakers: print(("\n%s: Beginning %d parallel corpus interrogation%s: %s" \ "\n Query: %s\n %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat, message) )) # run in parallel, get either a list of tuples (non-c option) # or a dataframe (c option) #import sys #reload(sys) #stdout=sys.stdout failed = False terminal = False used_joblib = False #ds = ds[::-1] if not root and print_info: from blessings import Terminal terminal = Terminal() print('\n' * (len(ds) - 2)) for dobj in ds: linenum = dobj['paralleling'] # this try handles nosetest problems in sublime text try: with terminal.location(0, terminal.height - (linenum + 1)): # this is a really bad idea. thetime = strftime("%H:%M:%S", localtime()) num_spaces = 26 - len(dobj['outname']) print('%s: QUEUED: %s' % (thetime, dobj['outname'])) except: pass if not root and multiprocess: #res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) try: #ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True) res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) used_joblib = True except: failed = True print('Multiprocessing failed.') raise if not res: failed = True else: res = [] for index, d in enumerate(ds): d['startnum'] = (100 / denom) * index res.append(interrogator(**d)) try: res = sorted([i for i in res if i]) except: pass # remove unpicklable bits from query from types import ModuleType, FunctionType, BuiltinMethodType, BuiltinFunctionType badtypes = (ModuleType, FunctionType, BuiltinFunctionType, BuiltinMethodType) qlocs = {k: v for k, v in locs.items() if not isinstance(v, badtypes)} if hasattr(qlocs['corpus'], 'name'): qlocs['corpus'] = qlocs['corpus'].path else: qlocs['corpus'] = list([i.path for i in qlocs['corpus']]) from corpkit.interrogation import Concordance if kwargs.get('conc') == 'only': concs = pd.concat([x for x in res]) thetime = strftime("%H:%M:%S", localtime()) concs = concs.reset_index(drop=True) lines = Concordance(concs) if save: lines.save(save, print_info=print_info) if print_info: print('\n\n%s: Finished! %d results.\n\n' % (thetime, len(concs.index))) return lines if not all(isinstance(i.results, Series) for i in res): out = OrderedDict() for interrog, d in zip(res, ds): for unpicklable in ['note', 'root']: interrog.query.pop(unpicklable, None) try: out[interrog.query['outname']] = interrog except KeyError: out[d['outname']] = interrog from corpkit.interrogation import Interrodict idict = Interrodict(out) if print_info: time = strftime("%H:%M:%S", localtime()) print("\n\n%s: Finished! Output is a dictionary with keys:\n\n '%s'\n" % \ (time, "'\n '".join(sorted(out.keys())))) idict.query = qlocs if save: idict.save(save, print_info=print_info) return idict # make query and total branch, save, return # todo: standardise this so we don't have to guess transposes else: if multiple_corpora and not mult_corp_are_subs: sers = [i.results for i in res] out = DataFrame(sers, index=[i.query['outname'] for i in res]) out = out.reindex_axis(sorted(out.columns), axis=1) # sort cols out = out.fillna(0) # nan to zero out = out.astype(int) # float to int out = out.T else: try: out = pd.concat([r.results for r in res], axis=1) out = out.T out.index = [i.query['outname'] for i in res] except ValueError: return None # format like normal # this sorts subcorpora, which are cls out = out[sorted(list(out.columns))] # puts subcorpora in the right place if not mult_corp_are_subs: out = out.T out = out.fillna(0) # nan to zero out = out.astype(int) if 'c' in show and mult_corp_are_subs: out = out.sum() out.index = sorted(list(out.index)) # sort by total if isinstance(out, DataFrame): out = out[list(out.sum().sort_values(ascending=False).index)] # really need to figure out the deal with tranposing! if all(x.endswith('.xml') for x in list(out.columns)) \ or all(x.endswith('.txt') for x in list(out.columns)): out = out.T out = out.edit(sort_by=sort_by, print_info=False, keep_stats=False, \ df1_always_df=kwargs.get('df1_always_df')) out.query = qlocs if len(out.results.columns) == 1: out.results = out.results.sort_index() if kwargs.get('conc') is True: concs = pd.concat([x.concordance for x in res], ignore_index=True) concs = concs.sort_values(by='c') concs = concs.reset_index(drop=True) out.concordance = Concordance(concs) thetime = strftime("%H:%M:%S", localtime()) if terminal and print_info: with terminal.location(0, terminal.height): print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n')) else: if print_info: print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n')) if save: out.save(save, print_info = print_info) return out