def multiquery(corpus, query, sort_by = 'total', quicksave = False): """Creates a named tuple for a list of named queries to count. Pass in something like: [[u'NPs in corpus', r'NP'], [u'VPs in corpus', r'VP']]""" import collections import os import pandas import pandas as pd from time import strftime, localtime from corpkit.interrogator import interrogator from corpkit.editor import editor if quicksave: savedir = 'data/saved_interrogations' if not quicksave.endswith('.p'): quicksave = quicksave + '.p' fullpath = os.path.join(savedir, quicksave) while os.path.isfile(fullpath): selection = raw_input("\nSave error: %s already exists in %s.\n\nPick a new name: " % (savename, savedir)) if not selection.endswith('.p'): selection = selection + '.p' fullpath = os.path.join(savedir, selection) results = [] for name, pattern in query: result = interrogator(corpus, 'count', pattern) result.totals.name = name # rename count results.append(result.totals) results = pd.concat(results, axis = 1) results = editor(results, sort_by = sort_by, print_info = False, keep_stats = False) time = strftime("%H:%M:%S", localtime()) print '%s: Finished! %d unique results, %d total.' % (time, len(results.results.columns), results.totals.sum()) if quicksave: from corpkit.other import save_result save_result(results, quicksave) return results
def pmultiquery(path, option = 'c', query = 'any', sort_by = 'total', quicksave = False, num_proc = 'default', function_filter = False, **kwargs): """Parallel process multiple queries or corpora. This function is used by interrogator if: a) path is a list of paths b) query is a dict of named queries. This function needs joblib 0.8.4 or above in order to run properly.""" import collections import os import pandas import pandas as pd from collections import namedtuple from time import strftime, localtime from corpkit.interrogator import interrogator from corpkit.editor import editor from corpkit.other import save_result try: from joblib import Parallel, delayed except: raise ValueError('joblib, the module used for multiprocessing, cannot be found. ' \ 'Install with:\n\n pip install joblib') import multiprocessing num_cores = multiprocessing.cpu_count() def best_num_parallel(num_cores, num_queries): """decide how many parallel processes to run the idea, more or less, is to """ if num_queries <= num_cores: return num_queries if num_queries > num_cores: if (num_queries / num_cores) == num_cores: return int(num_cores) if num_queries % num_cores == 0: return max([int(num_queries / n) for n in range(2, num_cores) if int(num_queries / n) <= num_cores]) else: import math if (float(math.sqrt(num_queries))).is_integer(): square_root = math.sqrt(num_queries) if square_root <= num_queries / num_cores: return int(square_root) return num_queries / ((num_queries / num_cores) + 1) # are we processing multiple queries or corpora? # find out optimal number of cores to use. multiple_option = False multiple_corpora = False if type(path) != str: multiple_corpora = True num_cores = best_num_parallel(num_cores, len(path)) elif type(query) != str: multiple_corpora = False num_cores = best_num_parallel(num_cores, len(query)) elif type(function_filter) != str: multiple_option = True num_cores = best_num_parallel(num_cores, len(function_filter.keys())) if num_proc != 'default': num_cores = num_proc # make sure quicksaves are right type if quicksave is True: raise ValueError('quicksave must be string when using pmultiquery.') # the options that don't change d = {'option': option, 'paralleling': True, 'function': 'interrogator'} # add kwargs to query for k, v in kwargs.items(): d[k] = v # make a list of dicts to pass to interrogator, # with the iterable unique in every one ds = [] if multiple_corpora and not multiple_option: path = sorted(path) for index, p in enumerate(path): name = os.path.basename(p) a_dict = dict(d) a_dict['path'] = p a_dict['query'] = query a_dict['outname'] = name a_dict['printstatus'] = False ds.append(a_dict) elif not multiple_corpora and not multiple_option: import collections for index, (name, q) in enumerate(query.items()): a_dict = dict(d) a_dict['path'] = path a_dict['query'] = q a_dict['outname'] = name a_dict['printstatus'] = False ds.append(a_dict) elif multiple_option: import collections for index, (name, q) in enumerate(function_filter.items()): a_dict = dict(d) a_dict['path'] = path a_dict['query'] = query a_dict['outname'] = name a_dict['function_filter'] = q a_dict['printstatus'] = False ds.append(a_dict) time = strftime("%H:%M:%S", localtime()) if multiple_corpora and not multiple_option: print ("\n%s: Beginning %d parallel corpus interrogations:\n %s" \ "\n Query: '%s'" \ "\n Interrogating corpus ... \n" % (time, num_cores, "\n ".join(path), query) ) elif not multiple_corpora and not multiple_option: print ("\n%s: Beginning %d parallel corpus interrogations: %s" \ "\n Queries: '%s'" \ "\n Interrogating corpus ... \n" % (time, num_cores, path, "', '".join(query.values())) ) elif multiple_option: print ("\n%s: Beginning %d parallel corpus interrogations (multiple options): %s" \ "\n Query: '%s'" \ "\n Interrogating corpus ... \n" % (time, num_cores, path, query) ) # run in parallel, get either a list of tuples (non-c option) # or a dataframe (c option) res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) res = sorted(res) # turn list into dict of results, make query and total branches, # save and return if not option.startswith('c'): out = {} print '' for (name, data), d in zip(res, ds): if not option.startswith('k'): outputnames = collections.namedtuple('interrogation', ['query', 'results', 'totals']) stotal = data.sum(axis = 1) stotal.name = u'Total' output = outputnames(d, data, stotal) else: outputnames = collections.namedtuple('interrogation', ['query', 'results']) output = outputnames(d, data) out[name] = output # could be wrong for unstructured corpora? num_diff_results = len(data) time = strftime("%H:%M:%S", localtime()) print "\n%s: Finished! Output is a dictionary with keys:\n\n '%s'\n" % (time, "'\n '".join(sorted(out.keys()))) if quicksave: for k, v in out.items(): save_result(v, k, savedir = 'data/saved_interrogations/%s' % quicksave) return out # make query and total branch, save, return else: out = pd.concat(res, axis = 1) out = editor(out, sort_by = sort_by, print_info = False, keep_stats = False) time = strftime("%H:%M:%S", localtime()) print '\n%s: Finished! %d unique results, %d total.' % (time, len(out.results.columns), out.totals.sum()) if quicksave: from corpkit.other import save_result save_result(out, quicksave) return out