예제 #1
0
파일: corpus.py 프로젝트: maxdesp/corpkit
    def concordance(self, *args, **kwargs): 
        """
        A concordance method for Tregex queries, CoreNLP dependencies, 
        tokenised data or plaintext.

        :Example:

        >>> wv = ['want', 'need', 'feel', 'desire']
        >>> corpus.concordance({L: wv, F: 'root'})
           0   01  1-01.txt.xml                But , so I  feel     like i do that for w
           1   01  1-01.txt.xml                         I  felt     a little like oh , i
           2   01  1-01.txt.xml   he 's a difficult man I  feel     like his work ethic 
           3   01  1-01.txt.xml                      So I  felt     like i recognized li
           ...                                                                       ...


        Arguments are the same as :func:`~corpkit.interrogation.Interrogation.interrogate`, plus:

        :param only_format_match: if True, left and right window will just be words, regardless of what is in 'show'
        :type only_format_match: bool

        :param random: randomise lines
        :type random: bool

        :param only_unique: only unique lines
        :type only_unique: bool

        :returns: A :class:`corpkit.interrogation.Concordance` instance

        """

        from interrogator import interrogator
        kwargs.pop('do_concordancing', None)
        kwargs.pop('conc', None)
        return interrogator(self, do_concordancing = 'only', *args, **kwargs)
예제 #2
0
    def concordance(self, *args, **kwargs):
        """
        A concordance method for Tregex queries, CoreNLP dependencies, 
        tokenised data or plaintext.

        :Example:

        >>> wv = ['want', 'need', 'feel', 'desire']
        >>> corpus.concordance({L: wv, F: 'root'})
           0   01  1-01.txt.xml                But , so I  feel     like i do that for w
           1   01  1-01.txt.xml                         I  felt     a little like oh , i
           2   01  1-01.txt.xml   he 's a difficult man I  feel     like his work ethic 
           3   01  1-01.txt.xml                      So I  felt     like i recognized li
           ...                                                                       ...


        Arguments are the same as :func:`~corpkit.interrogation.Interrogation.interrogate`, plus:

        :param only_format_match: if True, left and right window will just be words, regardless of what is in 'show'
        :type only_format_match: bool

        :param random: randomise lines
        :type random: bool

        :param only_unique: only unique lines
        :type only_unique: bool

        :returns: A :class:`corpkit.interrogation.Concordance` instance

        """

        from interrogator import interrogator
        kwargs.pop('do_concordancing', None)
        kwargs.pop('conc', None)
        return interrogator(self, do_concordancing='only', *args, **kwargs)
예제 #3
0
파일: other.py 프로젝트: mphilli/corpkit
def interroplot(path, query):
    import corpkit
    """Interrogates path with Tregex query, gets relative frequencies, and plots the top seven results"""
    from corpkit import interrogator, editor, plotter
    quickstart = interrogator(path, 'words', query)
    edited = editor(quickstart.results, '%', quickstart.totals, print_info = False)
    plotter(str(path), edited.results)
예제 #4
0
파일: corpus.py 프로젝트: xsongx/corpkit
    def concordance(self, *args, **kwargs): 
        """
        A concordance method for Tregex queries, CoreNLP dependencies, 
        tokenised data or plaintext.

           >>> wv = ['want', 'need', 'feel', 'desire']
           >>> corpus.concordance({'l': wv, 'f': 'root'})

        Arguments are the same as :func:`~corpkit.interrogation.Interrogation.interrogate`, plus:

        :param only_format_match: if True, left and right window will just be words, regardless of what is in 'show'
        :type only_format_match: bool

        :param random: randomise lines
        :type random: bool

        :param only_unique: only unique lines
        :type only_unique: bool

        :returns: A :class:`corpkit.interrogation.Concordance` instance

        """

        from interrogator import interrogator
        return interrogator(self, conc = True, *args, **kwargs)
예제 #5
0
    def features(self):
        """
        Get some basic stats from the corpus, and store as :py:attr:`~corpkit.corpus.Corpus.features`

           >>> corpus.get_stats()

        :returns: None
        """
        from interrogator import interrogator
        return interrogator(self, 's', 'any').results
예제 #6
0
파일: corpus.py 프로젝트: xsongx/corpkit
    def get_stats(self, *args):
        """
        Get some basic stats from the corpus, and store as :py:attr:`~corpkit.corpus.Corpus.features`

           >>> corpus.get_stats()

        :returns: None
        """  
        from interrogator import interrogator
        self.features = interrogator(self.path, 's', 'any').results
        print('\nFeatures defined. See .features attribute ...') 
예제 #7
0
    def features(self):
        """
        Generate and show basic stats from the corpus, including number of sentences, clauses, process types, etc.

        :Example:

        >>> corpus.features
            SB  Characters  Tokens  Words  Closed class words  Open class words  Clauses
            01       26873    8513   7308                4809              3704     2212   
            02       25844    7933   6920                4313              3620     2270   
            03       18376    5683   4877                3067              2616     1640   
            04       20066    6354   5366                3587              2767     1775

        """
        from interrogator import interrogator
        return interrogator(self, 's', 'any').results
예제 #8
0
파일: corpus.py 프로젝트: maxdesp/corpkit
    def features(self):
        """
        Generate and show basic stats from the corpus, including number of sentences, clauses, process types, etc.

        :Example:

        >>> corpus.features
            SB  Characters  Tokens  Words  Closed class words  Open class words  Clauses
            01       26873    8513   7308                4809              3704     2212   
            02       25844    7933   6920                4313              3620     2270   
            03       18376    5683   4877                3067              2616     1640   
            04       20066    6354   5366                3587              2767     1775

        """
        from interrogator import interrogator
        return interrogator(self, 's', 'any').results
예제 #9
0
파일: other.py 프로젝트: mphilli/corpkit
def multiquery(corpus, query, sort_by = 'total', quicksave = False):
    import corpkit
    """Creates a named tuple for a list of named queries to count.

    Pass in something like:

    [[u'NPs in corpus', r'NP'], [u'VPs in corpus', r'VP']]"""

    import collections
    import os
    import pandas
    import pandas as pd
    from time import strftime, localtime
    from interrogator import interrogator
    from editor import editor

    if quicksave:
        savedir = 'saved_interrogations'
        if not quicksave.endswith('.p'):
            quicksave = quicksave + '.p'
        fullpath = os.path.join(savedir, quicksave)
        while os.path.isfile(fullpath):
            selection = raw_input("\nSave error: %s already exists in %s.\n\nPick a new name: " % (savename, savedir))
            if not selection.endswith('.p'):
                selection = selection + '.p'
                fullpath = os.path.join(savedir, selection)

    results = []
    for name, pattern in query:
        result = interrogator(corpus, 'count', pattern)
        result.totals.name = name # rename count
        results.append(result.totals)
    results = pd.concat(results, axis = 1)

    results = editor(results, sort_by = sort_by, print_info = False, keep_stats = False)
    time = strftime("%H:%M:%S", localtime())
    print '%s: Finished! %d unique results, %d total.' % (time, len(results.results.columns), results.totals.sum())
    if quicksave:
        from other import save_result
        save_result(results, quicksave)
    return results
예제 #10
0
def pmultiquery(corpus, 
    search,
    show = 'words',
    query = 'any', 
    sort_by = 'total', 
    quicksave = False,
    multiprocess = 'default', 
    function_filter = False,
    just_speakers = False,
    root = False,
    note = False,
    print_info = True,
    **kwargs):
    """Parallel process multiple queries or corpora.

    This function is used by interrogator() if:

        a) path is a list of paths
        b) query is a dict of named queries
        c) just speakers == 'each', or a list of speakers with len(list) > 1
    
    This function needs joblib 0.8.4 or above in order to run properly.
    There's no reason to call it yourself."""
    
    import collections
    import os
    import pandas as pd
    import collections
    from collections import namedtuple
    from time import strftime, localtime
    import corpkit
    from interrogator import interrogator
    from editor import editor
    from other import save
    from interrogation import Interrogation
    try:
        from joblib import Parallel, delayed
    except:
        pass
        #raise ValueError('joblib, the module used for multiprocessing, cannot be found. ' \
        #                 'Install with:\n\n        pip install joblib')
    import multiprocessing

    def best_num_parallel(num_cores, num_queries):
        import corpkit
        """decide how many parallel processes to run

        the idea, more or less, is to balance the load when possible"""
        if num_queries <= num_cores:
            return num_queries
        if num_queries > num_cores:
            if (num_queries / num_cores) == num_cores:
                return int(num_cores)
            if num_queries % num_cores == 0:
                try:
                    return max([int(num_queries / n) for n in range(2, num_cores) if int(num_queries / n) <= num_cores])   
                except ValueError:
                    return num_cores
            else:
                import math
                if (float(math.sqrt(num_queries))).is_integer():
                    square_root = math.sqrt(num_queries)
                    if square_root <= num_queries / num_cores: 
                        return int(square_root)    
        return num_cores

    num_cores = multiprocessing.cpu_count()

    # what is our iterable? ...
    multiple_option = False
    multiple_queries = False
    multiple_speakers = False
    multiple_corpora = False
    multiple_search = False
    mult_corp_are_subs = False
    denom = 1

    if hasattr(corpus, '__iter__'):
        multiple_corpora = True
        num_cores = best_num_parallel(num_cores, len(corpus))
        denom = len(corpus)
        if all(c.__class__ == corpkit.corpus.Subcorpus for c in corpus):
            mult_corp_are_subs = True
    elif (type(query) == list or type(query) == dict) and not hasattr(search, '__iter__'):
            multiple_queries = True
            num_cores = best_num_parallel(num_cores, len(query))
            denom = len(query)
    elif hasattr(search, '__iter__') and type(search) != dict:
        multiple_search = True
        num_cores = best_num_parallel(num_cores, len(list(search.keys())))
        denom = len(list(search.keys()))
    elif hasattr(function_filter, '__iter__'):
        multiple_option = True
        num_cores = best_num_parallel(num_cores, len(list(function_filter.keys())))
        denom = len(list(function_filter.keys()))
    elif just_speakers:
        from build import get_speaker_names_from_xml_corpus
        multiple_speakers = True
        if just_speakers == 'each' or just_speakers == ['each']:
            just_speakers = get_speaker_names_from_xml_corpus(corpus.path)
        if len(just_speakers) == 0:
            print('No speaker name data found.')
            return
        num_cores = best_num_parallel(num_cores, len(just_speakers))
        denom = len(just_speakers)
        
    if type(multiprocess) == int:
        num_cores = multiprocess
    if multiprocess is False:
        num_cores = 1

    # make sure quicksaves are right type
    if quicksave is True:
        raise ValueError('quicksave must be string when using pmultiquery.')
    
    # the options that don't change
    d = {
         #'paralleling': True,
         'function': 'interrogator',
         'root': root,
         'note': note,
         'denominator': denom}
    
    # add kwargs to query
    for k, v in list(kwargs.items()):
        d[k] = v

    # make a list of dicts to pass to interrogator,
    # with the iterable unique in every one
    ds = []
    if multiple_corpora:
        for index, p in enumerate(corpus):
            name = p.name
            a_dict = dict(d)
            a_dict['corpus'] = p
            a_dict['search'] = search
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name.replace('-parsed', '')
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_queries:
        for index, (name, q) in enumerate(query.items()):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = search
            a_dict['query'] = q
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_option:
        for index, (name, q) in enumerate(function_filter.items()):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = search
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['function_filter'] = q
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_speakers:
        for index, name in enumerate(just_speakers):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = search
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = [name]
            a_dict['function_filter'] = function_filter
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_search:
        for index, val in enumerate(search):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = val
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = just_speakers
            a_dict['function_filter'] = function_filter
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)

    if kwargs.get('do_concordancing') is False:
        message = 'Interrogating'
    elif kwargs.get('do_concordancing') is True:
        message = 'Interrogating and concordancing'
    elif kwargs.get('do_concordancing').lower() == 'only':
        message = 'Concordancing'
    time = strftime("%H:%M:%S", localtime())
    sformat = ''
    for i, (k, v) in enumerate(list(search.items())):
        if type(v) == list:
            vformat = ', '.join(v[:5])
            if len(v) > 5:
                vformat += ' ...'
        else:
            vformat = v
        sformat += '%s: %s' %(k, vformat)
        if i < len(search.keys()) - 1:
            sformat += '\n                  '

    if multiple_corpora and not multiple_option:
        corplist = "\n              ".join([i.name for i in corpus[:20]])
        if len(corpus) > 20:
            corplist += '\n ... and %d more ...\n' % (len(corpus) - 20)
        print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes):\n              %s" \
           "\n          Query: '%s'\n          %s corpus ... \n"  % (time, len(corpus), num_cores, corplist, sformat, message)))

    elif multiple_queries:
        print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \
           "\n          Queries: '%s'\n          %s corpus ... \n" % (time, len(search), num_cores, corpus.name, "', '".join(list(search.values())), message) ))

    elif multiple_search:
        print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \
           "\n          Queries: '%s'\n          %s corpus ... \n" % (time, len(list(search.keys())), num_cores, corpus.name, str(list(search.values())), message)))

    elif multiple_option:
        print(("\n%s: Beginning %d parallel corpus interrogations (multiple options): %s" \
           "\n          Query: '%s'\n          %s corpus ... \n" % (time, num_cores, corpus.name, sformat, message) ))

    elif multiple_speakers:
        print(("\n%s: Beginning %d parallel corpus interrogations: %s" \
           "\n          Query: '%s'\n          %s corpus ... \n" % (time, num_cores, corpus.name, sformat, message) ))

    # run in parallel, get either a list of tuples (non-c option)
    # or a dataframe (c option)
    #import sys
    #reload(sys)
    #stdout=sys.stdout
    failed = False
    terminal = False
    used_joblib = False
    #ds = ds[::-1]
    if not root:
        from blessings import Terminal
        terminal = Terminal()
        print('\n' * (len(ds) - 2))
        for dobj in ds:
            linenum = dobj['paralleling']
            # this try handles nosetest problems in sublime text
            try:
                with terminal.location(0, terminal.height - (linenum + 1)):
                    # this is a really bad idea.
                    thetime = strftime("%H:%M:%S", localtime())
                    num_spaces = 26 - len(dobj['outname'])
                    print('%s: QUEUED: %s' % (thetime, dobj['outname']))

            except:
                pass

    if not root and multiprocess:
        #res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
        try:
            #ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True) 
            res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
            used_joblib = True
        except:
            failed = True
            print('Multiprocessing failed.')
            raise
        if not res:
            failed = True
    else:
        res = []
        for index, d in enumerate(ds):
            d['startnum'] = (100 / denom) * index
            res.append(interrogator(**d))
        try:
            res = sorted(res)
        except:
            pass

    # multiprocessing way
    #from multiprocessing import Process
    #from interrogator import interrogator
    #jobs = []
    ##for d in ds:
    ##    p = multiprocessing.Process(target=interrogator, kwargs=(**d,))
    ##    jobs.append(p)
    ##    p.start()
    ##    while p.is_alive():
    ##        import time
    ##        time.sleep(2)
    ##        if root:
    ##            root.update()
    #result_queue = multiprocessing.Queue()
    #
    #for d in ds:
    #funs = [interrogator(result_queue, **kwargs) for kwargs in ds]
    #jobs = [multiprocessing.Process(mc) for mc in funs]
    #for job in jobs: job.start()
    #for job in jobs: job.join()
    #results = [result_queue.get() for mc in funs]

    import corpkit
    from interrogation import Concordance
    if kwargs.get('do_concordancing') == 'only':
        concs = pd.concat([x for x in res])
        thetime = strftime("%H:%M:%S", localtime())
        print('\n\n%s: Finished! %d results.\n\n' % (thetime, len(concs.index)))
        return Concordance(concs)

    from collections import OrderedDict
    if not all(type(i.results) == pd.core.series.Series for i in res):
        out = OrderedDict()
        for interrog, d in zip(res, ds):
            for unpicklable in ['note', 'root']:
                interrog.query.pop(unpicklable, None)
            out[interrog.query['outname']] = interrog
    
        if quicksave:
            fullpath = os.path.join('saved_interrogations', quicksave)
            while os.path.isdir(fullpath):
                selection = input("\nSave error: %s already exists in %s.\n\nType 'o' to overwrite, or enter a new name: " % (quicksave, 'saved_interrogations'))
                if selection == 'o' or selection == 'O':
                    import shutil
                    shutil.rmtree(fullpath)
                else:
                    import os
                    fullpath = os.path.join('saved_interrogations', selection)

            for k, v in list(out.items()):
                save(v, k, savedir = fullpath, print_info = False)
        
            time = strftime("%H:%M:%S", localtime())
            print("\n%s: %d files saved to %s" % ( time, len(list(out.keys())), fullpath))

        time = strftime("%H:%M:%S", localtime())
        print("\n\n%s: Finished! Output is a dictionary with keys:\n\n         '%s'\n" % (time, "'\n         '".join(sorted(out.keys()))))
        from interrogation import Interrodict
        return Interrodict(out)
    # make query and total branch, save, return
    else:
        #print sers
        #print ds
        if multiple_corpora and not mult_corp_are_subs:
            sers = [i.results for i in res]
            out = pd.DataFrame(sers, index = [i.query['outname'] for i in res])
            out = out.reindex_axis(sorted(out.columns), axis=1) # sort cols
            out = out.fillna(0) # nan to zero
            out = out.astype(int) # float to int
            out = out.T            
        else:
            out = pd.concat([r.results for r in res], axis = 1)
            # format like normal
            out = out[sorted(list(out.columns))]
            out = out.T
            out = out.fillna(0) # nan to zero
            out = out.astype(int)
            if 'c' in show and mult_corp_are_subs:
                out = out.sum()
                out.index = sorted(list(out.index))

        # sort by total
        if type(out) == pd.core.frame.DataFrame:
            out.ix['Total-tmp'] = out.sum()
            tot = out.ix['Total-tmp']
            out = out[tot.argsort()[::-1]]
            out = out.drop('Total-tmp', axis = 0)
        out = out.edit(sort_by = sort_by, print_info = False, keep_stats = False, \
                      df1_always_df = kwargs.get('df1_always_df'))
        if len(out.results.columns) == 1:
            out.results = out.results.sort_index()   
        if kwargs.get('do_concordancing') is True:
            concs = pd.concat([x.concordance for x in res], ignore_index = True)
            concs = concs.sort_values(by='c')
            concs = concs.reset_index(drop=True)
            out.concordance = Concordance(concs)
        thetime = strftime("%H:%M:%S", localtime())
        if terminal:
            with terminal.location(0, terminal.height):
                print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n'))
        else:
            print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n'))
        #if used_joblib:
            
        if quicksave:
            from other import save
            save(out, quicksave)
        print('\n')
        return out
예제 #11
0
파일: corpus.py 프로젝트: xsongx/corpkit
    def interrogate(self, search, *args, **kwargs):
        """Interrogate a corpus of texts for a lexicogrammatical phenomenon

            >>> # show lemma form of nouns ending in 'ing'
            >>> q = {'w': r'ing$', 'p': r'^N'}
            >>> data = corpus.interrogate(q, show = 'l')
        
        :param search: What query should be matching
           - t/tregex
           - w/word
           - l/lemma
           - f/function
           - g/governor
           - d/dependent
           - p/pos
           - i/index
           - n/ngrams
           - s/general stats
        :type search: str, or, for dependencies, a dict like ``{'w': 'help', 'p': r'^V'}``

        :param searchmode: Return results matching any/all criteria
        :type searchmode: str ('any'/'all')

        :param exclude: The inverse of `search`, removing results from search
        :type exclude: dict -- ``{'l': 'be'}``

        :param excludemode: Exclude results matching any/all criteria
        :type excludemode: str ('any'/'all')
        
        :param query: A search query for the interrogation
        :type query: str -- regex/Tregex pattern; dict -- ``{name: pattern}``; list -- word list to match
        
        :param show: What to output. If multiple strings are passed, results will be colon-separated, in order
           - t/tree
           - w/word
           - l/lemma
           - g/governor
           - d/dependent
           - f/function
           - p/pos
           - i/index
           - a/distance from root
        :type show: list of strings

        :param lemmatise: Force lemmatisation on results
        :type lemmatise: bool
           
        :param lemmatag: Explicitly pass a pos to lemmatiser (generally when data is unparsed)
        :type lemmatag: False/'n'/'v'/'a'/'r'
        
        :param spelling: Convert all to U.S. or U.K. English
        :type spelling: False/'US'/'UK'
           
        :param dep_type: The kind of Stanford CoreNLP dependency parses you want to use
        :type dep_type: str -- 'basic-dependencies'/'a', 'collapsed-dependencies'/'b', 'collapsed-ccprocessed-dependencies'/'c'
        
        :param quicksave: Save result as pickle to ```saved_interrogations/str``` on completion
        :type quicksave: str
        
        :param gramsize: size of ngrams (default 2)
        :type gramsize: int

        :param split_contractions: make ``"don't"`` et al into two tokens
        :type split_contractions: bool

        :param multiprocess: how many parallel processes to run
        :type multiprocess: int / bool (to determine automatically)

        :param files_as_subcorpora: treat each file as a subcorpus
        :type files_as_subcorpora: bool

        :returns: A :class:`corpkit.interrogation.Interrogation` object, with ``.query``, ``.results``, ``.totals`` attributes. If multiprocessing is \
        invoked, result may be a :class:`corpkit.interrogation.Interrodict` containing corpus names, queries or speakers as keys.
        """
        from interrogator import interrogator
        par = kwargs.pop('multiprocess', None)
        if par and self.subcorpora:
            if type(par) == int:
                kwargs['multiprocess'] = par
            return interrogator(self.subcorpora, search, *args, **kwargs)
        else:
            return interrogator(self, search, *args, **kwargs)
예제 #12
0
 def interrogate(self, *args, **kwargs):
     """Interrogate the corpus using :func:`~corpkit.corpus.Corpus.interrogate`"""
     from interrogator import interrogator
     return interrogator(self, *args, **kwargs)
예제 #13
0
 def concordance(self, *args, **kwargs):
     """Concordance the corpus using :func:`~corpkit.corpus.Corpus.concordance`"""
     from interrogator import interrogator
     return interrogator(self, do_concordancing='only', *args, **kwargs)
예제 #14
0
파일: corpus.py 프로젝트: maxdesp/corpkit
 def concordance(self, *args, **kwargs):
     """Concordance the corpus using :func:`~corpkit.corpus.Corpus.concordance`"""
     from interrogator import interrogator
     return interrogator(self, do_concordancing = 'only', *args, **kwargs)
예제 #15
0
    def interrogate(self, search, *args, **kwargs):
        """Interrogate a corpus of texts for a lexicogrammatical phenomenon

        :Example:

        >>> corpus = Corpus('data/conversations-parsed')
        ### show lemma form of nouns ending in 'ing'
        >>> q = {W: r'ing$', P: r'^N'}
        >>> data = corpus.interrogate(q, show = L)
        >>> data.results
            ..  something  anything  thing  feeling  everything  nothing  morning
            01         14        11     12        1           6        0        1   
            02         10        20      4        4           8        3        0   
            03         14         5      5        3           1        0        0
            ...                                                               ...   
        
        :param search: What query should be matching
           - t/tregex
           - w/word
           - l/lemma
           - f/function
           - g/governor
           - d/dependent
           - p/pos
           - i/index
           - n/ngrams
           - s/general stats
        :type search: str, or, for dependencies, a dict like `{W: 'help', P: r'^V'}`

        :param searchmode: Return results matching any/all criteria
        :type searchmode: str -- `'any'`/`'all'`

        :param exclude: The inverse of `search`, removing results from search
        :type exclude: dict -- `{L: 'be'}`

        :param excludemode: Exclude results matching any/all criteria
        :type excludemode: str -- `'any'`/`'all'`
        
        :param query: A search query for the interrogation
        :type query: 
           - str -- regex/Tregex pattern
           - dict -- `{name: pattern}`
           - list -- word list to match
        
        :param show: What to output. If multiple strings are passed, results will be colon-separated, in order
           - t/tree
           - w/word
           - l/lemma
           - g/governor
           - d/dependent
           - f/function
           - p/pos
           - i/index
           - a/distance from root
        :type show: list of strings

        :param lemmatise: Force lemmatisation on results
        :type lemmatise: bool
           
        :param lemmatag: Explicitly pass a pos to lemmatiser (generally when data is unparsed)
        :type lemmatag: False/'n'/'v'/'a'/'r'
        
        :param spelling: Convert all to U.S. or U.K. English
        :type spelling: False/'US'/'UK'
           
        :param dep_type: The kind of Stanford CoreNLP dependency parses you want to use
        :type dep_type: str -- 'basic-dependencies'/'a', 'collapsed-dependencies'/'b', 'collapsed-ccprocessed-dependencies'/'c'
        
        :param save: Save result as pickle to `saved_interrogations/<save>` on completion
        :type save: str
        
        :param gramsize: size of ngrams (default 2)
        :type gramsize: int

        :param split_contractions: make `"don't"` et al into two tokens
        :type split_contractions: bool

        :param multiprocess: how many parallel processes to run
        :type multiprocess: int / bool (to determine automatically)

        :param files_as_subcorpora: treat each file as a subcorpus
        :type files_as_subcorpora: bool

        :param do_concordancing: Concordance while interrogating, store as `.concordance` attribute
        :type do_concordancing: bool/'only'

        :param maxconc: Maximum number of concordance lines
        :type maxcond: int

        :returns: A :class:`corpkit.interrogation.Interrogation` object, with `.query`, `.results`, `.totals` attributes. If multiprocessing is \
        invoked, result may be a :class:`corpkit.interrogation.Interrodict` containing corpus names, queries or speakers as keys.
        """
        from interrogator import interrogator
        par = kwargs.pop('multiprocess', None)
        if par and self.subcorpora:
            if type(par) == int:
                kwargs['multiprocess'] = par
            return interrogator(self.subcorpora, search, *args, **kwargs)
        else:
            return interrogator(self, search, *args, **kwargs)
예제 #16
0
파일: corpus.py 프로젝트: xsongx/corpkit
 def interrogate(self, *args, **kwargs):
     """interrogate the corpus using :func:`~corpkit.corpus.Corpus.interrogate`"""
     from interrogator import interrogator
     return interrogator([s for s in self], *args, **kwargs)
예제 #17
0
def pmultiquery(corpus,
                search,
                show='words',
                query='any',
                sort_by='total',
                quicksave=False,
                multiprocess='default',
                just_speakers=False,
                root=False,
                note=False,
                print_info=True,
                **kwargs):
    """Parallel process multiple queries or corpora.

    This function is used by interrogator() for multiprocessing.
    
    There's no reason to call this function yourself."""

    import collections
    import os
    import pandas as pd
    import collections
    from collections import namedtuple
    from time import strftime, localtime
    import corpkit
    from interrogator import interrogator
    from editor import editor
    from other import save
    from interrogation import Interrogation
    try:
        from joblib import Parallel, delayed
    except:
        pass
        #raise ValueError('joblib, the module used for multiprocessing, cannot be found. ' \
        #                 'Install with:\n\n        pip install joblib')
    import multiprocessing

    locs = locals()
    for k, v in kwargs.items():
        locs[k] = v

    def best_num_parallel(num_cores, num_queries):
        import corpkit
        """decide how many parallel processes to run

        the idea, more or less, is to balance the load when possible"""
        if num_queries <= num_cores:
            return num_queries
        if num_queries > num_cores:
            if (num_queries / num_cores) == num_cores:
                return int(num_cores)
            if num_queries % num_cores == 0:
                try:
                    return max([
                        int(num_queries / n) for n in range(2, num_cores)
                        if int(num_queries / n) <= num_cores
                    ])
                except ValueError:
                    return num_cores
            else:
                import math
                if (float(math.sqrt(num_queries))).is_integer():
                    square_root = math.sqrt(num_queries)
                    if square_root <= num_queries / num_cores:
                        return int(square_root)
        return num_cores

    num_cores = multiprocessing.cpu_count()

    # what is our iterable? ...
    multiple_option = False
    multiple_queries = False
    multiple_speakers = False
    multiple_corpora = False
    multiple_search = False
    mult_corp_are_subs = False
    denom = 1

    if hasattr(corpus, '__iter__'):
        multiple_corpora = True
        num_cores = best_num_parallel(num_cores, len(corpus))
        denom = len(corpus)
        if all(c.__class__ == corpkit.corpus.Subcorpus for c in corpus):
            mult_corp_are_subs = True
    elif (type(query) == list
          or type(query) == dict) and not hasattr(search, '__iter__'):
        multiple_queries = True
        num_cores = best_num_parallel(num_cores, len(query))
        denom = len(query)
    elif hasattr(search, '__iter__') and all(
            type(i) == dict for i in list(search.values())):
        multiple_search = True
        num_cores = best_num_parallel(num_cores, len(list(search.keys())))
        denom = len(list(search.keys()))

    elif just_speakers:
        from build import get_speaker_names_from_xml_corpus
        multiple_speakers = True
        if just_speakers == 'each' or just_speakers == ['each']:
            just_speakers = get_speaker_names_from_xml_corpus(corpus.path)
        if len(just_speakers) == 0:
            print('No speaker name data found.')
            return
        num_cores = best_num_parallel(num_cores, len(just_speakers))
        denom = len(just_speakers)

    # if this thing has already come through multiquery, don't multiprocess this time
    #if kwargs.get('outname'):
    #    multiprocess = False

    if multiple_corpora and any(x is True for x in [
            multiple_speakers, multiple_queries, multiple_search,
            multiple_option
    ]):
        from corpus import Corpus, Corpora
        if corpus.__class__ == Corpora:
            multiprocess = False
        else:
            corpus = Corpus(corpus)

    if type(multiprocess) == int:
        num_cores = multiprocess
    if multiprocess is False:
        num_cores = 1

    # make sure quicksaves are right type
    if quicksave is True:
        raise ValueError('quicksave must be string when using pmultiquery.')

    # the options that don't change
    d = {
        #'paralleling': True,
        'function': 'interrogator',
        'root': root,
        'note': note,
        'denominator': denom
    }

    # add kwargs to query
    for k, v in list(kwargs.items()):
        d[k] = v

    # make a list of dicts to pass to interrogator,
    # with the iterable unique in every one
    ds = []
    if multiple_corpora:
        for index, p in enumerate(corpus):
            name = p.name
            a_dict = dict(d)
            a_dict['corpus'] = p
            a_dict['search'] = search
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name.replace('-parsed', '')
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_queries:
        for index, (name, q) in enumerate(query.items()):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = search
            a_dict['query'] = q
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_speakers:
        for index, name in enumerate(just_speakers):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = search
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = [name]
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_search:
        for index, (name, val) in enumerate(search.items()):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = val
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)

    if kwargs.get('do_concordancing') is False:
        message = 'Interrogating'
    elif kwargs.get('do_concordancing') is True:
        message = 'Interrogating and concordancing'
    elif kwargs.get('do_concordancing').lower() == 'only':
        message = 'Concordancing'
    time = strftime("%H:%M:%S", localtime())
    sformat = ''
    if multiple_queries:
        to_it_over = query
    else:
        to_it_over = search
    for i, (k, v) in enumerate(list(to_it_over.items())):
        if type(v) == list:
            vformat = ', '.join(v[:5])
            if len(v) > 5:
                vformat += ' ...'
        elif type(v) == dict:
            vformat = ''
            for kk, vv in v.items():
                if type(vv) == list:
                    vv = ', '.join(vv[:5])

                vformat += '\n                     %s: %s' % (kk, vv)
                if len(vv) > 5:
                    vformat += ' ...'
        else:
            vformat = v
        sformat += '%s: %s' % (k, vformat)
        if i < len(to_it_over.keys()) - 1:
            sformat += '\n                   '

    if print_info:
        if multiple_corpora and not multiple_option:
            corplist = "\n              ".join([i.name for i in corpus[:20]])
            if len(corpus) > 20:
                corplist += '\n ... and %d more ...\n' % (len(corpus) - 20)
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes):\n              %s" \
               "\n          Query: %s\n          %s corpus ... \n"  % (time, len(corpus), num_cores, corplist, sformat, message)))

        elif multiple_queries:
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \
               "\n          Queries: %s\n          %s corpus ... \n" % (time, len(query), num_cores, corpus.name, sformat, message) ))

        elif multiple_search:
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \
               "\n          Queries: %s\n          %s corpus ... \n" % (time, len(list(search.keys())), num_cores, corpus.name, sformat, message)))

        elif multiple_option:
            print(("\n%s: Beginning %d parallel corpus interrogations (multiple options): %s" \
               "\n          Query: %s\n          %s corpus ... \n" % (time, num_cores, corpus.name, sformat, message) ))

        elif multiple_speakers:
            print(("\n%s: Beginning %d parallel corpus interrogations: %s" \
               "\n          Query: %s\n          %s corpus ... \n" % (time, num_cores, corpus.name, sformat, message) ))

    # run in parallel, get either a list of tuples (non-c option)
    # or a dataframe (c option)
    #import sys
    #reload(sys)
    #stdout=sys.stdout
    failed = False
    terminal = False
    used_joblib = False
    #ds = ds[::-1]
    if not root and print_info:
        from blessings import Terminal
        terminal = Terminal()
        print('\n' * (len(ds) - 2))
        for dobj in ds:
            linenum = dobj['paralleling']
            # this try handles nosetest problems in sublime text
            try:
                with terminal.location(0, terminal.height - (linenum + 1)):
                    # this is a really bad idea.
                    thetime = strftime("%H:%M:%S", localtime())
                    num_spaces = 26 - len(dobj['outname'])
                    print('%s: QUEUED: %s' % (thetime, dobj['outname']))

            except:
                pass

    if not root and multiprocess:
        #res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
        try:
            #ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True)
            res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x)
                                             for x in ds)
            used_joblib = True
        except:
            failed = True
            print('Multiprocessing failed.')
            raise
        if not res:
            failed = True
    else:
        res = []
        for index, d in enumerate(ds):
            d['startnum'] = (100 / denom) * index
            res.append(interrogator(**d))
        try:
            res = sorted(res)
        except:
            pass

    # multiprocessing way
    #from multiprocessing import Process
    #from interrogator import interrogator
    #jobs = []
    ##for d in ds:
    ##    p = multiprocessing.Process(target=interrogator, kwargs=(**d,))
    ##    jobs.append(p)
    ##    p.start()
    ##    while p.is_alive():
    ##        import time
    ##        time.sleep(2)
    ##        if root:
    ##            root.update()
    #result_queue = multiprocessing.Queue()
    #
    #for d in ds:
    #funs = [interrogator(result_queue, **kwargs) for kwargs in ds]
    #jobs = [multiprocessing.Process(mc) for mc in funs]
    #for job in jobs: job.start()
    #for job in jobs: job.join()
    #results = [result_queue.get() for mc in funs]

    import corpkit
    from interrogation import Concordance
    if kwargs.get('do_concordancing') == 'only':
        concs = pd.concat([x for x in res])
        thetime = strftime("%H:%M:%S", localtime())
        if print_info:
            print('\n\n%s: Finished! %d results.\n\n' %
                  (thetime, len(concs.index)))
        return Concordance(concs)

    from collections import OrderedDict
    if not all(type(i.results) == pd.core.series.Series for i in res):
        out = OrderedDict()
        for interrog, d in zip(res, ds):
            for unpicklable in ['note', 'root']:
                interrog.query.pop(unpicklable, None)
            try:
                out[interrog.query['outname']] = interrog
            except KeyError:
                out[d['outname']] = interrog

        if quicksave:
            fullpath = os.path.join('saved_interrogations', quicksave)
            while os.path.isdir(fullpath):
                selection = input(
                    "\nSave error: %s already exists in %s.\n\nType 'o' to overwrite, or enter a new name: "
                    % (quicksave, 'saved_interrogations'))
                if selection == 'o' or selection == 'O':
                    import shutil
                    shutil.rmtree(fullpath)
                else:
                    import os
                    fullpath = os.path.join('saved_interrogations', selection)

            for k, v in list(out.items()):
                save(v, k, savedir=fullpath, print_info=False)

            time = strftime("%H:%M:%S", localtime())
            print("\n%s: %d files saved to %s" %
                  (time, len(list(out.keys())), fullpath))

        time = strftime("%H:%M:%S", localtime())
        if print_info:
            print(
                "\n\n%s: Finished! Output is a dictionary with keys:\n\n         '%s'\n"
                % (time, "'\n         '".join(sorted(out.keys()))))
        from interrogation import Interrodict
        idict = Interrodict(out)

        # remove unpicklable bits from query
        from types import ModuleType, FunctionType, BuiltinMethodType, BuiltinFunctionType
        locs = {k: v for k, v in locs.items() if not isinstance(v, ModuleType) \
                                             and not isinstance(v, FunctionType) \
                                             and not isinstance(v, BuiltinFunctionType) \
                                             and not isinstance(v, BuiltinMethodType)}
        idict.query = locs
        return idict
    # make query and total branch, save, return
    else:
        #print sers
        #print ds
        if multiple_corpora and not mult_corp_are_subs:
            sers = [i.results for i in res]
            out = pd.DataFrame(sers, index=[i.query['outname'] for i in res])
            out = out.reindex_axis(sorted(out.columns), axis=1)  # sort cols
            out = out.fillna(0)  # nan to zero
            out = out.astype(int)  # float to int
            out = out.T
        else:
            try:
                out = pd.concat([r.results for r in res], axis=1)
            except ValueError:
                return None
            # format like normal
            out = out[sorted(list(out.columns))]
            out = out.T
            out = out.fillna(0)  # nan to zero
            out = out.astype(int)
            if 'c' in show and mult_corp_are_subs:
                out = out.sum()
                out.index = sorted(list(out.index))

        # sort by total
        if type(out) == pd.core.frame.DataFrame:
            out.ix['Total-tmp'] = out.sum()
            tot = out.ix['Total-tmp']
            out = out[tot.argsort()[::-1]]
            out = out.drop('Total-tmp', axis=0)
        out = out.edit(sort_by = sort_by, print_info = False, keep_stats = False, \
                      df1_always_df = kwargs.get('df1_always_df'))
        if len(out.results.columns) == 1:
            out.results = out.results.sort_index()
        if kwargs.get('do_concordancing') is True:
            concs = pd.concat([x.concordance for x in res], ignore_index=True)
            concs = concs.sort_values(by='c')
            concs = concs.reset_index(drop=True)
            out.concordance = Concordance(concs)
        thetime = strftime("%H:%M:%S", localtime())
        if terminal and print_info:
            with terminal.location(0, terminal.height):
                print('\n\n%s: Finished! %d unique results, %d total.%s' %
                      (thetime, len(
                          out.results.columns), out.totals.sum(), '\n'))
        else:
            if print_info:
                print('\n\n%s: Finished! %d unique results, %d total.%s' %
                      (thetime, len(
                          out.results.columns), out.totals.sum(), '\n'))
        #if used_joblib:

        if quicksave:
            from other import save
            save(out, quicksave)
        return out
예제 #18
0
def configurations(corpus, search, **kwargs):
    """Get behaviour of a word---see corpkit.corpus.Corpus.configurations() for docs"""

    import corpkit
    from dictionaries.wordlists import wordlists
    from dictionaries.roles import roles
    from interrogation import Interrodict
    from interrogator import interrogator
    from collections import OrderedDict

    root = kwargs.get('root')
    note = kwargs.get('note')

    if search.get('l') and search.get('w'):
        raise ValueError('Search only for a word or a lemma, not both.')

    if search.get('l'):
        dep_word_or_lemma = 'dl'
        gov_word_or_lemma = 'gl'
        word_or_token = search.get('l')
    else:
        if search.get('w'):
            dep_word_or_lemma = 'd'
            gov_word_or_lemma = 'g'
            word_or_token = search.get('w')

    queries = {'participant': 

                {'left_participant_in':             
                  {dep_word_or_lemma: word_or_token,
                   'df': r'^.subj.*',
                   'f': roles.event},

                'right_participant_in':
                  {dep_word_or_lemma: word_or_token,
                   'df': r'^[di]obj',
                   'f': roles.event},

                'modified_by':
                  {'f': r'^amod', 
                   gov_word_or_lemma: word_or_token},

                 'and_or':
                  {'f': 'conj:(and|or)',
                   'gf': roles.participant,
                   gov_word_or_lemma: word_or_token},
                },

               'process':

                {'has_subject':
                  {'f': roles.participant1,
                   gov_word_or_lemma: word_or_token},

                 'has_object':
                  {'f': roles.participant2,
                   gov_word_or_lemma: word_or_token},

                 'modalised_by':
                  {'f': r'aux',
                   'w': wordlists.modals,
                   gov_word_or_lemma: word_or_token},

                 'modulated_by':
                  {'f': 'advmod',
                   'gf': roles.event,
                   gov_word_or_lemma: word_or_token},

                 'and_or':
                  {'f': 'conj:(and|or)',
                   'gf': roles.event,                 
                   gov_word_or_lemma: word_or_token},
              
                },

               'modifier':

                {'modifies':
                  {'df': roles.modifier,
                   dep_word_or_lemma: word_or_token},

                 'modulated_by':
                  {'f': 'advmod',
                   'gf': roles.modifier,
                   gov_word_or_lemma: word_or_token},

                 'and_or':
                  {'f': 'conj:(and|or)',
                   'gf': roles.modifier,
                   gov_word_or_lemma: word_or_token},

                }
            }

    if search.get('f'):
        if search.get('f').lower().startswith('part'):
            queries = queries['participant']
        elif search.get('f').lower().startswith('proc'):
            queries = queries['process']
        elif search.get('f').lower().startswith('mod'):
            queries = queries['modifier']
    else:
        newqueries = {}
        for k, v in queries.items():
            for name, pattern in v.items():
                newqueries[name] = pattern
        queries = newqueries
        queries['and_or'] = {'f': 'conj:(and|or)', gov_word_or_lemma: word_or_token},

    total_queries = 0
    for k, v in queries.items():
        for subk, subv in v.items():
            total_queries += 1

    kwargs['search'] = queries
    data = interrogator(corpus, **kwargs)
    for k, v in data.items():
        v.results = v.results.drop(word_or_token, axis = 1, errors = 'ignore')
        v.totals = v.results.sum(axis = 1)
        data[k] = v
    return data
예제 #19
0
파일: corpus.py 프로젝트: xsongx/corpkit
 def concordance(self, *args, **kwargs):
     """interrogate the corpus using :func:`~corpkit.corpus.Corpus.concordance`"""
     from interrogator import interrogator
     return interrogator([s for s in self], conc = True, *args, **kwargs)
예제 #20
0
def pmultiquery(
    path,
    option="c",
    query="any",
    sort_by="total",
    quicksave=False,
    num_proc="default",
    function_filter=False,
    just_speakers=False,
    root=False,
    note=False,
    print_info=True,
    **kwargs
):
    """Parallel process multiple queries or corpora.

    This function is used by interrogator if:

        a) path is a list of paths
        b) query is a dict of named queries
        c) function_filter is iterable
        d) just speakers == 'each'
    
    This function needs joblib 0.8.4 or above in order to run properly."""

    import collections
    import os
    import pandas
    import pandas as pd
    import collections
    from collections import namedtuple
    from time import strftime, localtime
    from interrogator import interrogator
    from editor import editor
    from other import save_result

    try:
        from joblib import Parallel, delayed
    except:
        pass
        # raise ValueError('joblib, the module used for multiprocessing, cannot be found. ' \
        #                 'Install with:\n\n        pip install joblib')
    import multiprocessing

    def best_num_parallel(num_cores, num_queries):
        import corpkit

        """decide how many parallel processes to run

        the idea, more or less, is to balance the load when possible"""
        if num_queries <= num_cores:
            return num_queries
        if num_queries > num_cores:
            if (num_queries / num_cores) == num_cores:
                return int(num_cores)
            if num_queries % num_cores == 0:
                return max([int(num_queries / n) for n in range(2, num_cores) if int(num_queries / n) <= num_cores])
            else:
                import math

                if (float(math.sqrt(num_queries))).is_integer():
                    square_root = math.sqrt(num_queries)
                    if square_root <= num_queries / num_cores:
                        return int(square_root)
        return num_cores

    num_cores = multiprocessing.cpu_count()

    # are we processing multiple queries or corpora?
    # find out optimal number of cores to use.
    multiple_option = False
    multiple_queries = False
    multiple_speakers = False
    multiple_corpora = False

    denom = 1
    if hasattr(path, "__iter__"):
        multiple_corpora = True
        num_cores = best_num_parallel(num_cores, len(path))
        denom = len(path)
    elif hasattr(query, "__iter__"):
        multiple_queries = True
        num_cores = best_num_parallel(num_cores, len(query))
        denom = len(query)
    elif hasattr(function_filter, "__iter__"):
        multiple_option = True
        num_cores = best_num_parallel(num_cores, len(function_filter.keys()))
        denom = len(function_filter.keys())
    elif just_speakers:
        from corpkit.build import get_speaker_names_from_xml_corpus

        multiple_speakers = True
        if just_speakers == "each":
            just_speakers = get_speaker_names_from_xml_corpus(path)
        if len(just_speakers) == 0:
            print "No speaker name data found."
            return
        num_cores = best_num_parallel(num_cores, len(just_speakers))
        denom = len(just_speakers)

    if num_proc != "default":
        num_cores = num_proc

    # make sure quicksaves are right type
    if quicksave is True:
        raise ValueError("quicksave must be string when using pmultiquery.")

    # the options that don't change
    d = {
        "option": option,
        #'paralleling': True,
        "function": "interrogator",
        "root": root,
        "note": note,
        "denominator": denom,
    }
    # add kwargs to query
    for k, v in kwargs.items():
        d[k] = v

    # make a list of dicts to pass to interrogator,
    # with the iterable unique in every one
    ds = []
    if multiple_corpora:
        path = sorted(path)
        for index, p in enumerate(path):
            name = os.path.basename(p)
            a_dict = dict(d)
            a_dict["path"] = p
            a_dict["query"] = query
            a_dict["outname"] = name
            a_dict["just_speakers"] = just_speakers
            a_dict["paralleling"] = index
            a_dict["printstatus"] = False
            ds.append(a_dict)
    elif multiple_queries:
        for index, (name, q) in enumerate(query.items()):
            a_dict = dict(d)
            a_dict["path"] = path
            a_dict["query"] = q
            a_dict["outname"] = name
            a_dict["just_speakers"] = just_speakers
            a_dict["paralleling"] = index
            a_dict["printstatus"] = False
            ds.append(a_dict)
    elif multiple_option:
        for index, (name, q) in enumerate(function_filter.items()):
            a_dict = dict(d)
            a_dict["path"] = path
            a_dict["query"] = query
            a_dict["outname"] = name
            a_dict["just_speakers"] = just_speakers
            a_dict["paralleling"] = index
            a_dict["function_filter"] = q
            a_dict["printstatus"] = False
            ds.append(a_dict)
    elif multiple_speakers:
        for index, name in enumerate(just_speakers):
            a_dict = dict(d)
            a_dict["path"] = path
            a_dict["query"] = query
            a_dict["outname"] = name
            a_dict["just_speakers"] = [name]
            a_dict["function_filter"] = function_filter
            a_dict["paralleling"] = index
            a_dict["printstatus"] = False
            ds.append(a_dict)

    time = strftime("%H:%M:%S", localtime())
    if multiple_corpora and not multiple_option:
        print (
            "\n%s: Beginning %d parallel corpus interrogations:\n              %s"
            "\n\n          Query: '%s'"
            "\n          Interrogating corpus ... \n" % (time, num_cores, "\n              ".join(path), query)
        )

    elif multiple_queries:
        print (
            "\n%s: Beginning %d parallel corpus interrogations: %s"
            "\n          Queries: '%s'"
            "\n          Interrogating corpus ... \n"
            % (time, num_cores, os.path.basename(path), "', '".join(query.values()))
        )

    elif multiple_option:
        print (
            "\n%s: Beginning %d parallel corpus interrogations (multiple options): %s"
            "\n\n          Query: '%s'"
            "\n          Interrogating corpus ... \n" % (time, num_cores, os.path.basename(path), query)
        )

    elif multiple_speakers:
        print (
            "\n%s: Beginning %d parallel corpus interrogations: %s"
            "\n\n          Query: '%s'"
            "\n          Interrogating corpus ... \n" % (time, num_cores, os.path.basename(path), query)
        )

    # run in parallel, get either a list of tuples (non-c option)
    # or a dataframe (c option)
    # import sys
    # reload(sys)
    # stdout=sys.stdout
    failed = False
    # ds = ds[::-1]
    if not root:
        from blessings import Terminal

        terminal = Terminal()
        print "\n" * (len(ds) - 2)
        for dobj in ds:
            linenum = dobj["paralleling"]
            with terminal.location(0, terminal.height - (linenum + 1)):
                # this is a really bad idea.
                thetime = strftime("%H:%M:%S", localtime())
                print "%s: [                      0%% (%s)                            ]" % (thetime, dobj["outname"])

        # res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
        try:
            # ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True)
            res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
            print "\n\n\n"
        except:
            failed = True
            print "Multiprocessing failed."
            raise
        try:
            res = sorted(res)
        except:
            failed = True
            pass
    elif root or failed:
        res = []
        for index, d in enumerate(ds):
            d["startnum"] = (100 / denom) * index
            res.append(interrogator(**d))
        try:
            res = sorted(res)
        except:
            pass

    # multiprocessing way
    # from multiprocessing import Process
    # from corpkit.interrogator import interrogator
    # jobs = []
    ##for d in ds:
    ##    p = multiprocessing.Process(target=interrogator, kwargs=(**d,))
    ##    jobs.append(p)
    ##    p.start()
    ##    while p.is_alive():
    ##        import time
    ##        time.sleep(2)
    ##        if root:
    ##            root.update()
    # result_queue = multiprocessing.Queue()
    #
    # for d in ds:
    # funs = [interrogator(result_queue, **kwargs) for kwargs in ds]
    # jobs = [multiprocessing.Process(mc) for mc in funs]
    # for job in jobs: job.start()
    # for job in jobs: job.join()
    # results = [result_queue.get() for mc in funs]

    # turn list into dict of results, make query and total branches,
    # save and return
    if not option.startswith("c"):
        out = {}
        # print ''
        for (name, data), d in zip(res, ds):
            for unpicklable in ["note", "root"]:
                try:
                    del d[unpicklable]
                except KeyError:
                    pass
            if not option.startswith("k"):
                outputnames = collections.namedtuple("interrogation", ["query", "results", "totals"])
                try:
                    stotal = data.sum(axis=1)
                    stotal.name = u"Total"
                except ValueError:
                    stotal = data.sum()
                output = outputnames(d, data, stotal)
            else:
                outputnames = collections.namedtuple("interrogation", ["query", "results"])
                output = outputnames(d, data)
            out[name] = output

        # could be wrong for unstructured corpora?
        if quicksave:
            fullpath = os.path.join("saved_interrogations", quicksave)
            while os.path.isdir(fullpath):
                selection = raw_input(
                    "\nSave error: %s already exists in %s.\n\nType 'o' to overwrite, or enter a new name: "
                    % (quicksave, "saved_interrogations")
                )
                if selection == "o" or selection == "O":
                    import shutil

                    shutil.rmtree(fullpath)
                else:
                    import os

                    fullpath = os.path.join("saved_interrogations", selection)

            for k, v in out.items():
                save_result(v, k, savedir=fullpath, print_info=False)

            time = strftime("%H:%M:%S", localtime())
            print "\n%s: %d files saved to %s" % (time, len(out.keys()), fullpath)

        time = strftime("%H:%M:%S", localtime())
        print "\n\n%s: Finished! Output is a dictionary with keys:\n\n         '%s'\n" % (
            time,
            "'\n         '".join(sorted(out.keys())),
        )

        return out
    # make query and total branch, save, return
    else:
        out = pd.concat(res, axis=1)
        out = editor(out, sort_by=sort_by, print_info=False, keep_stats=False)
        time = strftime("%H:%M:%S", localtime())
        print "\n\n%s: Finished! %d unique results, %d total." % (time, len(out.results.columns), out.totals.sum())
        if quicksave:
            from other import save_result

            save_result(out, quicksave)
        return out