예제 #1
0
    def make_progress_bar():
        """generate a progress bar"""

        if simple_tregex_mode:
            total_files = len(list(to_iterate_over.keys()))
        else:
            total_files = sum(len(x) for x in list(to_iterate_over.values()))

        par_args = {
            'printstatus': kwargs.get('printstatus', True),
            'root': root,
            'note': note,
            'quiet': quiet,
            'length': total_files,
            'startnum': kwargs.get('startnum'),
            'denom': kwargs.get('denominator', 1)
        }

        term = None
        if kwargs.get('paralleling', None) is not None:
            from blessings import Terminal
            term = Terminal()
            par_args['terminal'] = term
            par_args['linenum'] = kwargs.get('paralleling')

        if in_notebook:
            par_args['welcome_message'] = welcome_message

        outn = kwargs.get('outname', '')
        if outn:
            outn = getattr(outn, 'name', outn)
            outn = outn + ': '

        tstr = '%s%d/%d' % (outn, current_iter, total_files)
        p = animator(None, None, init=True, tot_string=tstr, **par_args)
        tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
        animator(p, current_iter, tstr, **par_args)
        return p, outn, total_files, par_args
예제 #2
0
def download_large_file(proj_path,
                        url,
                        actually_download=True,
                        root=False,
                        **kwargs):
    """
    Download something to proj_path, unless it's CoreNLP, which goes to ~/corenlp
    """
    import os
    import shutil
    import glob
    import zipfile
    from time import localtime, strftime
    from corpkit.textprogressbar import TextProgressBar
    from corpkit.process import animator

    file_name = url.split('/')[-1]
    home = os.path.expanduser("~")
    customdir = kwargs.get('custom_corenlp_dir', False)
    # if it's corenlp, put it in home/corenlp
    # if that dir exists, check if for a zip file
    # if there's a zipfile and it works, move on
    # if there's a zipfile and it's broken, delete it
    if 'stanford' in url:
        if customdir:
            downloaded_dir = customdir
        else:
            downloaded_dir = os.path.join(home, 'corenlp')
        if not os.path.isdir(downloaded_dir):
            os.makedirs(downloaded_dir)
        else:
            poss_zips = glob.glob(
                os.path.join(downloaded_dir, 'stanford-corenlp-full*.zip'))
            if poss_zips:
                fullfile = poss_zips[-1]
                from zipfile import BadZipfile
                try:
                    the_zip_file = zipfile.ZipFile(fullfile)
                    ret = the_zip_file.testzip()
                    if ret is None:
                        return downloaded_dir, fullfile
                    else:
                        os.remove(fullfile)
                except BadZipfile:
                    os.remove(fullfile)
            #else:
            #    shutil.rmtree(downloaded_dir)
    else:
        downloaded_dir = os.path.join(proj_path, 'temp')
        try:
            os.makedirs(downloaded_dir)
        except OSError:
            pass
    fullfile = os.path.join(downloaded_dir, file_name)

    if actually_download:
        import __main__ as main
        if not root and not hasattr(main, '__file__'):
            txt = 'CoreNLP not found. Download latest version (%s)? (y/n) ' % url

            selection = INPUTFUNC(txt)

            if 'n' in selection.lower():
                return None, None
        try:
            import requests
            # NOTE the stream=True parameter
            r = requests.get(url, stream=True, verify=False)
            file_size = int(r.headers['content-length'])
            file_size_dl = 0
            block_sz = 8192
            showlength = file_size / block_sz
            thetime = strftime("%H:%M:%S", localtime())
            print('\n%s: Downloading ... \n' % thetime)
            par_args = {
                'printstatus': kwargs.get('printstatus', True),
                'length': showlength
            }
            if not root:
                tstr = '%d/%d' % (file_size_dl + 1 / block_sz, showlength)
                p = animator(None,
                             None,
                             init=True,
                             tot_string=tstr,
                             **par_args)
                animator(p, file_size_dl + 1, tstr)

            with open(fullfile, 'wb') as f:
                for chunk in r.iter_content(chunk_size=block_sz):
                    if chunk:  # filter out keep-alive new chunks
                        f.write(chunk)
                        file_size_dl += len(chunk)
                        #print file_size_dl * 100.0 / file_size
                        if kwargs.get('note'):
                            kwargs['note'].progvar.set(file_size_dl * 100.0 /
                                                       int(file_size))
                        else:
                            tstr = '%d/%d' % (file_size_dl / block_sz,
                                              showlength)
                            animator(p, file_size_dl / block_sz, tstr,
                                     **par_args)
                        if root:
                            root.update()
        except Exception as err:
            import traceback
            print(traceback.format_exc())
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: Download failed' % thetime)
            try:
                f.close()
            except:
                pass
            if root:
                root.update()
            return None, None

        if kwargs.get('note'):
            kwargs['note'].progvar.set(100)
        else:
            p.animate(int(file_size))
        thetime = strftime("%H:%M:%S", localtime())
        print('\n%s: Downloaded successully.' % thetime)
        try:
            f.close()
        except:
            pass
    return downloaded_dir, fullfile
예제 #3
0
def interrogator(corpus, 
    search='w', 
    query='any',
    show='w',
    exclude=False,
    excludemode='any',
    searchmode='all',
    case_sensitive=False,
    save=False,
    subcorpora=False,
    just_metadata=False,
    skip_metadata=False,
    preserve_case=False,
    lemmatag=False,
    files_as_subcorpora=False,
    only_unique=False,
    only_format_match=True,
    multiprocess=False,
    spelling=False,
    regex_nonword_filter=r'[A-Za-z0-9]',
    gramsize=1,
    conc=False,
    maxconc=9999,
    window=None,
    no_closed=False,
    no_punct=True,
    discard=False,
    **kwargs):
    """
    Interrogate corpus, corpora, subcorpus and file objects.
    See corpkit.interrogation.interrogate() for docstring
    """
    
    conc = kwargs.get('do_concordancing', conc)
    quiet = kwargs.get('quiet', False)
    coref = kwargs.pop('coref', False)
    show_conc_metadata = kwargs.pop('show_conc_metadata', False)
    fsi_index = kwargs.pop('fsi_index', True)
    dep_type = kwargs.pop('dep_type', 'collapsed-ccprocessed-dependencies')

    nosubmode = subcorpora is None
    #todo: temporary
    #if getattr(corpus, '_dlist', False):
    #    subcorpora = 'file'

    # store kwargs and locs
    locs = locals().copy()
    locs.update(kwargs)
    locs.pop('kwargs', None)

    import codecs
    import signal
    import os
    from time import localtime, strftime
    from collections import Counter

    import pandas as pd
    from pandas import DataFrame, Series

    from corpkit.interrogation import Interrogation, Interrodict
    from corpkit.corpus import Datalist, Corpora, Corpus, File, Subcorpus
    from corpkit.process import (tregex_engine, get_deps, unsplitter, sanitise_dict, 
                                 animator, filtermaker, fix_search,
                                 pat_format, auto_usecols, format_tregex,
                                 make_conc_lines_from_whole_mid)
    from corpkit.other import as_regex
    from corpkit.dictionaries.process_types import Wordlist
    from corpkit.build import check_jdk
    from corpkit.conll import pipeline
    from corpkit.process import delete_files_and_subcorpora
    
    have_java = check_jdk()

    # remake corpus without bad files and folders 
    corpus, skip_metadata, just_metadata = delete_files_and_subcorpora(corpus, skip_metadata, just_metadata)

    # so you can do corpus.interrogate('features/postags/wordclasses/lexicon')
    if search == 'features':
        search = 'v'
        query = 'any'
    if search in ['postags', 'wordclasses']:
        query = 'any'
        preserve_case = True
        show = 'p' if search == 'postags' else 'x'
        # use tregex if simple because it's faster
        # but use dependencies otherwise
        search = 't' if not subcorpora and not just_metadata and not skip_metadata and have_java else {'w': 'any'}
    if search == 'lexicon':
        search = 't' if not subcorpora and not just_metadata and not skip_metadata and have_java else {'w': 'any'}
        query = 'any'
        show = ['w']

    if not kwargs.get('cql') and isinstance(search, STRINGTYPE) and len(search) > 3:
        raise ValueError('search argument not recognised.')

    import re
    if regex_nonword_filter:
        is_a_word = re.compile(regex_nonword_filter)
    else:
        is_a_word = re.compile(r'.*')

    from traitlets import TraitError

    # convert cql-style queries---pop for the sake of multiprocessing
    cql = kwargs.pop('cql', None)
    if cql:
        from corpkit.cql import to_corpkit
        search, exclude = to_corpkit(search)

    def signal_handler(signal, _):
        """
        Allow pausing and restarting whn not in GUI
        """
        if root:
            return  
        import signal
        import sys
        from time import localtime, strftime
        signal.signal(signal.SIGINT, original_sigint)
        thetime = strftime("%H:%M:%S", localtime())
        INPUTFUNC('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime)
        time = strftime("%H:%M:%S", localtime())
        print('%s: Interrogation resumed.\n' % time)
        signal.signal(signal.SIGINT, signal_handler)

    def add_adj_for_ngram(show, gramsize):
        """
        If there's a gramsize of more than 1, remake show
        for ngramming
        """
        if gramsize == 1:
            return show
        out = []
        for i in show:
            out.append(i)
        for i in range(1, gramsize):
            for bit in show:
                out.append('+%d%s' % (i, bit))
        return out

    def fix_show_bit(show_bit):
        """
        Take a single search/show_bit type, return match
        """
        ends = ['w', 'l', 'i', 'n', 'f', 'p', 'x', 's', 'a', 'e', 'c']
        starts = ['d', 'g', 'm', 'b', 'h', '+', '-', 'r', 'c']
        show_bit = show_bit.lstrip('n')
        show_bit = show_bit.lstrip('b')
        show_bit = list(show_bit)
        if show_bit[-1] not in ends:
            show_bit.append('w')
        if show_bit[0] not in starts:
            show_bit.insert(0, 'm')
        return ''.join(show_bit)

    def fix_show(show, gramsize):
        """
        Lowercase anything in show and turn into list
        """
        if isinstance(show, list):
            show = [i.lower() for i in show]
        elif isinstance(show, STRINGTYPE):
            show = show.lower()
            show = [show]
        show = [fix_show_bit(i) for i in show]
        return add_adj_for_ngram(show, gramsize)

    def is_multiquery(corpus, search, query, outname):
        """
        Determine if multiprocessing is needed/possibe, and 
        do some retyping if need be as well
        """
        is_mul = False
        from collections import OrderedDict
        from corpkit.dictionaries.process_types import Wordlist
        
        if isinstance(query, Wordlist):
            query = list(query)

        if subcorpora and multiprocess:
            is_mul = 'subcorpora'

        if isinstance(subcorpora, (list, tuple)):
            is_mul = 'subcorpora'

        if isinstance(query, (dict, OrderedDict)):
            is_mul = 'namedqueriessingle'
        
        if isinstance(search, dict):
            if all(isinstance(i, dict) for i in list(search.values())):
                is_mul = 'namedqueriesmultiple'
        return is_mul, corpus, search, query

    def ispunct(s):
        import string
        return all(c in string.punctuation for c in s)

    def uniquify(conc_lines):
        """get unique concordance lines"""
        from collections import OrderedDict
        unique_lines = []
        checking = []
        for index, (_, speakr, start, middle, end) in enumerate(conc_lines):
            joined = ' '.join([speakr, start, 'MIDDLEHERE:', middle, ':MIDDLEHERE', end])
            if joined not in checking:
                unique_lines.append(conc_lines[index])
            checking.append(joined)
        return unique_lines

    def compiler(pattern):
        """
        Compile regex or fail gracefully
        """
        if hasattr(pattern, 'pattern'):
            return pattern
        import re
        try:
            if case_sensitive:
                comped = re.compile(pattern)
            else:
                comped = re.compile(pattern, re.IGNORECASE)
            return comped
        except:
            import traceback
            import sys
            from time import localtime, strftime
            exc_type, exc_value, exc_traceback = sys.exc_info()
            lst = traceback.format_exception(exc_type, exc_value, exc_traceback)
            error_message = lst[-1]
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: Query %s' % (thetime, error_message))
            if root:
                return 'Bad query'
            else:
                raise ValueError('%s: Query %s' % (thetime, error_message))

    def determine_search_func(show):
        """Figure out what search function we're using"""

        simple_tregex_mode = False
        statsmode = False
        tree_to_text = False
        search_trees = False
            
        simp_crit = all(not i for i in [kwargs.get('tgrep'),
                                        files_as_subcorpora,
                                        subcorpora,
                                        just_metadata,
                                        skip_metadata])

        if search.get('t') and simp_crit:
            if have_java:
                simple_tregex_mode = True
            else:
                search_trees = 'tgrep'
            optiontext = 'Searching parse trees'

        elif datatype == 'conll':
        
            if any(i.endswith('t') for i in search.keys()):
                if have_java and not kwargs.get('tgrep'):
                    search_trees = 'tregex'
                else:
                    search_trees = 'tgrep'
                optiontext = 'Searching parse trees'
            elif any(i.endswith('v') for i in search.keys()):
                # either of these searchers now seems to work
                #seacher = get_stats_conll
                statsmode = True
                optiontext = 'General statistics'
            elif any(i.endswith('r') for i in search.keys()):
                optiontext = 'Distance from root'
            else:
                optiontext = 'Querying CONLL data'

        return optiontext, simple_tregex_mode, statsmode, tree_to_text, search_trees

    def get_tregex_values(show):
        """If using Tregex, set appropriate values

        - Check for valid query
        - Make 'any' query
        - Make list query
        """

        translated_option = 't'
        if isinstance(search['t'], Wordlist):
            search['t'] = list(search['t'])
        q = tregex_engine(corpus=False,
                          query=search.get('t'),
                          options=['-t'],
                          check_query=True,
                          root=root,
                          preserve_case=preserve_case
                         )

        # so many of these bad fixing loops!
        nshow = []
        for i in show:
            if i == 'm':
                nshow.append('w')
            else:
                nshow.append(i.lstrip('m'))
        show = nshow

        if q is False:
            if root:
                return 'Bad query', None
            else:
                return 'Bad query', None

        if isinstance(search['t'], list):
            regex = as_regex(search['t'], boundaries='line', case_sensitive=case_sensitive)
        else:
            regex = ''

        # listquery, anyquery, translated_option
        treg_dict = {'p': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'u'],
                     'pl': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'u'],
                     'x': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'u'],
                     't': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'o'],
                     'w': [r'/%s/ !< __' % regex, r'/.?[A-Za-z0-9].?/ !< __', 't'],
                     'c': [r'/%s/ !< __'  % regex, r'/.?[A-Za-z0-9].?/ !< __', 'C'],
                     'l': [r'/%s/ !< __'  % regex, r'/.?[A-Za-z0-9].?/ !< __', 't'],
                     'u': [r'/%s/ !< __'  % regex, r'/.?[A-Za-z0-9].?/ !< __', 'v']
                    }

        newshow = []

        listq, anyq, translated_option = treg_dict.get(show[0][-1].lower())
        newshow.append(translated_option)
        for item in show[1:]:
            _, _, noption = treg_dict.get(item.lower())
            newshow.append(noption)

        if isinstance(search['t'], list):
            search['t'] = listq
        elif search['t'] == 'any':   
            search['t'] = anyq
        return search['t'], newshow

    def correct_spelling(a_string):
        """correct spelling within a string"""
        if not spelling:
            return a_string
        from corpkit.dictionaries.word_transforms import usa_convert
        if spelling.lower() == 'uk':
            usa_convert = {v: k for k, v in list(usa_convert.items())}
        bits = a_string.split('/')
        for index, i in enumerate(bits):
            converted = usa_convert.get(i.lower(), i)
            if i.islower() or preserve_case is False:
                converted = converted.lower()
            elif i.isupper() and preserve_case:
                converted = converted.upper()
            elif i.istitle() and preserve_case:
                converted = converted.title()
            bits[index] = converted
        r = '/'.join(bits)
        return r

    def make_search_iterable(corpus):
        """determine how to structure the corpus for interrogation"""
        # skip file definitions if they are not needed
        if getattr(corpus, '_dlist', False):

            return {(i.name, i.path): [i] for i in list(corpus.files)}
            #return {('Sample', 'Sample'): list(corpus.files)}

        if simple_tregex_mode:
            if corpus.level in ['s', 'f', 'd']:
                return {(corpus.name, corpus.path): False}
            else:
                return {(os.path.basename(i), os.path.join(corpus.path, i)): False
                    for i in os.listdir(corpus.path)
                    if os.path.isdir(os.path.join(corpus.path, i))}

        if isinstance(corpus, Datalist):
            to_iterate_over = {}
            # it could be files or subcorpus objects
            if corpus[0].level in ['s', 'd']:
                if files_as_subcorpora:
                    for subc in corpus:
                        for f in subc.files:
                            to_iterate_over[(f.name, f.path)] = [f]
                else:
                    for subc in corpus:
                        to_iterate_over[(subc.name, subc.path)] = subc.files
            elif corpus[0].level == 'f':
                for f in corpus:
                    to_iterate_over[(f.name, f.path)] = [f]
        elif corpus.singlefile:
            to_iterate_over = {(corpus.name, corpus.path): [corpus]}
        elif not hasattr(corpus, 'subcorpora') or not corpus.subcorpora:
            # just files in a directory
            if files_as_subcorpora:
                to_iterate_over = {}
                for f in corpus.files:
                    to_iterate_over[(f.name, f.path)] = [f]
            else:
                to_iterate_over = {(corpus.name, corpus.path): corpus.files}
        else:
            to_iterate_over = {}
            if files_as_subcorpora:
                # don't know if possible: has subcorpora but also .files
                if hasattr(corpus, 'files') and corpus.files is not None:
                    for f in corpus.files:
                        to_iterate_over[(f.name, f.path)] = [f]
                # has subcorpora with files in those
                elif hasattr(corpus, 'files') and corpus.files is None:
                    for subc in corpus.subcorpora:
                        for f in subc.files:
                            to_iterate_over[(f.name, f.path)] = [f]
            else:
                if corpus[0].level == 's':
                    for subcorpus in corpus:
                        to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files
                elif corpus[0].level == 'f':
                    for f in corpus:
                        to_iterate_over[(f.name, f.path)] = [f]
                else:
                    for subcorpus in corpus.subcorpora:
                        to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files
        return to_iterate_over

    def welcome_printer(return_it=False):
        """Print welcome message"""
        if no_conc:
            message = 'Interrogating'
        else:
            message = 'Interrogating and concordancing'
        if only_conc:
            message = 'Concordancing'
        if kwargs.get('printstatus', True):
            thetime = strftime("%H:%M:%S", localtime())
            from corpkit.process import dictformat
            sformat = dictformat(search)
            welcome = ('\n%s: %s %s ...\n          %s\n          ' \
                        'Query: %s\n          %s corpus ... \n' % \
                      (thetime, message, cname, optiontext, sformat, message))
            if return_it:
                return welcome
            else:
                print(welcome)

    def goodbye_printer(return_it=False, only_conc=False):
        """Say goodbye before exiting"""
        if not kwargs.get('printstatus', True):
            return
        thetime = strftime("%H:%M:%S", localtime())
        if only_conc:
            finalstring = '\n\n%s: Concordancing finished! %s results.' % (thetime, format(len(conc_df), ','))
        else:
            finalstring = '\n\n%s: Interrogation finished!' % thetime
            if countmode:
                finalstring += ' %s matches.' % format(tot, ',')
            else:
                finalstring += ' %s unique results, %s total occurrences.' % (format(numentries, ','), format(total_total, ','))
        if return_it:
            return finalstring
        else:
            print(finalstring)

    def get_conc_colnames(corpus,
                          fsi_index=False,
                          simple_tregex_mode=False):
    
        fields = []
        base = 'c f s l m r'
        
        if simple_tregex_mode:
            base = base.replace('f ', '')

        if fsi_index and not simple_tregex_mode:
            base = 'i ' + base
        
        if PYTHON_VERSION == 2:
            base = base.encode('utf-8').split()
        else:
            base = base.split() 

        if show_conc_metadata:
            from corpkit.build import get_all_metadata_fields
            meta = get_all_metadata_fields(corpus.path)

            if isinstance(show_conc_metadata, list):
                meta = [i for i in meta if i in show_conc_metadata]
            #elif show_conc_metadata is True:
            #    pass
            for i in sorted(meta):
                if i in ['speaker', 'sent_id', 'parse']:
                    continue
                if PYTHON_VERSION == 2:
                    base.append(i.encode('utf-8'))
                else:
                    base.append(i)
        return base

    def make_conc_obj_from_conclines(conc_results, fsi_index=False):
        """
        Turn conclines into DataFrame
        """
        from corpkit.interrogation import Concordance
        #fsi_place = 2 if fsi_index else 0

        all_conc_lines = []
        for sc_name, resu in sorted(conc_results.items()):
            if only_unique:
                unique_results = uniquify(resu)
            else:
                unique_results = resu
            #make into series
            for lin in unique_results:
                #spkr = str(spkr, errors = 'ignore')
                #if not subcorpora:
                #    lin[fsi_place] = lin[fsi_place]
                #lin.insert(fsi_place, sc_name)

                if len(lin) < len(conc_col_names):
                    diff = len(conc_col_names) - len(lin)
                    lin.extend(['none'] * diff)

                all_conc_lines.append(Series(lin, index=conc_col_names))

        try:
            conc_df = pd.concat(all_conc_lines, axis=1).T
        except ValueError:
            return
        
        if all(x == '' for x in list(conc_df['s'].values)) or \
           all(x == 'none' for x in list(conc_df['s'].values)):
            conc_df.drop('s', axis=1, inplace=True)

        locs['corpus'] = corpus.name

        if maxconc:
            conc_df = Concordance(conc_df[:maxconc])
        else:
            conc_df = Concordance(conc_df)
        try:
            conc_df.query = locs
        except AttributeError:
            pass
        return conc_df

    def lowercase_result(res):
        """      
        Take any result and do spelling/lowercasing if need be

        todo: remove lowercase and change name
        """
        if not res or statsmode:
            return res
        # this is likely broken, but spelling in interrogate is deprecated anyway
        if spelling:
            res = [correct_spelling(r) for r in res]
        return res

    def postprocess_concline(line, fsi_index=False, conc=False):
        # todo: are these right?
        if not conc:
            return line
        subc, star, en = 0, 2, 5
        if fsi_index:
            subc, star, en = 2, 4, 7
        if not preserve_case:
            line[star:en] = [str(x).lower() for x in line[star:en]]
        if spelling:
            line[star:en] = [correct_spelling(str(b)) for b in line[star:en]]
        return line

    def make_progress_bar():
        """generate a progress bar"""

        if simple_tregex_mode:
            total_files = len(list(to_iterate_over.keys()))
        else:
            total_files = sum(len(x) for x in list(to_iterate_over.values()))

        par_args = {'printstatus': kwargs.get('printstatus', True),
                    'root': root, 
                    'note': note,
                    'quiet': quiet,
                    'length': total_files,
                    'startnum': kwargs.get('startnum'),
                    'denom': kwargs.get('denominator', 1)}

        term = None
        if kwargs.get('paralleling', None) is not None:
            from blessings import Terminal
            term = Terminal()
            par_args['terminal'] = term
            par_args['linenum'] = kwargs.get('paralleling')

        if in_notebook:
            par_args['welcome_message'] = welcome_message

        outn = kwargs.get('outname', '')
        if outn:
            outn = getattr(outn, 'name', outn)
            outn = outn + ': '

        tstr = '%s%d/%d' % (outn, current_iter, total_files)
        p = animator(None, None, init=True, tot_string=tstr, **par_args)
        tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
        animator(p, current_iter, tstr, **par_args)
        return p, outn, total_files, par_args

    # find out if using gui
    root = kwargs.get('root')
    note = kwargs.get('note')
    language_model = kwargs.get('language_model')

    # set up pause method
    original_sigint = signal.getsignal(signal.SIGINT)
    if kwargs.get('paralleling', None) is None:
        if not root:
            original_sigint = signal.getsignal(signal.SIGINT)
            signal.signal(signal.SIGINT, signal_handler)

    # find out about concordancing
    only_conc = False
    no_conc = False
    if conc is False:
        no_conc = True
    if isinstance(conc, str) and conc.lower() == 'only':
        only_conc = True
        no_conc = False
    numconc = 0

    # wipe non essential class attributes to not bloat query attrib
    if isinstance(corpus, Corpus):
        import copy
        corpus = copy.copy(corpus)
        for k, v in corpus.__dict__.items():
            if isinstance(v, (Interrogation, Interrodict)):
                corpus.__dict__.pop(k, None)

    # convert path to corpus object
    if not isinstance(corpus, (Corpus, Corpora, Subcorpus, File, Datalist)):
        if not multiprocess and not kwargs.get('outname'):
            corpus = Corpus(corpus, print_info=False)

    # figure out how the user has entered the query and show, and normalise
    from corpkit.process import searchfixer
    search = searchfixer(search, query)
    show = fix_show(show, gramsize)
    locs['show'] = show

    # instantiate lemmatiser if need be
    lem_instance = False
    if any(i.endswith('l') for i in show) and isinstance(search, dict) and search.get('t'):
        from nltk.stem.wordnet import WordNetLemmatizer
        lem_instance = WordNetLemmatizer()

    # do multiprocessing if need be
    im, corpus, search, query, = is_multiquery(corpus, search, query, 
                                                             kwargs.get('outname', False))

    # figure out if we can multiprocess the corpus
    if hasattr(corpus, '__iter__') and im:
        corpus = Corpus(corpus, print_info=False)
    if hasattr(corpus, '__iter__') and not im:
        im = 'datalist'
    if isinstance(corpus, Corpora):
        im = 'multiplecorpora'

    # split corpus if the user wants multiprocessing but no other iterable
    if not im and multiprocess:
        im = 'datalist'
        if getattr(corpus, 'subcorpora', False):
            corpus = corpus[:]
        else:
            corpus = corpus.files

    search = fix_search(search, case_sensitive=case_sensitive, root=root)
    exclude = fix_search(exclude, case_sensitive=case_sensitive, root=root)

    # if it's already been through pmultiquery, don't do it again
    locs['search'] = search
    locs['exclude'] = exclude
    locs['query'] = query
    locs['corpus'] = corpus
    locs['multiprocess'] = multiprocess
    locs['print_info'] = kwargs.get('printstatus', True)
    locs['multiple'] = im
    locs['subcorpora'] = subcorpora
    locs['nosubmode'] = nosubmode

    # send to multiprocess function
    if im:
        signal.signal(signal.SIGINT, original_sigint)
        from corpkit.multiprocess import pmultiquery
        return pmultiquery(**locs)

    # get corpus metadata
    cname = corpus.name
    if isinstance(save, STRINGTYPE):
        savename = corpus.name + '-' + save
    if save is True:
        raise ValueError('save must be str, not bool.')


    datatype = getattr(corpus, 'datatype', 'conll')
    singlefile = getattr(corpus, 'singlefile', False)
    level = getattr(corpus, 'level', 'c')
        
    # store all results in here
    from collections import defaultdict
    results = defaultdict(Counter)
    count_results = defaultdict(list)
    conc_results = defaultdict(list)

    # check if just counting, turn off conc if so
    countmode = 'c' in show or 'mc' in show
    if countmode:
        no_conc = True
        only_conc = False
    # where we are at in interrogation
    current_iter = 0

    # multiprocessing progress bar
    denom = kwargs.get('denominator', 1)
    startnum = kwargs.get('startnum', 0)

    # Determine the search function to be used #
    optiontext, simple_tregex_mode, statsmode, tree_to_text, search_trees = determine_search_func(show)
    
    # no conc for statsmode
    if statsmode:
        no_conc = True
        only_conc = False
        conc = False

    # Set some Tregex-related values
    translated_option = False
    if search.get('t'):
        query, translated_option = get_tregex_values(show)
        if query == 'Bad query' and translated_option is None:
            if root:
                return 'Bad query'
            else:
                return
    # more tregex options
    if tree_to_text:
        treg_q = r'ROOT << __'
        op = ['-o', '-t', '-w', '-f']
    elif simple_tregex_mode:
        treg_q = search['t']
        op = ['-%s' % i for i in translated_option] + ['-o', '-f']

    # make iterable object for corpus interrogation
    to_iterate_over = make_search_iterable(corpus)

    try:
        from ipywidgets import IntProgress
        _ = IntProgress(min=0, max=10, value=1)
        in_notebook = True
    except TraitError:
        in_notebook = False
    except ImportError:
        in_notebook = False
    # caused in newest ipython
    except AttributeError:
        in_notebook = False

    lemtag = False
    if search.get('t'):
        from corpkit.process import gettag
        lemtag = gettag(search.get('t'), lemmatag)

    usecols = auto_usecols(search, exclude, show, kwargs.pop('usecols', None), coref=coref)

    # print welcome message
    welcome_message = welcome_printer(return_it=in_notebook)

    # create a progress bar
    p, outn, total_files, par_args = make_progress_bar()

    if conc:
        conc_col_names = get_conc_colnames(corpus,
                                           fsi_index=fsi_index,
                                           simple_tregex_mode=False)

 

    # Iterate over data, doing interrogations
    for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()):
        if nosubmode:
            subcorpus_name = 'Total'

        # results for subcorpus go here
        #conc_results[subcorpus_name] = []
        #count_results[subcorpus_name] = []
        #results[subcorpus_name] = Counter()

        # get either everything (tree_to_text) or the search['t'] query
        if tree_to_text or simple_tregex_mode:
            result = tregex_engine(query=treg_q,
                                   options=op,
                                   corpus=subcorpus_path,
                                   root=root,
                                   preserve_case=preserve_case)

            # format search results with slashes etc
            if not countmode and not tree_to_text:
                result = format_tregex(result, show, translated_option=translated_option,
                            exclude=exclude, excludemode=excludemode, lemtag=lemtag,
                            lem_instance=lem_instance, countmode=countmode, speaker_data=False)

            # if concordancing, do the query again with 'whole' sent and fname
            if not no_conc:
                ops = ['-w'] + op
                #ops = [i for i in ops if i != '-n']
                whole_result = tregex_engine(query=search['t'],
                                             options=ops,
                                             corpus=subcorpus_path,
                                             root=root,
                                             preserve_case=preserve_case
                                            )

                # format match too depending on option
                if not only_format_match:
                    wholeresult = format_tregex(whole_result, show, translated_option=translated_option,
                                exclude=exclude, excludemode=excludemode, lemtag=lemtag,
                            lem_instance=lem_instance, countmode=countmode, speaker_data=False, whole=True)

                # make conc lines from conc results
                conc_result = make_conc_lines_from_whole_mid(whole_result, result, show=show)
                for lin in conc_result:
                    if maxconc is False or numconc < maxconc:
                        conc_results[subcorpus_name].append(lin)
                    numconc += 1

            # add matches to ongoing counts
            if countmode:
                count_results[subcorpus_name] += [result]            
            else:
                if result:
                    results[subcorpus_name] += Counter([i[-1] for i in result])
                else:
                    results[subcorpus_name] += Counter()

            # update progress bar
            current_iter += 1
            tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
            animator(p, current_iter, tstr, **par_args)
            continue

        # todo: move this
        kwargs.pop('by_metadata', None)
        
        # conll querying goes by file, not subcorpus
        for f in files:
            slow_treg_speaker_guess = kwargs.get('outname', '') if kwargs.get('multispeaker') else ''
            filepath, corefs = f.path, coref
            res, conc_res = pipeline(filepath, search=search, show=show,
                                     dep_type=dep_type,
                                     exclude=exclude,
                                     excludemode=excludemode,
                                     searchmode=searchmode,
                                     case_sensitive=case_sensitive,
                                     conc=conc,
                                     only_format_match=only_format_match,
                                     speaker=slow_treg_speaker_guess,
                                     gramsize=gramsize,
                                     no_punct=no_punct,
                                     no_closed=no_closed,
                                     window=window,
                                     filename=f.path,
                                     coref=corefs,
                                     countmode=countmode,
                                     maxconc=(maxconc, numconc),
                                     is_a_word=is_a_word,
                                     by_metadata=subcorpora,
                                     show_conc_metadata=show_conc_metadata,
                                     just_metadata=just_metadata,
                                     skip_metadata=skip_metadata,
                                     fsi_index=fsi_index,
                                     category=subcorpus_name,
                                     translated_option=translated_option,
                                     statsmode=statsmode,
                                     preserve_case=preserve_case,
                                     usecols=usecols,
                                     search_trees=search_trees,
                                     lem_instance=lem_instance,
                                     lemtag=lemtag,
                                     **kwargs)

            if res is None and conc_res is None:
                current_iter += 1
                tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
                animator(p, current_iter, tstr, **par_args)
                continue

            # deal with symbolic structures---that is, rather than adding
            # results by subcorpora, add them by metadata value
            # todo: sorting?
            if subcorpora:
                for (k, v), concl in zip(res.items(), conc_res.values()):                            
                    v = lowercase_result(v)
                    results[k] += Counter(v)
                    for line in concl:
                        if maxconc is False or numconc < maxconc:
                            line = postprocess_concline(line,
                                fsi_index=fsi_index, conc=conc)
                            conc_results[k].append(line)
                            numconc += 1
                
                current_iter += 1
                tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
                animator(p, current_iter, tstr, **par_args)
                continue

            # garbage collection needed?
            sents = None
            corefs = None
                
            if res == 'Bad query':
                return 'Bad query'

            if countmode:
                count_results[subcorpus_name] += [res]

            else:
                # add filename and do lowercasing for conc
                if not no_conc:
                    for line in conc_res:
                        line = postprocess_concline(line,
                            fsi_index=fsi_index, conc=conc)
                        if maxconc is False or numconc < maxconc:
                            conc_results[subcorpus_name].append(line)
                            numconc += 1

                # do lowercasing and spelling
                if not only_conc:
                    res = lowercase_result(res)
                    # discard removes low results, helping with 
                    # curse of dimensionality
                    countres = Counter(res)
                    if isinstance(discard, float):
                        countres.most_common()
                        nkeep = len(counter) - len(counter) * discard
                        countres = Counter({k: v for i, (k, v) in enumerate(countres.most_common()) if i <= nkeep})
                    elif isinstance(discard, int):
                        countres = Counter({k: v for k, v in countres.most_common() if v >= discard})
                    results[subcorpus_name] += countres
                    #else:
                    #results[subcorpus_name] += res

            # update progress bar
            current_iter += 1
            tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
            animator(p, current_iter, tstr, **par_args)

    # Get concordances into DataFrame, return if just conc
    if not no_conc:
        # fail on this line with typeerror if no results?
        conc_df = make_conc_obj_from_conclines(conc_results, fsi_index=fsi_index)
        if only_conc and conc_df is None:
            return
        elif only_conc:
            locs = sanitise_dict(locs)
            try:
                conc_df.query = locs
            except AttributeError:
                return conc_df
            if save and not kwargs.get('outname'):
                if conc_df is not None:
                    conc_df.save(savename)
            goodbye_printer(only_conc=True)
            if not root:
                signal.signal(signal.SIGINT, original_sigint)            
            return conc_df
    else:
        conc_df = None

    # Get interrogation into DataFrame
    if countmode:
        df = Series({k: sum(v) for k, v in sorted(count_results.items())})
        tot = df.sum()
    else:
        the_big_dict = {}
        unique_results = set(item for sublist in list(results.values()) for item in sublist)
        sortres = sorted(results.items(), key=lambda x: x[0])
        for word in unique_results:
            the_big_dict[word] = [subcorp_result[word] for _, subcorp_result in sortres]
        # turn master dict into dataframe, sorted
        df = DataFrame(the_big_dict, index=sorted(results.keys()))

        # for ngrams, remove hapaxes
        #if show_ngram or show_collocates:
        #    if not language_model:
        #        df = df[[i for i in list(df.columns) if df[i].sum() > 1]]

        numentries = len(df.columns)
        tot = df.sum(axis=1)
        total_total = df.sum().sum()

    # turn df into series if all conditions met
    conds = [countmode,
             files_as_subcorpora,
             subcorpora,
             kwargs.get('df1_always_df', False)]
    anyxs = [level == 's',
             singlefile,
             nosubmode]
    if all(not x for x in conds) and any(x for x in anyxs):
        df = Series(df.ix[0])
        df.sort_values(ascending=False, inplace=True)
        tot = df.sum()
        numentries = len(df.index)
        total_total = tot

    # turn data into DF for GUI if need be
    if isinstance(df, Series) and kwargs.get('df1_always_df', False):
        total_total = df.sum()
        df = DataFrame(df)
        tot = Series(total_total, index=['Total'])

    # if we're doing files as subcorpora,  we can remove the extension etc
    if isinstance(df, DataFrame) and files_as_subcorpora:
        cname = corpus.name.replace('-stripped', '').replace('-parsed', '')
        edits = [(r'(-[0-9][0-9][0-9])?\.txt\.conllu?', ''),
                 (r'-%s(-stripped)?(-parsed)?' % cname, '')]
        from corpkit.editor import editor
        df = editor(df, replace_subcorpus_names=edits).results
        tot = df.sum(axis=1)
        total_total = df.sum().sum()

    if conc_df is not None and conc_df is not False:
        # removed 'f' from here for now
        for col in ['c']:
            for pat in ['.txt', '.conll', '.conllu']:
                conc_df[col] = conc_df[col].str.replace(pat, '')
            conc_df[col] = conc_df[col].str.replace(r'-[0-9][0-9][0-9]$', '')

        #df.index = df.index.str.replace('w', 'this')

    # make interrogation object
    locs['corpus'] = corpus.path
    locs = sanitise_dict(locs)
    if nosubmode and isinstance(df, pd.DataFrame):
        df = df.sum()
    interro = Interrogation(results=df, totals=tot, query=locs, concordance=conc_df)

    # save it
    if save and not kwargs.get('outname'):
        print('\n')
        interro.save(savename)
    
    goodbye = goodbye_printer(return_it=in_notebook)
    if in_notebook:
        try:
            p.children[2].value = goodbye.replace('\n', '')
        except AttributeError:
            pass
    if not root:
        signal.signal(signal.SIGINT, original_sigint)
    return interro
예제 #4
0
파일: build.py 프로젝트: javelir/corpkit
def download_large_file(proj_path, url, actually_download=True, root=False, **kwargs):
    """
    Download something to proj_path
    """
    import corpkit
    import os
    import shutil
    import glob
    import sys
    import zipfile
    from time import localtime, strftime
    from corpkit.textprogressbar import TextProgressBar
    from corpkit.process import animator

    file_name = url.split('/')[-1]
    home = os.path.expanduser("~")
    # if it's corenlp, put it in home/corenlp
    # if that dir exists, check if for a zip file
    # if there's a zipfile and it works, move on
    # if there's a zipfile and it's broken, delete it
    if 'stanford' in url:
        downloaded_dir = os.path.join(home, 'corenlp')
        if not os.path.isdir(downloaded_dir):
            os.makedirs(downloaded_dir)
        else:
            poss_zips = glob.glob(os.path.join(downloaded_dir, 'stanford-corenlp-full*.zip'))
            if poss_zips:
                fullfile = poss_zips[-1]   
                the_zip_file = zipfile.ZipFile(fullfile)
                ret = the_zip_file.testzip()
                if ret is None:
                    return downloaded_dir, fullfile
                else:
                    os.remove(fullfile)
            #else:
            #    shutil.rmtree(downloaded_dir)
    else:
        downloaded_dir = os.path.join(proj_path, 'temp')
        try:
            os.makedirs(downloaded_dir)
        except OSError:
            pass
    fullfile = os.path.join(downloaded_dir, file_name)

    if actually_download:
        if not root:
            txt = 'CoreNLP not found. Download latest version (%s)? (y/n) ' % url
            
            selection = INPUTFUNC(txt)

            if 'n' in selection.lower():
                return None, None
        try:
            import requests
            # NOTE the stream=True parameter
            r = requests.get(url, stream=True, verify=False)
            file_size = int(r.headers['content-length'])
            file_size_dl = 0
            block_sz = 8192
            showlength = file_size / block_sz
            thetime = strftime("%H:%M:%S", localtime())
            print('\n%s: Downloading ... \n' % thetime)
            par_args = {'printstatus': kwargs.get('printstatus', True),
                        'length': showlength}
            if not root:
                tstr = '%d/%d' % (file_size_dl + 1 / block_sz, showlength)
                p = animator(None, None, init=True, tot_string=tstr, **par_args)
                animator(p, file_size_dl + 1, tstr)

            with open(fullfile, 'wb') as f:
                for chunk in r.iter_content(chunk_size=block_sz): 
                    if chunk: # filter out keep-alive new chunks
                        f.write(chunk)
                        file_size_dl += len(chunk)
                        #print file_size_dl * 100.0 / file_size
                        if kwargs.get('note'):
                            kwargs['note'].progvar.set(file_size_dl * 100.0 / int(file_size))
                        else:
                            tstr = '%d/%d' % (file_size_dl / block_sz, showlength)
                            animator(p, file_size_dl / block_sz, tstr, **par_args)
                        if root:
                            root.update()
        except Exception as err:
            import traceback
            print(traceback.format_exc())
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: Download failed' % thetime)
            try:
                f.close()
            except:
                pass
            if root:
                root.update()
            return

        if kwargs.get('note'):  
            kwargs['note'].progvar.set(100)
        else:    
            p.animate(int(file_size))
        thetime = strftime("%H:%M:%S", localtime())
        print('\n%s: Downloaded successully.' % thetime)
        try:
            f.close()
        except:
            pass
    return downloaded_dir, fullfile
예제 #5
0
def interrogator(
    corpus,
    search,
    query="any",
    show="w",
    exclude=False,
    excludemode="any",
    searchmode="all",
    dep_type="collapsed-ccprocessed-dependencies",
    case_sensitive=False,
    quicksave=False,
    just_speakers=False,
    preserve_case=False,
    lemmatag=False,
    files_as_subcorpora=False,
    conc=False,
    only_unique=False,
    random=False,
    only_format_match=False,
    multiprocess=False,
    spelling=False,
    regex_nonword_filter=r"[A-Za-z0-9:_]",
    gramsize=2,
    split_contractions=False,
    **kwargs
):
    """interrogate corpus, corpora, subcorpus and file objects

    see corpkit.interrogation.interrogate() for docstring"""
    # store kwargs
    locs = locals()

    from corpkit.interrogation import Interrogation
    from corpkit.process import tregex_engine
    import pandas as pd
    from pandas import DataFrame, Series
    from collections import Counter
    from corpkit.other import as_regex
    from corpkit.process import get_deps
    from time import localtime, strftime

    thetime = strftime("%H:%M:%S", localtime())
    from corpkit.textprogressbar import TextProgressBar
    from corpkit.process import animator
    from corpkit.dictionaries.word_transforms import wordlist, taglemma

    # find out if using gui
    root = kwargs.get("root")
    note = kwargs.get("note")

    # convert path to corpus object
    if type(corpus) == str:
        from corpkit.corpus import Corpus

        corpus = Corpus(corpus)

    # figure out how the user has entered the query and normalise
    from corpkit.process import searchfixer

    search, search_iterable = searchfixer(search, query)

    # for better printing of query, esp during multiprocess
    # can remove if multiprocess printing improved
    if len(search.keys()) == 1:
        query = search.values()[0]

    if "l" in show and search.get("t"):
        from nltk.stem.wordnet import WordNetLemmatizer

        lmtzr = WordNetLemmatizer()

    if type(show) == str:
        show = [show]

    def is_multiquery(corpus, search, query, just_speakers):
        """determine if multiprocessing is needed
        do some retyping if need be as well"""
        im = False
        from collections import OrderedDict

        if hasattr(corpus, "__iter__"):
            im = True
        # so we can do search = 't', query = ['NP', 'VP']:
        if type(query) == list:
            if query != search.values()[0] or len(search.keys()) > 1:
                query = {c.title(): c for c in query}
        if type(query) == dict or type(query) == OrderedDict:
            im = True
        if just_speakers:
            if just_speakers == "each":
                im = True
                just_speakers = ["each"]
            if just_speakers == ["each"]:
                im = True
            if type(just_speakers) == str:
                im = False
                just_speakers = [just_speakers]
            if type(just_speakers) == list:
                if len(just_speakers) > 1:
                    im = True
        if type(search) == dict:
            if all(type(i) == dict for i in search.values()):
                im = True
        return im, corpus, search, query, just_speakers

    def slow_tregex(sents, **dummy_args):
        """do the speaker-specific version of tregex queries"""
        import os
        from corpkit.process import tregex_engine

        # first, put the relevant trees into temp file
        if kwargs.get("outname"):
            to_open = "tmp-%s.txt" % kwargs["outname"]
        else:
            to_open = "tmp.txt"
        to_write = "\n".join([sent._parse_string.strip() for sent in sents if sent.parse_string is not None])
        to_write.encode("utf-8", errors="ignore")
        with open(to_open, "w") as fo:
            fo.write(to_write)
        q = search.values()[0]
        res = tregex_engine(
            query=q, options=["-o", "-%s" % translated_option], corpus=to_open, root=root, preserve_case=True
        )
        if root:
            root.update()
        os.remove(to_open)
        if countmode:
            return len(res)
        else:
            return res

    def get_stats(sents, **dummy_args):
        """get a bunch of frequencies on interpersonal phenomena"""
        import os
        import re
        from collections import Counter

        statsmode_results = Counter()
        # first, put the relevant trees into temp file
        if kwargs.get("outname"):
            to_open = "tmp-%s.txt" % kwargs["outname"]
        else:
            to_open = "tmp.txt"
        with open(to_open, "w") as fo:
            for sent in sents:
                statsmode_results["Sentences"] += 1
                sts = sent.parse_string.rstrip()
                encd = sts.encode("utf-8", errors="ignore") + "\n"
                fo.write(encd)
                deps = get_deps(sent, dep_type)
                numpass = len([x for x in deps.links if x.type.endswith("pass")])
                statsmode_results["Passives"] += numpass
                statsmode_results["Tokens"] += len(sent.tokens)
                words = [w.word for w in sent.tokens if w.word.isalnum()]
                statsmode_results["Words"] += len(words)
                statsmode_results["Characters"] += len("".join(words))

        # count moods via trees          (/\?/ !< __)
        from dictionaries.process_types import processes
        from corpkit.other import as_regex

        tregex_qs = {
            "Imperative": r"ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/",
            "Open interrogative": r"ROOT < SBARQ <<- (/\?/ !< __)",
            "Closed interrogative": r"ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))",
            "Unmodalised declarative": r"ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))",
            "Modalised declarative": r"ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))",
            "Open class words": r"/^(NN|JJ|VB|RB)/ < __",
            "Closed class words": r"__ !< __ !> /^(NN|JJ|VB|RB)/",
            "Clauses": r"/^S/ < __",
            "Interrogative": r"ROOT << (/\?/ !< __)",
            "Mental processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.mental, boundaries="w"),
            "Verbal processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.verbal, boundaries="w"),
            "Relational processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)"
            % as_regex(processes.relational, boundaries="w"),
        }

        for name, q in sorted(tregex_qs.items()):
            res = tregex_engine(query=q, options=["-o", "-C"], corpus=to_open, root=root)
            statsmode_results[name] += int(res)
            global numdone
            numdone += 1
            if root:
                root.update()
            else:
                tot_string = str(numdone + 1) + "/" + str(total_files)
                if kwargs.get("outname"):
                    tot_string = "%s: %s" % (kwargs["outname"], tot_string)
                animator(p, numdone, tot_string, **par_args)
            if kwargs.get("note", False):
                kwargs["note"].progvar.set((numdone * 100.0 / total_files / denom) + startnum)
        os.remove(to_open)
        return statsmode_results

    def make_conc_lines_from_whole_mid(wholes, middle_column_result, speakr=False):
        if speakr is False:
            speakr = ""
        conc_lines = []
        # remove duplicates from results
        unique_wholes = []
        unique_middle_column_result = []
        duplicates = []
        for index, ((f, whole), mid) in enumerate(zip(wholes, middle_column_result)):
            if "-join-".join([f, whole, mid]) not in duplicates:
                duplicates.append("-join-".join([f, whole, mid]))
                unique_wholes.append([f, whole])
                unique_middle_column_result.append(mid)

        # split into start, middle and end, dealing with multiple occurrences
        for index, ((f, whole), mid) in enumerate(zip(unique_wholes, unique_middle_column_result)):
            reg = re.compile(r"([^a-zA-Z0-9-]|^)(" + re.escape(mid) + r")([^a-zA-Z0-9-]|$)", re.IGNORECASE | re.UNICODE)
            offsets = [(m.start(), m.end()) for m in re.finditer(reg, whole)]
            for offstart, offend in offsets:
                start, middle, end = whole[0:offstart].strip(), whole[offstart:offend].strip(), whole[offend:].strip()
                conc_lines.append([os.path.basename(f), speakr, start, middle, end])
        return conc_lines

    def uniquify(conc_lines):
        from collections import OrderedDict

        unique_lines = []
        checking = []
        for index, (f, speakr, start, middle, end) in enumerate(conc_lines):
            joined = " ".join([speakr, start, "MIDDLEHERE:", middle, ":MIDDLEHERE", end])
            if joined not in checking:
                unique_lines.append(conc_lines[index])
            checking.append(joined)
        return unique_lines

    def lemmatiser(list_of_words, tag):
        """take a list of unicode words and a tag and return a lemmatised list."""
        output = []
        for word in list_of_words:
            if translated_option.startswith("u"):
                if word.lower() in taglemma.keys():
                    word = taglemma[word.lower()]
                else:
                    if word == "x":
                        word = "Other"
            # only use wordnet lemmatiser when appropriate
            else:
                if word in wordlist:
                    word = wordlist[word]
                word = lmtzr.lemmatize(word, tag)
            output.append(word)
        return output

    def gettag(query, lemmatag=False):
        """
        Find tag for WordNet lemmatisation
        """
        import re

        tagdict = {"N": "n", "A": "a", "V": "v", "A": "r", "None": False, "": False, "Off": False}

        if lemmatag is False:
            tag = "n"  # same default as wordnet
            # attempt to find tag from tregex query
            tagfinder = re.compile(r"^[^A-Za-z]*([A-Za-z]*)")
            tagchecker = re.compile(r"^[A-Z]{1,4}$")
            qr = query.replace(r"\w", "").replace(r"\s", "").replace(r"\b", "")
            treebank_tag = re.findall(tagfinder, qr)
            if re.match(tagchecker, treebank_tag[0]):
                tag = tagdict.get(treebank_tag[0], "n")
        elif lemmatag:
            tag = lemmatag
        return tag

    def format_tregex(results):
        """format tregex by show list"""
        if countmode:
            return results
        import re

        done = []
        if "l" in show or "pl" in show:
            lemmata = lemmatiser(results, gettag(search.get("t"), lemmatag))
        else:
            lemmata = [None for i in results]
        for word, lemma in zip(results, lemmata):
            bits = []
            if exclude and exclude.get("w"):
                if len(exclude.keys()) == 1 or excludemode == "any":
                    if re.search(exclude.get("w"), word):
                        continue
                if len(exclude.keys()) == 1 or excludemode == "any":
                    if re.search(exclude.get("l"), lemma):
                        continue
                if len(exclude.keys()) == 1 or excludemode == "any":
                    if re.search(exclude.get("p"), word):
                        continue
                if len(exclude.keys()) == 1 or excludemode == "any":
                    if re.search(exclude.get("pl"), lemma):
                        continue
            if exclude and excludemode == "all":
                num_to_cause_exclude = len(exclude.keys())
                current_num = 0
                if exclude.get("w"):
                    if re.search(exclude.get("w"), word):
                        current_num += 1
                if exclude.get("l"):
                    if re.search(exclude.get("l"), lemma):
                        current_num += 1
                if exclude.get("p"):
                    if re.search(exclude.get("p"), word):
                        current_num += 1
                if exclude.get("pl"):
                    if re.search(exclude.get("pl"), lemma):
                        current_num += 1
                if current_num == num_to_cause_exclude:
                    continue

            for i in show:
                if i == "t":
                    bits.append(word)
                if i == "l":
                    bits.append(lemma)
                elif i == "w":
                    bits.append(word)
                elif i == "p":
                    bits.append(word)
                elif i == "pl":
                    bits.append(lemma)
            joined = "/".join(bits)
            done.append(joined)
        return done

    def tok_by_list(pattern, list_of_toks, concordancing=False, **kwargs):
        """search for regex in plaintext corpora"""
        import re

        if type(pattern) == str:
            pattern = [pattern]
        if not case_sensitive:
            pattern = [p.lower() for p in pattern]
        if not concordancing:
            if case_sensitive:
                matches = [m for m in list_of_toks if m in pattern]
            else:
                matches = [m for m in list_of_toks if m.lower() in pattern]
        else:
            matches = []
            for index, token in enumerate(list_of_toks):
                if token in pattern:
                    match = [" ".join([t for t in unsplitter(list_of_toks[:index])])[-140:]]
                    match.append(token)
                    match.append(" ".join([t for t in unsplitter(list_of_toks[index + 1 :])])[:140])
                    matches.append(match)
        if countmode:
            return len(matches)
        else:
            return matches

    def unsplitter(lst):
        """unsplit contractions and apostophes from tokenised text"""
        if split_contractions:
            return lst
        unsplit = []
        for index, t in enumerate(lst):
            if index == 0 or index == len(lst) - 1:
                unsplit.append(t)
                continue
            if "'" in t and not t.endswith("'"):
                rejoined = "".join([lst[index - 1], t])
                unsplit.append(rejoined)
            else:
                if not "'" in lst[index + 1]:
                    unsplit.append(t)
        return unsplit

    def tok_ngrams(pattern, list_of_toks, concordancing=False, split_contractions=True):
        from collections import Counter
        import re

        ngrams = Counter()
        result = []
        # if it's not a compiled regex
        list_of_toks = [x for x in list_of_toks if re.search(regex_nonword_filter, x)]
        if pattern.lower() == "any":
            pattern = r".*"

        if not split_contractions:
            list_of_toks = unsplitter(list_of_toks)

            # list_of_toks = [x for x in list_of_toks if "'" not in x]
        for index, w in enumerate(list_of_toks):
            try:
                the_gram = [list_of_toks[index + x] for x in range(gramsize)]
                if not any(re.search(pattern, x) for x in the_gram):
                    continue
                ngrams[" ".join(the_gram)] += 1
            except IndexError:
                pass

        # turn counter into list of results
        for k, v in ngrams.items():
            if v > 1:
                for i in range(v):
                    result.append(k)
        if countmode:
            return len(result)
        else:
            return result

    def compiler(pattern):
        """compile regex or fail gracefully"""
        import re

        try:
            if case_sensitive:
                comped = re.compile(pattern)
            else:
                comped = re.compile(pattern, re.IGNORECASE)
            return comped
        except:
            import traceback
            import sys
            from time import localtime, strftime

            exc_type, exc_value, exc_traceback = sys.exc_info()
            lst = traceback.format_exception(exc_type, exc_value, exc_traceback)
            error_message = lst[-1]
            thetime = strftime("%H:%M:%S", localtime())
            print "%s: Query %s" % (thetime, error_message)
            if root:
                return "Bad query"
            else:
                raise ValueError("%s: Query %s" % (thetime, error_message))

    def tok_by_reg(pattern, list_of_toks, concordancing=False, **kwargs):
        """search for regex in plaintext corpora"""
        import re

        comped = compiler(pattern)
        if comped == "Bad query":
            return "Bad query"
        if not concordancing:
            matches = [m for m in list_of_toks if re.search(comped, m)]
        else:
            matches = []
            for index, token in enumerate(list_of_toks):
                if re.search(comped, token):
                    match = [" ".join([t for t in unsplitter(list_of_toks[:index])])[-140:]]
                    match.append(re.search(comped, token).group(0))
                    match.append(" ".join([t for t in unsplitter(list_of_toks[index + 1 :])])[:140])
                    matches.append(match)
        if countmode:
            return len(matches)
        else:
            return matches

    def plaintext_regex_search(pattern, plaintext_data, concordancing=False, **kwargs):
        """search for regex in plaintext corpora

        it searches over lines, so the user needs to be careful.
        """
        import re

        if concordancing:
            pattern = r"(.{,140})\b(" + pattern + r")\b(.{,140})"
        compiled_pattern = compiler(pattern)
        if compiled_pattern == "Bad query":
            return "Bad query"
        matches = re.findall(compiled_pattern, plaintext_data)
        if concordancing:
            matches = [list(m) for m in matches]
        if not concordancing:
            for index, i in enumerate(matches):
                if type(i) == tuple:
                    matches[index] = i[0]
        if countmode:
            return len(matches)
        else:
            return matches

    def correct_spelling(a_string):
        if not spelling:
            return a_string
        from dictionaries.word_transforms import usa_convert

        if spelling.lower() == "uk":
            usa_convert = {v: k for k, v in usa_convert.items()}
        spell_out = []
        bits = a_string.split("/")
        for index, i in enumerate(bits):
            converted = usa_convert.get(i.lower(), i)
            if i.islower() or preserve_case is False:
                converted = converted.lower()
            elif i.isupper() and preserve_case:
                converted = converted.upper()
            elif i.istitle() and preserve_case:
                converted = converted.title()
            bits[index] = converted
        r = "/".join(bits)
        return r

    def plaintext_simple_search(pattern, plaintext_data, concordancing=False, **kwargs):
        """search for tokens in plaintext corpora"""
        import re

        result = []
        if type(pattern) == str:
            pattern = [pattern]
        for p in pattern:
            if concordancing:
                pat = r"(.{0,140})\b(" + re.escape(p) + r")\b(.{0,140})"
            pat = compiler(pat)
            if pat == "Bad query":
                return "Bad query"
            matches = re.findall(pat, plaintext_data)
            if concordancing:
                matches = [list(m) for m in matches]
                for i in matches:
                    result.append(i)
            else:
                for m in range(len(matches)):
                    result.append(p)
        return result

    # do multiprocessing if need be
    im, corpus, search, query, just_speakers = is_multiquery(corpus, search, query, just_speakers)

    locs["search"] = search
    locs["query"] = query
    locs["just_speakers"] = just_speakers
    locs["corpus"] = corpus
    locs["multiprocess"] = multiprocess

    if im:
        from corpkit.multiprocess import pmultiquery

        return pmultiquery(**locs)

    datatype = corpus.datatype
    singlefile = corpus.singlefile

    # store all results in here
    results = {}
    # check if just counting
    countmode = "c" in show
    # where we are at in interrogation
    current_iter = 0

    # multiprocessing progress bar
    denom = kwargs.get("denominator", 1)
    startnum = kwargs.get("startnum", 0)

    ############################################
    # Determine the search function to be used #
    ############################################

    # simple tregex is tregex over whole dirs
    simple_tregex_mode = False
    statsmode = False
    if not just_speakers and "t" in search.keys():
        simple_tregex_mode = True
    else:
        if corpus.datatype == "plaintext":
            if search.get("n"):
                raise NotImplementedError("Use a tokenised corpus for n-gramming.")
                # searcher = plaintext_ngram
                optiontext = "n-grams via plaintext"
            if search.get("w"):
                if kwargs.get("regex", True):
                    searcher = plaintext_regex_search
                else:
                    searcher = plaintext_simple_search
                optiontext = "Searching plaintext"

        elif corpus.datatype == "tokens":
            if search.get("n"):
                searcher = tok_ngrams
                optiontext = "n-grams via tokens"
            elif search.get("w"):
                if kwargs.get("regex", True):
                    searcher = tok_by_reg
                else:
                    searcher = tok_by_list
                if type(search.get("w")) == list:
                    searcher = tok_by_list
                optiontext = "Searching tokens"
        only_parse = ["r", "d", "g", "dl", "gl", "df", "gf", "dp", "gp", "f"]
        if corpus.datatype != "parse" and any(i in only_parse for i in search.keys()):
            raise ValueError(
                'Need parsed corpus to search with "%s" option(s).'
                % ", ".join([i for i in search.keys() if i in only_parse])
            )

        elif corpus.datatype == "parse":
            if search.get("t"):
                searcher = slow_tregex
            elif search.get("s"):
                searcher = get_stats
                statsmode = True
                optiontext = "General statistics"
                global numdone
                numdone = 0
            else:
                from corpkit.depsearch import dep_searcher

                searcher = dep_searcher
                optiontext = "Dependency querying"

    ############################################
    #      Set some Tregex-related values      #
    ############################################

    if search.get("t"):
        query = search.get("t")

        # check the query
        q = tregex_engine(corpus=False, query=search.get("t"), options=["-t"], check_query=True, root=root)
        if query is False:
            if root:
                return "Bad query"
            else:
                return

        optiontext = "Searching parse trees"
        if "p" in show or "pl" in show:
            translated_option = "u"
            if type(search["t"]) == list:
                search["t"] = r"__ < (/%s/ !< __)" % as_regex(
                    search["t"], boundaries="line", case_sensitive=case_sensitive
                )
            if search["t"] == "any":
                search["t"] = r"__ < (/.?[A-Za-z0-9].?/ !< __)"
        elif "t" in show:
            translated_option = "o"
            if type(search["t"]) == list:
                search["t"] = r"__ < (/%s/ !< __)" % as_regex(
                    search["t"], boundaries="line", case_sensitive=case_sensitive
                )
            if search["t"] == "any":
                search["t"] = r"__ < (/.?[A-Za-z0-9].?/ !< __)"
        elif "w" in show:
            translated_option = "t"
            if type(search["t"]) == list:
                search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive)
            if search["t"] == "any":
                search["t"] = r"/.?[A-Za-z0-9].?/ !< __"
        elif "c" in show:
            count_results = {}
            only_count = True
            translated_option = "C"
            if type(search["t"]) == list:
                search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive)
            if search["t"] == "any":
                search["t"] = r"/.?[A-Za-z0-9].?/ !< __"
        elif "l" in show:
            translated_option = "t"
            if type(search["t"]) == list:
                search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive)
            if search["t"] == "any":
                search["t"] = r"/.?[A-Za-z0-9].?/ !< __"

        query = search["t"]

    ############################################
    # Make iterable for corpus/subcorpus/file  #
    ############################################

    if corpus.singlefile:
        to_iterate_over = {(corpus.name, corpus.path): [corpus]}
    elif not corpus.subcorpora:
        to_iterate_over = {(corpus.name, corpus.path): corpus.files}
    else:
        to_iterate_over = {}
        for k, v in sorted(corpus.structure.items()):
            to_iterate_over[(k.name, k.path)] = v
    if files_as_subcorpora:
        to_iterate_over = {}
        for f in corpus.files:
            to_iterate_over[(f.name, f.path)] = [f]

    ############################################
    #           Print welcome message          #
    ############################################

    if conc:
        message = "Concordancing"
    else:
        message = "Interrogating"
    if kwargs.get("printstatus", True):
        thetime = strftime("%H:%M:%S", localtime())

        sformat = "\n                 ".join(["%s: %s" % (k.rjust(3), v) for k, v in search.items()])
        if search == {"s": r".*"}:
            sformat = "features"
        welcome = "\n%s: %s %s ...\n          %s\n          Query: %s\n" % (
            thetime,
            message,
            corpus.name,
            optiontext,
            sformat,
        )
        print welcome

    ############################################
    #           Make progress bar              #
    ############################################

    if simple_tregex_mode:
        total_files = len(to_iterate_over.keys())
    else:
        if search.get("s"):
            total_files = sum([len(x) for x in to_iterate_over.values()]) * 12
        else:
            total_files = sum([len(x) for x in to_iterate_over.values()])

    par_args = {"printstatus": kwargs.get("printstatus", True), "root": root, "note": note, "length": total_files}

    term = None
    if kwargs.get("paralleling", None) is not None:
        from blessings import Terminal

        term = Terminal()
        par_args["terminal"] = term
        par_args["linenum"] = kwargs.get("paralleling")

    outn = kwargs.get("outname", "")
    if outn:
        outn = outn + ": "
    tstr = "%s%d/%d" % (outn, current_iter, total_files)
    p = animator(None, None, init=True, tot_string=tstr, **par_args)
    tstr = "%s%d/%d" % (outn, current_iter + 1, total_files)
    animator(p, current_iter, tstr, **par_args)

    ############################################
    # Iterate over data, doing interrogations  #
    ############################################

    for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()):

        if countmode or conc:
            results[subcorpus_name] = []
        else:
            results[subcorpus_name] = Counter()

        # tregex over subcorpora, not files
        if simple_tregex_mode:

            op = ["-o", "-" + translated_option]
            result = tregex_engine(
                query=search["t"], options=op, corpus=subcorpus_path, root=root, preserve_case=preserve_case
            )

            if countmode:
                results[subcorpus_name].append(result)
                continue

            result = Counter(format_tregex(result))

            if conc:
                op.append("-w")
                whole_result = tregex_engine(
                    query=search["t"], options=op, corpus=subcorpus_path, root=root, preserve_case=preserve_case
                )

                if not only_format_match:
                    whole_result = format_tregex(whole_result)

                result = make_conc_lines_from_whole_mid(whole_result, result, speakr=False)

                if spelling:
                    for index, line in enumerate(result):
                        result[index] = [correct_spelling(b) for b in line]

            results[subcorpus_name] += result

            current_iter += 1
            if kwargs.get("paralleling", None) is not None:
                tstr = "%s%d/%d" % (outn, current_iter + 2, total_files)
            else:
                tstr = "%s%d/%d" % (outn, current_iter + 1, total_files)
            animator(p, current_iter, tstr, **par_args)

        # dependencies, plaintext, tokens or slow_tregex
        else:
            for f in files:

                if corpus.datatype == "parse":
                    with open(f.path, "r") as data:
                        data = data.read()
                        from corenlp_xml.document import Document

                        try:
                            corenlp_xml = Document(data)
                        except:
                            print "Could not read file: %s" % f.path
                            continue
                        if just_speakers:
                            sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers]
                            if not sents:
                                continue
                        else:
                            sents = corenlp_xml.sentences

                        res = searcher(
                            sents,
                            search=search,
                            show=show,
                            dep_type=dep_type,
                            exclude=exclude,
                            excludemode=excludemode,
                            searchmode=searchmode,
                            lemmatise=False,
                            case_sensitive=case_sensitive,
                            concordancing=conc,
                            only_format_match=only_format_match,
                        )

                        if res == "Bad query":
                            return "Bad query"

                        if searcher == slow_tregex and not countmode:
                            res = format_tregex(res)

                elif corpus.datatype == "tokens":
                    import pickle

                    with open(f.path, "rb") as fo:
                        data = pickle.load(fo)
                    res = searcher(search.values()[0], data, split_contractions=split_contractions, concordancing=conc)
                    if conc:
                        for index, line in enumerate(res):
                            line.insert(0, "")

                elif corpus.datatype == "plaintext":
                    with open(f.path, "rb") as data:
                        data = data.read()
                        data = unicode(data, errors="ignore")
                        res = searcher(search.values()[0], data, concordancing=conc)
                        if conc:
                            for index, line in enumerate(res):
                                line.insert(0, "")

                if countmode:
                    results[subcorpus_name] += res
                    continue

                # add filename and do lowercasing for conc
                if conc:
                    for index, line in enumerate(res):
                        line.insert(0, f.name)
                        if not preserve_case:
                            line = [b.lower() for b in line]
                        if spelling:
                            line = [correct_spelling(b) for b in line]
                        results[subcorpus_name] += [line]

                # do lowercasing and spelling
                else:
                    if not preserve_case:
                        res = [r.lower() for r in res]
                    if spelling:
                        res = [correct_spelling(r) for r in res]
                    results[subcorpus_name] += Counter(res)

                if not statsmode:
                    current_iter += 1
                    if kwargs.get("paralleling", None) is not None:
                        tstr = "%s%d/%d" % (outn, current_iter + 2, total_files)
                    else:
                        tstr = "%s%d/%d" % (outn, current_iter + 1, total_files)

    # delete temp file if there
    import os

    if os.path.isfile("tmp.txt"):
        os.remove("tmp.txt")

    ############################################
    #     Get concordances into DataFrame      #
    ############################################

    if conc:
        all_conc_lines = []
        for sc_name, resu in sorted(results.items()):

            if only_unique:
                unique_results = uniquify(resu)
            else:
                unique_results = resu
            # make into series
            pindex = "c f s l m r".encode("utf-8").split()
            for fname, spkr, start, word, end in unique_results:
                spkr = unicode(spkr, errors="ignore")
                fname = os.path.basename(fname)

                # the use of ascii here makes sure the string formats ok, but will also screw over
                # anyone doing non-english work. so, change to utf-8, then fix errors as they come
                # in the corpkit-gui "add_conc_lines_to_window" function
                all_conc_lines.append(
                    Series(
                        [
                            sc_name.encode("ascii", errors="ignore"),
                            fname.encode("ascii", errors="ignore"),
                            spkr.encode("ascii", errors="ignore"),
                            start.encode("ascii", errors="ignore"),
                            word.encode("ascii", errors="ignore"),
                            end.encode("ascii", errors="ignore"),
                        ],
                        index=pindex,
                    )
                )

        # randomise results...
        if random:
            from random import shuffle

            shuffle(all_conc_lines)

        df = pd.concat(all_conc_lines, axis=1).T

        # not doing anything yet --- this is for multimodal concordancing
        add_links = False
        if not add_links:
            df.columns = ["c", "f", "s", "l", "m", "r"]
        else:
            df.columns = ["c", "f", "s", "l", "m", "r", "link"]

        if all(x == "" for x in list(df["s"].values)):
            df.drop("s", axis=1, inplace=True)

        if kwargs.get("note"):
            kwargs["note"].progvar.set(100)

        if kwargs.get("printstatus", True):
            thetime = strftime("%H:%M:%S", localtime())
            finalstring = "\n\n%s: Concordancing finished! %d matches.\n" % (thetime, len(df.index))
            print finalstring

        from corpkit.interrogation import Concordance

        output = Concordance(df)
        output.query = locs
        if quicksave:
            interro.save()
        return output

    ############################################
    #     Get interrogation into DataFrame     #
    ############################################

    else:
        if countmode:
            df = Series({k: sum(v) for k, v in sorted(results.items())})
            tot = df.sum()
        else:
            the_big_dict = {}
            unique_results = set([item for sublist in results.values() for item in sublist])
            for word in unique_results:
                the_big_dict[word] = [subcorp_result[word] for subcorp_result in sorted(results.values())]
            # turn master dict into dataframe, sorted
            df = DataFrame(the_big_dict, index=sorted(results.keys()))

            numentries = len(df.columns)
            tot = df.sum(axis=1)
            total_total = df.sum().sum()

        ############################################
        # Format, output as Interrogation object   #
        ############################################

        if not countmode:
            if not corpus.subcorpora or singlefile:
                if not files_as_subcorpora:
                    if not kwargs.get("df1_always_df"):
                        df = Series(df.ix[0])
                        df.sort(ascending=False)
                        tot = df.sum()
                        numentries = len(df.index)
                        total_total = tot

        # sort by total
        if type(df) == pd.core.frame.DataFrame:
            if not df.empty:
                df.ix["Total-tmp"] = df.sum()
                the_tot = df.ix["Total-tmp"]
                df = df[the_tot.argsort()[::-1]]
                df = df.drop("Total-tmp", axis=0)

        # format final string
        if kwargs.get("printstatus", True):
            thetime = strftime("%H:%M:%S", localtime())
            finalstring = "\n\n%s: Interrogation finished!" % thetime
            if countmode:
                finalstring += " %d matches." % tot
            else:
                finalstring += " %d unique results, %d total occurrences." % (numentries, total_total)
            print finalstring

        interro = Interrogation(results=df, totals=tot, query=locs)

        if quicksave:
            interro.save()

        return interro
예제 #6
0
    def get_stats(sents, **dummy_args):
        """get a bunch of frequencies on interpersonal phenomena"""
        import os
        import re
        from collections import Counter

        statsmode_results = Counter()
        # first, put the relevant trees into temp file
        if kwargs.get("outname"):
            to_open = "tmp-%s.txt" % kwargs["outname"]
        else:
            to_open = "tmp.txt"
        with open(to_open, "w") as fo:
            for sent in sents:
                statsmode_results["Sentences"] += 1
                sts = sent.parse_string.rstrip()
                encd = sts.encode("utf-8", errors="ignore") + "\n"
                fo.write(encd)
                deps = get_deps(sent, dep_type)
                numpass = len([x for x in deps.links if x.type.endswith("pass")])
                statsmode_results["Passives"] += numpass
                statsmode_results["Tokens"] += len(sent.tokens)
                words = [w.word for w in sent.tokens if w.word.isalnum()]
                statsmode_results["Words"] += len(words)
                statsmode_results["Characters"] += len("".join(words))

        # count moods via trees          (/\?/ !< __)
        from dictionaries.process_types import processes
        from corpkit.other import as_regex

        tregex_qs = {
            "Imperative": r"ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/",
            "Open interrogative": r"ROOT < SBARQ <<- (/\?/ !< __)",
            "Closed interrogative": r"ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))",
            "Unmodalised declarative": r"ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))",
            "Modalised declarative": r"ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))",
            "Open class words": r"/^(NN|JJ|VB|RB)/ < __",
            "Closed class words": r"__ !< __ !> /^(NN|JJ|VB|RB)/",
            "Clauses": r"/^S/ < __",
            "Interrogative": r"ROOT << (/\?/ !< __)",
            "Mental processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.mental, boundaries="w"),
            "Verbal processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.verbal, boundaries="w"),
            "Relational processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)"
            % as_regex(processes.relational, boundaries="w"),
        }

        for name, q in sorted(tregex_qs.items()):
            res = tregex_engine(query=q, options=["-o", "-C"], corpus=to_open, root=root)
            statsmode_results[name] += int(res)
            global numdone
            numdone += 1
            if root:
                root.update()
            else:
                tot_string = str(numdone + 1) + "/" + str(total_files)
                if kwargs.get("outname"):
                    tot_string = "%s: %s" % (kwargs["outname"], tot_string)
                animator(p, numdone, tot_string, **par_args)
            if kwargs.get("note", False):
                kwargs["note"].progvar.set((numdone * 100.0 / total_files / denom) + startnum)
        os.remove(to_open)
        return statsmode_results
예제 #7
0
def dep_searcher(sents,
                 search,
                 show = 'w',
                 dep_type = 'collapsed-ccprocessed-dependencies',
                 regex_nonword_filter = r'[A-Za-z0-9:_]',
                 concordancing = False,
                 exclude = False,
                 excludemode = 'any',
                 searchmode = 'all',
                 lemmatise = False,
                 case_sensitive = False,
                 progbar = False,
                 only_format_match = False):
    import re
    from corenlp_xml.document import Document
    from collections import Counter
    from corpkit.build import flatten_treestring
    from corpkit.process import filtermaker, animator, get_deps
    """
    search corenlp dependency parse
    1. search for 'search' keyword arg
       governor
       dependent
       function
       pos
       lemma
       word
       index
       etc

    2. exclude entries if need be, using same method as search

    3. return '/'-sep list of 'show' keyword arg, or conc lines:
       governor
       dependent
       function
       pos
       lemma
       word
       index
       distance
       etc
       
       ... or just return int count.
       """

    def distancer(lks, lk):
        "determine number of jumps to root"      
        c = 0
        # get the gov index, stop when it's zero
        root_found = False
        while not root_found:
            if c == 0:
                try:
                    link_to_check = next(i for i in lks if i.dependent.idx == lk.id)
                except StopIteration:
                    root_found = True
                    break
                #link_to_check = lk
            gov_index = link_to_check.governor.idx
            if gov_index == 0:
                root_found = True
            else:
                if c > 29:
                    root_found = True
                    break
                link_to_check = [l for l in lks if l.dependent.idx == gov_index]
                if len(link_to_check) > 0:
                    link_to_check = link_to_check[0]
                else:
                    break
                c += 1
        if c < 30:
            return c

    def get_matches_from_sent(s, search, deps = False, tokens = False, 
        dep_type = 'basic-dependencies', mode = 'all'):
        """process a sentence object, returning matching tok ids"""
        from corpkit.process import get_deps
        import re
        lks = []
        if not deps:
            deps = get_deps(s, dep_type)
        if not tokens:
            tokens = s.tokens

        for opt, pat in search.items():
            if type(pat) == list:
                if all(type(x) == int for x in pat):
                    pat = [str(x) for x in pat]
                pat = filtermaker(pat, case_sensitive = case_sensitive)
                search[opt] = pat
            if type(pat) == dict:
                del search[opt]
                for k, v in pat.items():
                    if k != 'w':
                        search[opt + k] = v
                    else:
                        search[opt] = v
            if type(pat) == str and pat.lower() == 'any':
                search[opt] = re.compile(r'.*')

        for opt, pat in search.items():
            if opt == 'g':
                got = []
                for l in deps.links:
                    if re.search(pat, l.governor.text):
                        got.append(s.get_token_by_id(l.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'gf':
                got = []
                for l in deps.links:
                    if re.search(pat, l.type):
                        gov_index = l.dependent.idx
                        for l2 in deps.links:
                            if l2.governor.idx == gov_index:
                                got.append(s.get_token_by_id(l2.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'df':
                got = []
                for l in deps.links:
                    if re.search(pat, l.type):
                        got.append(s.get_token_by_id(l.governor.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'gl':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.lemma):
                        for i in deps.links:
                            if i.governor.idx == tok.id:
                                got.append(s.get_token_by_id(i.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'gp':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.pos):
                        for i in deps.links:
                            if i.governor.idx == tok.id:
                                got.append(s.get_token_by_id(i.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'dl':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.lemma):
                        for i in deps.links:
                            if i.dependent.idx == tok.id:
                                got.append(s.get_token_by_id(i.governor.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'dp':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.pos):
                        for i in deps.links:
                            if i.dependent.idx == tok.id:
                                got.append(s.get_token_by_id(i.governor.idx))
                got = set(got)
                for i in got:
                    lks.append(i)

            elif opt == 'd':
                got = []
                for l in deps.links:
                    if re.search(pat, l.dependent.text):
                        got.append(s.get_token_by_id(l.governor.idx))

                got = set(got)
                for i in got:
                    lks.append(i)

            elif opt == 'f':
                got = []
                for l in deps.links:
                    if re.search(pat, l.type):
                        got.append(s.get_token_by_id(l.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'p':
                for tok in tokens:
                    if re.search(pat, tok.pos):
                        lks.append(tok)
            elif opt == 'pl':
                for tok in tokens:
                    from dictionaries.word_transforms import taglemma
                    postag = tok.pos
                    if postag.lower() in taglemma.keys():
                        stemmedtag = taglemma[postag.lower()]
                    else:
                        stemmedtag = postag.lower()
                    if re.search(pat, stemmedtag):
                        lks.append(tok)
            elif opt == 'l':
                for tok in tokens:
                    if re.search(pat, tok.lemma):
                        lks.append(tok)
            elif opt == 'w':
                for tok in tokens:
                    if re.search(pat, tok.word):
                        lks.append(tok)
            elif opt == 'i':
                for tok in tokens:
                    if re.search(pat, str(tok.id)):
                        lks.append(tok)
            elif opt == 'r':
                got = []
                for tok in tokens:
                    dist = distancer(deps.links, tok)
                    if dist is not None and dist is not False:
                        try:
                            if int(dist) == int(pat):
                                lks.append(tok)

                        except TypeError:
                            if re.search(pat, str(dist)):
                                lks.append(tok)

        if mode == 'all':
            from collections import Counter
            counted = Counter(lks)
            lks = [k for k, v in sorted(counted.items()) if v >= len(search.keys())]
        return lks

    result = []
    numdone = 0

    for s in sents:
        numdone += 1
        deps = get_deps(s, dep_type)
        tokens = s.tokens
        lks = get_matches_from_sent(s, search, deps, tokens, dep_type, mode = searchmode)

        if not concordancing:
            lks = list(set([x for x in lks if x and re.search(regex_nonword_filter, x.word)]))

        if exclude is not False:
            to_remove = get_matches_from_sent(s, exclude, deps, tokens, dep_type, mode = excludemode)

            for i in to_remove:
                try:
                    lks.remove(i)
                except ValueError:
                    pass

        if progbar:
            tstr = '%d/%d' % (numdone, len(sents))
            animator(progbar, numdone, tstr)

        if 'c' in show:
            result.append(len(lks))
            continue

        if concordancing:
            for lk in lks: # for each concordance middle part
                one_result = []
                if not lk:
                    continue
                # get the index of the match
                windex = int(lk.id) - 1
                speakr = s.speakername
                if not speakr:
                    speakr = ''
                # begin building line with speaker first
                conc_line = [speakr]
                # format a single word correctly
                if only_format_match:
                    start = ' '.join([t.word for index, t in enumerate(s.tokens) if index < windex])
                    end = ' '.join([t.word for index, t in enumerate(s.tokens) if index > windex])
                    s.tokens = [s.get_token_by_id(lk.id)]
                for tok in s.tokens:
                    single_wd = {}
                    intermediate_result = []
                    if 'w' in show:
                        single_wd['w'] = tok.word
                    if 'l' in show:
                        from dictionaries.word_transforms import wordlist
                        if tok.lemma in wordlist.keys():
                            lem = wordlist[tok.lemma]
                        else:
                            lem = tok.lemma
                        single_wd['l'] = lem
                    if 'p' in show:
                        single_wd['p'] = tok.pos

                    if 'pl' in show:
                        single_wd['pl'] = lk.pos
                        from dictionaries.word_transforms import taglemma
                        if postag.lower() in taglemma.keys():
                            single_wd['pl'] = taglemma[postag.lower()]
                        else:
                            single_wd['pl'] = postag.lower()
                        if not single_wd['pl']:
                            single_wd['pl'] == 'none'

                    if 'r' in show:
                        all_lks = [l for l in deps.links]
                        distance = distancer(all_lks, tok)
                        if distance:
                            single_wd['r'] = str(distance)
                        else:
                            single_wd['r'] = '0'
                    if 'f' in show:
                        for lin in deps.links:
                            single_wd['f'] = '.'
                            if tok.id == lin.dependent.idx:
                                single_wd['f'] = lin.type
                                break
                    if 'i' in show:
                        single_wd['i'] = str(tok.id)

                    for i in show:
                        intermediate_result.append(single_wd[i])
                    intermediate_result = [i.replace('/', '-slash-') for i in intermediate_result]
                    one_result.append('/'.join(intermediate_result))
                # now we have formatted tokens as a list. we need to split
                # it into start, middle and end
                if not only_format_match:
                    start = ' '.join([w for index, w in enumerate(one_result) if index < windex])
                    end = ' '.join([w for index, w in enumerate(one_result) if index > windex])
                    middle = one_result[windex]
                else:
                    middle = one_result[0]

                for bit in start, middle, end:
                    conc_line.append(bit)
                result.append(conc_line)
        else:
            # figure out what to show
            for lk in lks:
                single_result = {}
                if not lk:
                    continue
                if 'w' in show:
                    single_result['w'] = 'none'
                    if lemmatise:
                        single_result['w'] = lk.lemma
                    else:
                        single_result['w'] = lk.word
                if 'l' in show:
                    from dictionaries.word_transforms import wordlist
                    if lk.lemma in wordlist.keys():
                        lem = wordlist[lk.lemma]
                    else:
                        lem = lk.lemma
                    single_result['l'] = lem
                if 'p' in show:
                    single_result['p'] = 'none'
                    postag = lk.pos
                    if lemmatise:
                        from dictionaries.word_transforms import taglemma
                        if postag.lower() in taglemma.keys():
                            single_result['p'] = taglemma[postag.lower()]
                        else:
                            single_result['p'] = postag.lower()
                    else:
                        single_result['p'] = postag
                    if not single_result['p']:
                        single_result['p'] == 'none'

                if 'pl' in show:
                    single_result['pl'] = 'none'
                    postag = lk.pos
                    from dictionaries.word_transforms import taglemma
                    if postag.lower() in taglemma.keys():
                        single_result['pl'] = taglemma[postag.lower()]
                    else:
                        single_result['pl'] = postag.lower()
                    if not single_result['pl']:
                        single_result['pl'] == 'none'

                if 'f' in show:
                    single_result['f'] = 'none'
                    for i in deps.links:
                        if i.dependent.idx == lk.id:
                            single_result['f'] = i.type.rstrip(',')
                            break
                    if single_result['f'] == '':
                        single_result['f'] = 'root'

                if 'g' in show:
                    single_result['g'] = 'none'
                    for i in deps.links:
                        if i.dependent.idx == lk.id:
                            if s.get_token_by_id(i.governor.idx):
                                if lemmatise:                          
                                        single_result['g'] = s.get_token_by_id(i.governor.idx).lemma
                                else:
                                    single_result['g'] = i.governor.text
                            else:
                                single_result['g'] = 'root'
                            break

                if 'd' in show:
                    single_result['d'] = 'none'
                    for i in deps.links:
                        if i.governor.idx == lk.id:
                            if s.get_token_by_id(i.dependent.idx):       
                                if lemmatise:
                                    single_result['d'] = s.get_token_by_id(i.dependent.idx).lemma
                                else:
                                    single_result['d'] = i.dependent.text
                            break

                if 'gl' in show:
                    single_result['gl'] = 'none'
                    for i in deps.links:
                        if i.dependent.idx == lk.id:
                            if s.get_token_by_id(i.governor.idx):
                                single_result['gl'] = s.get_token_by_id(i.governor.idx).lemma
                            else:
                                single_result['gl'] = 'root'
                            break

                if 'dl' in show:
                    single_result['dl'] = 'none'
                    for i in deps.links:
                        if i.governor.idx == lk.id:
                            if s.get_token_by_id(i.dependent.idx):       
                                single_result['dl'] = s.get_token_by_id(i.dependent.idx).lemma
                            break

                if 'gp' in show:
                    single_result['gp'] = 'none'
                    for i in deps.links:
                        if i.dependent.idx == lk.id:
                            if s.get_token_by_id(i.governor.idx):       
                                single_result['gp'] = s.get_token_by_id(i.governor.idx).pos
                            break

                if 'dp' in show:
                    single_result['dp'] = 'none'
                    for i in deps.links:
                        if i.governor.idx == lk.id:
                            if s.get_token_by_id(i.dependent.idx):       
                                single_result['dp'] = s.get_token_by_id(i.dependent.idx).pos
                            break

                if 'df' in show:
                    single_result['df'] = 'none'
                    for i in deps.links:
                        if i.governor.idx == lk.id:
                            single_result['df'] = i.type
                            break  

                if 'gf' in show:
                    single_result['gf'] = 'none'
                    for i in deps.links:
                        # if the result is the dependent, get the governor, find where
                        # it is a dependent, then gt the type
                        if i.dependent.idx == lk.id:
                            gv = next(x for x in deps.links if x.dependent.idx == i.governor.idx)
                            single_result['gf'] = gv.type
                            break                

                if 'r' in show:

                    all_lks = [l for l in deps.links]
                    distance = distancer(all_lks, lk)
                    if distance is not False and distance is not None:
                        single_result['r'] = str(distance)

                if 'i' in show:
                    single_result['i'] = str(lk.id)

                if 'c' not in show:
                    
                    # add them in order
                    out = []
                    for i in show:
                        out.append(single_result[i])

                    out = [i.replace('/', '-slash-') for i in out]
                    result.append('/'.join(out))
    
    if 'c' in show:
        result = sum(result)

    return result