def filtermaker(the_filter, case_sensitive = False): import re if type(the_filter) == list: from other import as_regex the_filter = as_regex(the_filter, case_sensitive = case_sensitive) try: output = re.compile(the_filter) is_valid = True except: is_valid = False if root: import traceback import sys exc_type, exc_value, exc_traceback = sys.exc_info() lst = traceback.format_exception(exc_type, exc_value, exc_traceback) error_message = lst[-1] thetime = strftime("%H:%M:%S", localtime()) print('%s: Filter %s' % (thetime, error_message)) return 'Bad query' while not is_valid: if root: time = strftime("%H:%M:%S", localtime()) print(the_filter) print('%s: Invalid the_filter regular expression.' % time) return False time = strftime("%H:%M:%S", localtime()) selection = input('\n%s: filter regular expression " %s " contains an error. You can either:\n\n' \ ' a) rewrite it now\n' \ ' b) exit\n\nYour selection: ' % (time, the_filter)) if 'a' in selection: the_filter = input('\nNew regular expression: ') try: output = re.compile(r'\b' + the_filter + r'\b') is_valid = True except re.error: is_valid = False elif 'b' in selection: print('') return False return output
def filtermaker(the_filter, case_sensitive=False): import re if type(the_filter) == list: from other import as_regex the_filter = as_regex(the_filter, case_sensitive=case_sensitive) try: output = re.compile(the_filter) is_valid = True except: is_valid = False if root: import traceback import sys exc_type, exc_value, exc_traceback = sys.exc_info() lst = traceback.format_exception(exc_type, exc_value, exc_traceback) error_message = lst[-1] thetime = strftime("%H:%M:%S", localtime()) print('%s: Filter %s' % (thetime, error_message)) return 'Bad query' while not is_valid: if root: time = strftime("%H:%M:%S", localtime()) print(the_filter) print('%s: Invalid the_filter regular expression.' % time) return False time = strftime("%H:%M:%S", localtime()) selection = input('\n%s: filter regular expression " %s " contains an error. You can either:\n\n' \ ' a) rewrite it now\n' \ ' b) exit\n\nYour selection: ' % (time, the_filter)) if 'a' in selection: the_filter = input('\nNew regular expression: ') try: output = re.compile(r'\b' + the_filter + r'\b') is_valid = True except re.error: is_valid = False elif 'b' in selection: print('') return False return output
def interrogator(corpus, search, query = 'any', show = 'w', exclude = False, excludemode = 'any', searchmode = 'all', dep_type = 'collapsed-ccprocessed-dependencies', case_sensitive = False, quicksave = False, just_speakers = False, preserve_case = False, lemmatag = False, files_as_subcorpora = False, only_unique = False, random = False, only_format_match = False, multiprocess = False, spelling = False, regex_nonword_filter = r'[A-Za-z0-9:_]', gramsize = 2, split_contractions = False, do_concordancing = False, maxconc = 9999, **kwargs): """interrogate corpus, corpora, subcorpus and file objects see corpkit.interrogation.interrogate() for docstring""" only_conc = False no_conc = False if do_concordancing is False: no_conc = True if type(do_concordancing) == str and do_concordancing.lower() == 'only': only_conc = True no_conc = False # iteratively count conc lines numconc = 0 # store kwargs locs = locals() if kwargs: for k, v in kwargs.items(): locs[k] = v locs.pop('kwargs', None) import corpkit from interrogation import Interrogation from process import tregex_engine import pandas as pd from pandas import DataFrame, Series from collections import Counter from other import as_regex from process import get_deps from time import localtime, strftime from textprogressbar import TextProgressBar from process import animator from dictionaries.word_transforms import wordlist, taglemma import corenlp_xml import codecs import signal original_sigint = signal.getsignal(signal.SIGINT) if kwargs.get('paralleling', None) is None: original_sigint = signal.getsignal(signal.SIGINT) def signal_handler(signal, frame): """pause on ctrl+c, rather than just stop loop""" import signal import sys from time import localtime, strftime signal.signal(signal.SIGINT, original_sigint) thetime = strftime("%H:%M:%S", localtime()) try: sel = raw_input('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime) except NameError: sel = input('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime) time = strftime("%H:%M:%S", localtime()) print('%s: Interrogation resumed.\n' % time) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGINT, signal_handler) # find out if using gui root = kwargs.get('root') note = kwargs.get('note') # convert path to corpus object if type(corpus) == str: from corpus import Corpus corpus = Corpus(corpus) # figure out how the user has entered the query and normalise from process import searchfixer search, search_iterable = searchfixer(search, query) # for better printing of query, esp during multiprocess # can remove if multiprocess printing improved if len(list(search.keys())) == 1: query = list(search.values())[0] if 'l' in show and search.get('t'): from nltk.stem.wordnet import WordNetLemmatizer lmtzr=WordNetLemmatizer() if type(show) == str: show = [show] def is_multiquery(corpus, search, query, just_speakers): """determine if multiprocessing is needed do some retyping if need be as well""" im = False from collections import OrderedDict if hasattr(corpus, '__iter__'): im = True # so we can do search = 't', query = ['NP', 'VP']: if type(query) == list: if query != list(search.values())[0] or len(list(search.keys())) > 1: query = {c.title(): c for c in query} if type(query) == dict or type(query) == OrderedDict: im = True if just_speakers: if just_speakers == 'each': im = True just_speakers = ['each'] if just_speakers == ['each']: im = True if type(just_speakers) == str: im = False just_speakers = [just_speakers] if type(just_speakers) == list: if len(just_speakers) > 1: im = True if type(search) == dict: if all(type(i) == dict for i in list(search.values())): im = True return im, corpus, search, query, just_speakers def slow_tregex(sents, **dummy_args): """do the speaker-specific version of tregex queries""" speakr = dummy_args.get('speaker', False) import os from process import tregex_engine # first, put the relevant trees into temp file if kwargs.get('outname'): to_open = 'tmp-%s.txt' % kwargs['outname'] else: to_open = 'tmp.txt' to_write = '\n'.join([sent._parse_string.strip() for sent in sents \ if sent.parse_string is not None]) to_write.encode('utf-8', errors = 'ignore') with open(to_open, "w") as fo: encd = to_write.encode('utf-8', errors = 'ignore') + '\n' fo.write(encd) q = list(search.values())[0] ops = ['-o', '-%s' % translated_option] concs = [] res = tregex_engine(query = q, options = ops, corpus = to_open, root = root, preserve_case = True) if not no_conc: ops += ['-w', '-f'] whole_res = tregex_engine(query = q, options = ops, corpus = to_open, root = root, preserve_case = True) res = format_tregex(res) whole_res = format_tregex(whole_res, whole = True) concs = make_conc_lines_from_whole_mid(whole_res, res, speakr) if root: root.update() try: os.remove(to_open) except OSError: pass if countmode: return(len(res)) else: return res, concs def get_stats(sents, **dummy_args): """get a bunch of frequencies on interpersonal phenomena""" import os import re from collections import Counter statsmode_results = Counter() # first, put the relevant trees into temp file if kwargs.get('outname'): to_open = 'tmp-%s.txt' % kwargs['outname'] else: to_open = 'tmp.txt' with open(to_open, "w") as fo: for sent in sents: statsmode_results['Sentences'] += 1 sts = sent.parse_string.rstrip() encd = sts.encode('utf-8', errors = 'ignore') + '\n' fo.write(encd) deps = get_deps(sent, dep_type) numpass = len([x for x in deps.links if x.type.endswith('pass')]) statsmode_results['Passives'] += numpass statsmode_results['Tokens'] += len(sent.tokens) words = [w.word for w in sent.tokens if w.word.isalnum()] statsmode_results['Words'] += len(words) statsmode_results['Characters'] += len(''.join(words)) # count moods via trees (/\?/ !< __) from dictionaries.process_types import processes from other import as_regex tregex_qs = {'Imperative': r'ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/', 'Open interrogative': r'ROOT < SBARQ <<- (/\?/ !< __)', 'Closed interrogative': r'ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))', 'Unmodalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))', 'Modalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))', 'Open class words': r'/^(NN|JJ|VB|RB)/ < __', 'Closed class words': r'__ !< __ !> /^(NN|JJ|VB|RB)/', 'Clauses': r'/^S/ < __', 'Interrogative': r'ROOT << (/\?/ !< __)', 'Mental processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.mental, boundaries = 'w'), 'Verbal processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.verbal, boundaries = 'w'), 'Relational processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.relational, boundaries = 'w') } for name, q in sorted(tregex_qs.items()): res = tregex_engine(query = q, options = ['-o', '-C'], corpus = to_open, root = root) statsmode_results[name] += int(res) global numdone numdone += 1 if root: root.update() else: tot_string = str(numdone + 1) + '/' + str(total_files) if kwargs.get('outname'): tot_string = '%s: %s' % (kwargs['outname'], tot_string) animator(p, numdone, tot_string, **par_args) if kwargs.get('note', False): kwargs['note'].progvar.set((numdone * 100.0 / total_files / denom) + startnum) os.remove(to_open) return statsmode_results, [] def make_conc_lines_from_whole_mid(wholes, middle_column_result, speakr = False): import re, os if speakr is False: speakr = '' conc_lines = [] # remove duplicates from results unique_wholes = [] unique_middle_column_result = [] duplicates = [] for index, ((f, whole), mid) in enumerate(zip(wholes, middle_column_result)): if '-join-'.join([f, whole, mid]) not in duplicates: duplicates.append('-join-'.join([f, whole, mid])) unique_wholes.append([f, whole]) unique_middle_column_result.append(mid) # split into start, middle and end, dealing with multiple occurrences for index, ((f, whole), mid) in enumerate(zip(unique_wholes, unique_middle_column_result)): reg = re.compile(r'([^a-zA-Z0-9-]|^)(' + re.escape(mid) + r')([^a-zA-Z0-9-]|$)', re.IGNORECASE | re.UNICODE) offsets = [(m.start(), m.end()) for m in re.finditer(reg,whole)] for offstart, offend in offsets: start, middle, end = whole[0:offstart].strip(), whole[offstart:offend].strip(), whole[offend:].strip() conc_lines.append([os.path.basename(f), speakr, start, middle, end]) return conc_lines def uniquify(conc_lines): from collections import OrderedDict unique_lines = [] checking = [] for index, (f, speakr, start, middle, end) in enumerate(conc_lines): joined = ' '.join([speakr, start, 'MIDDLEHERE:', middle, ':MIDDLEHERE', end]) if joined not in checking: unique_lines.append(conc_lines[index]) checking.append(joined) return unique_lines def lemmatiser(list_of_words, tag): """take a list of unicode words and a tag and return a lemmatised list.""" output = [] for word in list_of_words: if translated_option.startswith('u'): if word.lower() in list(taglemma.keys()): word = taglemma[word.lower()] else: if word == 'x': word = 'Other' # only use wordnet lemmatiser when appropriate else: if word in wordlist: word = wordlist[word] word = lmtzr.lemmatize(word, tag) output.append(word) return output def gettag(query, lemmatag = False): """ Find tag for WordNet lemmatisation """ import re tagdict = {'N': 'n', 'A': 'a', 'V': 'v', 'A': 'r', 'None': False, '': False, 'Off': False} if lemmatag is False: tag = 'n' # same default as wordnet # attempt to find tag from tregex query tagfinder = re.compile(r'^[^A-Za-z]*([A-Za-z]*)') tagchecker = re.compile(r'^[A-Z]{1,4}$') qr = query.replace(r'\w', '').replace(r'\s', '').replace(r'\b', '') treebank_tag = re.findall(tagfinder, qr) if re.match(tagchecker, treebank_tag[0]): tag = tagdict.get(treebank_tag[0], 'n') elif lemmatag: tag = lemmatag return tag def format_tregex(results, whole = False): """format tregex by show list""" if countmode: return results import re done = [] if whole: fnames = [x for x, y in results] results = [y for x, y in results] if 'l' in show or 'pl' in show: lemmata = lemmatiser(results, gettag(search.get('t'), lemmatag)) else: lemmata = [None for i in results] for word, lemma in zip(results, lemmata): bits = [] if exclude and exclude.get('w'): if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('w'), word): continue if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('l'), lemma): continue if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('p'), word): continue if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('pl'), lemma): continue if exclude and excludemode == 'all': num_to_cause_exclude = len(list(exclude.keys())) current_num = 0 if exclude.get('w'): if re.search(exclude.get('w'), word): current_num += 1 if exclude.get('l'): if re.search(exclude.get('l'), lemma): current_num += 1 if exclude.get('p'): if re.search(exclude.get('p'), word): current_num += 1 if exclude.get('pl'): if re.search(exclude.get('pl'), lemma): current_num += 1 if current_num == num_to_cause_exclude: continue for i in show: if i == 't': bits.append(word) if i == 'l': bits.append(lemma) elif i == 'w': bits.append(word) elif i == 'p': bits.append(word) elif i == 'pl': bits.append(lemma) joined = '/'.join(bits) done.append(joined) if whole: done = zip(fnames, done) return done def tok_by_list(pattern, list_of_toks, concordancing = False, **kwargs): """search for regex in plaintext corpora""" import re if type(pattern) == str: pattern = [pattern] if not case_sensitive: pattern = [p.lower() for p in pattern] if not concordancing: if case_sensitive: matches = [m for m in list_of_toks if m in pattern] else: matches = [m for m in list_of_toks if m.lower() in pattern] else: matches = [] for index, token in enumerate(list_of_toks): if token in pattern: match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]] match.append(token) match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140]) matches.append(match) if countmode: return(len(matches)) else: return matches def unsplitter(lst): """unsplit contractions and apostophes from tokenised text""" if split_contractions: return lst unsplit = [] for index, t in enumerate(lst): if index == 0 or index == len(lst) - 1: unsplit.append(t) continue if "'" in t and not t.endswith("'"): rejoined = ''.join([lst[index - 1], t]) unsplit.append(rejoined) else: if not "'" in lst[index + 1]: unsplit.append(t) return unsplit def tok_ngrams(pattern, list_of_toks, concordancing = False, split_contractions = True): from collections import Counter import re ngrams = Counter() result = [] # if it's not a compiled regex list_of_toks = [x for x in list_of_toks if re.search(regex_nonword_filter, x)] if pattern.lower() == 'any': pattern = r'.*' if not split_contractions: list_of_toks = unsplitter(list_of_toks) #list_of_toks = [x for x in list_of_toks if "'" not in x] for index, w in enumerate(list_of_toks): try: the_gram = [list_of_toks[index+x] for x in range(gramsize)] if not any(re.search(pattern, x) for x in the_gram): continue ngrams[' '.join(the_gram)] += 1 except IndexError: pass # turn counter into list of results for k, v in list(ngrams.items()): if v > 1: for i in range(v): result.append(k) if countmode: return(len(result)) else: return result def compiler(pattern): """compile regex or fail gracefully""" import re try: if case_sensitive: comped = re.compile(pattern) else: comped = re.compile(pattern, re.IGNORECASE) return comped except: import traceback import sys from time import localtime, strftime exc_type, exc_value, exc_traceback = sys.exc_info() lst = traceback.format_exception(exc_type, exc_value, exc_traceback) error_message = lst[-1] thetime = strftime("%H:%M:%S", localtime()) print('%s: Query %s' % (thetime, error_message)) if root: return 'Bad query' else: raise ValueError('%s: Query %s' % (thetime, error_message)) def tok_by_reg(pattern, list_of_toks, concordancing = False, **kwargs): """search for regex in plaintext corpora""" import re comped = compiler(pattern) if comped == 'Bad query': return 'Bad query' if not concordancing: matches = [m for m in list_of_toks if re.search(comped, m)] else: matches = [] for index, token in enumerate(list_of_toks): if re.search(comped, token): match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]] match.append(re.search(comped, token).group(0)) match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140]) matches.append(match) if countmode: return(len(matches)) else: return matches def plaintext_regex_search(pattern, plaintext_data, concordancing = False, **kwargs): """search for regex in plaintext corpora it searches over lines, so the user needs to be careful. """ import re if concordancing: pattern = r'(.{,140})\b(' + pattern + r')\b(.{,140})' compiled_pattern = compiler(pattern) if compiled_pattern == 'Bad query': return 'Bad query' matches = re.findall(compiled_pattern, plaintext_data) if concordancing: matches = [list(m) for m in matches] if not concordancing: for index, i in enumerate(matches): if type(i) == tuple: matches[index] = i[0] if countmode: return(len(matches)) else: return matches def correct_spelling(a_string): if not spelling: return a_string from dictionaries.word_transforms import usa_convert if spelling.lower() == 'uk': usa_convert = {v: k for k, v in list(usa_convert.items())} spell_out = [] bits = a_string.split('/') for index, i in enumerate(bits): converted = usa_convert.get(i.lower(), i) if i.islower() or preserve_case is False: converted = converted.lower() elif i.isupper() and preserve_case: converted = converted.upper() elif i.istitle() and preserve_case: converted = converted.title() bits[index] = converted r = '/'.join(bits) return r def plaintext_simple_search(pattern, plaintext_data, concordancing = False, **kwargs): """search for tokens in plaintext corpora""" import re result = [] if type(pattern) == str: pattern = [pattern] for p in pattern: if concordancing: pat = r'(.{0,140})\b(' + re.escape(p) + r')\b(.{0,140})' pat = compiler(pat) if pat == 'Bad query': return 'Bad query' matches = re.findall(pat, plaintext_data) if concordancing: matches = [list(m) for m in matches] for i in matches: result.append(i) else: for m in range(len(matches)): result.append(p) return result # do multiprocessing if need be im, corpus, search, query, just_speakers = is_multiquery(corpus, search, query, just_speakers) locs['search'] = search locs['query'] = query locs['just_speakers'] = just_speakers locs['corpus'] = corpus locs['multiprocess'] = multiprocess if im: signal.signal(signal.SIGINT, original_sigint) from multiprocess import pmultiquery return pmultiquery(**locs) datatype = corpus.datatype singlefile = corpus.singlefile # store all results in here results = {} count_results = {} conc_results = {} # check if just counting countmode = 'c' in show if countmode: no_conc = True only_conc = False # where we are at in interrogation current_iter = 0 # multiprocessing progress bar denom = kwargs.get('denominator', 1) startnum = kwargs.get('startnum', 0) ############################################ # Determine the search function to be used # ############################################ # simple tregex is tregex over whole dirs simple_tregex_mode = False statsmode = False if not just_speakers and 't' in list(search.keys()): simple_tregex_mode = True else: if corpus.datatype == 'plaintext': if search.get('n'): raise NotImplementedError('Use a tokenised corpus for n-gramming.') #searcher = plaintext_ngram optiontext = 'n-grams via plaintext' if search.get('w'): if kwargs.get('regex', True): searcher = plaintext_regex_search else: searcher = plaintext_simple_search optiontext = 'Searching plaintext' elif corpus.datatype == 'tokens': if search.get('n'): searcher = tok_ngrams optiontext = 'n-grams via tokens' elif search.get('w'): if kwargs.get('regex', True): searcher = tok_by_reg else: searcher = tok_by_list if type(search.get('w')) == list: searcher = tok_by_list optiontext = 'Searching tokens' only_parse = ['r', 'd', 'g', 'dl', 'gl', 'df', 'gf', 'dp', 'gp', 'f', 'd2', 'd2f', 'd2p', 'd2l'] if corpus.datatype != 'parse' and any(i in only_parse for i in list(search.keys())): raise ValueError('Need parsed corpus to search with "%s" option(s).' % ', '.join([i for i in list(search.keys()) if i in only_parse])) elif corpus.datatype == 'parse': if search.get('t'): searcher = slow_tregex elif search.get('s'): searcher = get_stats statsmode = True optiontext = 'General statistics' global numdone numdone = 0 no_conc = True only_conc = False do_concordancing = False else: from depsearch import dep_searcher searcher = dep_searcher optiontext = 'Dependency querying' ############################################ # Set some Tregex-related values # ############################################ if search.get('t'): translated_option = 't' query = search.get('t') # check the query q = tregex_engine(corpus = False, query = search.get('t'), options = ['-t'], check_query = True, root = root) if query is False: if root: return 'Bad query' else: return optiontext = 'Searching parse trees' if 'p' in show or 'pl' in show: translated_option = 'u' if type(search['t']) == list: search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)' elif 't' in show: translated_option = 'o' if type(search['t']) == list: search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)' elif 'w' in show: translated_option = 't' if type(search['t']) == list: search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'/.?[A-Za-z0-9].?/ !< __' elif 'c' in show: only_count = True translated_option = 'C' if type(search['t']) == list: search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'/.?[A-Za-z0-9].?/ !< __' elif 'l' in show: translated_option = 't' if type(search['t']) == list: search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'/.?[A-Za-z0-9].?/ !< __' query = search['t'] ############################################ # Make iterable for corpus/subcorpus/file # ############################################ if corpus.singlefile: to_iterate_over = {(corpus.name, corpus.path): [corpus]} elif not corpus.subcorpora: to_iterate_over = {(corpus.name, corpus.path): corpus.files} else: to_iterate_over = {} for subcorpus in corpus.subcorpora: to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files #for k, v in sorted(corpus.structure.items(), key=lambda obj: obj[0].name): # to_iterate_over[(k.name, k.path)] = v if files_as_subcorpora: to_iterate_over = {} for f in corpus.files: to_iterate_over[(f.name, f.path)] = [f] ############################################ # Print welcome message # ############################################ if no_conc: message = 'Interrogating' else: message = 'Interrogating and concordancing' if kwargs.get('printstatus', True): thetime = strftime("%H:%M:%S", localtime()) sformat = '\n '.join(['%s: %s' % (k.rjust(3), v) for k, v in list(search.items())]) if search == {'s': r'.*'}: sformat = 'features' welcome = '\n%s: %s %s ...\n %s\n Query: %s\n %s corpus ... \n' % \ (thetime, message, corpus.name, optiontext, sformat, message) print(welcome) ############################################ # Make progress bar # ############################################ if simple_tregex_mode: total_files = len(list(to_iterate_over.keys())) else: if search.get('s'): total_files = sum([len(x) for x in list(to_iterate_over.values())]) * 12 else: total_files = sum([len(x) for x in list(to_iterate_over.values())]) par_args = {'printstatus': kwargs.get('printstatus', True), 'root': root, 'note': note, 'length': total_files, 'startnum': kwargs.get('startnum'), 'denom': kwargs.get('denominator', 1)} term = None if kwargs.get('paralleling', None) is not None: from blessings import Terminal term = Terminal() par_args['terminal'] = term par_args['linenum'] = kwargs.get('paralleling') outn = kwargs.get('outname', '') if outn: outn = outn + ': ' tstr = '%s%d/%d' % (outn, current_iter, total_files) p = animator(None, None, init = True, tot_string = tstr, **par_args) tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) ############################################ # Iterate over data, doing interrogations # ############################################ for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()): conc_results[subcorpus_name] = [] count_results[subcorpus_name] = [] results[subcorpus_name] = Counter() # tregex over subcorpora, not files if simple_tregex_mode: op = ['-o', '-' + translated_option] result = tregex_engine(query = search['t'], options = op, corpus = subcorpus_path, root = root, preserve_case = preserve_case) if not countmode: result = format_tregex(result) if not no_conc: op += ['-w', '-f'] whole_result = tregex_engine(query = search['t'], options = op, corpus = subcorpus_path, root = root, preserve_case = preserve_case) if not only_format_match: whole_result = format_tregex(whole_result, whole = True) conc_result = make_conc_lines_from_whole_mid(whole_result, result, speakr = False) if countmode: count_results[subcorpus_name] += [result] else: result = Counter(result) results[subcorpus_name] += result if not no_conc: for lin in conc_result: if numconc < maxconc or not maxconc: conc_results[subcorpus_name].append(lin) numconc += 1 current_iter += 1 if kwargs.get('paralleling', None) is not None: tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) else: tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) # dependencies, plaintext, tokens or slow_tregex else: for f in files: slow_treg_speaker_guess = kwargs.get('outname', False) if corpus.datatype == 'parse': with open(f.path, 'r') as data: data = data.read() from corenlp_xml.document import Document try: corenlp_xml = Document(data) except: print('Could not read file: %s' % f.path) continue if just_speakers: sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers] if len(just_speakers) == 1: slow_treg_speaker_guess = just_speakers[0] if not sents: continue else: sents = corenlp_xml.sentences res, conc_res = searcher(sents, search = search, show = show, dep_type = dep_type, exclude = exclude, excludemode = excludemode, searchmode = searchmode, lemmatise = False, case_sensitive = case_sensitive, do_concordancing = do_concordancing, only_format_match = only_format_match, speaker = slow_treg_speaker_guess) if res == 'Bad query': return 'Bad query' elif corpus.datatype == 'tokens': import pickle with codecs.open(f.path, "rb") as fo: data = pickle.load(fo) if not only_conc: res = searcher(list(search.values())[0], data, split_contractions = split_contractions, concordancing = False) if not no_conc: conc_res = searcher(list(search.values())[0], data, split_contractions = split_contractions, concordancing = True) if not no_conc: for index, line in enumerate(conc_res): line.insert(0, '') elif corpus.datatype == 'plaintext': with codecs.open(f.path, 'rb', encoding = 'utf-8') as data: data = data.read() if not only_conc: res = searcher(list(search.values())[0], data, concordancing = False) if not no_conc: conc_res = searcher(list(search.values())[0], data, concordancing = True) if not no_conc: for index, line in enumerate(conc_res): line.insert(0, '') if countmode: count_results[subcorpus_name] += [res] else: # add filename and do lowercasing for conc if not no_conc: for index, line in enumerate(conc_res): if searcher != slow_tregex: line.insert(0, f.name) else: line[0] = f.name if not preserve_case: line[3:] = [x.lower() for x in line[3:]] if spelling: line = [correct_spelling(b) for b in line] if numconc < maxconc or not maxconc: conc_results[subcorpus_name].append(line) numconc += 1 # do lowercasing and spelling if not only_conc: if not preserve_case: if not statsmode: res = [i.lower() for i in res] if spelling: if not statsmode: res = [correct_spelling(r) for r in res] #if not statsmode: results[subcorpus_name] += Counter(res) #else: #results[subcorpus_name] += res if not statsmode: current_iter += 1 if kwargs.get('paralleling', None) is not None: tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) else: tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) # delete temp file if there import os if os.path.isfile('tmp.txt'): os.remove('tmp.txt') ############################################ # Get concordances into DataFrame # ############################################ if not no_conc: all_conc_lines = [] for sc_name, resu in sorted(conc_results.items()): if only_unique: unique_results = uniquify(resu) else: unique_results = resu #make into series pindex = 'c f s l m r'.encode('utf-8').split() for fname, spkr, start, word, end in unique_results: #spkr = str(spkr, errors = 'ignore') fname = os.path.basename(fname) all_conc_lines.append(Series([sc_name, fname, \ spkr, \ start, \ word, \ end], \ index = pindex)) # randomise results... if random: from random import shuffle shuffle(all_conc_lines) conc_df = pd.concat(all_conc_lines, axis = 1).T # not doing anything yet --- this is for multimodal concordancing add_links = False if not add_links: conc_df.columns = ['c', 'f', 's', 'l', 'm', 'r'] else: conc_df.columns = ['c', 'f', 's', 'l', 'm', 'r', 'link'] if all(x == '' for x in list(conc_df['s'].values)): conc_df.drop('s', axis = 1, inplace = True) #if kwargs.get('note'): # kwargs['note'].progvar.set(100) #if kwargs.get('printstatus', True): # thetime = strftime("%H:%M:%S", localtime()) # finalstring = '\n\n%s: Concordancing finished! %d matches.\n' % (thetime, len(conc_df.index)) # print(finalstring) from interrogation import Concordance output = Concordance(conc_df) if only_conc: output.query = locs if quicksave: output.save() if kwargs.get('printstatus', True): thetime = strftime("%H:%M:%S", localtime()) finalstring = '\n\n%s: Concordancing finished! %d results.' % (thetime, len(conc_df)) print(finalstring) return output #output.query = locs #return output ############################################ # Get interrogation into DataFrame # ############################################ if not only_conc: if countmode: df = Series({k: sum(v) for k, v in sorted(count_results.items())}) tot = df.sum() else: the_big_dict = {} unique_results = set([item for sublist in list(results.values()) for item in sublist]) for word in unique_results: the_big_dict[word] = [subcorp_result[word] for name, subcorp_result in sorted(results.items(), key=lambda x: x[0])] # turn master dict into dataframe, sorted df = DataFrame(the_big_dict, index = sorted(results.keys())) numentries = len(df.columns) tot = df.sum(axis = 1) total_total = df.sum().sum() ############################################ # Format, output as Interrogation object # ############################################ if not countmode: if not corpus.subcorpora or singlefile: if not files_as_subcorpora: if not kwargs.get('df1_always_df'): df = Series(df.ix[0]) df.sort_values(ascending = False, inplace = True) tot = df.sum() numentries = len(df.index) total_total = tot # sort by total if type(df) == pd.core.frame.DataFrame: if not df.empty: df.ix['Total-tmp'] = df.sum() the_tot = df.ix['Total-tmp'] df = df[the_tot.argsort()[::-1]] df = df.drop('Total-tmp', axis = 0) # format final string if kwargs.get('printstatus', True): thetime = strftime("%H:%M:%S", localtime()) finalstring = '\n\n%s: Interrogation finished!' % thetime if countmode: finalstring += ' %d matches.' % tot else: finalstring += ' %d unique results, %d total occurrences.' % (numentries, total_total) print(finalstring) if not no_conc: interro = Interrogation(results = df, totals = tot, query = locs, concordance = output) else: interro = Interrogation(results = df, totals = tot, query = locs) if quicksave: interro.save() return interro
def get_stats(sents, **dummy_args): """get a bunch of frequencies on interpersonal phenomena""" import os import re from collections import Counter statsmode_results = Counter() # first, put the relevant trees into temp file if kwargs.get('outname'): to_open = 'tmp-%s.txt' % kwargs['outname'] else: to_open = 'tmp.txt' with open(to_open, "w") as fo: for sent in sents: statsmode_results['Sentences'] += 1 sts = sent.parse_string.rstrip() encd = sts.encode('utf-8', errors = 'ignore') + '\n' fo.write(encd) deps = get_deps(sent, dep_type) numpass = len([x for x in deps.links if x.type.endswith('pass')]) statsmode_results['Passives'] += numpass statsmode_results['Tokens'] += len(sent.tokens) words = [w.word for w in sent.tokens if w.word.isalnum()] statsmode_results['Words'] += len(words) statsmode_results['Characters'] += len(''.join(words)) # count moods via trees (/\?/ !< __) from dictionaries.process_types import processes from other import as_regex tregex_qs = {'Imperative': r'ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/', 'Open interrogative': r'ROOT < SBARQ <<- (/\?/ !< __)', 'Closed interrogative': r'ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))', 'Unmodalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))', 'Modalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))', 'Open class words': r'/^(NN|JJ|VB|RB)/ < __', 'Closed class words': r'__ !< __ !> /^(NN|JJ|VB|RB)/', 'Clauses': r'/^S/ < __', 'Interrogative': r'ROOT << (/\?/ !< __)', 'Mental processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.mental, boundaries = 'w'), 'Verbal processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.verbal, boundaries = 'w'), 'Relational processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.relational, boundaries = 'w') } for name, q in sorted(tregex_qs.items()): res = tregex_engine(query = q, options = ['-o', '-C'], corpus = to_open, root = root) statsmode_results[name] += int(res) global numdone numdone += 1 if root: root.update() else: tot_string = str(numdone + 1) + '/' + str(total_files) if kwargs.get('outname'): tot_string = '%s: %s' % (kwargs['outname'], tot_string) animator(p, numdone, tot_string, **par_args) if kwargs.get('note', False): kwargs['note'].progvar.set((numdone * 100.0 / total_files / denom) + startnum) os.remove(to_open) return statsmode_results, []
def interrogator(corpus, search, query = 'any', show = 'w', exclude = False, excludemode = 'any', searchmode = 'all', dep_type = 'collapsed-ccprocessed-dependencies', case_sensitive = False, save = False, just_speakers = False, preserve_case = False, lemmatag = False, files_as_subcorpora = False, only_unique = False, random = False, only_format_match = False, multiprocess = False, spelling = False, regex_nonword_filter = r'[A-Za-z0-9:_]', gramsize = 2, split_contractions = False, do_concordancing = False, maxconc = 9999, **kwargs): """interrogate corpus, corpora, subcorpus and file objects see corpkit.interrogation.interrogate() for docstring""" only_conc = False no_conc = False if do_concordancing is False: no_conc = True if type(do_concordancing) == str and do_concordancing.lower() == 'only': only_conc = True no_conc = False # iteratively count conc lines numconc = 0 # store kwargs locs = locals() if kwargs: for k, v in kwargs.items(): locs[k] = v locs.pop('kwargs', None) import corpkit from interrogation import Interrogation from corpus import Datalist, Corpora, Corpus, File from process import tregex_engine, get_deps import pandas as pd from pandas import DataFrame, Series from collections import Counter from other import as_regex from time import localtime, strftime from textprogressbar import TextProgressBar from process import animator from dictionaries.word_transforms import wordlist, taglemma import corenlp_xml import codecs import signal original_sigint = signal.getsignal(signal.SIGINT) if kwargs.get('paralleling', None) is None: original_sigint = signal.getsignal(signal.SIGINT) def signal_handler(signal, frame): """pause on ctrl+c, rather than just stop loop""" import signal import sys from time import localtime, strftime signal.signal(signal.SIGINT, original_sigint) thetime = strftime("%H:%M:%S", localtime()) try: sel = raw_input('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime) except NameError: sel = input('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime) time = strftime("%H:%M:%S", localtime()) print('%s: Interrogation resumed.\n' % time) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGINT, signal_handler) # find out if using gui root = kwargs.get('root') note = kwargs.get('note') # convert path to corpus object if corpus.__class__ not in [Corpus, Corpora, File]: if not multiprocess and not kwargs.get('outname'): corpus = Corpus(corpus, print_info = False) # figure out how the user has entered the query and normalise from process import searchfixer search = searchfixer(search, query) if 'l' in show and search.get('t'): from nltk.stem.wordnet import WordNetLemmatizer lmtzr=WordNetLemmatizer() if type(show) == str: show = [show] def is_multiquery(corpus, search, query, just_speakers): """determine if multiprocessing is needed do some retyping if need be as well""" im = False from collections import OrderedDict #if hasattr(corpus, '__iter__'): # im = True # so we can do search = 't', query = ['NP', 'VP']: if type(query) == list: if query != list(search.values())[0] or len(list(search.keys())) > 1: query = {c.title(): c for c in query} if type(query) == dict or type(query) == OrderedDict: im = True if just_speakers: if just_speakers == 'each': im = True just_speakers = ['each'] if just_speakers == ['each']: im = True if type(just_speakers) == str: im = False just_speakers = [just_speakers] if type(just_speakers) == list: if len(just_speakers) > 1: im = True if type(search) == dict: if all(type(i) == dict for i in list(search.values())): im = True return im, corpus, search, query, just_speakers def slow_tregex(sents, **dummy_args): """do the speaker-specific version of tregex queries""" speakr = dummy_args.get('speaker', False) import os from process import tregex_engine # first, put the relevant trees into temp file if kwargs.get('outname'): to_open = 'tmp-%s.txt' % kwargs['outname'] else: to_open = 'tmp.txt' to_write = '\n'.join([sent._parse_string.strip() for sent in sents \ if sent.parse_string is not None]) to_write.encode('utf-8', errors = 'ignore') with open(to_open, "w") as fo: encd = to_write.encode('utf-8', errors = 'ignore') + '\n' fo.write(encd) q = list(search.values())[0] ops = ['-o', '-%s' % translated_option] concs = [] res = tregex_engine(query = q, options = ops, corpus = to_open, root = root, preserve_case = True) if not no_conc: ops += ['-w', '-f'] whole_res = tregex_engine(query = q, options = ops, corpus = to_open, root = root, preserve_case = True) res = format_tregex(res) whole_res = format_tregex(whole_res, whole = True) concs = make_conc_lines_from_whole_mid(whole_res, res, speakr) if root: root.update() try: os.remove(to_open) except OSError: pass if countmode: return(len(res)) else: return res, concs def get_stats(sents, **dummy_args): """get a bunch of frequencies on interpersonal phenomena""" import os import re from collections import Counter statsmode_results = Counter() # first, put the relevant trees into temp file if kwargs.get('outname'): to_open = 'tmp-%s.txt' % kwargs['outname'] else: to_open = 'tmp.txt' with open(to_open, "w") as fo: for sent in sents: statsmode_results['Sentences'] += 1 sts = sent.parse_string.rstrip() encd = sts.encode('utf-8', errors = 'ignore') + '\n' fo.write(encd) deps = get_deps(sent, dep_type) numpass = len([x for x in deps.links if x.type.endswith('pass')]) statsmode_results['Passives'] += numpass statsmode_results['Tokens'] += len(sent.tokens) words = [w.word for w in sent.tokens if w.word.isalnum()] statsmode_results['Words'] += len(words) statsmode_results['Characters'] += len(''.join(words)) # count moods via trees (/\?/ !< __) from dictionaries.process_types import processes from other import as_regex tregex_qs = {'Imperative': r'ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/', 'Open interrogative': r'ROOT < SBARQ <<- (/\?/ !< __)', 'Closed interrogative': r'ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))', 'Unmodalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))', 'Modalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))', 'Open class words': r'/^(NN|JJ|VB|RB)/ < __', 'Closed class words': r'__ !< __ !> /^(NN|JJ|VB|RB)/', 'Clauses': r'/^S/ < __', 'Interrogative': r'ROOT << (/\?/ !< __)', 'Mental processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.mental, boundaries = 'w'), 'Verbal processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.verbal, boundaries = 'w'), 'Relational processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.relational, boundaries = 'w') } for name, q in sorted(tregex_qs.items()): res = tregex_engine(query = q, options = ['-o', '-C'], corpus = to_open, root = root) statsmode_results[name] += int(res) global numdone numdone += 1 if root: root.update() else: tot_string = str(numdone + 1) + '/' + str(total_files) if kwargs.get('outname'): tot_string = '%s: %s' % (kwargs['outname'], tot_string) animator(p, numdone, tot_string, **par_args) if kwargs.get('note', False): kwargs['note'].progvar.set((numdone * 100.0 / total_files / denom) + startnum) os.remove(to_open) return statsmode_results, [] def make_conc_lines_from_whole_mid(wholes, middle_column_result, speakr = False): import re, os if speakr is False: speakr = '' conc_lines = [] # remove duplicates from results unique_wholes = [] unique_middle_column_result = [] duplicates = [] for index, ((f, whole), mid) in enumerate(zip(wholes, middle_column_result)): if '-join-'.join([f, whole, mid]) not in duplicates: duplicates.append('-join-'.join([f, whole, mid])) unique_wholes.append([f, whole]) unique_middle_column_result.append(mid) # split into start, middle and end, dealing with multiple occurrences for index, ((f, whole), mid) in enumerate(zip(unique_wholes, unique_middle_column_result)): reg = re.compile(r'([^a-zA-Z0-9-]|^)(' + re.escape(mid) + r')([^a-zA-Z0-9-]|$)', re.IGNORECASE | re.UNICODE) offsets = [(m.start(), m.end()) for m in re.finditer(reg,whole)] for offstart, offend in offsets: start, middle, end = whole[0:offstart].strip(), whole[offstart:offend].strip(), whole[offend:].strip() conc_lines.append([os.path.basename(f), speakr, start, middle, end]) return conc_lines def uniquify(conc_lines): from collections import OrderedDict unique_lines = [] checking = [] for index, (f, speakr, start, middle, end) in enumerate(conc_lines): joined = ' '.join([speakr, start, 'MIDDLEHERE:', middle, ':MIDDLEHERE', end]) if joined not in checking: unique_lines.append(conc_lines[index]) checking.append(joined) return unique_lines def lemmatiser(list_of_words, tag): """take a list of unicode words and a tag and return a lemmatised list.""" output = [] for word in list_of_words: if translated_option.startswith('u'): if word.lower() in list(taglemma.keys()): word = taglemma[word.lower()] else: if word == 'x': word = 'Other' # only use wordnet lemmatiser when appropriate else: if word in wordlist: word = wordlist[word] word = lmtzr.lemmatize(word, tag) output.append(word) return output def gettag(query, lemmatag = False): """ Find tag for WordNet lemmatisation """ import re tagdict = {'N': 'n', 'A': 'a', 'V': 'v', 'A': 'r', 'None': False, '': False, 'Off': False} if lemmatag is False: tag = 'n' # same default as wordnet # attempt to find tag from tregex query tagfinder = re.compile(r'^[^A-Za-z]*([A-Za-z]*)') tagchecker = re.compile(r'^[A-Z]{1,4}$') qr = query.replace(r'\w', '').replace(r'\s', '').replace(r'\b', '') treebank_tag = re.findall(tagfinder, qr) if re.match(tagchecker, treebank_tag[0]): tag = tagdict.get(treebank_tag[0], 'n') elif lemmatag: tag = lemmatag return tag def format_tregex(results, whole = False): """format tregex by show list""" if countmode: return results import re done = [] if whole: fnames = [x for x, y in results] results = [y for x, y in results] if 'l' in show or 'pl' in show: lemmata = lemmatiser(results, gettag(search.get('t'), lemmatag)) else: lemmata = [None for i in results] for word, lemma in zip(results, lemmata): bits = [] if exclude and exclude.get('w'): if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('w'), word): continue if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('l'), lemma): continue if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('p'), word): continue if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('pl'), lemma): continue if exclude and excludemode == 'all': num_to_cause_exclude = len(list(exclude.keys())) current_num = 0 if exclude.get('w'): if re.search(exclude.get('w'), word): current_num += 1 if exclude.get('l'): if re.search(exclude.get('l'), lemma): current_num += 1 if exclude.get('p'): if re.search(exclude.get('p'), word): current_num += 1 if exclude.get('pl'): if re.search(exclude.get('pl'), lemma): current_num += 1 if current_num == num_to_cause_exclude: continue for i in show: if i == 't': bits.append(word) if i == 'l': bits.append(lemma) elif i == 'w': bits.append(word) elif i == 'p': bits.append(word) elif i == 'pl': bits.append(lemma) joined = '/'.join(bits) done.append(joined) if whole: done = zip(fnames, done) return done def tok_by_list(pattern, list_of_toks, concordancing = False, **kwargs): """search for regex in plaintext corpora""" import re if type(pattern) == str: pattern = [pattern] if not case_sensitive: pattern = [p.lower() for p in pattern] if not concordancing: if case_sensitive: matches = [m for m in list_of_toks if m in pattern] else: matches = [m for m in list_of_toks if m.lower() in pattern] else: matches = [] for index, token in enumerate(list_of_toks): if token in pattern: match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]] match.append(token) match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140]) matches.append(match) if countmode: return(len(matches)) else: return matches def unsplitter(lst): """unsplit contractions and apostophes from tokenised text""" if split_contractions: return lst unsplit = [] for index, t in enumerate(lst): if index == 0 or index == len(lst) - 1: unsplit.append(t) continue if "'" in t and not t.endswith("'"): rejoined = ''.join([lst[index - 1], t]) unsplit.append(rejoined) else: if not "'" in lst[index + 1]: unsplit.append(t) return unsplit def tok_ngrams(pattern, list_of_toks, concordancing = False, split_contractions = True): from collections import Counter import re ngrams = Counter() result = [] # if it's not a compiled regex list_of_toks = [x for x in list_of_toks if re.search(regex_nonword_filter, x)] if pattern.lower() == 'any': pattern = r'.*' if not split_contractions: list_of_toks = unsplitter(list_of_toks) #list_of_toks = [x for x in list_of_toks if "'" not in x] for index, w in enumerate(list_of_toks): try: the_gram = [list_of_toks[index+x] for x in range(gramsize)] if not any(re.search(pattern, x) for x in the_gram): continue ngrams[' '.join(the_gram)] += 1 except IndexError: pass # turn counter into list of results for k, v in list(ngrams.items()): if v > 1: for i in range(v): result.append(k) if countmode: return(len(result)) else: return result def compiler(pattern): """compile regex or fail gracefully""" import re try: if case_sensitive: comped = re.compile(pattern) else: comped = re.compile(pattern, re.IGNORECASE) return comped except: import traceback import sys from time import localtime, strftime exc_type, exc_value, exc_traceback = sys.exc_info() lst = traceback.format_exception(exc_type, exc_value, exc_traceback) error_message = lst[-1] thetime = strftime("%H:%M:%S", localtime()) print('%s: Query %s' % (thetime, error_message)) if root: return 'Bad query' else: raise ValueError('%s: Query %s' % (thetime, error_message)) def tok_by_reg(pattern, list_of_toks, concordancing = False, **kwargs): """search for regex in plaintext corpora""" import re comped = compiler(pattern) if comped == 'Bad query': return 'Bad query' if not concordancing: matches = [m for m in list_of_toks if re.search(comped, m)] else: matches = [] for index, token in enumerate(list_of_toks): if re.search(comped, token): match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]] match.append(re.search(comped, token).group(0)) match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140]) matches.append(match) if countmode: return(len(matches)) else: return matches def plaintext_regex_search(pattern, plaintext_data, concordancing = False, **kwargs): """search for regex in plaintext corpora it searches over lines, so the user needs to be careful. """ import re if concordancing: pattern = r'(.{,140})\b(' + pattern + r')\b(.{,140})' compiled_pattern = compiler(pattern) if compiled_pattern == 'Bad query': return 'Bad query' matches = re.findall(compiled_pattern, plaintext_data) if concordancing: matches = [list(m) for m in matches] if not concordancing: for index, i in enumerate(matches): if type(i) == tuple: matches[index] = i[0] if countmode: return(len(matches)) else: return matches def correct_spelling(a_string): if not spelling: return a_string from dictionaries.word_transforms import usa_convert if spelling.lower() == 'uk': usa_convert = {v: k for k, v in list(usa_convert.items())} spell_out = [] bits = a_string.split('/') for index, i in enumerate(bits): converted = usa_convert.get(i.lower(), i) if i.islower() or preserve_case is False: converted = converted.lower() elif i.isupper() and preserve_case: converted = converted.upper() elif i.istitle() and preserve_case: converted = converted.title() bits[index] = converted r = '/'.join(bits) return r def plaintext_simple_search(pattern, plaintext_data, concordancing = False, **kwargs): """search for tokens in plaintext corpora""" import re result = [] if type(pattern) == str: pattern = [pattern] for p in pattern: if concordancing: pat = r'(.{0,140})\b(' + re.escape(p) + r')\b(.{0,140})' pat = compiler(pat) if pat == 'Bad query': return 'Bad query' matches = re.findall(pat, plaintext_data) if concordancing: matches = [list(m) for m in matches] for i in matches: result.append(i) else: for m in range(len(matches)): result.append(p) return result # do multiprocessing if need be im, corpus, search, query, just_speakers = is_multiquery(corpus, search, query, just_speakers) if hasattr(corpus, '__iter__') and im: corpus = Corpus(corpus) if hasattr(corpus, '__iter__') and not im: im = True if corpus.__class__ == Corpora: im = True if not im and multiprocess: im = True corpus = corpus[:] # if it's already been through pmultiquery, don't do it again locs['search'] = search locs['query'] = query locs['just_speakers'] = just_speakers locs['corpus'] = corpus locs['multiprocess'] = multiprocess locs['print_info'] = kwargs.get('printstatus', True) if im: signal.signal(signal.SIGINT, original_sigint) from multiprocess import pmultiquery return pmultiquery(**locs) cname = corpus.name subcorpora = corpus.subcorpora try: datatype = corpus.datatype singlefile = corpus.singlefile except AttributeError: datatype = 'parse' singlefile = False # store all results in here results = {} count_results = {} conc_results = {} # check if just counting countmode = 'c' in show if countmode: no_conc = True only_conc = False # where we are at in interrogation current_iter = 0 # multiprocessing progress bar denom = kwargs.get('denominator', 1) startnum = kwargs.get('startnum', 0) ############################################ # Determine the search function to be used # ############################################ # simple tregex is tregex over whole dirs simple_tregex_mode = False statsmode = False if not just_speakers and 't' in list(search.keys()): simple_tregex_mode = True else: if datatype == 'plaintext': if search.get('n'): raise NotImplementedError('Use a tokenised corpus for n-gramming.') #searcher = plaintext_ngram optiontext = 'n-grams via plaintext' if search.get('w'): if kwargs.get('regex', True): searcher = plaintext_regex_search else: searcher = plaintext_simple_search optiontext = 'Searching plaintext' elif datatype == 'tokens': if search.get('n'): searcher = tok_ngrams optiontext = 'n-grams via tokens' elif search.get('w'): if kwargs.get('regex', True): searcher = tok_by_reg else: searcher = tok_by_list if type(search.get('w')) == list: searcher = tok_by_list optiontext = 'Searching tokens' only_parse = ['r', 'd', 'g', 'dl', 'gl', 'df', 'gf', 'dp', 'gp', 'f', 'd2', 'd2f', 'd2p', 'd2l'] if datatype != 'parse' and any(i in only_parse for i in list(search.keys())): raise ValueError('Need parsed corpus to search with "%s" option(s).' % ', '.join([i for i in list(search.keys()) if i in only_parse])) elif datatype == 'parse': if search.get('t'): searcher = slow_tregex elif search.get('s'): searcher = get_stats statsmode = True optiontext = 'General statistics' global numdone numdone = 0 no_conc = True only_conc = False do_concordancing = False else: from depsearch import dep_searcher searcher = dep_searcher optiontext = 'Dependency querying' ############################################ # Set some Tregex-related values # ############################################ if search.get('t'): translated_option = 't' query = search.get('t') # check the query q = tregex_engine(corpus = False, query = search.get('t'), options = ['-t'], check_query = True, root = root) if query is False: if root: return 'Bad query' else: return optiontext = 'Searching parse trees' if 'p' in show or 'pl' in show: translated_option = 'u' if type(search['t']) == list: search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)' elif 't' in show: translated_option = 'o' if type(search['t']) == list: search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)' elif 'w' in show: translated_option = 't' if type(search['t']) == list: search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'/.?[A-Za-z0-9].?/ !< __' elif 'c' in show: only_count = True translated_option = 'C' if type(search['t']) == list: search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'/.?[A-Za-z0-9].?/ !< __' elif 'l' in show: translated_option = 't' if type(search['t']) == list: search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'/.?[A-Za-z0-9].?/ !< __' query = search['t'] ############################################ # Make iterable for corpus/subcorpus/file # ############################################ if corpus.__class__ == Datalist: to_iterate_over = {} for subcorpus in corpus: to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files elif singlefile: to_iterate_over = {(corpus.name, corpus.path): [corpus]} elif not subcorpora: to_iterate_over = {(corpus.name, corpus.path): corpus.files} else: to_iterate_over = {} for subcorpus in subcorpora: to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files #for k, v in sorted(corpus.structure.items(), key=lambda obj: obj[0].name): # to_iterate_over[(k.name, k.path)] = v if files_as_subcorpora: to_iterate_over = {} for f in corpus.files: to_iterate_over[(f.name, f.path)] = [f] ############################################ # Print welcome message # ############################################ if no_conc: message = 'Interrogating' else: message = 'Interrogating and concordancing' if kwargs.get('printstatus', True): thetime = strftime("%H:%M:%S", localtime()) sformat = '\n '.join(['%s: %s' % (k.rjust(3), v) for k, v in list(search.items())]) if search == {'s': r'.*'}: sformat = 'features' welcome = '\n%s: %s %s ...\n %s\n Query: %s\n %s corpus ... \n' % \ (thetime, message, cname, optiontext, sformat, message) print(welcome) ############################################ # Make progress bar # ############################################ if simple_tregex_mode: total_files = len(list(to_iterate_over.keys())) else: if search.get('s'): total_files = sum([len(x) for x in list(to_iterate_over.values())]) * 12 else: total_files = sum([len(x) for x in list(to_iterate_over.values())]) par_args = {'printstatus': kwargs.get('printstatus', True), 'root': root, 'note': note, 'length': total_files, 'startnum': kwargs.get('startnum'), 'denom': kwargs.get('denominator', 1)} term = None if kwargs.get('paralleling', None) is not None: from blessings import Terminal term = Terminal() par_args['terminal'] = term par_args['linenum'] = kwargs.get('paralleling') outn = kwargs.get('outname', '') if outn: outn = outn + ': ' tstr = '%s%d/%d' % (outn, current_iter, total_files) p = animator(None, None, init = True, tot_string = tstr, **par_args) tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) ############################################ # Iterate over data, doing interrogations # ############################################ for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()): conc_results[subcorpus_name] = [] count_results[subcorpus_name] = [] results[subcorpus_name] = Counter() # tregex over subcorpora, not files if simple_tregex_mode: op = ['-o', '-' + translated_option] result = tregex_engine(query = search['t'], options = op, corpus = subcorpus_path, root = root, preserve_case = preserve_case) if not countmode: result = format_tregex(result) if not no_conc: op += ['-w', '-f'] whole_result = tregex_engine(query = search['t'], options = op, corpus = subcorpus_path, root = root, preserve_case = preserve_case) if not only_format_match: whole_result = format_tregex(whole_result, whole = True) conc_result = make_conc_lines_from_whole_mid(whole_result, result, speakr = False) if countmode: count_results[subcorpus_name] += [result] else: result = Counter(result) results[subcorpus_name] += result if not no_conc: for lin in conc_result: if numconc < maxconc or not maxconc: conc_results[subcorpus_name].append(lin) numconc += 1 current_iter += 1 if kwargs.get('paralleling', None) is not None: tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) else: tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) # dependencies, plaintext, tokens or slow_tregex else: for f in files: slow_treg_speaker_guess = kwargs.get('outname', False) if datatype == 'parse': with open(f.path, 'r') as data: data = data.read() from corenlp_xml.document import Document try: corenlp_xml = Document(data) except: print('Could not read file: %s' % f.path) continue if just_speakers: sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers] if len(just_speakers) == 1: slow_treg_speaker_guess = just_speakers[0] if not sents: continue else: sents = corenlp_xml.sentences res, conc_res = searcher(sents, search = search, show = show, dep_type = dep_type, exclude = exclude, excludemode = excludemode, searchmode = searchmode, lemmatise = False, case_sensitive = case_sensitive, do_concordancing = do_concordancing, only_format_match = only_format_match, speaker = slow_treg_speaker_guess) if res == 'Bad query': return 'Bad query' elif datatype == 'tokens': import pickle with codecs.open(f.path, "rb") as fo: data = pickle.load(fo) if not only_conc: res = searcher(list(search.values())[0], data, split_contractions = split_contractions, concordancing = False) if not no_conc: conc_res = searcher(list(search.values())[0], data, split_contractions = split_contractions, concordancing = True) if not no_conc: for index, line in enumerate(conc_res): line.insert(0, '') elif datatype == 'plaintext': with codecs.open(f.path, 'rb', encoding = 'utf-8') as data: data = data.read() if not only_conc: res = searcher(list(search.values())[0], data, concordancing = False) if not no_conc: conc_res = searcher(list(search.values())[0], data, concordancing = True) if not no_conc: for index, line in enumerate(conc_res): line.insert(0, '') if countmode: count_results[subcorpus_name] += [res] else: # add filename and do lowercasing for conc if not no_conc: for index, line in enumerate(conc_res): if searcher != slow_tregex: line.insert(0, f.name) else: line[0] = f.name if not preserve_case: line[3:] = [x.lower() for x in line[3:]] if spelling: line = [correct_spelling(b) for b in line] if numconc < maxconc or not maxconc: conc_results[subcorpus_name].append(line) numconc += 1 # do lowercasing and spelling if not only_conc: if not preserve_case: if not statsmode: res = [i.lower() for i in res] if spelling: if not statsmode: res = [correct_spelling(r) for r in res] #if not statsmode: results[subcorpus_name] += Counter(res) #else: #results[subcorpus_name] += res if not statsmode: current_iter += 1 if kwargs.get('paralleling', None) is not None: tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) else: tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) # delete temp file if there import os if os.path.isfile('tmp.txt'): os.remove('tmp.txt') ############################################ # Get concordances into DataFrame # ############################################ if not no_conc: all_conc_lines = [] for sc_name, resu in sorted(conc_results.items()): if only_unique: unique_results = uniquify(resu) else: unique_results = resu #make into series pindex = 'c f s l m r'.encode('utf-8').split() for fname, spkr, start, word, end in unique_results: #spkr = str(spkr, errors = 'ignore') fname = os.path.basename(fname) all_conc_lines.append(Series([sc_name, fname, \ spkr, \ start, \ word, \ end], \ index = pindex)) # randomise results... if random: from random import shuffle shuffle(all_conc_lines) conc_df = pd.concat(all_conc_lines, axis = 1).T # not doing anything yet --- this is for multimodal concordancing add_links = False if not add_links: conc_df.columns = ['c', 'f', 's', 'l', 'm', 'r'] else: conc_df.columns = ['c', 'f', 's', 'l', 'm', 'r', 'link'] if all(x == '' for x in list(conc_df['s'].values)): conc_df.drop('s', axis = 1, inplace = True) #if kwargs.get('note'): # kwargs['note'].progvar.set(100) #if kwargs.get('printstatus', True): # thetime = strftime("%H:%M:%S", localtime()) # finalstring = '\n\n%s: Concordancing finished! %d matches.\n' % (thetime, len(conc_df.index)) # print(finalstring) from interrogation import Concordance output = Concordance(conc_df) if only_conc: output.query = locs if save: output.save(save) if kwargs.get('printstatus', True): thetime = strftime("%H:%M:%S", localtime()) finalstring = '\n\n%s: Concordancing finished! %d results.' % (thetime, len(conc_df)) print(finalstring) signal.signal(signal.SIGINT, original_sigint) return output #output.query = locs #return output ############################################ # Get interrogation into DataFrame # ############################################ if not only_conc: if countmode: df = Series({k: sum(v) for k, v in sorted(count_results.items())}) tot = df.sum() else: the_big_dict = {} unique_results = set([item for sublist in list(results.values()) for item in sublist]) for word in unique_results: the_big_dict[word] = [subcorp_result[word] for name, subcorp_result in sorted(results.items(), key=lambda x: x[0])] # turn master dict into dataframe, sorted df = DataFrame(the_big_dict, index = sorted(results.keys())) numentries = len(df.columns) tot = df.sum(axis = 1) total_total = df.sum().sum() ############################################ # Format, output as Interrogation object # ############################################ if not countmode: if not subcorpora or singlefile: if not files_as_subcorpora: if not kwargs.get('df1_always_df'): df = Series(df.ix[0]) df.sort_values(ascending = False, inplace = True) tot = df.sum() numentries = len(df.index) total_total = tot # sort by total if type(df) == pd.core.frame.DataFrame: if not df.empty: df.ix['Total-tmp'] = df.sum() the_tot = df.ix['Total-tmp'] df = df[the_tot.argsort()[::-1]] df = df.drop('Total-tmp', axis = 0) # format final string if kwargs.get('printstatus', True): thetime = strftime("%H:%M:%S", localtime()) finalstring = '\n\n%s: Interrogation finished!' % thetime if countmode: finalstring += ' %d matches.' % tot else: finalstring += ' %d unique results, %d total occurrences.' % (numentries, total_total) print(finalstring) if not no_conc: interro = Interrogation(results = df, totals = tot, query = locs, concordance = output) else: interro = Interrogation(results = df, totals = tot, query = locs) if save: interro.save(save) signal.signal(signal.SIGINT, original_sigint) return interro
def conc(corpus, query, option = 'tregex', dep_function = 'any', dep_type = 'basic-dependencies', n = 100, random = False, window = 100, trees = False, plaintext = False, #'guess', add_links = False, show_links = False, print_status = True, print_output = True, just_speakers = False, root = False, **kwargs): """A concordancer for Tregex queries and dependencies""" import corpkit import os import re import pandas as pd from pandas import DataFrame from time import localtime, strftime try: from IPython.display import display, clear_output except ImportError: pass from corpkit.other import tregex_engine from corpkit.tests import check_pytex, check_dit try: get_ipython().getoutput() except TypeError: have_ipython = True except NameError: import subprocess have_ipython = False if query == 'any': query = r'.*' # convert list to query if type(query) == list: from other import as_regex if option.startswith('t'): query = r'/%s/ !< __' % as_regex(query, boundaries = 'line') else: query = as_regex(query, boundaries = 'w') can_do_fast = False if option.startswith('t'): if just_speakers is False: can_do_fast = True just_speakers_is_list = False if type(just_speakers) == list: just_speakers_is_list = True if type(just_speakers) == str: if just_speakers.lower() != 'all': just_speakers = [just_speakers] def get_deps(sentence, dep_type): if dep_type == 'basic-dependencies': return sentence.basic_dependencies if dep_type == 'collapsed-dependencies': return sentence.collapsed_dependencies if dep_type == 'collapsed-ccprocessed-dependencies': return sentence.collapsed_ccprocessed_dependencies conc_lines = [] if option.startswith('t'): if trees: options = '-s' else: options = '-t' if can_do_fast: speakr = '' tregex_engine(query = query, check_query = True, root = root) wholes = tregex_engine(query = query, options = ['-o', '-w', '-f', options], corpus = corpus, preserve_case = True, root = root) middle_column_result = tregex_engine(query = query, options = ['-o', options], corpus = corpus, preserve_case = True, root = root) for (f, whole), mid in zip(wholes, middle_column_result): reg = re.compile(r'(' + re.escape(mid) + r')', re.IGNORECASE) start, middle, end = re.split(reg, whole, 1) conc_lines.append([os.path.basename(f), speakr, start, middle, end]) else: fs_to_conc = [] for r, dirs, fs in os.walk(corpus): for f in fs: if not os.path.isfile(os.path.join(r, f)): continue if not f.endswith('.txt') and not f.endswith('.xml'): continue fs_to_conc.append(os.path.join(r, f)) def normalise(concline): import re reg = re.compile(r'\([^ ]+') spaces = re.compile(r'\s+') concline = re.sub(reg, '', concline) concline = re.sub(spaces, ' ', concline) concline = concline.replace(')', '').replace(' ', ' ') return concline.strip() num_fs = len(fs_to_conc) for index, filepath in enumerate(fs_to_conc): f = os.path.basename(filepath) if num_fs > 1: if 'note' in kwargs.keys(): kwargs['note'].progvar.set((index) * 100.0 / num_fs) from time import localtime, strftime thetime = strftime("%H:%M:%S", localtime()) print '%s: Extracting data from %s ...' % (thetime, f) if root: root.update() with open(filepath, "rb") as text: parsetreedict = {} data = text.read() if option.startswith('p') or option.startswith('l'): if option.startswith('l'): lstokens = pickle.load(open(filepath, 'rb')) data = ' '.join(tokens) data = data.split(' . ') else: lines = data.splitlines() for l in lines: m = re.compile(r'^(.*?)(' + query + r')(.*)$', re.IGNORECASE) mat = re.search(m, l) if mat: conc_lines.append([f, '', mat.group(1), mat.group(2), mat.group(3)]) continue from corenlp_xml.document import Document corenlp_xml = Document(data) #corenlp_xml = Beautifulcorenlp_xml(data, parse_only=justsents) if just_speakers: for s in just_speakers: parsetreedict[s] = [] sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers] #sents = [s for s in corenlp_xml.find_all('sentence') \ #if s.speakername.text.strip() in just_speakers] else: sents = corenlp_xml.sentences nsents = len(sents) for i, s in enumerate(sents): if num_fs == 1: if 'note' in kwargs.keys(): kwargs['note'].progvar.set((index) * 100.0 / nsents) if root: root.update() try: speakr = s.speakername.strip() except: speakr = '' parsetree = s.parse_string if option.startswith('t'): parsetreedict[speakr].append(parsetree) continue elif option.startswith('d'): #right_dependency_grammar = s.find_all('dependencies', type=dep_type, limit = 1) deps = get_deps(s, dep_type) if dep_function == 'any' or dep_function is False: wdsmatching = [l.dependent.text.strip() for l in deps.links \ if re.match(query, l.dependent.text.strip())] else: comped = re.compile(dep_function, re.IGNORECASE) #goodsent = any(re.match(query, l.dependent.text.strip()) for l in deps.links if re.match(comped, l.type.strip())) wdsmatching = [l.dependent.text.strip() for l in deps.links \ if re.match(comped, l.type.strip()) and \ re.match(query, l.dependent.text.strip())] # this is shit, needs indexing or something for wd in wdsmatching: line = normalise(parsetree) start, middle, end = re.split(r'(' + wd + r')', line, 1) conc_lines.append([f, speakr, start, middle, end]) if option.startswith('t'): for speakr, dt in parsetreedict.items(): trees_as_string = '\n'.join(dt) if trees: options = '-s' else: options = '-t' with open('tmp.txt', 'w') as fo: fo.write(trees_as_string.encode('utf-8', errors = 'ignore')) tregex_engine(query = query, check_query = True, root = root) wholes = tregex_engine(query = query, options = ['-o', '-w', options], corpus = 'tmp.txt', preserve_case = True, root = root) middle_column_result = tregex_engine(query = query, options = ['-o', options], corpus = 'tmp.txt', preserve_case = True, root = root) for whole, mid in zip(wholes, middle_column_result): reg = re.compile(r'(' + re.escape(mid) + r')', re.IGNORECASE) start, middle, end = re.split(reg, whole, 1) conc_lines.append([f, speakr, start, middle, end]) # does not keep results ordered! try: os.remove('tmp.txt') except: pass unique_results = [list(x) for x in set(tuple(x) for x in conc_lines)] #make into series series = [] pindex = 'f s l m r'.encode('utf-8').split() for fname, spkr, start, word, end in unique_results: import os fname = os.path.basename(fname) start = start.replace('$ ', '$').replace('`` ', '``').replace(' ,', ',').replace(' .', '.').replace("'' ", "''").replace(" n't", "n't").replace(" 're","'re").replace(" 'm","'m").replace(" 's","'s").replace(" 'd","'d").replace(" 'll","'ll").replace(' ', ' ') word = word.replace('$ ', '$').replace('`` ', '``').replace(' ,', ',').replace(' .', '.').replace("'' ", "''").replace(" n't", "n't").replace(" 're","'re").replace(" 'm","'m").replace(" 's","'s").replace(" 'd","'d").replace(" 'll","'ll").replace(' ', ' ') end = end.replace('$ ', '$').replace('`` ', '``').replace(' ,', ',').replace(' .', '.').replace("'' ", "''").replace(" n't", "n't").replace(" 're","'re").replace(" 'm","'m").replace(" 's","'s").replace(" 'd","'d").replace(" 'll","'ll").replace(' ', ' ') #spaces = ' ' * (maximum / 2 - (len(word) / 2)) #new_word = spaces + word + spaces series.append(pd.Series([fname.encode('utf-8', errors = 'ignore'), \ spkr.encode('utf-8', errors = 'ignore'), \ start.encode('utf-8', errors = 'ignore'), \ word.encode('utf-8', errors = 'ignore'), \ end.encode('utf-8', errors = 'ignore')], index = pindex)) # randomise results... if random: from random import shuffle shuffle(series) if series == []: if root: print 'No results found, sorry.' return else: raise ValueError("No results found, I'm afraid. Check your query and path.") df = pd.concat(series, axis = 1).T if not add_links: df.columns = ['f', 's', 'l', 'm', 'r'] else: df.columns = ['f', 's', 'l', 'm', 'r', 'link'] if all(x == '' for x in list(df['s'].values)): df.drop('s', axis = 1, inplace = True) formatl = lambda x: "{0}".format(x[-window:]) formatf = lambda x: "{0}".format(x[-20:]) #formatr = lambda x: formatr = lambda x: "{{:<{}s}}".format(df['r'].str.len().max()).format(x[:window]) st = df.head(n).to_string(header = False, formatters={'l': formatl, 'r': formatr, 'f': formatf}).splitlines() # hack because i can't figure out formatter: rem = '\n'.join([re.sub('\s*\.\.\.\s*$', '', s) for s in st]) if print_output: print rem if 'note' in kwargs.keys(): kwargs['note'].progvar.set(100) return df if add_links: def _add_links(lines, links = False, show = 'thread'): link = "http://www.healthboards.com/boards/bipolar-disorder/695089-labels.html" linktext = '<a href="%s>link</a>' % link import pandas as pd inds = list(df.index) num_objects = len(list(df.index)) ser = pd.Series([link for n in range(num_objects)], index = inds) lines['link'] = ser return lines df = _add_links(df) if add_links: if not show_links: if print_output: print df.drop('link', axis = 1).head(n).to_string(header = False, formatters={rname:'{{:<{}s}}'.format(df[rname].str.len().max()).format}) else: if print_output: print HTML(df.to_html(escape=False)) else: if print_output: print df.head(n).to_string(header = False, formatters={rname:'{{:<{}s}}'.format(df[rname].str.len().max()).format}) if not add_links: df.columns = ['f', 'l', 'm', 'r'] else: df.columns = ['f', 'l', 'm', 'r', 'link'] return df
def conc(corpus, option = 'tregex', query = 'any', dep_function = 'any', dep_type = 'collapsed-ccprocessed-dependencies', n = 100, random = False, split_sents = True, window = 100, trees = False, plaintext = False, add_links = False, show_links = False, print_status = True, print_output = True, just_speakers = False, root = False, **kwargs): """ A concordancer for Tregex queries and dependencies. * Revisions forthcoming to facilitate better dependency querying :returns: a Pandas DataFrame containing concordance lines""" import corpkit import os import re import pandas as pd from pandas import DataFrame from time import localtime, strftime try: from IPython.display import display, clear_output except ImportError: pass from corpkit.other import tregex_engine from corpkit.tests import check_pytex, check_dit try: get_ipython().getoutput() except TypeError: have_ipython = True except NameError: import subprocess have_ipython = False if query == 'any': query = r'.*' # convert list to query if type(query) == list: from other import as_regex if option.startswith('t'): query = r'/%s/ !< __' % as_regex(query, boundaries = 'line') else: query = as_regex(query, boundaries = 'w') can_do_fast = False if option.startswith('t'): if just_speakers is False: can_do_fast = True just_speakers_is_list = False if type(just_speakers) == list: just_speakers_is_list = True if type(just_speakers) == str: if just_speakers.lower() != 'all': just_speakers = [just_speakers] def get_deps(sentence, dep_type): if dep_type == 'basic-dependencies': return sentence.basic_dependencies if dep_type == 'collapsed-dependencies': return sentence.collapsed_dependencies if dep_type == 'collapsed-ccprocessed-dependencies': return sentence.collapsed_ccprocessed_dependencies conc_lines = [] if option.startswith('t'): if trees: options = '-s' else: options = '-t' if can_do_fast: speakr = '' tregex_engine(query = query, check_query = True, root = root) wholes = tregex_engine(query = query, options = ['-o', '-w', '-f', options], corpus = corpus, preserve_case = True, root = root) middle_column_result = tregex_engine(query = query, options = ['-o', options], corpus = corpus, preserve_case = True, root = root) for (f, whole), mid in zip(wholes, middle_column_result): reg = re.compile(r'(' + re.escape(mid) + r')', re.IGNORECASE) start, middle, end = re.split(reg, whole, 1) conc_lines.append([os.path.basename(f), speakr, start, middle, end]) else: if query.startswith(r'\b'): query = query[2:] if query.endswith(r'\b'): query = query[:-2] fs_to_conc = [] for r, dirs, fs in os.walk(corpus): for f in fs: if not os.path.isfile(os.path.join(r, f)): continue if not f.endswith('.txt') and not f.endswith('.xml') and not f.endswith('.p'): continue fs_to_conc.append(os.path.join(r, f)) def normalise(concline): import re reg = re.compile(r'\([^ ]+') spaces = re.compile(r'\s+') concline = re.sub(reg, '', concline) concline = re.sub(spaces, ' ', concline) concline = concline.replace(')', '').replace(' ', ' ') return concline.strip() num_fs = len(fs_to_conc) for index, filepath in enumerate(fs_to_conc): f = os.path.basename(filepath) if num_fs > 1: if 'note' in kwargs.keys(): kwargs['note'].progvar.set((index) * 100.0 / num_fs) if print_status: from time import localtime, strftime thetime = strftime("%H:%M:%S", localtime()) print '%s: Extracting data from %s ...' % (thetime, f) if root: root.update() with open(filepath, "r") as text: parsetreedict = {} data = text.read() if option.startswith('p'): import chardet enc = chardet.detect(data) data = unicode(data, enc['encoding'], errors = 'ignore') if option.startswith('p') or option.startswith('l'): if option.startswith('l'): import pickle try: lstokens = pickle.load(open(filepath, 'rb')) except EOFError: thetime = strftime("%H:%M:%S", localtime()) print '%s: File "%s" could not be opened.' % (thetime, os.path.basename(filepath)) data = ' '.join(lstokens) if split_sents: lines = data.split(' . ') else: lines = [data.replace('\n', '')] else: if split_sents: lines = data.splitlines() else: lines = [data.replace('\n', '')] for l in lines: if split_sents: m = re.compile(r'(?i)^(.*?)(\b' + query + r'\b)(.*)$', re.UNICODE) else: m = re.compile(r'(?i)(.{,%s})(\b' % window + query + r'\b)(.{,%s})' % window, re.UNICODE) if split_sents: mat = re.search(m, l) else: mat = re.findall(m, l) if split_sents: if mat: last_num = len(mat.groups()) conc_lines.append([f, '', mat.group(1), mat.group(2), mat.group(last_num)]) else: if mat: #print len(mat) for ent in mat: #print len(ent) last_num = len(ent) - 1 conc_lines.append([f, '', ent[0], ent[1], ent[last_num]]) if any(f.endswith('.xml') for f in fs_to_conc): from corenlp_xml.document import Document corenlp_xml = Document(data) #corenlp_xml = Beautifulcorenlp_xml(data, parse_only=justsents) if just_speakers: for s in just_speakers: parsetreedict[s] = [] sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers] #sents = [s for s in corenlp_xml.find_all('sentence') \ #if s.speakername.text.strip() in just_speakers] else: sents = corenlp_xml.sentences nsents = len(sents) for i, s in enumerate(sents): if num_fs == 1: if 'note' in kwargs.keys(): kwargs['note'].progvar.set((index) * 100.0 / nsents) if root: root.update() try: speakr = s.speakername.strip() except: speakr = '' parsetree = s.parse_string if option.startswith('t'): parsetreedict[speakr].append(parsetree) continue elif option.startswith('d'): try: compiled_query = re.compile(query) except: import traceback import sys exc_type, exc_value, exc_traceback = sys.exc_info() lst = traceback.format_exception(exc_type, exc_value, exc_traceback) error_message = lst[-1] thetime = strftime("%H:%M:%S", localtime()) print '%s: Query %s' % (thetime, error_message) return #right_dependency_grammar = s.find_all('dependencies', type=dep_type, limit = 1) deps = get_deps(s, dep_type) if dep_function == 'any' or dep_function is False: wdsmatching = [l.dependent.text.strip() for l in deps.links \ if re.match(query, l.dependent.text.strip())] else: comped = re.compile(dep_function, re.IGNORECASE) #goodsent = any(re.match(query, l.dependent.text.strip()) for l in deps.links if re.match(comped, l.type.strip())) wdsmatching = [l.dependent.text.strip() for l in deps.links \ if re.match(comped, l.type.strip()) and \ re.match(query, l.dependent.text.strip())] # this is shit, needs indexing or something for wd in wdsmatching: line = normalise(parsetree) try: start, middle, end = re.split(r'(' + wd + r')', line, 1) except ValueError: continue conc_lines.append([f, speakr, start, middle, end]) if option.startswith('t'): for speakr, dt in parsetreedict.items(): trees_as_string = '\n'.join(dt) if trees: options = '-s' else: options = '-t' with open('tmp.txt', 'w') as fo: fo.write(trees_as_string.encode('utf-8', errors = 'ignore')) tregex_engine(query = query, check_query = True, root = root) wholes = tregex_engine(query = query, options = ['-o', '-w', options], corpus = 'tmp.txt', preserve_case = True, root = root) middle_column_result = tregex_engine(query = query, options = ['-o', options], corpus = 'tmp.txt', preserve_case = True, root = root) for whole, mid in zip(wholes, middle_column_result): reg = re.compile(r'(' + re.escape(mid) + r')', re.IGNORECASE) start, middle, end = re.split(reg, whole, 1) conc_lines.append([f, speakr, start, middle, end]) # does not keep results ordered! try: os.remove('tmp.txt') except: pass unique_results = [list(x) for x in set(tuple(x) for x in conc_lines)] #make into series series = [] pindex = 'f s l m r'.encode('utf-8').split() for fname, spkr, start, word, end in unique_results: spkr = unicode(spkr, errors = 'ignore') fname = os.path.basename(fname) start = start.replace('$ ', '$').replace('`` ', '``').replace(' ,', ',').replace(' .', '.').replace("'' ", "''").replace(" n't", "n't").replace(" 're","'re").replace(" 'm","'m").replace(" 's","'s").replace(" 'd","'d").replace(" 'll","'ll").replace(' ', ' ') word = word.replace('$ ', '$').replace('`` ', '``').replace(' ,', ',').replace(' .', '.').replace("'' ", "''").replace(" n't", "n't").replace(" 're","'re").replace(" 'm","'m").replace(" 's","'s").replace(" 'd","'d").replace(" 'll","'ll").replace(' ', ' ') end = end.replace('$ ', '$').replace('`` ', '``').replace(' ,', ',').replace(' .', '.').replace("'' ", "''").replace(" n't", "n't").replace(" 're","'re").replace(" 'm","'m").replace(" 's","'s").replace(" 'd","'d").replace(" 'll","'ll").replace(' ', ' ') #spaces = ' ' * (maximum / 2 - (len(word) / 2)) #new_word = spaces + word + spaces # the use of ascii here makes sure the string formats ok, but will also screw over # anyone doing non-english work. so, change to utf-8, then fix errors as they come # in the corpkit-gui "add_conc_lines_to_window" function series.append(pd.Series([fname.encode('ascii', errors = 'ignore'), \ spkr.encode('ascii', errors = 'ignore'), \ start.encode('ascii', errors = 'ignore'), \ word.encode('ascii', errors = 'ignore'), \ end.encode('ascii', errors = 'ignore')], index = pindex)) # randomise results... if random: from random import shuffle shuffle(series) if series == []: if root: print 'No results found, sorry.' return else: raise ValueError("No results found, I'm afraid. Check your query and path.") df = pd.concat(series, axis = 1).T if not add_links: df.columns = ['f', 's', 'l', 'm', 'r'] else: df.columns = ['f', 's', 'l', 'm', 'r', 'link'] if all(x == '' for x in list(df['s'].values)): df.drop('s', axis = 1, inplace = True) if 'note' in kwargs.keys(): kwargs['note'].progvar.set(100) if print_output: formatl = lambda x: "{0}".format(x[-window:]) formatf = lambda x: "{0}".format(x[-20:]) #formatr = lambda x: formatr = lambda x: "{{:<{}s}}".format(df['r'].str.len().max()).format(x[:window]) st = df.head(n).to_string(header = False, formatters={'l': formatl, 'r': formatr, 'f': formatf}).splitlines() # hack because i can't figure out formatter: rem = '\n'.join([re.sub('\s*\.\.\.\s*$', '', s) for s in st]) print rem return df