def searchtree(tree, query, options = ['-t', '-o']): "Searches a tree with Tregex and returns matching terminals" import os from corpkit.other import tregex_engine from corpkit.tests import check_dit try: get_ipython().getoutput() except TypeError: have_ipython = True except NameError: import subprocess have_ipython = False fo = open('tree.tmp',"w") fo.write(tree + '\n') fo.close() result = tregex_engine(query = query, check_query = True) result = tregex_engine(query = query, options = options, corpus = "tree.tmp") os.remove("tree.tmp") return result
def make_nltk_text(directory, collapse_dirs = True, tagged = False, lemmatise = False, just_content_words = False): """turn a lot of trees into an nltk style text""" import nltk import os from corpkit.other import tregex_engine if type(directory) == str: dirs = [os.path.join(directory, d) for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))] if len(dirs) == 0: dirs = [directory] elif type(directory) == list: dirs = directory return_tuples = False if tagged: return_tuples = True if just_content_words: lemmatise = True query = r'__ < (/.?[A-Za-z0-9].?/ !< __)' if not return_tuples and not lemmatise: options = ['-o', '-t'] else: options = ['-o'] # filthy code. all_out = [] for d in dirs: print "Flattening %s ... " % str(d) res = tregex_engine(corpus = d, query = query, options = options, lemmatise = lemmatise, just_content_words = just_content_words, return_tuples = return_tuples) all_out.append(res) if collapse_dirs: tmp = [] for res in all_out: for w in res: tmp.append(w) all_out = tmp textx = nltk.Text(all_out) else: textx = {} for name, text in zip(dirs, all_out): t = nltk.Text(all_out) textx[os.path.basename(name)] = t return textx
def make_nltk_text(directory): """turn a lot of trees into an nltk style text""" import nltk import os from corpkit.other import tregex_engine if type(directory) == str: dirs = [os.path.join(directory, d) for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))] if len(dirs) == 0: dirs = [directory] elif type(directory) == list: dirs = directory out = [] for d in dirs: print d res = tregex_engine(corpus = dir, query = 'ROOT < __', options = ['-w', '-t']) for r in res: out.append(r) print 'Tokenising ...' as_string = '\n'.join(out) as_list_of_tokens = nltk.word_tokenize(as_string) text = nltk.Text(as_list_of_tokens) return text
def conc(corpus, query, n = 100, random = False, window = 40, trees = False, plaintext = 'guess', add_links = False, show_links = False): """A concordancer for Tregex queries over trees or regexes over plain text""" import os import re import pandas as pd from pandas import DataFrame from time import localtime, strftime try: from IPython.display import display, clear_output except ImportError: pass from corpkit.other import tregex_engine from corpkit.tests import check_pytex, check_dit try: get_ipython().getoutput() except TypeError: have_ipython = True except NameError: import subprocess have_ipython = False # convert list to query if type(query) == list: from corpkit.other import as_regex query = r'/%s/ !< __' % as_regex(query, boundaries = 'line') # lazy, daniel! if window == 'all': window = 9999 # check query good_tregex_query = tregex_engine(query, check_query = True) if good_tregex_query is False: return # make sure there's a corpus if not os.path.exists(corpus): raise ValueError('Corpus file or folder not found: %s' % corpus) # welcome message time = strftime("%H:%M:%S", localtime()) print "\n%s: Getting concordances for %s ... \n Query: %s\n" % (time, corpus, query) output = [] if plaintext == 'guess': if not tregex_engine(corpus = corpus, check_for_trees = True): plaintext = True else: plaintext = False if trees: options = '-s' else: options = '-t' if not plaintext: whole_results = tregex_engine(query, options = ['-o', '-w', options], corpus = corpus) middle_column_result = tregex_engine(query, options = ['-o', options], corpus = corpus) if plaintext: import nltk sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') whole_results = [] middle_column_result = [] small_regex = re.compile(query) big_regex = re.compile(r'.*' + query + r'.*') fs = [os.path.join(corpus, f) for f in os.listdir(corpus)] # do recursive if need if any(os.path.isdir(f) for f in fs): recursive_files = [] for dirname, dirnames, filenames in os.walk(corpus): for filename in filenames: recursive_files.append(os.path.join(dirname, filename)) fs = recursive_files for f in fs: raw = open(f).read().replace('\n', ' ') # encoding ... ? sents = sent_tokenizer.tokenize(raw) for sent in sents: try: for match in re.findall(small_regex, raw): middle_column_result.append(match) whole_results.append(sent) except: continue try: # get longest middle column result, or discover no results and raise error maximum = len(max(middle_column_result, key=len)) except ValueError: time = strftime("%H:%M:%S", localtime()) print "\n%s: No matches found." % time return zipped = zip(whole_results, middle_column_result) unique_results = [] for whole_result, middle_part in zipped: if not trees: regex = re.compile(r"(\b[^\s]{0,1}.{," + re.escape(str(window)) + r"})(\b" + re.escape(middle_part) + r"\b)(.{," + re.escape(str(window)) + r"}[^\s]\b)") else: regex = re.compile(r"(.{,%s})(%s)(.{,%s})" % (window, re.escape(middle_part), window )) search = re.findall(regex, whole_result) for result in search: unique_results.append(result) unique_results = set(sorted(unique_results)) # make unique #make into series series = [] lname = ' ' * (window/2-1) + 'l' # centering middle column #mname = ' ' * (maximum/2+1) + 'm' mname = ' ' * (maximum/2-1) + 'm' rname = ' ' * (window/2-1) + 'r' for start, word, end in unique_results: #spaces = ' ' * (maximum / 2 - (len(word) / 2)) #new_word = spaces + word + spaces series.append(pd.Series([start.encode('utf-8', errors = 'ignore'), word.encode('utf-8', errors = 'ignore'), end.encode('utf-8', errors = 'ignore')], index = [lname.encode('utf-8', errors = 'ignore'), mname.encode('utf-8', errors = 'ignore'), rname.encode('utf-8', errors = 'ignore')])) # randomise results... if random: from random import shuffle shuffle(series) try: df = pd.concat(series, axis = 1).T except ValueError: raise ValueError("No results found, I'm afraid. Check your query and path.") if add_links: def _add_links(lines, links = False, show = 'thread'): link = "http://www.healthboards.com/boards/bipolar-disorder/695089-labels.html" linktext = '<a href="%s>link</a>' % link import pandas as pd inds = list(df.index) num_objects = len(list(df.index)) ser = pd.Series([link for n in range(num_objects)], index = inds) lines['link'] = ser return lines df = _add_links(df) # make temporary pd.set_option('display.max_columns', 500) pd.set_option('max_colwidth',window * 2) pd.set_option('display.width', 1000) pd.set_option('expand_frame_repr', False) pd.set_option('colheader_justify', 'left') if add_links: if not show_links: print df.drop('link', axis = 1).head(n).to_string(header = False, formatters={rname:'{{:<{}s}}'.format(df[rname].str.len().max()).format}) else: print HTML(df.to_html(escape=False)) else: print df.head(n).to_string(header = False, formatters={rname:'{{:<{}s}}'.format(df[rname].str.len().max()).format}) if not add_links: df.columns = ['l', 'm', 'r'] else: df.columns = ['l', 'm', 'r', 'link'] return df # r'/NN.?/ < /(?i)\brisk/ $ (/NN.?/ < /(?i)factor >># NP)'
def dictmaker(path, dictname, query = 'any', dictpath = 'data/dictionaries', lemmatise = False, just_content_words = False, use_dependencies = False): """makes a pickle wordlist named dictname in dictpath""" import os import pickle import re import nltk from time import localtime, strftime from StringIO import StringIO import shutil from collections import Counter from corpkit.progressbar import ProgressBar from corpkit.other import tregex_engine try: from IPython.display import display, clear_output except ImportError: pass try: get_ipython().getoutput() except TypeError: have_ipython = True except NameError: import subprocess have_ipython = False if lemmatise: dictname = dictname + '-lemmatised' if not dictname.endswith('.p'): dictname = dictname + '.p' # allow direct passing of dirs path_is_list = False one_big_corpus = False if type(path) == str: sorted_dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path,d))] # if no subcorpora, just do the dir passed in if len(sorted_dirs) == 0: one_big_corpus = True sorted_dirs = [path] elif type(path) == list: path_is_list = True sorted_dirs = sorted(path) if type(sorted_dirs[0]) == int: sorted_dirs = [str(d) for d in sorted_dirs] try: sorted_dirs.sort(key=int) except: pass try: if not os.path.exists(dictpath): os.makedirs(dictpath) except IOError: print "Error making " + dictpath + "/ directory." while os.path.isfile(os.path.join(dictpath, dictname)): time = strftime("%H:%M:%S", localtime()) selection = raw_input('\n%s: %s already exists in %s.\n' \ ' You have the following options:\n\n' \ ' a) save with a new name\n' \ ' b) delete %s\n' \ ' c) exit\n\nYour selection: ' % (time, dictname, dictpath, os.path.join(dictpath, dictname))) if 'a' in selection: sel = raw_input('\nNew save name: ') dictname = sel if lemmatise: dictname = dictname.replace('-lemmatised.p', '') dictname = dictname + '-lemmatised' if not dictname.endswith('.p'): dictname = dictname + '.p' elif 'b' in selection: os.remove(os.path.join(dictpath, dictname)) elif 'c' in selection: print '' return else: as_str = str(selection) print ' Choice "%s" not recognised.' % selection time = strftime("%H:%M:%S", localtime()) print '\n%s: Extracting words from files ... \n' % time # all this just to get a list of files and make a better progress bar if use_dependencies: counts = [] for d in sorted_dirs: if not one_big_corpus: subcorpus = os.path.join(path, d) else: subcorpus = path if use_dependencies: files = [f for f in os.listdir(subcorpus) if f.endswith('.xml')] else: files = [f for f in os.listdir(subcorpus)] counts.append(len(files)) num_files = sum(counts) c = 0 p = ProgressBar(num_files) else: p = ProgressBar(len(sorted_dirs)) def tokener(xmldata): """print word, using good lemmatisation""" from bs4 import BeautifulSoup import gc open_classes = ['N', 'V', 'R', 'J'] result = [] just_good_deps = SoupStrainer('tokens') soup = BeautifulSoup(xmldata, parse_only=just_good_deps) for token in soup.find_all('token'): word = token.word.text query = re.compile(r'.*') if re.search(query, word): if lemmatise: word = token.lemma.text if just_content_words: if not token.pos.text[0] in open_classes: continue result.append(word) # attempt to stop memory problems. # not sure if this helps, though: soup.decompose() soup = None data = None gc.collect() return result # translate 'any' query if query == 'any': if lemmatise: query = r'__ <# (__ !< __)' else: query = r'__ !< __' if lemmatise: options = ['-o'] else: options = ['-t', '-o'] if use_dependencies: from bs4 import BeautifulSoup, SoupStrainer if query == 'any': query = r'.*' query = re.compile(query) allwords = [] for index, d in enumerate(sorted_dirs): if not use_dependencies: p.animate(index) if not path_is_list: if len(sorted_dirs) == 1: subcorp = d else: subcorp = os.path.join(path, d) else: subcorp = d # check query first time through if not use_dependencies: if index == 0: trees_found = tregex_engine(corpus = subcorp, check_for_trees = True) if not trees_found: lemmatise = False dictname = dictname.replace('-lemmatised', '') if trees_found: results = tregex_engine(corpus = subcorp, options = options, query = query, lemmatise = lemmatise, just_content_words = just_content_words) for result in results: allwords.append(result) elif use_dependencies: regex_nonword_filter = re.compile("[A-Za-z]") results = [] fs = [os.path.join(subcorp, f) for f in os.listdir(subcorp)] for f in fs: p.animate(c, str(c) + '/' + str(num_files)) c += 1 data = open(f).read() result_from_a_file = tokener(data) for w in result_from_a_file: if re.search(regex_nonword_filter, w): allwords.append(w.lower()) if not use_dependencies: if not trees_found: for f in os.listdir(subcorp): raw = unicode(open(os.path.join(subcorp, f)).read(), 'utf-8', errors = 'ignore') sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') sents = sent_tokenizer.tokenize(raw) tokenized_sents = [nltk.word_tokenize(i) for i in sents] for sent in tokenized_sents: for w in sent: allwords.append(w.lower()) #100% p.animate(len(sorted_dirs)) # make a dict dictionary = Counter(allwords) with open(os.path.join(dictpath, dictname), 'wb') as handle: pickle.dump(dictionary, handle) time = strftime("%H:%M:%S", localtime()) print '\n\n' + time + ': Done! ' + dictname + ' created in ' + dictpath + '/'
def datareader(data, plaintext = False, **kwargs): """ Returns a string of plain text from a number of kinds of data. The kinds of data currently accepted are: path to corpus : all trees are flattened path to subcorpus : all trees are flattened conc() output (list of concordance lines) csv file generated with conc() a string of text """ import os import pandas from corpkit.other import tregex_engine from corpkit.tests import check_dit try: get_ipython().getoutput() except TypeError: have_ipython = True except NameError: import subprocess have_ipython = False tregex_engine_used = False # if unicode, make it a string if type(data) == unicode: if not os.path.isdir(data): if not os.path.isfile(data): return good if type(data) == str: # if it's a file, read it if os.path.isfile(data): good = open(data).read() # if it's a dir, flatten all trees elif os.path.isdir(data): # get all sentences newline separated query = r'__ !< __' options = ['-o', '-t'] # if lemmatise, we get each word on a newline if 'lemmatise' in kwargs: if kwargs['lemmatise'] is True: query = r'__ <# (__ !< __)' options = ['-o'] # check for trees ... #while plaintext is False: #for f in first_twenty: #plaintext = tregex_engine(corpus = f, check_for_trees = True) if not plaintext: tregex_engine_used = True results = tregex_engine(corpus = data, options = options, query = query, **kwargs) else: results = [] fs = [os.path.join(data, f) for f in os.listdir(data)] # do recursive if need if any(os.path.isdir(f) for f in fs): recursive_files = [] for dirname, dirnames, filenames in os.walk(data): for filename in filenames: recursive_files.append(os.path.join(dirname, filename)) fs = recursive_files import nltk sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') for f in fs: raw = unicode(open(f).read(), 'utf-8', errors = 'ignore') sents = sent_tokenizer.tokenize(raw) tokenized_sents = [nltk.word_tokenize(i) for i in sents] for sent in tokenized_sents: for w in sent: results.append(w.lower()) return results #good = '\n'.join(results) # if a string of text, else: good = data # if conc results, turn into string... elif type(data) == pandas.core.frame.DataFrame: # if conc lines: try: if list(data.columns) == ['l', 'm', 'r']: conc_lines = True else: conc_lines = False except: conc_lines = False if conc_lines: # may not be unicode!? good = [' '.join(list(data.ix[l])) for l in list(data.index)] else: good = data # make unicode if not tregex_engine_used: try: good = unicode(good, 'utf-8', errors = 'ignore') except TypeError: pass return good
def eugener(path, query, depth = 5, top = 20, lemmatise = False, just_content_words = False, remove_query_from_output = False, remove_zero_depth = False, return_tags = False): """ ***This is probably broken now, can fix if there's a use for it.*** get most frequent words in corpus path to left and right of query regex path: path to corpus containing subcorpora query: regex to match word to be zero depth depth: number of places left and right to look top: number of most frequent entries to return lemmatise: wordnet lemmatisation just_content_words: keep only n, v, a, r tagged words remove_query_from_output: remove o """ import os import nltk import re from collections import Counter import pandas as pd from corpkit.progressbar import ProgressBar from corpkit.other import tregex_engine # manual lemmatisation here: from dictionaries.word_transforms import wordlist try: get_ipython().getoutput() except TypeError: have_ipython = True from IPython.display import display, clear_output except NameError: import subprocess have_ipython = False from corpkit.tests import check_dit # probably never needed if lemmatise: from nltk.stem.wordnet import WordNetLemmatizer lmtzr=WordNetLemmatizer() regex = re.compile(query) wordregex = re.compile('[A-Za-z0-9]') print '' # get list of subcorpora dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))] sorted_dirs = sorted(dirs) # define risk word # place for our output dfs = {} p = ProgressBar(len(sorted_dirs)) for index, corpus in enumerate(sorted_dirs): p.animate(index) # search the corpus for whole sents containing risk word subcorpus = os.path.join(path, corpus) if lemmatise: query = r'__ <# (__ !< __)' else: query = r'__ !> __' results = tregex_engine(query, ['-o'], subcorpus, lemmatise = lemmatise, just_content_words = just_content_words) # lowercase processed = [(r.lower(), tag) for r, tag in processed] # remove punct processed = [w for w in processed if re.search(wordregex, w[0])] # a place for info about each corpus # word list to use later all_words = [] dicts = [] # go left and right depth times (for 2, makes [-2, -1, 0, 1, 2]) for i in range(-depth, (depth + 1)): newdict = Counter() matching = [] # go through each token for index, (token, tag) in enumerate(processed): # if token matches risk expression if re.search(regex, token): # get the word at depth index # try statement for cases where the target word index isn't there try: if i < 0: num = index - abs(i) if return_tags: matching.append(processed[num][1]) else: matching.append(processed[num][0]) else: if return_tags: matching.append(processed[index + i][1]) else: matching.append(processed[index + i][0]) except: pass # tally results counted = Counter(matching) # remove punctuation etc for key in counted: # commented because this stuff was moved earlier. #if key.isalnum(): #if key not in stopwords: #if remove_stopwords: #if key not in stopwords: #newdict[key] = counted[key] #else: #newdict[key] = counted[key] newdict[key] = counted[key] for w in counted.keys(): all_words.append(w) #top_tokens = newdict.most_common(top) dicts.append(newdict) # make pandas series sers = [] # for each unique word for word in list(set(all_words)): #get counts for each depth series = [dct[word] for dct in dicts] # add a total series.append(sum([dct[word] for dct in dicts])) #make index names for depths plus total index_names = range(-depth, (depth + 1)) index_names.append('Total') # turn into pandas data, and name the series the word ser = pd.Series(series, index = index_names) ser.name = word sers.append(ser) # concatenate series into dataframe df = pd.concat(sers, axis=1) # sort by total tot = df.ix['Total'] df = df[tot.argsort()[::-1]] # remove words matching the regex if need be if remove_query_from_output: cols = [c for c in list(df.columns) if not re.search(regex, c)] df = pd.DataFrame(df[cols]) # remove zero depth if need be if remove_zero_depth: df = df.drop(0, axis = 0) # just top entries df = pd.DataFrame(df[list(df.columns)[:top]]) #transpose dfs[corpus] = df.T # complete animation, then clear p.animate(len(sorted_dirs)) if have_ipython: clear_output() # some settings for good display pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) pd.set_option('expand_frame_repr', False) pd.set_option('colheader_justify', 'right') # print the start of each frame, then return them all for item in sorted(dfs): print item, '\n', dfs[item].head(), '\n' return dfs
def conc(corpus, query, option = 'tregex', dep_function = 'any', dep_type = 'basic-dependencies', n = 100, random = False, window = 100, trees = False, plaintext = False, #'guess', add_links = False, show_links = False, print_status = True, print_output = True, just_speakers = False, root = False, **kwargs): """A concordancer for Tregex queries and dependencies""" import corpkit import os import re import pandas as pd from pandas import DataFrame from time import localtime, strftime try: from IPython.display import display, clear_output except ImportError: pass from corpkit.other import tregex_engine from corpkit.tests import check_pytex, check_dit try: get_ipython().getoutput() except TypeError: have_ipython = True except NameError: import subprocess have_ipython = False if query == 'any': query = r'.*' # convert list to query if type(query) == list: from other import as_regex if option.startswith('t'): query = r'/%s/ !< __' % as_regex(query, boundaries = 'line') else: query = as_regex(query, boundaries = 'w') can_do_fast = False if option.startswith('t'): if just_speakers is False: can_do_fast = True just_speakers_is_list = False if type(just_speakers) == list: just_speakers_is_list = True if type(just_speakers) == str: if just_speakers.lower() != 'all': just_speakers = [just_speakers] def get_deps(sentence, dep_type): if dep_type == 'basic-dependencies': return sentence.basic_dependencies if dep_type == 'collapsed-dependencies': return sentence.collapsed_dependencies if dep_type == 'collapsed-ccprocessed-dependencies': return sentence.collapsed_ccprocessed_dependencies conc_lines = [] if option.startswith('t'): if trees: options = '-s' else: options = '-t' if can_do_fast: speakr = '' tregex_engine(query = query, check_query = True, root = root) wholes = tregex_engine(query = query, options = ['-o', '-w', '-f', options], corpus = corpus, preserve_case = True, root = root) middle_column_result = tregex_engine(query = query, options = ['-o', options], corpus = corpus, preserve_case = True, root = root) for (f, whole), mid in zip(wholes, middle_column_result): reg = re.compile(r'(' + re.escape(mid) + r')', re.IGNORECASE) start, middle, end = re.split(reg, whole, 1) conc_lines.append([os.path.basename(f), speakr, start, middle, end]) else: fs_to_conc = [] for r, dirs, fs in os.walk(corpus): for f in fs: if not os.path.isfile(os.path.join(r, f)): continue if not f.endswith('.txt') and not f.endswith('.xml'): continue fs_to_conc.append(os.path.join(r, f)) def normalise(concline): import re reg = re.compile(r'\([^ ]+') spaces = re.compile(r'\s+') concline = re.sub(reg, '', concline) concline = re.sub(spaces, ' ', concline) concline = concline.replace(')', '').replace(' ', ' ') return concline.strip() num_fs = len(fs_to_conc) for index, filepath in enumerate(fs_to_conc): f = os.path.basename(filepath) if num_fs > 1: if 'note' in kwargs.keys(): kwargs['note'].progvar.set((index) * 100.0 / num_fs) from time import localtime, strftime thetime = strftime("%H:%M:%S", localtime()) print '%s: Extracting data from %s ...' % (thetime, f) if root: root.update() with open(filepath, "rb") as text: parsetreedict = {} data = text.read() if option.startswith('p') or option.startswith('l'): if option.startswith('l'): lstokens = pickle.load(open(filepath, 'rb')) data = ' '.join(tokens) data = data.split(' . ') else: lines = data.splitlines() for l in lines: m = re.compile(r'^(.*?)(' + query + r')(.*)$', re.IGNORECASE) mat = re.search(m, l) if mat: conc_lines.append([f, '', mat.group(1), mat.group(2), mat.group(3)]) continue from corenlp_xml.document import Document corenlp_xml = Document(data) #corenlp_xml = Beautifulcorenlp_xml(data, parse_only=justsents) if just_speakers: for s in just_speakers: parsetreedict[s] = [] sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers] #sents = [s for s in corenlp_xml.find_all('sentence') \ #if s.speakername.text.strip() in just_speakers] else: sents = corenlp_xml.sentences nsents = len(sents) for i, s in enumerate(sents): if num_fs == 1: if 'note' in kwargs.keys(): kwargs['note'].progvar.set((index) * 100.0 / nsents) if root: root.update() try: speakr = s.speakername.strip() except: speakr = '' parsetree = s.parse_string if option.startswith('t'): parsetreedict[speakr].append(parsetree) continue elif option.startswith('d'): #right_dependency_grammar = s.find_all('dependencies', type=dep_type, limit = 1) deps = get_deps(s, dep_type) if dep_function == 'any' or dep_function is False: wdsmatching = [l.dependent.text.strip() for l in deps.links \ if re.match(query, l.dependent.text.strip())] else: comped = re.compile(dep_function, re.IGNORECASE) #goodsent = any(re.match(query, l.dependent.text.strip()) for l in deps.links if re.match(comped, l.type.strip())) wdsmatching = [l.dependent.text.strip() for l in deps.links \ if re.match(comped, l.type.strip()) and \ re.match(query, l.dependent.text.strip())] # this is shit, needs indexing or something for wd in wdsmatching: line = normalise(parsetree) start, middle, end = re.split(r'(' + wd + r')', line, 1) conc_lines.append([f, speakr, start, middle, end]) if option.startswith('t'): for speakr, dt in parsetreedict.items(): trees_as_string = '\n'.join(dt) if trees: options = '-s' else: options = '-t' with open('tmp.txt', 'w') as fo: fo.write(trees_as_string.encode('utf-8', errors = 'ignore')) tregex_engine(query = query, check_query = True, root = root) wholes = tregex_engine(query = query, options = ['-o', '-w', options], corpus = 'tmp.txt', preserve_case = True, root = root) middle_column_result = tregex_engine(query = query, options = ['-o', options], corpus = 'tmp.txt', preserve_case = True, root = root) for whole, mid in zip(wholes, middle_column_result): reg = re.compile(r'(' + re.escape(mid) + r')', re.IGNORECASE) start, middle, end = re.split(reg, whole, 1) conc_lines.append([f, speakr, start, middle, end]) # does not keep results ordered! try: os.remove('tmp.txt') except: pass unique_results = [list(x) for x in set(tuple(x) for x in conc_lines)] #make into series series = [] pindex = 'f s l m r'.encode('utf-8').split() for fname, spkr, start, word, end in unique_results: import os fname = os.path.basename(fname) start = start.replace('$ ', '$').replace('`` ', '``').replace(' ,', ',').replace(' .', '.').replace("'' ", "''").replace(" n't", "n't").replace(" 're","'re").replace(" 'm","'m").replace(" 's","'s").replace(" 'd","'d").replace(" 'll","'ll").replace(' ', ' ') word = word.replace('$ ', '$').replace('`` ', '``').replace(' ,', ',').replace(' .', '.').replace("'' ", "''").replace(" n't", "n't").replace(" 're","'re").replace(" 'm","'m").replace(" 's","'s").replace(" 'd","'d").replace(" 'll","'ll").replace(' ', ' ') end = end.replace('$ ', '$').replace('`` ', '``').replace(' ,', ',').replace(' .', '.').replace("'' ", "''").replace(" n't", "n't").replace(" 're","'re").replace(" 'm","'m").replace(" 's","'s").replace(" 'd","'d").replace(" 'll","'ll").replace(' ', ' ') #spaces = ' ' * (maximum / 2 - (len(word) / 2)) #new_word = spaces + word + spaces series.append(pd.Series([fname.encode('utf-8', errors = 'ignore'), \ spkr.encode('utf-8', errors = 'ignore'), \ start.encode('utf-8', errors = 'ignore'), \ word.encode('utf-8', errors = 'ignore'), \ end.encode('utf-8', errors = 'ignore')], index = pindex)) # randomise results... if random: from random import shuffle shuffle(series) if series == []: if root: print 'No results found, sorry.' return else: raise ValueError("No results found, I'm afraid. Check your query and path.") df = pd.concat(series, axis = 1).T if not add_links: df.columns = ['f', 's', 'l', 'm', 'r'] else: df.columns = ['f', 's', 'l', 'm', 'r', 'link'] if all(x == '' for x in list(df['s'].values)): df.drop('s', axis = 1, inplace = True) formatl = lambda x: "{0}".format(x[-window:]) formatf = lambda x: "{0}".format(x[-20:]) #formatr = lambda x: formatr = lambda x: "{{:<{}s}}".format(df['r'].str.len().max()).format(x[:window]) st = df.head(n).to_string(header = False, formatters={'l': formatl, 'r': formatr, 'f': formatf}).splitlines() # hack because i can't figure out formatter: rem = '\n'.join([re.sub('\s*\.\.\.\s*$', '', s) for s in st]) if print_output: print rem if 'note' in kwargs.keys(): kwargs['note'].progvar.set(100) return df if add_links: def _add_links(lines, links = False, show = 'thread'): link = "http://www.healthboards.com/boards/bipolar-disorder/695089-labels.html" linktext = '<a href="%s>link</a>' % link import pandas as pd inds = list(df.index) num_objects = len(list(df.index)) ser = pd.Series([link for n in range(num_objects)], index = inds) lines['link'] = ser return lines df = _add_links(df) if add_links: if not show_links: if print_output: print df.drop('link', axis = 1).head(n).to_string(header = False, formatters={rname:'{{:<{}s}}'.format(df[rname].str.len().max()).format}) else: if print_output: print HTML(df.to_html(escape=False)) else: if print_output: print df.head(n).to_string(header = False, formatters={rname:'{{:<{}s}}'.format(df[rname].str.len().max()).format}) if not add_links: df.columns = ['f', 'l', 'm', 'r'] else: df.columns = ['f', 'l', 'm', 'r', 'link'] return df
def conc(corpus, option = 'tregex', query = 'any', dep_function = 'any', dep_type = 'collapsed-ccprocessed-dependencies', n = 100, random = False, split_sents = True, window = 100, trees = False, plaintext = False, add_links = False, show_links = False, print_status = True, print_output = True, just_speakers = False, root = False, **kwargs): """ A concordancer for Tregex queries and dependencies. * Revisions forthcoming to facilitate better dependency querying :returns: a Pandas DataFrame containing concordance lines""" import corpkit import os import re import pandas as pd from pandas import DataFrame from time import localtime, strftime try: from IPython.display import display, clear_output except ImportError: pass from corpkit.other import tregex_engine from corpkit.tests import check_pytex, check_dit try: get_ipython().getoutput() except TypeError: have_ipython = True except NameError: import subprocess have_ipython = False if query == 'any': query = r'.*' # convert list to query if type(query) == list: from other import as_regex if option.startswith('t'): query = r'/%s/ !< __' % as_regex(query, boundaries = 'line') else: query = as_regex(query, boundaries = 'w') can_do_fast = False if option.startswith('t'): if just_speakers is False: can_do_fast = True just_speakers_is_list = False if type(just_speakers) == list: just_speakers_is_list = True if type(just_speakers) == str: if just_speakers.lower() != 'all': just_speakers = [just_speakers] def get_deps(sentence, dep_type): if dep_type == 'basic-dependencies': return sentence.basic_dependencies if dep_type == 'collapsed-dependencies': return sentence.collapsed_dependencies if dep_type == 'collapsed-ccprocessed-dependencies': return sentence.collapsed_ccprocessed_dependencies conc_lines = [] if option.startswith('t'): if trees: options = '-s' else: options = '-t' if can_do_fast: speakr = '' tregex_engine(query = query, check_query = True, root = root) wholes = tregex_engine(query = query, options = ['-o', '-w', '-f', options], corpus = corpus, preserve_case = True, root = root) middle_column_result = tregex_engine(query = query, options = ['-o', options], corpus = corpus, preserve_case = True, root = root) for (f, whole), mid in zip(wholes, middle_column_result): reg = re.compile(r'(' + re.escape(mid) + r')', re.IGNORECASE) start, middle, end = re.split(reg, whole, 1) conc_lines.append([os.path.basename(f), speakr, start, middle, end]) else: if query.startswith(r'\b'): query = query[2:] if query.endswith(r'\b'): query = query[:-2] fs_to_conc = [] for r, dirs, fs in os.walk(corpus): for f in fs: if not os.path.isfile(os.path.join(r, f)): continue if not f.endswith('.txt') and not f.endswith('.xml') and not f.endswith('.p'): continue fs_to_conc.append(os.path.join(r, f)) def normalise(concline): import re reg = re.compile(r'\([^ ]+') spaces = re.compile(r'\s+') concline = re.sub(reg, '', concline) concline = re.sub(spaces, ' ', concline) concline = concline.replace(')', '').replace(' ', ' ') return concline.strip() num_fs = len(fs_to_conc) for index, filepath in enumerate(fs_to_conc): f = os.path.basename(filepath) if num_fs > 1: if 'note' in kwargs.keys(): kwargs['note'].progvar.set((index) * 100.0 / num_fs) if print_status: from time import localtime, strftime thetime = strftime("%H:%M:%S", localtime()) print '%s: Extracting data from %s ...' % (thetime, f) if root: root.update() with open(filepath, "r") as text: parsetreedict = {} data = text.read() if option.startswith('p'): import chardet enc = chardet.detect(data) data = unicode(data, enc['encoding'], errors = 'ignore') if option.startswith('p') or option.startswith('l'): if option.startswith('l'): import pickle try: lstokens = pickle.load(open(filepath, 'rb')) except EOFError: thetime = strftime("%H:%M:%S", localtime()) print '%s: File "%s" could not be opened.' % (thetime, os.path.basename(filepath)) data = ' '.join(lstokens) if split_sents: lines = data.split(' . ') else: lines = [data.replace('\n', '')] else: if split_sents: lines = data.splitlines() else: lines = [data.replace('\n', '')] for l in lines: if split_sents: m = re.compile(r'(?i)^(.*?)(\b' + query + r'\b)(.*)$', re.UNICODE) else: m = re.compile(r'(?i)(.{,%s})(\b' % window + query + r'\b)(.{,%s})' % window, re.UNICODE) if split_sents: mat = re.search(m, l) else: mat = re.findall(m, l) if split_sents: if mat: last_num = len(mat.groups()) conc_lines.append([f, '', mat.group(1), mat.group(2), mat.group(last_num)]) else: if mat: #print len(mat) for ent in mat: #print len(ent) last_num = len(ent) - 1 conc_lines.append([f, '', ent[0], ent[1], ent[last_num]]) if any(f.endswith('.xml') for f in fs_to_conc): from corenlp_xml.document import Document corenlp_xml = Document(data) #corenlp_xml = Beautifulcorenlp_xml(data, parse_only=justsents) if just_speakers: for s in just_speakers: parsetreedict[s] = [] sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers] #sents = [s for s in corenlp_xml.find_all('sentence') \ #if s.speakername.text.strip() in just_speakers] else: sents = corenlp_xml.sentences nsents = len(sents) for i, s in enumerate(sents): if num_fs == 1: if 'note' in kwargs.keys(): kwargs['note'].progvar.set((index) * 100.0 / nsents) if root: root.update() try: speakr = s.speakername.strip() except: speakr = '' parsetree = s.parse_string if option.startswith('t'): parsetreedict[speakr].append(parsetree) continue elif option.startswith('d'): try: compiled_query = re.compile(query) except: import traceback import sys exc_type, exc_value, exc_traceback = sys.exc_info() lst = traceback.format_exception(exc_type, exc_value, exc_traceback) error_message = lst[-1] thetime = strftime("%H:%M:%S", localtime()) print '%s: Query %s' % (thetime, error_message) return #right_dependency_grammar = s.find_all('dependencies', type=dep_type, limit = 1) deps = get_deps(s, dep_type) if dep_function == 'any' or dep_function is False: wdsmatching = [l.dependent.text.strip() for l in deps.links \ if re.match(query, l.dependent.text.strip())] else: comped = re.compile(dep_function, re.IGNORECASE) #goodsent = any(re.match(query, l.dependent.text.strip()) for l in deps.links if re.match(comped, l.type.strip())) wdsmatching = [l.dependent.text.strip() for l in deps.links \ if re.match(comped, l.type.strip()) and \ re.match(query, l.dependent.text.strip())] # this is shit, needs indexing or something for wd in wdsmatching: line = normalise(parsetree) try: start, middle, end = re.split(r'(' + wd + r')', line, 1) except ValueError: continue conc_lines.append([f, speakr, start, middle, end]) if option.startswith('t'): for speakr, dt in parsetreedict.items(): trees_as_string = '\n'.join(dt) if trees: options = '-s' else: options = '-t' with open('tmp.txt', 'w') as fo: fo.write(trees_as_string.encode('utf-8', errors = 'ignore')) tregex_engine(query = query, check_query = True, root = root) wholes = tregex_engine(query = query, options = ['-o', '-w', options], corpus = 'tmp.txt', preserve_case = True, root = root) middle_column_result = tregex_engine(query = query, options = ['-o', options], corpus = 'tmp.txt', preserve_case = True, root = root) for whole, mid in zip(wholes, middle_column_result): reg = re.compile(r'(' + re.escape(mid) + r')', re.IGNORECASE) start, middle, end = re.split(reg, whole, 1) conc_lines.append([f, speakr, start, middle, end]) # does not keep results ordered! try: os.remove('tmp.txt') except: pass unique_results = [list(x) for x in set(tuple(x) for x in conc_lines)] #make into series series = [] pindex = 'f s l m r'.encode('utf-8').split() for fname, spkr, start, word, end in unique_results: spkr = unicode(spkr, errors = 'ignore') fname = os.path.basename(fname) start = start.replace('$ ', '$').replace('`` ', '``').replace(' ,', ',').replace(' .', '.').replace("'' ", "''").replace(" n't", "n't").replace(" 're","'re").replace(" 'm","'m").replace(" 's","'s").replace(" 'd","'d").replace(" 'll","'ll").replace(' ', ' ') word = word.replace('$ ', '$').replace('`` ', '``').replace(' ,', ',').replace(' .', '.').replace("'' ", "''").replace(" n't", "n't").replace(" 're","'re").replace(" 'm","'m").replace(" 's","'s").replace(" 'd","'d").replace(" 'll","'ll").replace(' ', ' ') end = end.replace('$ ', '$').replace('`` ', '``').replace(' ,', ',').replace(' .', '.').replace("'' ", "''").replace(" n't", "n't").replace(" 're","'re").replace(" 'm","'m").replace(" 's","'s").replace(" 'd","'d").replace(" 'll","'ll").replace(' ', ' ') #spaces = ' ' * (maximum / 2 - (len(word) / 2)) #new_word = spaces + word + spaces # the use of ascii here makes sure the string formats ok, but will also screw over # anyone doing non-english work. so, change to utf-8, then fix errors as they come # in the corpkit-gui "add_conc_lines_to_window" function series.append(pd.Series([fname.encode('ascii', errors = 'ignore'), \ spkr.encode('ascii', errors = 'ignore'), \ start.encode('ascii', errors = 'ignore'), \ word.encode('ascii', errors = 'ignore'), \ end.encode('ascii', errors = 'ignore')], index = pindex)) # randomise results... if random: from random import shuffle shuffle(series) if series == []: if root: print 'No results found, sorry.' return else: raise ValueError("No results found, I'm afraid. Check your query and path.") df = pd.concat(series, axis = 1).T if not add_links: df.columns = ['f', 's', 'l', 'm', 'r'] else: df.columns = ['f', 's', 'l', 'm', 'r', 'link'] if all(x == '' for x in list(df['s'].values)): df.drop('s', axis = 1, inplace = True) if 'note' in kwargs.keys(): kwargs['note'].progvar.set(100) if print_output: formatl = lambda x: "{0}".format(x[-window:]) formatf = lambda x: "{0}".format(x[-20:]) #formatr = lambda x: formatr = lambda x: "{{:<{}s}}".format(df['r'].str.len().max()).format(x[:window]) st = df.head(n).to_string(header = False, formatters={'l': formatl, 'r': formatr, 'f': formatf}).splitlines() # hack because i can't figure out formatter: rem = '\n'.join([re.sub('\s*\.\.\.\s*$', '', s) for s in st]) print rem return df