def downloader(url_list, new_path = 'html', wait = 5): """download a bunch of urls and store in a local folder""" import corpkit import urllib import time import os from time import localtime, strftime from textprogressbar import TextProgressBar thetime = strftime("%H:%M:%S", localtime()) print "\n%s: Attempting to download %d URLs with %d seconds wait-time ... \n" % (thetime, len(url_list), wait) p = TextProgressBar(len(urls)) if not os.path.exists(new_path): os.makedirs(new_path) paths = [] for index, url in enumerate(url_list): p.animate(index) base = os.path.basename(url) new_filename = os.path.join(new_path, base) paths.append(new_filename) urllib.urlretrieve(url, new_filename) time.sleep(wait) p.animate(len(url_list)) num_downloaded = len(paths) thetime = strftime("%H:%M:%S", localtime()) print '\n\n%s: Done! %d files downloaded.' % (thetime, num_downloaded) return paths
def downloader(url_list, new_path='html', wait=5): """download a bunch of urls and store in a local folder""" import corpkit import urllib.request, urllib.parse, urllib.error import time import os from time import localtime, strftime from textprogressbar import TextProgressBar thetime = strftime("%H:%M:%S", localtime()) print( "\n%s: Attempting to download %d URLs with %d seconds wait-time ... \n" % (thetime, len(url_list), wait)) p = TextProgressBar(len(urls)) if not os.path.exists(new_path): os.makedirs(new_path) paths = [] for index, url in enumerate(url_list): p.animate(index) base = os.path.basename(url) new_filename = os.path.join(new_path, base) paths.append(new_filename) urllib.request.urlretrieve(url, new_filename) time.sleep(wait) p.animate(len(url_list)) num_downloaded = len(paths) thetime = strftime("%H:%M:%S", localtime()) print('\n\n%s: Done! %d files downloaded.' % (thetime, num_downloaded)) return paths
def animator(progbar, count, tot_string=False, linenum=False, terminal=False, init=False, length=False, **kwargs): """ Animates progress bar in unique position in terminal Multiple progress bars not supported in jupyter yet. """ # add startnum start_at = kwargs.get('startnum', 0) if start_at is None: start_at = 0.0 denominator = kwargs.get('denom', 1) if kwargs.get('note'): if count is None: perc_done = 0.0 else: perc_done = (count * 100.0 / float(length)) / float(denominator) kwargs['note'].progvar.set(start_at + perc_done) kwargs['root'].update() return if init: from textprogressbar import TextProgressBar return TextProgressBar(length, dirname=tot_string) if type(linenum) == int: # this try is for sublime text nosetests, which don't take terminal object try: with terminal.location(0, terminal.height - (linenum + 1)): if tot_string: progbar.animate(count, tot_string) else: progbar.animate(count) except: pass else: if tot_string: progbar.animate(count, tot_string) else: progbar.animate(count)
def dictmaker(path, dictname, query = 'any', dictpath = 'data/dictionaries', lemmatise = False, just_content_words = False, use_dependencies = False): """makes a pickle wordlist named dictname in dictpath""" import corpkit import os import pickle import re import nltk from time import localtime, strftime from StringIO import StringIO import shutil from collections import Counter from textprogressbar import TextProgressBar from corpkit.process import tregex_engine try: from IPython.display import display, clear_output except ImportError: pass try: get_ipython().getoutput() except TypeError: have_ipython = True except NameError: import subprocess have_ipython = False if lemmatise: dictname = dictname + '-lemmatised' if not dictname.endswith('.p'): dictname = dictname + '.p' # allow direct passing of dirs path_is_list = False one_big_corpus = False if type(path) == str: sorted_dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path,d))] # if no subcorpora, just do the dir passed in if len(sorted_dirs) == 0: one_big_corpus = True sorted_dirs = [path] elif type(path) == list: path_is_list = True sorted_dirs = sorted(path) if type(sorted_dirs[0]) == int: sorted_dirs = [str(d) for d in sorted_dirs] try: sorted_dirs.sort(key=int) except: pass try: if not os.path.exists(dictpath): os.makedirs(dictpath) except IOError: print "Error making " + dictpath + "/ directory." while os.path.isfile(os.path.join(dictpath, dictname)): time = strftime("%H:%M:%S", localtime()) selection = raw_input('\n%s: %s already exists in %s.\n' \ ' You have the following options:\n\n' \ ' a) save with a new name\n' \ ' b) delete %s\n' \ ' c) exit\n\nYour selection: ' % (time, dictname, dictpath, os.path.join(dictpath, dictname))) if 'a' in selection: sel = raw_input('\nNew save name: ') dictname = sel if lemmatise: dictname = dictname.replace('-lemmatised.p', '') dictname = dictname + '-lemmatised' if not dictname.endswith('.p'): dictname = dictname + '.p' elif 'b' in selection: os.remove(os.path.join(dictpath, dictname)) elif 'c' in selection: print '' return else: as_str = str(selection) print ' Choice "%s" not recognised.' % selection time = strftime("%H:%M:%S", localtime()) print '\n%s: Extracting words from files ... \n' % time # all this just to get a list of files and make a better progress bar if use_dependencies: counts = [] for d in sorted_dirs: if not one_big_corpus: subcorpus = os.path.join(path, d) else: subcorpus = path if use_dependencies: files = [f for f in os.listdir(subcorpus) if f.endswith('.xml')] else: files = [f for f in os.listdir(subcorpus)] counts.append(len(files)) num_files = sum(counts) c = 0 p = TextProgressBar(num_files) else: p = TextProgressBar(len(sorted_dirs)) def tokener(xmldata): import corpkit """print word, using good lemmatisation""" from bs4 import BeautifulSoup import gc open_classes = ['N', 'V', 'R', 'J'] result = [] just_good_deps = SoupStrainer('tokens') soup = BeautifulSoup(xmldata, parse_only=just_good_deps) for token in soup.find_all('token'): word = token.word.text query = re.compile(r'.*') if re.search(query, word): if lemmatise: word = token.lemma.text if just_content_words: if not token.pos.text[0] in open_classes: continue result.append(word) # attempt to stop memory problems. # not sure if this helps, though: soup.decompose() soup = None data = None gc.collect() return result # translate 'any' query if query == 'any': if lemmatise: query = r'__ <# (__ !< __)' else: query = r'__ !< __' if lemmatise: options = ['-o'] else: options = ['-t', '-o'] if use_dependencies: from bs4 import BeautifulSoup, SoupStrainer if query == 'any': query = r'.*' query = re.compile(query) allwords = [] for index, d in enumerate(sorted_dirs): if not use_dependencies: p.animate(index) if not path_is_list: if len(sorted_dirs) == 1: subcorp = d else: subcorp = os.path.join(path, d) else: subcorp = d # check query first time through if not use_dependencies: if index == 0: trees_found = tregex_engine(corpus = subcorp, check_for_trees = True) if not trees_found: lemmatise = False dictname = dictname.replace('-lemmatised', '') if trees_found: results = tregex_engine(corpus = subcorp, options = options, query = query, lemmatise = lemmatise, just_content_words = just_content_words) for result in results: allwords.append(result) elif use_dependencies: regex_nonword_filter = re.compile("[A-Za-z]") results = [] fs = [os.path.join(subcorp, f) for f in os.listdir(subcorp)] for f in fs: p.animate(c, str(c) + '/' + str(num_files)) c += 1 data = open(f).read() result_from_a_file = tokener(data) for w in result_from_a_file: if re.search(regex_nonword_filter, w): allwords.append(w.lower()) if not use_dependencies: if not trees_found: for f in os.listdir(subcorp): raw = unicode(open(os.path.join(subcorp, f)).read(), 'utf-8', errors = 'ignore') sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') sents = sent_tokenizer.tokenize(raw) tokenized_sents = [nltk.word_tokenize(i) for i in sents] for sent in tokenized_sents: for w in sent: allwords.append(w.lower()) #100% p.animate(len(sorted_dirs)) # make a dict dictionary = Counter(allwords) with open(os.path.join(dictpath, dictname), 'wb') as handle: pickle.dump(dictionary, handle) time = strftime("%H:%M:%S", localtime()) print '\n\n' + time + ': Done! ' + dictname + ' created in ' + dictpath + '/'
def eugener(path, query, depth = 5, top = 20, lemmatise = False, just_content_words = False, remove_query_from_output = False, remove_zero_depth = False, return_tags = False): """ ***This is probably broken now, can fix if there's a use for it.*** get most frequent words in corpus path to left and right of query regex path: path to corpus containing subcorpora query: regex to match word to be zero depth depth: number of places left and right to look top: number of most frequent entries to return lemmatise: wordnet lemmatisation just_content_words: keep only n, v, a, r tagged words remove_query_from_output: remove o """ import os import nltk import re from collections import Counter import pandas as pd from textprogressbar import TextProgressBar from other import tregex_engine # manual lemmatisation here: from dictionaries.word_transforms import wordlist try: get_ipython().getoutput() except TypeError: have_ipython = True from IPython.display import display, clear_output except NameError: import subprocess have_ipython = False from tests import check_dit # probably never needed if lemmatise: from nltk.stem.wordnet import WordNetLemmatizer lmtzr=WordNetLemmatizer() regex = re.compile(query) wordregex = re.compile('[A-Za-z0-9]') print '' # get list of subcorpora dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))] sorted_dirs = sorted(dirs) # define risk word # place for our output dfs = {} p = TextProgressBar(len(sorted_dirs)) for index, corpus in enumerate(sorted_dirs): p.animate(index) # search the corpus for whole sents containing risk word subcorpus = os.path.join(path, corpus) if lemmatise: query = r'__ <# (__ !< __)' else: query = r'__ !> __' results = tregex_engine(query, ['-o'], subcorpus, lemmatise = lemmatise, just_content_words = just_content_words) # lowercase processed = [(r.lower(), tag) for r, tag in processed] # remove punct processed = [w for w in processed if re.search(wordregex, w[0])] # a place for info about each corpus # word list to use later all_words = [] dicts = [] # go left and right depth times (for 2, makes [-2, -1, 0, 1, 2]) for i in range(-depth, (depth + 1)): newdict = Counter() matching = [] # go through each token for index, (token, tag) in enumerate(processed): # if token matches risk expression if re.search(regex, token): # get the word at depth index # try statement for cases where the target word index isn't there try: if i < 0: num = index - abs(i) if return_tags: matching.append(processed[num][1]) else: matching.append(processed[num][0]) else: if return_tags: matching.append(processed[index + i][1]) else: matching.append(processed[index + i][0]) except: pass # tally results counted = Counter(matching) # remove punctuation etc for key in counted: # commented because this stuff was moved earlier. #if key.isalnum(): #if key not in stopwords: #if remove_stopwords: #if key not in stopwords: #newdict[key] = counted[key] #else: #newdict[key] = counted[key] newdict[key] = counted[key] for w in counted.keys(): all_words.append(w) #top_tokens = newdict.most_common(top) dicts.append(newdict) # make pandas series sers = [] # for each unique word for word in list(set(all_words)): #get counts for each depth series = [dct[word] for dct in dicts] # add a total series.append(sum([dct[word] for dct in dicts])) #make index names for depths plus total index_names = range(-depth, (depth + 1)) index_names.append('Total') # turn into pandas data, and name the series the word ser = pd.Series(series, index = index_names) ser.name = word sers.append(ser) # concatenate series into dataframe df = pd.concat(sers, axis=1) # sort by total tot = df.ix['Total'] df = df[tot.argsort()[::-1]] # remove words matching the regex if need be if remove_query_from_output: cols = [c for c in list(df.columns) if not re.search(regex, c)] df = pd.DataFrame(df[cols]) # remove zero depth if need be if remove_zero_depth: df = df.drop(0, axis = 0) # just top entries df = pd.DataFrame(df[list(df.columns)[:top]]) #transpose dfs[corpus] = df.T # complete animation, then clear p.animate(len(sorted_dirs)) if have_ipython: clear_output() # some settings for good display pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) pd.set_option('expand_frame_repr', False) pd.set_option('colheader_justify', 'right') # print the start of each frame, then return them all for item in sorted(dfs): print item, '\n', dfs[item].head(), '\n' return dfs
def download_large_file(proj_path, url, actually_download = True, root = False, **kwargs): """download something to proj_path""" import corpkit import os import urllib2 from time import localtime, strftime from textprogressbar import TextProgressBar file_name = url.split('/')[-1] home = os.path.expanduser("~") if 'stanford' in url: downloaded_dir = os.path.join(home, 'corenlp') else: downloaded_dir = os.path.join(proj_path, 'temp') fullfile = os.path.join(downloaded_dir, file_name) try: os.makedirs(downloaded_dir) except OSError: if 'stanford-corenlp-full-2015-04-20.zip' in os.listdir(downloaded_dir): import zipfile the_zip_file = zipfile.ZipFile(fullfile) ret = the_zip_file.testzip() if ret is None: return downloaded_dir, fullfile else: os.remove(fullfile) if actually_download: try: u = urllib2.urlopen(url) f = open(fullfile, 'wb') meta = u.info() file_size = int(meta.getheaders("Content-Length")[0]) if root: root.update() if 'note' in kwargs.keys(): kwargs['note'].progvar.set(0) else: p = TextProgressBar(int(file_size)) from time import localtime, strftime thetime = strftime("%H:%M:%S", localtime()) print '%s: Downloading ... ' % thetime file_size_dl = 0 block_sz = 8192 while True: buffer = u.read(block_sz) if not buffer: break file_size_dl += len(buffer) f.write(buffer) #status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size) #status = status + chr(8)*(len(status)+1) if 'note' in kwargs.keys(): kwargs['note'].progvar.set(file_size_dl * 100.0 / int(file_size)) else: p.animate(file_size_dl) if root: root.update() if 'note' in kwargs.keys(): kwargs['note'].progvar.set(100) else: p.animate(int(file_size)) except: time = strftime("%H:%M:%S", localtime()) print '%s: Downloaded failed: bad connection.' % time f.close() if root: root.update() return time = strftime("%H:%M:%S", localtime()) print '%s: Downloaded successully.' % time f.close() return downloaded_dir, fullfile
def dictmaker(path, dictname, query='any', dictpath='data/dictionaries', lemmatise=False, just_content_words=False, use_dependencies=False): """makes a pickle wordlist named dictname in dictpath""" import corpkit import os import pickle import re import nltk from time import localtime, strftime from io import StringIO import shutil from collections import Counter from textprogressbar import TextProgressBar from process import tregex_engine try: from IPython.display import display, clear_output except ImportError: pass try: get_ipython().getoutput() except TypeError: have_ipython = True except NameError: import subprocess have_ipython = False if lemmatise: dictname = dictname + '-lemmatised' if not dictname.endswith('.p'): dictname = dictname + '.p' # allow direct passing of dirs path_is_list = False one_big_corpus = False if type(path) == str: sorted_dirs = [ d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d)) ] # if no subcorpora, just do the dir passed in if len(sorted_dirs) == 0: one_big_corpus = True sorted_dirs = [path] elif type(path) == list: path_is_list = True sorted_dirs = sorted(path) if type(sorted_dirs[0]) == int: sorted_dirs = [str(d) for d in sorted_dirs] try: sorted_dirs.sort(key=int) except: pass try: if not os.path.exists(dictpath): os.makedirs(dictpath) except IOError: print("Error making " + dictpath + "/ directory.") while os.path.isfile(os.path.join(dictpath, dictname)): time = strftime("%H:%M:%S", localtime()) selection = input('\n%s: %s already exists in %s.\n' \ ' You have the following options:\n\n' \ ' a) save with a new name\n' \ ' b) delete %s\n' \ ' c) exit\n\nYour selection: ' % (time, dictname, dictpath, os.path.join(dictpath, dictname))) if 'a' in selection: sel = input('\nNew save name: ') dictname = sel if lemmatise: dictname = dictname.replace('-lemmatised.p', '') dictname = dictname + '-lemmatised' if not dictname.endswith('.p'): dictname = dictname + '.p' elif 'b' in selection: os.remove(os.path.join(dictpath, dictname)) elif 'c' in selection: print('') return else: as_str = str(selection) print(' Choice "%s" not recognised.' % selection) time = strftime("%H:%M:%S", localtime()) print('\n%s: Extracting words from files ... \n' % time) # all this just to get a list of files and make a better progress bar if use_dependencies: counts = [] for d in sorted_dirs: if not one_big_corpus: subcorpus = os.path.join(path, d) else: subcorpus = path if use_dependencies: files = [ f for f in os.listdir(subcorpus) if f.endswith('.xml') ] else: files = [f for f in os.listdir(subcorpus)] counts.append(len(files)) num_files = sum(counts) c = 0 p = TextProgressBar(num_files) else: p = TextProgressBar(len(sorted_dirs)) def tokener(xmldata): import corpkit """print word, using good lemmatisation""" from bs4 import BeautifulSoup import gc open_classes = ['N', 'V', 'R', 'J'] result = [] just_good_deps = SoupStrainer('tokens') soup = BeautifulSoup(xmldata, parse_only=just_good_deps) for token in soup.find_all('token'): word = token.word.text query = re.compile(r'.*') if re.search(query, word): if lemmatise: word = token.lemma.text if just_content_words: if not token.pos.text[0] in open_classes: continue result.append(word) # attempt to stop memory problems. # not sure if this helps, though: soup.decompose() soup = None data = None gc.collect() return result # translate 'any' query if query == 'any': if lemmatise: query = r'__ <# (__ !< __)' else: query = r'__ !< __' if lemmatise: options = ['-o'] else: options = ['-t', '-o'] if use_dependencies: from bs4 import BeautifulSoup, SoupStrainer if query == 'any': query = r'.*' query = re.compile(query) allwords = [] for index, d in enumerate(sorted_dirs): if not use_dependencies: p.animate(index) if not path_is_list: if len(sorted_dirs) == 1: subcorp = d else: subcorp = os.path.join(path, d) else: subcorp = d # check query first time through if not use_dependencies: if index == 0: trees_found = tregex_engine(corpus=subcorp, check_for_trees=True) if not trees_found: lemmatise = False dictname = dictname.replace('-lemmatised', '') if trees_found: results = tregex_engine(corpus=subcorp, options=options, query=query, lemmatise=lemmatise, just_content_words=just_content_words) for result in results: allwords.append(result) elif use_dependencies: regex_nonword_filter = re.compile("[A-Za-z]") results = [] fs = [os.path.join(subcorp, f) for f in os.listdir(subcorp)] for f in fs: p.animate(c, str(c) + '/' + str(num_files)) c += 1 data = open(f).read() result_from_a_file = tokener(data) for w in result_from_a_file: if re.search(regex_nonword_filter, w): allwords.append(w.lower()) if not use_dependencies: if not trees_found: for f in os.listdir(subcorp): raw = str(open(os.path.join(subcorp, f)).read(), 'utf-8', errors='ignore') sent_tokenizer = nltk.data.load( 'tokenizers/punkt/english.pickle') sents = sent_tokenizer.tokenize(raw) tokenized_sents = [nltk.word_tokenize(i) for i in sents] for sent in tokenized_sents: for w in sent: allwords.append(w.lower()) #100% p.animate(len(sorted_dirs)) # make a dict dictionary = Counter(allwords) with open(os.path.join(dictpath, dictname), 'wb') as handle: pickle.dump(dictionary, handle) time = strftime("%H:%M:%S", localtime()) print('\n\n' + time + ': Done! ' + dictname + ' created in ' + dictpath + '/')