def ngrams(data, reference_corpus = 'bnc.p', clear = True, printstatus = True, n = 'all', **kwargs): """Feed this function some data and get its keywords. You can use dictmaker() to build a new reference_corpus to serve as reference corpus, or use bnc.p A list of what counts as data is available in the docstring of datareader(). """ import re import time from time import localtime, strftime from dictionaries.stopwords import stopwords as my_stopwords try: from IPython.display import display, clear_output except ImportError: pass from corpkit.keys import keywords_and_ngrams, turn_input_into_counter from corpkit.other import datareader loaded_ref_corpus = turn_input_into_counter(reference_corpus) if n == 'all': n = 99999 time = strftime("%H:%M:%S", localtime()) if printstatus: print "\n%s: Generating ngrams... \n" % time good = datareader(data, **kwargs) regex_nonword_filter = re.compile("[A-Za-z-\']") good = [i for i in good if re.search(regex_nonword_filter, i) and i not in my_stopwords] ngrams = keywords_and_ngrams(good, reference_corpus = reference_corpus, calc_all = calc_all, show = 'ngrams', **kwargs) import pandas as pd out = pd.Series([s for k, s in ngrams], index = [k for k, s in ngrams]) out.name = 'ngrams' # print and return if clear: clear_output() if printstatus: time = strftime("%H:%M:%S", localtime()) print '%s: Done! %d results.\n' % (time, len(list(out.index))) if n == 'all': n = len(out) return out[:n]
def collocates(data, nbest = 30, window = 5): """Feed this data and get its collocations""" import nltk from nltk import collocations from nltk.collocations import BigramCollocationFinder import os import time from time import localtime, strftime from corpkit.other import datareader from corpkit.tests import check_dit try: from IPython.display import display, clear_output except ImportError: pass # turn all sentences into long string time = strftime("%H:%M:%S", localtime()) #if noprint is False: print "\n%s: Generating %d collocates ... \n" % (time, nbest) good = datareader(data) if type(good) != unicode: good = unicode(good.lower(), 'utf-8', errors = 'ignore') else: good = good.lower() # sent and word tokenise sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') sents = sent_tokenizer.tokenize(good) tokenized_sents = [nltk.word_tokenize(i) for i in sents] allwords = [] # for each sentence, for sent in tokenized_sents: # for each word, for word in sent: # make a list of all words allwords.append(word) bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(allwords, window_size=window) # should be consistent in stopwords ignored_words = nltk.corpus.stopwords.words('english') finder.apply_word_filter(lambda w: len(w) < 2 or w.lower() \ in ignored_words or not w.isalnum()) #unncessary?: finder.apply_freq_filter(2) results = sorted(finder.nbest(bigram_measures.raw_freq, nbest)) listversion = [] for index, thecollocation in enumerate(results): aslist = [index, thecollocation[0], thecollocation[1]] listversion.append(aslist) clear_output() return listversion
def turn_input_into_counter(data, **kwargs): """from string (filepath) or variable, return a counter.""" import sys import os import re import collections import pickle import pandas from corpkit.other import datareader dict_found = False if type(data) == str: if os.path.isdir(data): # get list of words good = datareader(data, **kwargs) # remove bad stuff from result regex_nonword_filter = re.compile("[A-Za-z]") data = [i for i in good if re.search(regex_nonword_filter, i)] return collections.Counter(data) while not dict_found: if 'interrogation' in str(type(data)): try: data = data.results except: raise ValueError("Can't find .results branch of input.") # if passing in results, sum them if type(data) == pandas.core.frame.DataFrame: data = data.sum() # count sum if type(data) == pandas.core.series.Series: data = data[data != 0] data = collections.Counter(data.to_dict()) dict_found = True return data # turn notmal dicts into counter if type(data) == dict: dict_found = True return collections.Counter(data) # the best case scenario: if type(data) == collections.Counter: dict_found = True return data # filepath stuff if type(data) == str: if not data.endswith('.p'): data = data + '.p' try: ref_corp_dict = pickle.load( open( data, "rb" ) ) dict_found = True return ref_corp_dict except IOError: try: ref_corp_dict = pickle.load( open( os.path.join('data/dictionaries', data), "rb" ) ) dict_found = True return ref_corp_dict except IOError: try: import corpkit path_to_corpkit = os.path.dirname(corpkit.__file__) thepath, corpkitname = os.path.split(path_to_corpkit) dictionaries_path = os.path.join(thepath, 'dictionaries') ref_corp_dict = pickle.load( open( os.path.join(dictionaries_path, data), "rb" ) ) dict_found = True return ref_corp_dict except: pass dict_of_dicts = {} d_for_print = [] dicts = [f for f in os.listdir('data/dictionaries') if f.endswith('.p')] for index, d in enumerate(dicts): dict_of_dicts[index] = d d_for_print.append(' % 2d) %s' % (index, d)) d_for_print = '\n'.join(d_for_print) selection = raw_input("\nReference corpus not found. Select an existing reference corpus or exit or type 'exit' to quit.\n\n%s\n\nYour selection: " % d_for_print) if selection.startswith('e'): import sys sys.exit() else: try: data = dict_of_dicts[int(selection)] except: print '\nInput "%s" not recognised.' % data