def fetch_document_bigrams(self, document_lemmas, number_of_bigrams=100): """ Given a number of lemmas identifying a document, it calculates N bigrams found in that document, where N=number_of_bigrams. """ if not self.include_bigrams: return [] bigram = Phrases() bigram.add_vocab([document_lemmas]) bigram_counter = Counter() for key in bigram.vocab.keys(): if key not in STOPWORDS_BYTES: if len(key.split("_")) > 1: bigram_counter[key] += bigram.vocab[key] bigram_iterators = [ repeat(bigram, bigram_count) for bigram, bigram_count in bigram_counter.most_common(number_of_bigrams) ] found_bigrams = list(chain(*bigram_iterators)) known_bigrams = [bigram for bigram in found_bigrams if bigram in self.top_bigrams] return known_bigrams
def train_phrasal(): sentences = load_sentences('/nlp/data/romap/unigram_docs/') bigram = Phrases() for sentence in sentences: bigram.add_vocab([sentence]) print len(sentences) model = gensim.models.Word2Vec(bigram[sentences], size=200) model.wv.save_word2vec_format( '/nlp/data/romap/naacl-pattern/w2v/Collocation-w2v.txt', binary=False)
def write_to_file_chartssa(no_delexi_charts: List[str], all_sents: List[List[str]]) -> None: with open(os.path.join('chartssa/original_data/', 'chartssa.box'), 'w') as g: with open(os.path.join('chartssa/original_data/', 'train.box'), 'w') as train: with open(os.path.join('chartssa/original_data/', 'test.box'), 'w') as test: with open(os.path.join('chartssa/original_data/', 'valid.box'), 'w') as valid: for chart in no_delexi_charts: chart_descs, reversed_chart_descs = turn_chart_info_into_sentences( chart) #pprint(chart_descs) #pprint(reversed_chart_descs) #pprint(all_sents) #!!! all_sents must contain a list of sentences, with each sentence being a list of words bigram = Phrases(all_sents, min_count=1, threshold=10) bigram.add_vocab([["Financial", "Groups"], ["Law", "Firms"], ["Computer", "Science"]]) #print("aaaaaaaa=",bigram[['Financial', 'Groups', 'are', 'more', 'awsome', 'than', 'law', 'firms', 'and', 'even', 'more', 'awesome', 'than', 'computer', 'science']]) #print(bigram.vocab) new_reversed_chart_descs = convert_sentences_to_bigrams( reversed_chart_descs, bigram) #pprint(new_reversed_chart_descs) # chart_lines_senta : all the sentences belonging to chart `chart` chart_lines_sa = generate_files_sa( new_reversed_chart_descs) #pprint(chart_lines_sa) len_all_chart_sentences = len(chart_lines_sa) #print("len=", len_all_chart_sentences) g.write(''.join(chart_lines_sa)) for line_idx, chart_line in enumerate(chart_lines_sa): if line_idx in list(range(5)): #print("test=", line_idx) test.write(chart_line) elif line_idx in list(range(5, 10)): #print("valid=", line_idx) valid.write(chart_line) elif line_idx in list( range(10, len_all_chart_sentences)): #print("train=", line_idx) train.write(chart_line)
def train_phrases(tokenized_corpus): # If no pretrained Phrases model is available, instantiate one: if not os.path.isfile(os.path.join(module_path, 'models', 'Phrases_model')): bigram = Phrases(tokenized_corpus, min_count=5, # Ignore all words and bigrams with total collected count lower than this value. common_terms=common_terms) # List of stop words that won’t affect frequency count of expressions containing them. save_phrases(bigram) # Otherwise load the pretrained model and update it else: bigram = load_phrases() bigram.add_vocab(tokenized_corpus) save_phrases(bigram) return bigram
class OverkillTokenizer(Tokenizer): def __init__(self, lemmatize=True, n_jobs=1, bigram=None, trigram=None, min_count=5, threshold=10.): self.lemmatize = lemmatize self.n_jobs = n_jobs self.bigram = bigram self.trigram = trigram self.min_count = min_count self.threshold = threshold def tokenize(self, docs): if self.lemmatize: lem = WordNetLemmatizer() #print('RAKE tokenizing...') pre_tdocs = RAKETokenizer(n_jobs=self.n_jobs).tokenize(docs) for i, tdoc in enumerate(pre_tdocs): for t in tdoc: if t.startswith('one'): print(t) print(i) #print('Additional Tokenizing docs...') if self.n_jobs == 1: tdocs = [pre_tokenize(doc, tdoc, lem=lem) for doc, tdoc in zip(docs, pre_tdocs)] else: tdocs = parallel(partial(pre_tokenize, lem=lem), zip(docs, pre_tdocs), self.n_jobs, expand_args=True) #print('Training bigram...') if self.bigram is None: self.bigram = Phrases(tdocs, min_count=self.min_count, threshold=self.threshold, delimiter=b' ') else: self.bigram.add_vocab(tdocs) #print('Training trigram...') if self.trigram is None: self.trigram = Phrases(self.bigram[tdocs], min_count=self.min_count, threshold=self.threshold, delimiter=b' ') else: self.trigram.add_vocab(self.bigram[tdocs]) return [tdoc for tdoc in self.trigram[self.bigram[tdocs]]]
def train_mwe_model_from_json(articles): # if path.exists("./models/model"): # phrases_model = SaveLoad.load("./models/model") # else: phrases_model = Phrases(common_terms=accepted_connectors, min_count=5) for document in articles: if 'text' in document.keys(): text = document["text"] phrases_model.add_vocab([text.split(" ")]) if 'title' in document.keys(): title = document["title"] phrases_model.add_vocab([title.split(" ")]) # phrases_model.save("./models/model") phraser_model = Phraser(phrases_model) return phraser_model
def write_to_file_chartssb(no_delexi_charts: List[str], all_sents: List[List[str]]) -> None: with open(os.path.join('chartssb/original_data/', 'chartssb.box'), 'w') as g: with open(os.path.join('chartssb/original_data/', 'train.box'), 'w') as train: with open(os.path.join('chartssb/original_data/', 'test.box'), 'w') as test: with open(os.path.join('chartssb/original_data/', 'valid.box'), 'w') as valid: for chart in no_delexi_charts: chart_descs, _ = turn_chart_info_into_sentences(chart) #print(chart_descs) bigram2 = Phrases(all_sents, min_count=1, threshold=2) bigram2.add_vocab([["Financial", "Groups"], ["Law", "Firms"], ["Computer", "Science"]]) print("vocab=", bigram2.vocab) chart_infos_sentb = turn_dict_into_sent_b(chart_descs) new_infos = convert_chartssb_to_bigrams( chart_infos_sentb, bigram2) chart_lines_sentb = generate_files_sb(new_infos) len_all_chart_sentences = len(chart_lines_sentb) print("len=", len_all_chart_sentences) g.write(''.join(chart_lines_sentb)) for line_idx, chart_line in enumerate( chart_lines_sentb): if line_idx in list(range(5)): #print("test=", line_idx) test.write(chart_line) elif line_idx in list(range(5, 10)): #print("valid=", line_idx) valid.write(chart_line) elif line_idx in list( range(10, len_all_chart_sentences)): #print("train=", line_idx) train.write(chart_line)
def word2vec_measure(): article_names = ["expressen", "aftonbladet", "svd", "dn"] #, sentences = [] for single_article in article_names: print(" \n *** " + single_article + " *****") articles = db.get_articles(single_article) bigram = Phrases() for row in articles: row = IO.filter_text(row.lower()) sentence = [ word for word in row if word not in stopwords.words('swedish') ] sentences.append(sentence) bigram.add_vocab([sentence]) print(len(sentences)) num_features = 300 # Word vector dimensionality min_word_count = 5 # Minimum word count num_workers = 8 # Number of threads to run in parallel context = 5 # `context window` is the maximum distance between the current and predicted word within a sentence. downsampling = 1e-3 # Downsample setting for frequent words # bigram_model = Word2Vec(bigram[sentences], size=100) bigram_model = Word2Vec(bigram[sentences], workers=num_workers, \ size=num_features, sg=1, min_count = min_word_count, \ window = context, sample = downsampling) word2vec_result = bigram_model.most_similar( positive=['muslimska_brödraskapet'], topn=200) # filepath = prop.word2vec_count+single_article+".tsv" filepath = prop.word2vec_count + "all_10.tsv" IO.write_tuple(word2vec_result, filepath)
def bigrams_with_gensim(data): from gensim.models import Phrases bigram = Phrases() sentences = [] for row in data: title = row['Headings'].replace('[','').replace(']','').replace("'",'') title = title + '.' #title = title.replace('--',' -- ') sentence = [word for word in nltk.word_tokenize(title.lower()) if word not in string.punctuation] sentences.append(sentence) bigram.add_vocab([sentence]) bigram_counter = Counter() for key in bigram.vocab.keys(): if key not in stopwords.words("english"): spl = re.split(b'\_',key) spl = [s for s in spl if s !=''] if len(spl) > 1: bigram_counter[key] += bigram.vocab[key] print('Bigrams with gensim') for key, counts in bigram_counter.most_common(50): print('{}: {}'.format(key, counts)) return bigram
def trainWord2Vec(fileName, modelName): # train word2vec on the two sentences file = open("../data/mergedDatasets/" + fileName, "r") #sentences = file.read() bigram = Phrases() lines = [] for line in file: line = str(line.decode('ascii', 'ignore')) #replace special charecters with spaces to it does not confuce for words for charec in line: if charec in [ ',', '\'', '.', '-', '_', '!', '|', '@', '#', '$', '%', '^', '*', '~', '(', ')', '{', '}' ]: line = line.replace(charec, ' ') #remove pre and post white spaces line = line.strip() if len(line) > 0: wordArray = line.split(" ") wordArray = map(str.lower, wordArray) if len(wordArray) > 15: lines.append(wordArray) sentences = lines print sentences bigram.add_vocab(sentences) trigram = Phrases(bigram[sentences]) fourgram = Phrases(trigram[bigram[sentences]]) #for a in bigram.vocab.keys(): # if str(a).find("_")>0 :print a mymodel = gensim.models.Word2Vec(fourgram[trigram[bigram[sentences]]], min_count=15, size=200, workers=4) mymodel.save("../word2VecModels/" + modelName)
extra_testing_mat[row - N_TRAINING, cuisine2id[cuisine]] = 1 with open("extra_testing_matrix.pyobject", "wb") as f: pickle.dump(extra_testing_mat, f) finish = time() print("Complete!") print("Running time: %.2f seconds" % (finish - start,)) print() # BIGRAMS & TRIGRAMS print("Creating n-gram corpus from training corpus...") start = time() phrases = Phrases(min_count=3, threshold=10.0) with open("training_corpus.txt", "rt") as f: for line in f: phrases.add_vocab([line.rstrip().split()]) _ = f.seek(0) with open("bigram_training_corpus.txt", "wt") as g: for line in f: word_list = phrases[line.rstrip().split()] g.write(" ".join(word_list) + "\n") phrases = Phrases(min_count=3, threshold=10.0) with open("bigram_training_corpus.txt", "rt") as f: for line in f: phrases.add_vocab([line.rstrip().split()]) _ = f.seek(0) with open("trigram_training_corpus.txt", "wt") as g: for line in f: word_list = phrases[line.rstrip().split()] g.write(" ".join(word_list) + "\n") finish = time()
if len(sys.argv) < 4: #print(globals()['__doc__'] % locals()) sys.exit(1) sentence_stream = [] inp, outp1, outp2 = sys.argv[1:4] filesent = open(inp) sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') lines = (sent_detector.tokenize(filesent.read().strip())) bigram_transformer = Phrases(min_count=1, threshold=2) for line in lines: sentence = [ word.decode("utf-8") for word in nltk.word_tokenize(line.lower()) if word not in string.punctuation ] sentence_stream.append(sentence) bigram_transformer.add_vocab([sentence]) #sentence_stream = [doc.split(" ") for doc in lines] #bigram_transformer = Phrases(sentence_stream, min_count=1, threshold=2) model = Word2Vec(bigram_transformer[sentence_stream], size=100, window=2, min_count=2, workers=multiprocessing.cpu_count(), sg=1) # model = Word2Vec(LineSentence(inp), size=400, window=5, min_count=2, # workers=multiprocessing.cpu_count()) # trim unneeded model memory = use(much) less RAM
class TopicModel(object): ''' This module preprocesses a corpus of documents and runs Latent Dirichlet Allocation (LDA) on a corpus of documents. Parameters ---------- num_topics: int, default: 100 input parameter to LDA min_word_count: int, default: 20 if a token has fewer than min_word_count occurences in the entire corpus, then it will be pruned from the processed corpus top_most_common_words: int, default: 10 prune tokens that are within the top_most_common_words throughout the entire corpus min_doc_length: int, default: 40 if the number of tokens within a processed document is less than min_doc_length, then the document is excluded max_doc_length: int, default: 1000 if the number of tokens within a processed document is greater than max_doc_length, then the document is excluded random_state: default: None the random seed for the Gensim LDA object Attributes ---------- bigramizer: the trained Gensim bigramizer tokens: list of list of strings dictionary: mapping from id to token corpus: bag of words vectorization of the tokens lda: the Gensim LDA object dominant_topic_ids: list of dominant topic ids, in decreasing order of dominance ''' def __init__(self, num_topics=100, min_word_count=20, top_most_common_words=10, min_doc_length=40, max_doc_length=1000, random_state=None): self.num_topics = num_topics self.min_word_count = min_word_count self.top_most_common_words = top_most_common_words assert max_doc_length > min_doc_length, \ "max_doc_length must be greater than min_doc_length" self.min_doc_length = min_doc_length self.max_doc_length = max_doc_length self.random_state = random_state # natural language processing self.stop_words = self.getEnglishStopWords() self.bigramizer = Phrases() def fit(self, documents): ''' parameters: documents: list of strings, each represents a document ''' # tokens, dictionary, corpus for LDA self.tokens = self.preProcessCorpus(documents) self.dictionary = corpora.Dictionary(self.tokens) self.corpus = [self.dictionary.doc2bow(text) for text in self.tokens] self.lda = self.getLDA(dictionary=self.dictionary, corpus=self.corpus, num_topics=self.num_topics, random_state=self.random_state) self.num_dominant_topics = min(10, self.num_topics) self.dominant_topic_ids = self.getDominantTopics( self.corpus, self.lda, self.num_dominant_topics) def __str__(self): description = ( "topic model:\n\ttoken length = {0:,d}\n\tdictionary length = {1:,d}" "\n\tnum_topics = {2:,d}\n\tmin_word_count = {3:,d}" "\n\ttop_most_common_words = {4:,d}\n\tmin_doc_length = {5:,d}" "\n\tmax_doc_length = {6:,d}") return description.format(len(self.tokens), len(self.dictionary), self.num_topics, self.min_word_count, self.top_most_common_words, self.min_doc_length, self.max_doc_length) @staticmethod def getEnglishStopWords(): ''' returns a set of stop words for NLP pre-processing from nltk.corpus.stopwords() Also, some words and letters are added to the set, such as "please", "sincerely", "u", etc... ''' stop_words = set(stopwords.words("english")) stop_words.add('please') stop_words.add('would') stop_words.add('use') stop_words.add('also') stop_words.add('thank') stop_words.add('sincerely') stop_words.add('regards') stop_words.add('hi') stop_words.add('hello') stop_words.add('greetings') stop_words.add('hey') stop_words.add('attachment') stop_words.add('attached') stop_words.add('attached_file') stop_words.add('see') stop_words.add('file') stop_words.add('comment') for item in 'abcdefghijklmnopqrstuvwxyz': stop_words.add(item) return stop_words @staticmethod def getFrequencies(tokens): """ input: tokens, a list of list of tokens output: a collections.Counter() object that contains token counts """ frequencies = Counter() for row in tokens: frequencies.update(row) return frequencies @staticmethod def getLowFreqWords(frequencies, countCutOff): """ input: frequencies: a collections.Counter() object countCutOff: the minimum frequency below which tokens are added to the set of low frequency tokens """ lowFreqTokens = set() for token, freq in frequencies.items(): if freq <= countCutOff: lowFreqTokens.add(token) return lowFreqTokens def preProcessCorpus(self, documents, min_word_count=None, top_most_common_words=None, min_doc_length=None, max_doc_length=None): ''' this function pre-processes the documents and converts them into a list of list of tokens input: documents: a list of strings (each string represents a document) min_word_count: if the frequency count of a token in the corpus is less than min_word_count then it is pruned top_most_common_words: if the frequency count of a token in the corpus exceeds top_most_common_words then it is pruned min_doc_length: if the number of tokens within a processed document is less than min_doc_length, then the document is excluded max_doc_length: if the number of tokens within a processed document is greater than max_doc_length, then the document is excluded output: a list of list of tokens ''' if min_word_count is None: min_word_count = self.min_word_count if top_most_common_words is None: top_most_common_words = self.top_most_common_words if min_doc_length is None: min_doc_length = self.min_doc_length if max_doc_length is None: max_doc_length = self.max_doc_length tokens = [tokenizer(document) for document in documents] # exclude comments that are longer than max_doc_length tokens = [tkn for tkn in tokens if len(tkn) < max_doc_length] # train Gensim Phrases model for bigrams self.bigramizer.add_vocab(tokens) # apply Gensim Phrases model to generate bigrams tokens = [self.bigramizer[tkn] for tkn in tokens] # exclude stop words tokens = [[t for t in tkn if t not in self.stop_words] for tkn in tokens] # exclude tokens that are shorter than min_doc_length tokens = [tkn for tkn in tokens if len(tkn) > min_doc_length] # calculate token frequencies to exclude low and high frequency tokens freqs = self.getFrequencies(tokens) low_freq_tokens = set(x[0] for x in freqs.items() if x[1] < min_word_count) high_freq_tokens = [ word[0] for word in freqs.most_common(top_most_common_words) ] tokens = [[t for t in tkn if t not in low_freq_tokens] for tkn in tokens] tokens = [[t for t in tkn if t not in high_freq_tokens] for tkn in tokens] print('\nnumber of low frequency tokens pruned = {:,d}'\ .format(len(low_freq_tokens))) print('min_word_count = {:d}, top_most_common_words = {:,d}'\ .format(min_word_count, top_most_common_words)) print('number of high frequency tokens pruned = {:,d}'\ .format(len(high_freq_tokens))) print('tokens = {:,d} rows'.format(len(tokens))) print('text pre-processing is complete\n') return tokens def getLDA(self, dictionary=None, corpus=None, num_topics=None, random_state=None): # get LDA for dictionary_all and corpus_all print('computing LDA...') if dictionary is None: dictionary = self.dictionary if corpus is None: corpus = self.corpus if num_topics is None: num_topics = self.num_topics lda = models.ldamodel.LdaModel(corpus=corpus, alpha='auto', id2word=dictionary, num_topics=num_topics, random_state=random_state) return lda def getDominantTopics(self, corpus, lda, num_dominant_topics=None): print('computing dominant topics...') if corpus is None: corpus = self.corpus if lda is None: lda = self.lda if num_dominant_topics is None: num_dominant_topics = self.num_dominant_topics # get topic weight matrix using lda.inference # the matrix has dimensions (num documents) x (num topics) inference = lda.inference(corpus) inference = inference[ 0] # the inference is a tuple, need the first term num_topics = lda.num_topics # find dominant topics across documents (vertical sum) column_sum_of_weights = np.sum(inference, axis=0) sorted_weight_indices = np.argsort(column_sum_of_weights) idx = np.arange(num_topics - num_dominant_topics, num_topics) dominant_topic_ids = sorted_weight_indices[idx] # the dominant_topic_ids store the ids in descending order of dominance dominant_topic_ids = dominant_topic_ids[::-1] # convert from numpy array to list and return return dominant_topic_ids.tolist()
import bz2 import nltk from collections import Counter from gensim.models import Phrases from gensim.models import Word2Vec from nltk.corpus import stopwords sentences = [] bigram = Phrases() with bz2.BZ2File('./2009.csv.bz2') as file_: for i, line in enumerate(file_): sentence = [word for word in nltk.word_tokenize(line.decode("utf-8").lower()) if word not in string.punctuation] sentences.append(sentence) bigram.add_vocab([sentence]) bigram_model = Word2Vec(bigram[sentences]) bigram_model_counter = Counter() bigram_model.save('ok.w2v') for key in bigram_model.vocab.keys(): if key not in stopwords.words("english"): if len(key.split("_")) > 1: bigram_model_counter[key] += bigram_model.vocab[key].count for key, counts in bigram_model_counter.most_common(50): print('{0: <20} {1}'.format(key.encode("utf-8"), counts))
from gensim.models import Phrases from gensim.test.utils import datapath bigrams = Phrases(min_count=3, threshold=6) with open( 'C:\\Users\\sreek\\PycharmProjects\\SEC-Edgar-Data\\tokenized_file.txt', 'r' ) as tinf, open( 'C:\\Users\\sreek\\PycharmProjects\\SEC-Edgar-Data\\bi_gram_corpa.txt', 'a') as toutf: for line in tinf: bigrams.add_vocab(line) toutf.write(''.join(bigrams[line]))
bigram = Phrases(min_count=10) print "====================================== Start bigram training ======================================" # train bigram for city in Cities: print "====================================== City %s bigram training ======================================" % city filepath = "CityTextCorpus/%s/*/part-00000" % city cityFileList = glob.glob(filepath) cityTweets = MySentences(cityFileList) bigram.add_vocab(cityTweets) print "====================================== Start trigram training ======================================" # create phrase detector for trigram trigram = Phrases(min_count=5) # train trigram for city in Cities: print "====================================== City %s trigram training ======================================" % city filepath = "CityTextCorpus/%s/*/part-00000" % city cityFileList = glob.glob(filepath)
class TopicModel(object): ''' This module preprocesses a corpus of documents and runs Latent Dirichlet Allocation (LDA) on a corpus of documents. Parameters ---------- num_topics: int, default: 100 input parameter to LDA min_word_count: int, default: 20 if a token has fewer than min_word_count occurences in the entire corpus, then it will be pruned from the processed corpus top_most_common_words: int, default: 10 prune tokens that are within the top_most_common_words throughout the entire corpus min_doc_length: int, default: 40 if the number of tokens within a processed document is less than min_doc_length, then the document is excluded max_doc_length: int, default: 1000 if the number of tokens within a processed document is greater than max_doc_length, then the document is excluded random_state: default: None the random seed for the Gensim LDA object Attributes ---------- bigramizer: the trained Gensim bigramizer tokens: list of list of strings dictionary: mapping from id to token corpus: bag of words vectorization of the tokens lda: the Gensim LDA object dominant_topic_ids: list of dominant topic ids, in decreasing order of dominance ''' def __init__(self, num_topics=100, min_word_count=20, top_most_common_words=10, min_doc_length=40, max_doc_length=1000, random_state=None): self.num_topics = num_topics self.min_word_count = min_word_count self.top_most_common_words = top_most_common_words assert max_doc_length > min_doc_length, \ "max_doc_length must be greater than min_doc_length" self.min_doc_length = min_doc_length self.max_doc_length = max_doc_length self.random_state = random_state # natural language processing self.stop_words = self.getEnglishStopWords() self.bigramizer = Phrases() def fit(self, documents): ''' parameters: documents: list of strings, each represents a document ''' # tokens, dictionary, corpus for LDA self.tokens = self.preProcessCorpus(documents) self.dictionary = corpora.Dictionary(self.tokens) self.corpus = [self.dictionary.doc2bow(text) for text in self.tokens] self.lda = self.getLDA(dictionary=self.dictionary, corpus=self.corpus, num_topics=self.num_topics, random_state=self.random_state) self.num_dominant_topics=min(10, self.num_topics) self.dominant_topic_ids = self.getDominantTopics(self.corpus, self.lda, self.num_dominant_topics) def __str__(self): description = ("topic model:\n\ttoken length = {0:,d}\n\tdictionary length = {1:,d}" "\n\tnum_topics = {2:,d}\n\tmin_word_count = {3:,d}" "\n\ttop_most_common_words = {4:,d}\n\tmin_doc_length = {5:,d}" "\n\tmax_doc_length = {6:,d}") return description.format(len(self.tokens), len(self.dictionary), self.num_topics, self.min_word_count, self.top_most_common_words, self.min_doc_length, self.max_doc_length) @staticmethod def getEnglishStopWords(): ''' returns a set of stop words for NLP pre-processing from nltk.corpus.stopwords() Also, some words and letters are added to the set, such as "please", "sincerely", "u", etc... ''' stop_words = set(stopwords.words("english")) stop_words.add('please') stop_words.add('would') stop_words.add('use') stop_words.add('also') stop_words.add('thank') stop_words.add('sincerely') stop_words.add('regards') stop_words.add('hi') stop_words.add('hello') stop_words.add('greetings') stop_words.add('hey') stop_words.add('attachment') stop_words.add('attached') stop_words.add('attached_file') stop_words.add('see') stop_words.add('file') stop_words.add('comment') for item in 'abcdefghijklmnopqrstuvwxyz': stop_words.add(item) return stop_words @staticmethod def getFrequencies(tokens): """ input: tokens, a list of list of tokens output: a collections.Counter() object that contains token counts """ frequencies = Counter() for row in tokens: frequencies.update(row) return frequencies @staticmethod def getLowFreqWords(frequencies, countCutOff): """ input: frequencies: a collections.Counter() object countCutOff: the minimum frequency below which tokens are added to the set of low frequency tokens """ lowFreqTokens = set() for token, freq in frequencies.iteritems(): if freq <= countCutOff: lowFreqTokens.add(token) return lowFreqTokens def preProcessCorpus(self, documents, min_word_count=None, top_most_common_words=None, min_doc_length=None, max_doc_length=None): ''' this function pre-processes the documents and converts them into a list of list of tokens input: documents: a list of strings (each string represents a document) min_word_count: if the frequency count of a token in the corpus is less than min_word_count then it is pruned top_most_common_words: if the frequency count of a token in the corpus exceeds top_most_common_words then it is pruned min_doc_length: if the number of tokens within a processed document is less than min_doc_length, then the document is excluded max_doc_length: if the number of tokens within a processed document is greater than max_doc_length, then the document is excluded output: a list of list of tokens ''' if min_word_count is None: min_word_count = self.min_word_count if top_most_common_words is None: top_most_common_words = self.top_most_common_words if min_doc_length is None: min_doc_length = self.min_doc_length if max_doc_length is None: max_doc_length = self.max_doc_length tokens = [tokenizer(document) for document in documents] # exclude comments that are longer than max_doc_length tokens = [tkn for tkn in tokens if len(tkn) < max_doc_length] # train Gensim Phrases model for bigrams self.bigramizer.add_vocab(tokens) # apply Gensim Phrases model to generate bigrams tokens = [self.bigramizer[tkn] for tkn in tokens] # exclude stop words tokens = [[t for t in tkn if t not in self.stop_words] for tkn in tokens] # exclude tokens that are shorter than min_doc_length tokens = [tkn for tkn in tokens if len(tkn) > min_doc_length] # calculate token frequencies to exclude low and high frequency tokens freqs = self.getFrequencies(tokens) low_freq_tokens = set(x[0] for x in freqs.iteritems() if x[1] < min_word_count) high_freq_tokens = [word[0] for word in freqs.most_common(top_most_common_words)] tokens = [[t for t in tkn if t not in low_freq_tokens] for tkn in tokens] tokens = [[t for t in tkn if t not in high_freq_tokens] for tkn in tokens] print '\nnumber of low frequency tokens pruned = {:,d}'\ .format(len(low_freq_tokens)) print 'min_word_count = {:d}, top_most_common_words = {:,d}'\ .format(min_word_count, top_most_common_words) print 'number of high frequency tokens pruned = {:,d}'\ .format(len(high_freq_tokens)) print 'tokens = {:,d} rows'.format(len(tokens)) print 'text pre-processing is complete\n' return tokens def getLDA(self, dictionary=None, corpus=None, num_topics=None, random_state=None): # get LDA for dictionary_all and corpus_all print 'computing LDA...' if dictionary is None: dictionary = self.dictionary if corpus is None: corpus = self.corpus if num_topics is None: num_topics = self.num_topics lda = models.ldamodel.LdaModel(corpus=corpus, alpha='auto', id2word=dictionary, num_topics=num_topics, random_state=random_state) return lda def getDominantTopics(self, corpus, lda, num_dominant_topics=None): print 'computing dominant topics...' if corpus is None: corpus = self.corpus if lda is None: lda = self.lda if num_dominant_topics is None: num_dominant_topics = self.num_dominant_topics # get topic weight matrix using lda.inference # the matrix has dimensions (num documents) x (num topics) inference = lda.inference(corpus) inference = inference[0] # the inference is a tuple, need the first term num_topics = lda.num_topics # find dominant topics across documents (vertical sum) column_sum_of_weights = np.sum(inference, axis=0) sorted_weight_indices = np.argsort(column_sum_of_weights) idx = np.arange(num_topics - num_dominant_topics, num_topics) dominant_topic_ids = sorted_weight_indices[idx] # the dominant_topic_ids store the ids in descending order of dominance dominant_topic_ids = dominant_topic_ids[::-1] # convert from numpy array to list and return return dominant_topic_ids.tolist()
abstrct = [] ngram = Phrases() # creating dataframe datadf = pd.read_pickle(file_dir) years = np.array(datadf.Publication_Year) for i in np.arange(len(datadf.index)): texts = [ word for word in nltk.word_tokenize(datadf.iloc[i]['Abstracts'].lower()) if word not in string.punctuation and word not in stoplist ] abstrct.append(texts) ngram.add_vocab([texts]) # ## Removing unimportant words from the bag of words # There are also some unimportant words which are not included in the NLTK stopword list. So we have created a text file and put those stopwords and clean our data. # In[69]: f = open('new_stop_words.txt', 'r') # open file in read mode new_stopwords_list = f.read() # copy to a string stoplist += new_stopwords_list.split() N = 0 #for phrase # ## Most frequent words in the collection of abstracts
import nltk from collections import Counter from gensim.models import Phrases from gensim.models import Word2Vec from nltk.corpus import stopwords sentences = [] bigram = Phrases() with bz2.BZ2File('./2009.csv.bz2') as file_: for i, line in enumerate(file_): sentence = [ word for word in nltk.word_tokenize(line.decode("utf-8").lower()) if word not in string.punctuation ] sentences.append(sentence) bigram.add_vocab([sentence]) bigram_model = Word2Vec(bigram[sentences]) bigram_model_counter = Counter() bigram_model.save('ok.w2v') for key in bigram_model.vocab.keys(): if key not in stopwords.words("english"): if len(key.split("_")) > 1: bigram_model_counter[key] += bigram_model.vocab[key].count for key, counts in bigram_model_counter.most_common(50): print('{0: <20} {1}'.format(key.encode("utf-8"), counts))
from gensim.models import Word2Vec from gensim.utils import lemmatize from gensim.parsing.preprocessing import STOPWORDS from nltk.corpus import stopwords from collections import Counter print("Reading input file 'input/audits_with_content.csv'") with open('input/audits_with_content.csv', 'r') as f: reader = csv.reader(f) raw_documents = list(reader) print("Prepare documents") documents = [doc[2] for doc in raw_documents if doc[2] != ''] sentences = [] bigram = Phrases() for document in documents: raw_text = document.lower() tokens = lemmatize(raw_text, stopwords=STOPWORDS) sentences.append(tokens) bigram.add_vocab([tokens]) bigram_counter = Counter() for key in bigram.vocab.keys(): if key not in stopwords.words("english"): if len(key.split("_")) > 1: bigram_counter[key] += bigram.vocab[key] for key, counts in bigram_counter.most_common(200): print '{0: <20} {1}'.format(key.encode("utf-8"), counts)