def get_tfidf(self, filename, file_tfidf): with open(r'D:\DS_lab\Project\PreProcessing\data\vocab.txt') as file: word_idfs = [(line.split(":")[0], float(line.split(":")[1])) for line in line_tokenize(file.read())] idfs = dict(word_idfs) IDwords = dict( (word, index) for index, (word, idf) in enumerate(word_idfs)) data = [] with open(filename) as file: documents = [(line.split("_____")[0], line.split("_____")[1]) for line in line_tokenize(file.read())] for document in documents: words = [ w for w in document[1].split() if w in list(idfs.keys()) ] set_of_words = list(set(words)) sum_words = len(words) word_tfidfs = [] sum_squares = 0 for word in set_of_words: tfidf = idfs[word] * words.count(word) / sum_words sum_squares += tfidf**2 word_tfidfs.append((IDwords[word], tfidf)) word_tfidfs_normalize = [ (str(index) + ":" + str(tfidf / np.sqrt(sum_squares))) for index, tfidf in word_tfidfs ] sparse_data = " ".join(word_tfidfs_normalize) data.append("_____".join([document[0], sparse_data])) with open(file_tfidf, 'w') as file: file.write("\n".join(data))
def get_text(in_file): "returns the text included in in_file, all in lowercase" l_file = open(in_file, "r") raw = l_file.read().decode('utf8').lower() text = '\n'.join(nltk.line_tokenize(raw)) return text
def date_change(str_): date_time ="" from translate import Translator translator= Translator(from_lang='burmese',to_lang="en") import langid def lang_identifier_mm(text): if "en" == langid.classify(text)[0]: return False else: return True import nltk,datetime current_month = datetime.datetime.now().date().month from dateutil import parser keywords = ["naypyidaw","naypyitaw","nay","daw","pyi","taw"] for _line in nltk.line_tokenize(str_): if lang_identifier_mm(_line) == True: _line = translator.translate(_line) _line = _line.lower() for _key in keywords: if _key in nltk.word_tokenize(_line): date_time = "2019 "+_line.replace(_key,"").replace(" on ","") date_month = parser.parse(date_time).month if date_month < current_month: date_time = "2018 "+_line.replace(_key,"").replace(" on ","") return date_time
def kmeans_clustering(source_path, files_list, clusters): """ Function to perform kmeans clustering on the dataset. Args: source_path -- string. Path where the data files are located. files_list -- list of file names in the source path clusters -- number of clusters """ stopwords = nltk.line_tokenize(open('stopwords.txt').read()) docs = [] filename = {} i = 0 # To avoid np array issues due to large datasets, perform clustering by splitting the data into smaller subsets for file_name in files_list[:2000]: resume = open(source_path + '/' + file_name).read() docs.append(resume) filename[i] = file_name i += 1 vectorizer = TfidfVectorizer(min_df=1) tfidf = vectorizer.fit_transform(docs) km = KMeans(n_clusters=clusters, init='k-means++', max_iter=10, n_init=1) km.fit(tfidf) results = [] # Create a results list with filename and predicted cluster value for each resume. for i in range(0, len(tfidf.toarray())): try: results.append( [str(filename[i]), int(km.predict(tfidf.toarray()[i]))]) except: pass # Copy the resume file from its source directory and move it to new directory according to its predicted cluster for cluster in range(0, clusters): # Create a combined resume text for each cluster for further analysis using word clouds word_cloud_text = "" for i in range(0, len(results) - 1): if results[i][1] == cluster: document = open(source_path + '/' + results[i][0], 'r').read() docu = re.sub('[^A-Za-z\' ]+', '', str(document).lower()) unigrams = docu.split() word_list = [ word.lower() for word in unigrams if word.lower() not in stopwords ] text = " ".join(word_list) word_cloud_text += text destination = '/Users/' + user_name + '/Documents/Data/kmeans/pass1/' + str( cluster) shutil.copy2(source_path + '/' + results[i][0], destination) f = open( '/Users/' + user_name + '/Documents/Data/kmeans/wordcloud/pass1/' + str(cluster) + ".txt", 'w') f.write(word_cloud_text) f.close()
def generate_vocabulary(self): self.min_df = 10 def compute_idf(df, corpus_size): assert df > 0 return np.log(corpus_size / df) with open(self.file) as file: data = file.read() lines = line_tokenize(data) corpus_size = len(lines) doc_count = defaultdict(int) for line in lines: components = line.split("_____") text = components[-1] features = list(set(text.split())) for w in features: doc_count[w] += 1 # words = list(doc_count.keys()) # idfs = [] # for word in words: # if doc_count[word] > self.min_df: # idf = compute_idf(doc_count[word], corpus_size) # idfs.append(idf) # else: # words.remove(word) # vocab = zip(words, idfs) vocab = [(word, compute_idf(doc_count[word], corpus_size)) for word in list(doc_count.keys()) if doc_count[word] > self.min_df] feature_idfs = [] for (feature, idf) in vocab: feature_idfs.append(feature + ":" + str(idf)) with open('data\\vocab.txt', 'w') as file: file.write("\n".join(feature_idfs))
def populate(keyfile, textfile): #imatrix=copy.deepcopy(amatrix) #itext=text.tokens[:] print("populate : ", keyfile) f = open('Keys/' + keyfile) raw = f.read().lower() #tokens1 = nltk.word_tokenize(raw) tokens1 = nltk.line_tokenize(raw) text11 = removelinebreak(tokens1) tokens1 = text11 text1 = Text(raw) text1.tokens = tokens1 #print(tokens1) #text.updatetokens(tokens1) #text.tokens.append(textfile) #print(text.tokens) if len(text1.tokens) > 0: poslist = text1.update_graph('Corpus/' + textfile) text.updatetokens(text1.tokens) #print ("after: ",text.tokens) amatrix.updateMatrix(text.tokens, poslist, False) text.tokens.sort() print(amatrix) rem = [] for i in text.tokens: if i[len(i) - 1] == "s" and i[:-1] in text.tokens: t = i[:-1] for j in text.tokens: if amatrix.gmatrix[j][i].weight != float('inf'): if amatrix.gmatrix[j][t].weight != float('inf'): amatrix.gmatrix[j][t].weight += amatrix.gmatrix[j][ i].weight amatrix.gmatrix[j][t].numupdate += amatrix.gmatrix[j][ i].numupdate amatrix.gmatrix[t][j].weight = amatrix.gmatrix[j][ t].weight amatrix.gmatrix[t][j].numupdate = amatrix.gmatrix[j][ t].numupdate else: amatrix.gmatrix[j][t].weight = amatrix.gmatrix[j][ i].weight amatrix.gmatrix[j][t].numupdate = amatrix.gmatrix[j][ i].numupdate amatrix.gmatrix[t][j].weight = amatrix.gmatrix[j][ t].weight amatrix.gmatrix[t][j].numupdate = amatrix.gmatrix[j][ t].numupdate print("pop remove: ", i) rem.append(i) for i in rem: for j in text.tokens: del amatrix.gmatrix[j][i] for i in rem: print(i, "hi") del amatrix.gmatrix[i] text.tokens.remove(i)
def get_cosine(block): lines = nltk.line_tokenize(block) l = [ l.strip('cosine similarity between vectors: ') for l in lines if l.startswith('cosine') ] if len(l) == 0: return 0 else: return l[0]
def kmeans_clustering(source_path, files_list, clusters): """ Function to perform kmeans clustering on the dataset. Args: source_path -- string. Path where the data files are located. files_list -- list of file names in the source path clusters -- number of clusters """ stopwords = nltk.line_tokenize(open('stopwords.txt').read()) docs = [] filename = {} i = 0 # To avoid np array issues due to large datasets, perform clustering by splitting the data into smaller subsets for file_name in files_list[:2000]: resume = open(source_path + '/' + file_name).read() docs.append(resume) filename[i] = file_name i += 1 vectorizer = TfidfVectorizer(min_df=1) tfidf = vectorizer.fit_transform(docs) km = KMeans(n_clusters=clusters, init='k-means++', max_iter=10, n_init=1) km.fit(tfidf) results = [] # Create a results list with filename and predicted cluster value for each resume. for i in range(0, len(tfidf.toarray())): try: results.append([str(filename[i]), int(km.predict(tfidf.toarray()[i]))]) except: pass # Copy the resume file from its source directory and move it to new directory according to its predicted cluster for cluster in range(0, clusters): # Create a combined resume text for each cluster for further analysis using word clouds word_cloud_text = "" for i in range(0, len(results)-1): if results[i][1] == cluster: document = open(source_path + '/' + results[i][0], 'r').read() docu = re.sub('[^A-Za-z\' ]+', '', str(document).lower()) unigrams = docu.split() word_list = [word.lower() for word in unigrams if word.lower() not in stopwords] text = " ".join(word_list) word_cloud_text += text destination = '/Users/' + user_name + '/Documents/Data/kmeans/pass1/' + str(cluster) shutil.copy2(source_path + '/' + results[i][0], destination) f = open('/Users/' + user_name + '/Documents/Data/kmeans/wordcloud/pass1/' + str(cluster) + ".txt", 'w') f.write(word_cloud_text) f.close()
def extract_from_file(tagger, files, labels, file_id, keep): extracted_lines = [] extracted_labels = [] extract_from = [] file = files[file_id] if labels: label = labels[file_id] label = list(label) count = 0 i = j = 0 lines = nltk.line_tokenize(file) start = None count = 0 found_list = [contains_keywords.remote(line, tagger) for line in lines] found_list = ray.get(found_list) while i < len(lines): print(i) line = lines[i] if found_list[i]: count = 0 if not start: start = i else: count += 1 if start is not None: if count == keep or i == len(lines) - 1: start = max(start - keep, 0) end = min(i, len(lines) - 1) # print(end) new_extracted = '\n'.join(lines[start:end + 1]) extracted_lines.append(new_extracted) if labels: extracted_labels.append(label[start:end + 1]) extract_from.append((file_id, start)) start = None count = 0 i += 1 return extracted_lines, extracted_labels, extract_from
def initgraph(keyfile, textfile): f = open(keyfile) raw = f.read().lower() global text text = Text(raw) #tokens = nltk.word_tokenize(raw) tokens = nltk.line_tokenize(raw) text.tokens = list(set(tokens)) text1 = removelinebreak(text.tokens) #print text1 text1.sort() #text1.append(textfile) text.tokens = text1 #print text.tokens #print len(text.tokens) global amatrix #print ("before: ",text.tokens) amatrix = text.create_graph(textfile) #print("init:",len(text.tokens)) #amatrix.draw_ind_png(text.tokens,text.tokens,textfile) #print len(amatrix.gmatrix) return text.tokens
from __future__ import division import os import re import nltk import random import string import pickle from lxml import etree from nltk import bigrams from nltk import FreqDist from collections import Counter from util import ResumeCorpus user_name = os.environ.get('USER') punct = string.punctuation stopwords = nltk.line_tokenize(open('stopwords.txt').read()) porter = nltk.PorterStemmer() def create_skills_json(training_data): """ This function will extract all the skills from the training corpus and create a dictionary with Job Titles as keys and list of all the skills for that Job Title as values Args: training_data -- list of tuples. Eg. [(resume, tag, filename), (resume, tag, filename)...] Returns: skills_dict -- A dictionary with Job Titles as keys and list of all the skills for that Job Title as values """
def get_line_breaks(text): # uphold line breaks lines = nltk.line_tokenize(text, blanklines='keep') snippets = [len(tokenizer.tokenize(line)) for line in lines[:-1]] breaks = np.array(snippets).cumsum() + np.arange(len(snippets)) return breaks
import nltk import os import json import pickle # assign input path and file name inpath = 'org_texts/' infile = inpath + 'tasis_faq.txt' # assign empty dict d = {} # open txt file with open(infile) as f: # read file trial = f.read() # tokenize by line tokenize = nltk.line_tokenize(trial) # even numbers(questions) assign to key, odd (answers) to value in dict for i in range(0, len(tokenize), 2): d[tokenize[i]] = tokenize[i + 1] # assign output path and filename outpath = 'dict_texts/' outname = outpath + 'tasis_faq_dict' # write outfile outfile = open(outname, 'wb') pickle.dump(d, outfile) outfile.close()
Returns lists of tokens, tags, and items, then writes them to text files. ''' import nltk # Load text file data=open("tpb_sopa.txt") # OUT: <type 'file'> # Read file as string input=data.read() # OUT: <type 'str'> # Parse text lines into tokens lines=nltk.line_tokenize(input) # Write list of lines to disk FILE=open("out/lines.txt","w") for line in lines: FILE.writelines(str(line)+'\n') FILE.close() print "lines: "+str(len(lines)) # Parse sentences into tokens sentences=nltk.sent_tokenize(input) # Write list of sentences to disk FILE=open("out/sentences.txt","w") for sentence in sentences: FILE.writelines(str(sentence)) FILE.close()