def clean_text(s): s = preprocess.to_lower_case(s) s = preprocess.remove_special_chars(s) s = preprocess.remove_accents(s) s = preprocess.remove_stopwords(s) s = preprocess.remove_punctuation(s) s = preprocess.remove_extra_space(s) return s
def index_document(document, inverted_index, movieID): tokens = nltk.word_tokenize(document) # Tokenize the script/synopsis tokens = [x for x in tokens if x not in string.punctuation] tokens = remove_stopwords(tokens) # Remove the stopwords tokens = stem_words(tokens) # Stem words for i in range(0, len(tokens)): tokens[i] = tokens[i].lower() # Makes all words lowercase create_inverted_index(inverted_index, movieID, tokens) # Create the inverted index
def calculate_query_TFIDF(query_string, inverted_index, num_files, profile): # List of words to remove words from profile text that appear often but have no bearing on user's likes/dislikes words_to_remove = ["birthday", "bday", "facebook", "lol", "thank", "christmas", "hanukkah", "happy"] # First we must preprocess the query (social media profile) m = NameDataset() tokens = nltk.word_tokenize(query_string) # Tokenizes the string using NLTK tokens = [x for x in tokens if x not in string.punctuation] # Don't include punctuation query_tokens = remove_stopwords(tokens) # Remove the stopwords # Only includes words that are: 1.) In English 2.) Not in words_to_remove 3.) Not a first name or last name query_tokens = [x for x in query_tokens if (wordnet.synsets(x) and x not in words_to_remove and not m.search_first_name(x)) and not m.search_last_name(x)] query_tokens = stem_words(query_tokens) # Stem words for preprocessing for i in range(0, len(query_tokens)): # Converts all tokens to lowercase query_tokens[i] = query_tokens[i].lower() query_tokens = [x for x in query_tokens if x != 'birthdai'] # Makes sure this common word doesn't appear query_appearances = collections.Counter() query_weights = [0] * len(inverted_index) # Initialize vector to hold query weights query_length = 0.0 l = list(inverted_index.keys()) # Gets list of tuples (query_term, index) for query_token in query_tokens: # Counter that keeps track of word appearances query_appearances[query_token] += 1 # Iterate through each term in the query vector and assign nonzero weight if the term appears in inverted index for query_term in query_appearances: if query_term in inverted_index: index_of_word = l.index(query_term) # Since ordered dict, calculate index of term num_postings = inverted_index[query_term].length + 0.0 # Document frequency idf = math.log10(num_files / num_postings) # Inverse document frequency tf = query_appearances[query_term] # Term frequency query_weights[index_of_word] = tf * idf # Query weight query_length += (tf * idf) * (tf * idf) # Update running total for query length query_length = math.sqrt(query_length) # Calculate final query length # Writes the query data to pickles pickle_out = open("data/"+profile+"/query_appearances.pickle", "wb") pickle.dump(query_appearances, pickle_out) pickle_out.close() pickle_out2 = open("data/" + profile + "/query_weights.pickle", "wb") pickle.dump(query_weights, pickle_out2) pickle_out2.close() return (query_weights, query_length, query_appearances) # Returns the tuple of necessary data
def summarize(article): try: _create_unverified_https_context = ssl._create_unverified_context except AttributeError: pass else: ssl._create_default_https_context = _create_unverified_https_context #nltk.download('averaged_perceptron_tagger') sentences = preprocess.tokenize_sentences(article) clean_sentences = pdpip3.Series(sentences).str.replace("[^a-zA-Z]", " ") clean_sentences = [s.lower() for s in clean_sentences] clean_sentences = [ preprocess.remove_stopwords(r.split()) for r in clean_sentences ] word_embeddings = {} f = open('/Users/apple/Downloads/glove.6B/glove.6B.100d.txt', encoding='utf-8') for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') word_embeddings[word] = coefs f.close() sentence_vectors = [] for i in clean_sentences: if len(i) != 0: v = sum( [word_embeddings.get(w, np.zeros((100, ))) for w in i.split()]) / (len(i.split()) + 0.001) else: v = np.zeros((100, )) sentence_vectors.append(v) sim_mat = np.zeros([len(sentences), len(sentences)]) for i in range(len(sentences)): for j in range(len(sentences)): if i != j: sim_mat[i][j] = cosine_similarity( sentence_vectors[i].reshape(1, 100), sentence_vectors[j].reshape(1, 100))[0, 0] nx_graph = nx.from_numpy_array(sim_mat) scores = nx.pagerank(nx_graph) ranked_sentences = sorted( ((scores[i], s) for i, s in enumerate(sentences)), reverse=True) # r = Rake() ques = [] for i in range(len(ranked_sentences)): tokens = [] print(ranked_sentences[i][1]) article = ranked_sentences[i][1] print("Article:", article) # r.extract_keywords_from_text(ranked_sentences[i][1]) # print("*********************") # print(r.get_ranked_phrases()) # To get keyword phrases ranked highest to lowest. # tokens.extend(r.get_ranked_phrases()) # lis = [] # for i in range(len(tokens)): # if len(tokens[i].split()) > 1: # lis.extend(nltk.word_tokenize(tokens[i])) # # else: # lis.append(tokens[i]) # print("Parts of speech tagging: ", pos_tag(lis)) # for i in range(len(ranked_sentences)): doc = nlp(article) print("DOC", doc.ents) print([(X.text, X.label_) for X in doc.ents]) for X in doc.ents: if X.label_: print("Inside for") article = article.replace(X.text, "__________") ques.append(article) break #print(ques) #print(type(ques)) print(i + 1, ":", article) print(ques) return ques
# 1. a lines = get_file_lines(test_file_name) print("***********Test-1***********") print(lines) print("****************************") # 1. b for i in range(len(lines)): lines[i] = clean_text(lines[i]) print("***********Test-2***********") print(lines) print("****************************") # 1.c for i in range(len(lines)): lines[i] = remove_stopwords(stopwords_file, lines[i], do_clean=True) print("***********Test-3***********") print(lines) print("****************************") # 1.d for i in range(len(lines)): lines[i] = apply_stemming(lines[i]) lines[i] = apply_lemmatization(lines[i]) print("***********Test-4***********") print(lines) print("****************************") # 1.e print("***********Test-5***********") for line in lines:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) trigram = gensim.models.Phrases(bigram[data_words], threshold=100) bigram_mod = gensim.models.phrases.Phraser(bigram) trigram_mod = gensim.models.phrases.Phraser(trigram) def make_bigrams(texts): return [bigram_mod[doc] for doc in texts] def make_trigrams(texts): return [trigram_mod[bigram_mod[doc]] for doc in texts] data_words_nostops = remove_stopwords(data_words) data_words_bigram = make_bigrams(data_words_nostops) nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) data_lemmatized = lemmatization(data_words_bigram, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) id2word = corpora.dictionary.Dictionary(data_lemmatized) texts = data_lemmatized corpus = [id2word.doc2bow(text) for text in texts] lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, random_state=100, update_every=1,
def test_remove_stopwords(self): t1 = "There is not way in hell I'm gonna wait till 1am for transfer news. :)" t2 = "I had a very vivid dream that I was pregnant last night, (like, scary real) and today I've felt off. And something still doesn't feel right." self.assertEqual(p.remove_stopwords("not", ["not"]), "not")
def test_remove_stopwords(self): t1 = "There is not way in hell I'm gonna wait till 1am for transfer news. :)" t2 = "I had a very vivid dream that I was pregnant last night, (like, scary real) and today I've felt off. And something still doesn't feel right." self.assertEqual(p.remove_stopwords("not", ["not"]), "not")
import preprocess def intersect(a, b): return list(set(a) & set(b)) files=[x for x in os.listdir(os.getcwd()+"/Output") if "_output" in x] glo=[[0,0,0],[0,0,0],[0,0,0],[0,0,0],[0,0,0],[0,0,0],[0,0,0],[0,0,0],[0,0,0]] for File in files: targetFile="" for ch in File: if ch=="_": break else: targetFile=targetFile+ch target=open(os.getcwd()+"/clusters _summarized(2)/"+targetFile) summary1=nltk.word_tokenize(preprocess.remove_stopwords((target.readline()[:-1]).lower())) summary2=nltk.word_tokenize(preprocess.remove_stopwords((target.readline()[:-1]).lower())) print(File) length_summary1=len(summary1) print("K: "+str(summary1)) length_summary2=len(summary2) print("P: "+str(summary2)) target.close() book = xlwt.Workbook(encoding="utf-8") sheet1 = book.add_sheet("Sheet 1",cell_overwrite_ok=True) counter=0 sheet1.write(counter,0,"Base") sheet1.write(counter,1,"Precision")
import preprocess import pandas as pd filename = 'dataset_related.csv' output = 'data_related_extracted/data_related_extracted_remstop.txt' df = pd.read_csv(filename) df_extract = df.loc[:, ['tweet', 'class']] df_extract = df_extract.dropna() df_extract = df_extract.drop_duplicates() tweets = df_extract.tweet.values.tolist() classes = df_extract['class'].values.tolist() # Preprocess for i in range(len(tweets)): tweets[i] = tweets[i].replace('\n', ' ') tweets[i] = preprocess.preprocess(tweets[i]) tweets[i] = preprocess.remove_punc(tweets[i]) tweets[i] = preprocess.lemmatize(tweets[i]) tweets[i] = preprocess.remove_stopwords(tweets[i]) s = [] for tweet, cl in zip(tweets, classes): s.append(tweet + '\t' + str(int(cl)) + '\n') with open(output, 'wb') as f: for x in s: f.write(x.encode('utf-8'))
from underthesea import pos_tag, word_tokenize import preprocess from nltk.tokenize import word_tokenize as word_tokenize1 while True: question = input('Query: ') print(pos_tag(question)) print(word_tokenize(question)) print(preprocess.remove_stopwords(question.lower())) print(word_tokenize1(question.lower())) # 1Q nC --> relevance 1Q-1C-Score --> top kC # 1Q -> VECTOR, 1C -> VECTOR # 1 ele vector = 1 word
def third_problem(a, b, c): tree = stree.Suffix_tree() for i in range(0, len(b)): tree.add(b[i]) rank = [0] * len(c) count = 1 out = tree.search(a) if out != -1: out.sort() k = 0 out2 = [] while k < len(out): out3 = [0, out[k][0]] j = 1 while k + j < len(out) and out[k][0] == out[k + j][0]: j += 1 out3[0] += j out2.append(out3) k += j out2.sort() out2.reverse() for j in range(0, len(out2)): rank[out2[j][1]] = count count += 1 if 0 in rank: d = (r.remove_stopwords([a]))[0].split(" ") outing = [[0, 0]] * len(rank) for j in range(0, len(outing)): outing[j] = [0, j] for i in d: out4 = tree.search(i) if out4 != -1: for j in range(0, len(out4)): if out4[j][0] < len(outing): outing[out4[j][0]][0] += 1 outing.sort() outing.reverse() k = 0 while k < len(outing) and outing[k][0] != 0: if rank[outing[k][1]] == 0: rank[outing[k][1]] = count count += 1 k += 1 if 0 in rank: subs2 = get_all_substrings(a) i = len(subs2) - 2 outing = [[0, 0]] * len(rank) for j in range(0, len(outing)): outing[j] = [0, j] while i >= 0: output = tree.search(subs2[i]) if output != -1: for j in range(0, len(output)): if output[j][0] < len(outing): outing[output[j][0]][0] += len(subs2[i]) i -= 1 outing.sort() outing.reverse() k = 0 while k < len(outing) and outing[k][0] != 0: if rank[outing[k][1]] == 0: rank[outing[k][1]] = count count += 1 k += 1 if 0 in rank: for q in range(0, len(rank)): if rank[q] == 0: rank[q] = count last_out = [[0, 0]] * len(c) for j in range(0, len(rank)): last_out[j] = [rank[j], j] last_out.sort() print( "\t\tTITLES OF THE TALES IN ORDER OF RELEVANCE (FROM HIGHEST TO LOWEST) FOR THE QUERY STRING '", a, "' ARE =>\n\n") for i in last_out: print("\t\t\t\t'", c[i[1]], "'\n")
#!/usr/bin/env python 3.6 import preprocess A = open('data/FS_FSociety.txt').read() B = preprocess.normalize(A).lower() C = preprocess.remove_stopwords(B) from preprocess import CollocationList coll2 = CollocationList(C) coll2.find_collocations() collocations = coll2.head(40) D = preprocess.utils.hypenation(C,collocations) coll3 = CollocationList(D) coll3.find_collocations() from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r"\s+", gaps=True) tokens = tokenizer.tokenize(D) print ("Total inicial de palabras: ", len(B.split())) print ("Total sin stopwords: ", len(C.split())) print ("Total after collocations hypen: ", len(D.split())) tokens_unique=set([]) tokens_unique = set(tokens) print ("Palabras únicas:", len(tokens_unique)) #Inicializar un diccionario para guardar el # de apariciones de cada palabra. dict = {} for word in tokens_unique: dict[word]=0 #Diccionario con word = # apariciones. for token in tokens: dict[token]+=1 #Operar con una tupla puede ser mejor. Lista([#apariciones,word]) tupla = []