def word_vec_matrix(self, model, one_hot): training_data = {"article":[],"summaries":[]} i=1 for k in range(len(self.data["articles"])): art=[] summ=[] for word in wt(self.data["articles"][k].lower()): try: art.append(model.wv.word_vec(word)) except Exception as e: print(e) for word in wt(self.data["summaries"][k].lower()): try: summ.append(one_hot[word]) #summ.append(model.wv.word_vec(word)) except Exception as e: print(e) training_data["article"].append(art) training_data["summaries"].append(summ) if i%100==0: logger.info("progress: " + str(((i*100)/len(self.data["articles"])))) i+=1 print('\007') return training_data
def dict_build(): print("------------Building Dictionary------------") #build dic from Uottawa an reuters json Corpus # *Full text, Altered Text, Stemmed, stopwords removal, normalized #Acess corpus uottawa_json = os.path.dirname(os.path.join( os.getcwd())) + "/Json_data/uottawa_corpus.json" reuters_json = os.path.dirname(os.path.join( os.getcwd())) + "/Json_data/final_reuters_corpus.json" corpus_collection = [ uottawa_json, reuters_json ] #corpus_collection = ["reuters_corpus.json", "uottawa_corpus.json"] #Dictionary Json Structure dict = { 'fullText': set(), 'alteredText': set(), 'stemmedText': set(), 'stopWord': set(), 'normalized': set() } #Enumerate allows us to loop over something and have an automatic counter. #docID -> article number starting with 1 #Find title body topic create snippet for id, corpus_data in enumerate(corpus_collection): with open(corpus_data) as corpus: data = json.load(corpus) for values in data: tokenized_title = [ word.lower() for word in wt(values['title']) if word not in string.punctuation and not any( i.isdigit() for i in word) and word != "" ] tokenized_description = [ word.lower() for word in wt(values['description']) if word not in string.punctuation and not any( i.isdigit() for i in word) and word != "" ] #dict fulltext dict['fullText'] |= set(tokenized_title) dict['fullText'] |= set(tokenized_description) #dict altered text [normalizing stemmed and stopwords removed title and description] dict['alteredText'] |= normalize( stemming(remove_stopwords(tokenized_title))) dict['alteredText'] |= normalize( stemming(remove_stopwords(tokenized_description))) #stopwords removal dict['stopWord'] |= remove_stopwords(tokenized_title) dict['stopWord'] |= remove_stopwords(tokenized_description) #stemming [porter stemmer] dict['stemmedText'] |= stemming(tokenized_title) dict['stemmedText'] |= stemming(tokenized_description) #normalization dict['normalized'] |= normalize(tokenized_title) dict['normalized'] |= normalize(tokenized_description) with open('dictionary.json', 'w') as outfile: my_dict_lists = {k: list(v) for (k, v) in dict.items()} json.dump(my_dict_lists, outfile, ensure_ascii=False, indent=4) print("------------Done------------")
def preprocess_snli_jsonl(file_path, vocab_idx, out_file, vocab_size=30000): X1 = [] X2 = [] l1 = [] l2 = [] Y = [] labels = {'neutral': 0, 'entailment': 1, 'contradiction': 2} with codecs.open(file_path, 'r', 'utf-8') as f: for line in f: line = json.loads(line) if line['gold_label'] not in labels: continue sentence1 = [w.lower() for w in wt(line['sentence1'])] s1 = [] for w in sentence1: s1.append(vocab_idx[w] if w in vocab_idx else vocab_size - 1) sentence2 = [w.lower() for w in wt(line['sentence2'])] s2 = [] for w in sentence2: s2.append(vocab_idx[w] if w in vocab_idx else vocab_size - 1) X1.append(np.array(s1)) X2.append(np.array(s2)) l1.append(len(s1)) l2.append(len(s2)) Y.append(labels[line['gold_label']]) writer = codecs.open(out_file, 'wb') data = {'X1': np.array(X1), 'X2': np.array(X2), 'l1': np.array(l1), 'l2': np.array(l2), 'Y': np.array(Y)} pickle.dump(data, writer) writer.close()
def gen_signature(word): """Generate a signature for each candidate expansion, using contextual information from the Brown corpus, as well as WordNet definitions and examples (if applicable).""" if word in gen_signature.dict: return gen_signature.dict[word] inds = find_matches(word) if len(inds) > 50: f = len(inds) / 50 inds = [inds[int(i * f)] for i in range(50)] signature = defaultdict(int) for i in inds: for w in gen_context(i, brown): signature[w] += 1 sig = {w for w in signature if signature[w] > 1 and w not in stopwords.words('english') and w != ','} if word in wn.words(): if wn.synsets(word) and str(wn.synsets(word)[0]).count("'") == 2: define = (eval("wn.{}.definition()".format( str(wn.synsets(word)[0]).lower()))) examples = (eval("wn.{}.examples()".format( str(wn.synsets(word)[0]).lower()))) if examples: for ex in examples: sig.update([w for w in wt(ex) if w not in stopwords.words('english')]) if define: sig.update([w for w in wt(define) if w not in stopwords.words('english')]) gen_signature.dict[word] = sig return sig
def wordvecmatrix(model,data): IO_data={"article":[],"summaries":[]} i=1 for k in range(len(data["articles"])): art=[] summ=[] for word in wt(data["articles"][k].lower()): try: art.append(model.wv.word_vec(word)) except Exception as e: print(e) for word in wt(data["summaries"][k].lower()): try: summ.append(onehot[word]) #summ.append(model.wv.word_vec(word)) except Exception as e: print(e) IO_data["article"].append(art) IO_data["summaries"].append(summ) if i%100==0: print("progress: " + str(((i*100)/len(data["articles"])))) i+=1 #announcedone() print('\007') return IO_data
def create_wordclouds(self, text, name_of_cloud, additional_stop_list, max_words, width, height, bigram = False): text_nopunc = self.remove_punctuation(text, "", "") text_lower = text_nopunc.lower() stop = self.stopwords stop.extend(additional_stop_list) text_nostop = self.remove_stopword(text_lower, stop) tokens = wt(text_nostop) text_lem = self.lemmatize(tokens) tokens_lem = wt(text_lem) my_bigrams = nltk.bigrams(tokens_lem) if bigram: bigram_merged=list() for line in my_bigrams: bigram_merged.append(line[0]+' ' + line[1]) counts = collections.Counter(bigram_merged) else: counts = collections.Counter(tokens_lem) final = counts.most_common(max_words) max_count = max(final, key=operator.itemgetter(1))[1] final = [(name, count / float(max_count))for name, count in final] # tags = make_tags(final, maxsize=max_word_size) # create_tag_image(tags, name_of_cloud+'.png', size=(width, height), layout=3, fontname='Crimson Text', background = (255, 255, 255)) # temp_cloud = " ".join(text for text, count in final) word_cloud = WordCloud(font_path="fonts/Georgia.ttf", width=width, height=height, max_words=max_words, stopwords=stop) word_cloud.fit_words(final) word_cloud.to_file(name_of_cloud + ".png")
def inference(x1, x2): #tokenize and pad x1 = wt(x1.lower().strip()) x2 = wt(x2.lower().strip()) if len(x1) >= 16: x1 = x1[:16] else: while (len(x1) < 16): x1.append("pad") if len(x2) >= 16: x2 = x2[:16] else: while (len(x2) < 16): x2.append("pad") q1 = [] q2 = [] for word in x1: try: q1.append(model1.wv.word_vec(word)) except Exception as e: q1.append(model1.wv.word_vec("pad")) continue for word2 in x2: try: q2.append(model1.wv.word_vec(word2)) except Exception as e2: q2.append(model1.wv.word_vec("pad")) continue x1 = np.asarray(q1, dtype='float32').reshape((1, 16, 256)) x2 = np.asarray(q2, dtype='float32').reshape((1, 16, 256)) sim_prob = siamese_model.predict([x1, x2]) return sim_prob[0][0]
def prepareData(CandidateList): positiveText = "" negativeText = "" neutralText = "" vectors = [] labels = [] for candidate in CandidateList: positiveDict = candidate.positive for item in positiveDict: text = positiveDict[item].tweet positiveText += text vec = text.split() vectors.append(vec) labels.append("positive") for candidate in CandidateList: negativeDict = candidate.negative for item in negativeDict: text = negativeDict[item].tweet negativeText += text vectors.append(vec) labels.append("negative") for candidate in CandidateList: neutralDict = candidate.neutral for item in neutralDict: text = neutralDict[item].tweet neutralText += text vectors.append(vec) labels.append("neutral") positiveTokens = wt(positiveText) negativeTokens = wt(negativeText) neutralTokens = wt(neutralText) positiveDist = freq(positiveTokens) negativeDist = freq(negativeTokens) neutralDist = freq(neutralTokens) tempVector = defaultdict() mostCount = 30 mostPositive = positiveDist.most_common(mostCount) mostNegative = negativeDist.most_common(mostCount) mostNeutral = neutralDist.most_common(mostCount) for mytuple in positiveDist.items(): if mytuple not in mostPositive and mytuple[1] > 1: tempVector[len(tempVector)] = mytuple[0] for mytuple in negativeDist.items(): if mytuple not in mostNegative and mytuple[1] > 1: tempVector[len(tempVector)] = mytuple[0] for mytuple in neutralDist.items(): if mytuple not in mostNeutral and mytuple[1] > 1: tempVector[len(tempVector)] = mytuple[0] print len(tempVector) tempvector = {tempVector[w]: w for w in tempVector} print len(tempvector) return (vectors, labels, tempvector)
def kmtokens(text, *args): if len(args)==0: tokens=wt(text) return [stemmer.stem(token.strip()) for token in tokens if len(token)>1 and not token in stopwords] if args[0]=='pos': goodpos=['N', 'J', 'R', 'V'] tokens=nltk.pos_tag(wt(text)) return [stemmer.stem(token[0].strip()) for token in tokens if len(token[0])>1 and not token[0] in stopwords]
def get_details(url_arg): url = urlopen(url_arg) html = url.read() url.close() soup = bs(html, "html.parser") movie_name = " ".join( wt(soup.title.get_text())[:wt(soup.title.get_text()).index('Reviews')]) total_reviews = int(wt(soup.find(align="right").get_text())[0]) total_pages = math.ceil(total_reviews / 10) return (movie_name, total_reviews, total_pages)
def get_details(movie_url): print(movie_url) url = urlopen(movie_url) html = url.read() url.close() soup = bs(html, "html.parser") movie = " ".join( wt(soup.title.get_text())[:wt(soup.title.get_text()).index("Reviews")]) total_reviews = int(wt(soup.find(align="right").get_text())[0]) total_pages = int(math.ceil(total_reviews / 10)) return (movie, total_reviews, total_pages)
def removeStopWords(t): stop_words = stopwords.words("english") word_feature_vector = [] for word in wt(t): if word not in stop_words: word_feature_vector.append(word) return word_feature_vector
def analyze(text): text = text.lower() text = text.translate(str.maketrans('', '', string.punctuation)) print(text) token_words = wt(text, 'english') print(token_words) complete_text = [] for word in token_words: if word not in stopwords.words('english'): complete_text.append(word) s_count = sexual_count(complete_text) p_count = physical_count(complete_text) slurs_count = slurs_count(complete_text) if (s_count == 0) and (p_count == 0) and (slurs_count == 0): return "Good news! Content warning are not aplicable to this literature!" else: output = """ """ s = "There are " + s_count + " instances of sexually violent words." p = "There are " + p_count + " instances of physically violent words." ss = "There are " + slurs_count + " instances of physically violent words." output = s + p + ss return output
def process_unlabel_lapt_for_bilstmcrf(input_fn, output_fn): if os.path.exists(output_fn): print('data already exists', output_fn) return res = [] for f in os.listdir(input_fn): if not f.endswith('json'): continue f = open(input_fn + '/' + f) js = json.load(f) f.close() reviews = js['Reviews'] contents = [r['Content'] for r in reviews if r['Content'] is not None] res.extend(contents) with open(output_fn, 'w') as f: for content in res: content = content.strip().lower() sents = st(content) for sent in sents: tokens = wt(sent) for token in tokens: f.write(token.encode('utf-8')) f.write(' O\n') f.write('\n')
def word_overlap_score(inverted_index, docs, vectors, lam, default_dist=1): doc_len = len(docs) score_mat = np.full((doc_len, doc_len), default_dist, dtype="float32") index_array = lambda sents: reduce( lambda accu, indx_array: accu + indx_array, [inverted_index.get(word, []) for word in sents], []) tokens = [wt(sents.replace('.', '')) for sents in docs] words_inverded_index = list(map(index_array, tokens)) count_dict_list = list(map(Counter, words_inverded_index)) def get_score(i, j, intersect): n1 = len(tokens[i]) n2 = len(tokens[j]) overlap = 1 - 2 * float(intersect) / max(n1 + n2, 1) cosine = cosine_similarity(vectors[i].T, vectors[j].T) final_score = max((1 - lam) * overlap + lam * cosine.all(), 0) score_mat[i][j] = final_score score_mat[j][i] = final_score [[get_score(i, j, count_dict_list[i][j]) \ for j in count_dict_list[i] if j >= i] \ for i in tqdm(range(doc_len))] return score_mat
def stem_words(data): def feature_tokens(tokens): stemtokens = list() for i in range(len(tokens)): if tokens[i] == 'not': i += 1 continue if tokens[i] not in stop_words and not tokens[i].endswith("i"): stemmed = ps.stem(tokens[i]) if len(stemmed) > 2: stemtokens.append(stemmed) return stemtokens # initiate list for counting word frequencies in the list of documents new_train = list() for rawtext in data: # remove line breaks, indenting, punctuation, contractions text = processText(rawtext) # adds all stems that aren't stopwords tokens = wt(text) stemtokens = feature_tokens(tokens) new_train.append(' '.join(stemtokens)) # print(new_train) return new_train
def check_nouns_adverbs_adjectives(sentence): tokenized = wt(sentence) tags = nltk.pos_tag(tokenized) return " ".join([ i[0] for i in tags if (i[1][0] == 'N') or (i[1][0] == 'R') or (i[1][0] == 'J') ])
def normalize_query_and(q): q = wt(q) normalized_q = [] for word in q: if word not in punctuation: normalized_q.append(stemmer.stem(word)) return normalized_q
def preprocessTxt(txtfile, newfile): with open(txtfile, 'r') as reader: data = reader.readlines() reader.close() new_data = [] renew_data = [] pro_data = [] for line in data: sents = wt(line.strip()) new_data.append(sents) with open(newfile, 'w') as etf: for line in new_data: renew_line = [] pro_line = [] for word in line: word = str.lower(word) renew_line.append(word) etf.write('{} '.format(word)) if word[0].isalpha(): pro_line.append(word) else: pass renew_data.append(renew_line) pro_data.append(pro_line) etf.write('\n') etf.close() print('txtfile preprocessed.') return renew_data, pro_data
def build_vocab(train_file, vocab_file): vocab = defaultdict(int) with codecs.open(train_file, 'r', 'utf-8') as f: for line in f: d = json.loads(line) sentence1 = [w.lower() for w in wt(d['sentence1'])] sentence2 = [w.lower() for w in wt(d['sentence2'])] for word in sentence1: vocab[word] += 1 for word in sentence2: vocab[word] += 1 writer = codecs.open(vocab_file, 'wb') vocab = sorted(vocab.items(), key=itemgetter(1), reverse=True) pickle.dump(vocab, writer) writer.close()
def tfidfCounts(data): # initiate list for counting word frequencies in the list of documents count = {} for rawtext in data: # remove line breaks, indenting, punctuation, contractions text = processText(rawtext) # adds all stems that aren't stopwords tokens = wt(text) stemtokens = feature_tokens(tokens) docwords = {} # adds all bigrams in the form [not ___] (excluding stop words) # stemtokens += [tokens[i] + ps.stem(tokens[i + 1]) for i in range(len(tokens)-1) if tokens[i] == 'not' and not tokens[i + 1] in stop_words] for stem in stemtokens: if not stem in count: count[stem] = 0 if not stem in docwords: docwords[stem] = 0 docInfo = 0 for word in docwords: for stem in stemtokens: if word == stem: docwords[word] += 1 for word in docwords: count[word] += math.log( len(stemtokens) / docwords[word]) * docwords[word] return count
def summonehot(corpus): allwords=[] annotated={} for sent in corpus: for word in wt(sent): allwords.append(word.lower()) print(len(set(allwords)), "unique characters in corpus") #maxcorp=int(input("Enter desired number of vocabulary: ")) maxcorp=int(len(set(allwords))/1.1) wordcount = Counter(allwords).most_common(maxcorp) allwords=[] for p in wordcount: allwords.append(p[0]) allwords=list(set(allwords)) print(len(allwords), "unique characters in corpus after max corpus cut") #integer encode label_encoder = LabelEncoder() integer_encoded = label_encoder.fit_transform(allwords) #one hot onehot_encoder = OneHotEncoder(sparse=False) integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) onehot_encoded = onehot_encoder.fit_transform(integer_encoded) #make look up dict for k in range(len(onehot_encoded)): inverted = cleantext(label_encoder.inverse_transform([argmax(onehot_encoded[k, :])])[0]).strip() annotated[inverted]=onehot_encoded[k] return label_encoder,onehot_encoded,annotated
def encode(self, corpus): all_words = [] one_hot = {} for sent in corpus: for word in wt(' '.join(sent)): all_words.append(word.lower()) #print(len(set(all_words)), "unique words in corpus") logger.info(str(len(all_words)) + 'unique words in corpus') #maxcorp=int(input("Enter desired number of vocabulary: ")) maxcorp = int(len(set(all_words)) / 1.1) wordcount = Counter(all_words).most_common(maxcorp) all_words = [] for p in wordcount: all_words.append(p[0]) all_words = list(set(all_words)) #print(len(all_words), "unique words in corpus after max corpus cut") #logger.info(str(len(all_words)) + 'unique words in corpus after max corpus cut') #integer encode #label_encoder = LabelEncoder() #integer_encoded = label_encoder.fit_transform(all_words) #one hot label_encoder = LabelEncoder() integer_encoded = label_encoder.fit_transform(all_words) onehot_encoder = OneHotEncoder(sparse=False) #integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) onehot_encoded = onehot_encoder.fit_transform(np.array(all_words).reshape(-1, 1)) for i in range(len(onehot_encoded)): word = label_encoder.inverse_transform([argmax(onehot_encoded[i, :])])[0].strip() one_hot[word] = onehot_encoded[i] #print(len(one_hot.keys())) return one_hot
def makeEmbeddingMap(text, corpusCounts, numgram, maxperkey): if numgram < 2: return -1 freqs = getNgramCounts(text, numgram) topgrams = getNMax(freqs, 2000) stemmedWords = {} onlyStem = {} for entry in topgrams: tokens = wt(entry[0]) tokens = [stemmer.stem(token) for token in tokens] stem = " ".join(tokens) if stem in stemmedWords: stemmedWords[stem][entry[0]] = entry[1] onlyStem[stem] += entry[1] else: stemmedWords[stem] = {} stemmedWords[stem][entry[0]] = entry[1] onlyStem[stem] = entry[1] topgrams = getNMax(onlyStem, 1500) vectorMap = {} for word in corpusCounts: vectorMap[word[0]] = list() for gram in topgrams: if len(vectorMap[word[0]]) - 1 < maxperkey and word[0] in gram[ 0] and not gram[0] in vectorMap[word[0]]: vectorMap[word[0]].append(gram[0]) return vectorMap
def feature_extractor(preprocessed_tweet): feature = {} word_and_tag = nltk.pos_tag(wt(preprocessed_tweet)) # add pos has a feature too if the accuracy is less all_tags = [] all_words = [] for (w,t) in word_and_tag: all_tags.append(t) for (w,t) in word_and_tag: if t in ["VB","VBD","VBN","VBP","VBZ","VBG"]: all_words.append(w) if t in ["JJ","JJR","JJS"]: all_words.append(w) if t in ["RB","RBS","RBR"]: all_words.append(w) if t in ["WRB","MD","IN","RP","CD","NN","NNP"]: all_words.append(w) for word in feature_bag_of_word: feature[word] = (word in all_words) for tag in tags: feature[tag] = (tag in all_tags) return feature
def proc_sentence(sentence): raw_text = sentence.find('text').text.lower() opinions = sentence.find('Opinions') if opinions is None: return (wt(raw_text), ['O'] * len(wt(raw_text))) # make sure the target words will be separated by inserting spaces text_for_tokenization = raw_text[:].replace('/', ' / ') for opinion in opinions: if 'target' in opinion.attrib and opinion.attrib[ 'target'] is not 'NULL': text_for_tokenization = text_for_tokenization.replace( opinion.attrib['target'], ' ' + opinion.attrib['target'] + ' ') tokens = wt(text_for_tokenization) spans = get_spans(raw_text, tokens) char_idx_to_word_idx = {s[1]: idx for idx, s in enumerate(spans) } # map origin index to the tokenized words tags = ['O'] * len(spans) #print(char_idx_to_word_idx) if opinions is not None: for opinion in opinions: if 'from' not in opinion.attrib: continue sidx = int(opinion.attrib['from']) eidx = int(opinion.attrib['to']) if sidx == eidx == 0: continue token_sidx, token_eidx = 1000, 0 tag = 'B' for idx in range(sidx, eidx): if idx in char_idx_to_word_idx: token_sidx = min(token_sidx, char_idx_to_word_idx[idx]) token_eidx = max(token_eidx, char_idx_to_word_idx[idx]) tags[char_idx_to_word_idx[idx]] = 'B' for idx in range(token_sidx, token_eidx + 1): tags[idx] = tag tag = 'I' if sidx not in char_idx_to_word_idx: print('warning', tokens, text_for_tokenization, sidx, spans, zip([s[0] for s in spans], tags)) #raise Exception('warning', tokens, text_for_tokenization, sidx, spans, zip([s[0] for s in spans], tags)) return (tokens, tags)
def predict(trained_model, testing_path): print("Start predict module") try: with open('test_featuresets.dmp', 'rb') as fp: print("Test featuresets found!") documents = pickle.load(fp) except: print("No existing test featuresets. Make a new one.") from os import listdir pospath = testing_path + r"\pos" negpath = testing_path + r"\neg" posfiles = listdir(pospath) negfiles = listdir(negpath) documents = [] for fn in posfiles: ff = open(pospath + '/' + fn, encoding='windows-1252') ftok = [word.lower() for word in wt(ff.read())] fset = set(ftok) tmpdict = {} for key in fset: tmpdict[key] = ftok.count(key) documents.append((tmpdict, 'pos')) for fn in negfiles: ff = open(negpath + '/' + fn, encoding='windows-1252') ftok = [word.lower() for word in wt(ff.read())] fset = set(ftok) tmpdict = {} for key in fset: tmpdict[key] = ftok.count(key) documents.append((tmpdict, 'neg')) with open('test_featuresets.dmp', 'xb') as fp: pickle.dump(documents, fp) model_predictions = [] ground_truth = [] count = 0 for body, senti in documents: count += 1 model_predictions.append(trained_model.classify(body)) ground_truth.append(senti) print("Doc " + str(count) + " Done") print("Finish predict module") return model_predictions, ground_truth
def normalize_and(): for doc in content: doc = wt(doc.lower()) sentence = [] for word in doc: if word not in punctuation: sentence.append(stemmer.stem(word)) processed_docs.append(sentence)
def parseStopWords(sentence): words=wt(sentence) english=st.words(english) ss='' for word in words: if(word.lower() not in english): ss=word+' ' return ss
def removeURLs(unprocessed_dataset_with_label): for tweet in unprocessed_dataset_with_label: tweet_words_array = wt(tweet["tweet"]) for word in list(tweet_words_array): if word == 'https' or word == 'https…' or word == 'http' or word == 'http…' or word == ':' or word[:2] == '//': tweet_words_array.remove(word) tweet["tweet"] = " ".join(tweet_words_array) return unprocessed_dataset_with_label
def get_sents_vector(sents, model): """ Function to convert given sentence into vectors :param sents: tokenized sentence :param model: model :return: sentence vectors """ return vectorize_sentence(wt(sents), model)
def get_inverted_index(docs): tokesize = lambda x: wt(x) tokens_array = list(map(tokesize, docs)) inverted_index = {} _ = [[inverted_index.setdefault(word, []).append(i) \ for word in tokens_array[i]] \ for i in range(len(tokens_array))] return inverted_index
def process_query(self): """Q.process_query() -- processes the user query, by tokenizing and stemming words. """ self.query = wt(self.query) self.processed_query = [] for word in self.query: if word not in self.stop_words and word not in self.punctuation: self.processed_query.append(self.stemmer.stem(word))
def normalized_and(): processed_docs = [] for doc in content: doc = wt(doc) sentence = [] for word in doc: if word not in punctuation: sentence.append(word) processed_docs.append(sentence)
def remove_stop_words_and_stem(soup): result = get_soup_text(soup) tokenized_text = wt(result) filtered_array = [] for word in tokenized_text: # Eliminating the stopwords if word not in stop_words and word.isalpha(): # Only considering alphabetical strings filtered_array.append(stemmer.stem(word)) return TextBlob(' '.join(filtered_array))
def tokenize(source): words = wt(source.lower().replace("can't", "can").replace("won't", "will").replace("gonna", "go")) words = filter(lambda x:x.isalpha() and len(x) > 1, words) reformed = map(lambda x: (morphy(x) or x), words) made = [] count = len(reformed) for word in reformed: if word not in made: made.append(word) return made, count
def normalize_query(q): q = re.sub(r'(,)([0-9]+)', r"\1 \2", q.lower()) q = re.sub(r'boundary', "four six", q) q = wt(q) normalized_q = [] for word in q: if word not in stop and word not in punctuation: if word in no_replace_query.keys(): word = no_replace_query[word] normalized_q.append(stemmer.stem(word)) return normalized_q
def normalize(): for doc in content: doc = re.sub(r'(,)([0-9]+)', r"\1 \2", doc.lower()) doc = wt(doc) sentence = [] for word in doc: if word not in stop and word not in punctuation: word = stemmer.stem(word) if word in no_replace.keys(): word = no_replace[word] sentence.append(word) processed_docs.append(sentence)
def process_corpus(self): """Q.process_corpus() -- processes the queries defined by us, by tokenizing, stemming, and removing stop words. """ for doc in self.corpus_list: doc = wt(doc) sentence = [] for word in doc: if word not in self.stop_words and word not in self.punctuation: word = self.stemmer.stem(word) sentence.append(word) self.processed_corpus.append(sentence)
def simHelper(T): ''' Given a token returns a pos tagged list ''' alphanum = letters+octdigits # part of speech word list for the text fullList = [word for subl in [pos_tag(wt(s)) for s in st(T)] for word in subl] # remove symbols and -NONE- tags from list by checking the first character of the word and tag posList = [word for word in fullList if word[1][0] in alphanum and word[0][0] in alphanum] return posList
def initial_work(): with open('C:/Users/Isha/Desktop/AldaSampling/data/StopWords.txt', 'rb') as infile: splitwordslist = infile.read().decode('UTF-8') myfilehtml = open('C:/Users/Isha/Desktop/AldaSampling/output/Processed/unsponsored/1010023_raw_html.txt') soup = BeautifulSoup(myfilehtml,'html.parser') for scriptdata in soup.findAll('script'): scriptdata.extract() result = soup.get_text().lower() output=open("output.txt","w") output.write(result) tokenized_text = wt(result) stop_words = wt(splitwordslist) for words in tokenized_text: if words not in stop_words: if words.isalpha(): filtered_array.append(ps.stem(words)) print(len(filtered_array)) print(filtered_array) filtered_doc = ' '.join(filtered_array) print(filtered_doc) list_of_docs.append(tblob(filtered_doc)) list_of_docs.append(tblob('experi unicorn astro table')) print(list_of_docs)
def filter_sentences(corpus='rbc_se.txt', dic='names.txt', newfile='rbc_sent_filt_new.txt'): name_dict = set([x.strip('\n') for x in open('.\\preprocessing\\' + dic, encoding='utf-8')]) text = open(corpus, encoding='utf-8') new_file = open(newfile, 'w', encoding='utf-8') for line in text: tokens = wt(line.strip('\n')) for word in tokens: if word in punkt: continue word = word.lower() if word in name_dict: new_file.write(line) break text.close() new_file.close()
def pos_tag(self, data, columns, tag): result = pd.DataFrame(columns=columns) data = data.dropna() result.title = data.title for index, row in data.iterrows(): if not isinstance(row['summary'], int): #temp = self.normalize(row['summary'].strip('#')) temp = row['summary'] pos = pos_tag(wt(temp)) temp = " ".join(noun[0] for noun in pos if noun[1] in tag) print temp result.loc[index, 'summary'] = unicode(temp) # for c in columns: # for index, row in data.iterrows(): # if not isinstance(row[c], int): # temp = self.normalize(row[c]) # pos = pos_tag(wt(temp)) # temp = " ".join(noun[0] for noun in pos if noun[1] in tag) # result.loc[index, c] = unicode(temp) return result.dropna()
def summarize(self, text, n): sents = st(text) assert n <= len(sents) # assert is a way of making sure a condition holds true # will throw error if it is false word_sent = [wt(s.lower()) for s in sents] # list of lists of all the sentences self._freq = self._compute_frequencies(word_sent) ranking = defaultdict(int) for i,sent in enumerate(word_sent): # enumerate creates a tuple with index,element for each entry in the list # allows need for a counter variable, but makes it easy to index for word in sent: if word in self._freq: ranking[i] += self._freq[word] sents_idx = nlargest(n,ranking, key = ranking.get) return [sents[j] for j in sents_idx] # sexy list comprehension
def prepareData(CandidateList): positiveText = "" negativeText = "" neutralText = "" vectors = [] labels = [] for candidate in CandidateList: positiveDict = candidate.positive for item in positiveDict: text = positiveDict[item].tweet positiveText += text vec = text.split() vectors.append(vec) labels.append("positive") for candidate in CandidateList: negativeDict = candidate.negative for item in negativeDict: text = negativeDict[item].tweet negativeText += text vectors.append(vec) labels.append("negative") for candidate in CandidateList: neutralDict = candidate.neutral for item in neutralDict: text = neutralDict[item].tweet neutralText += text vectors.append(vec) labels.append("neutral") positiveTokens = wt(positiveText) negativeTokens = wt(negativeText) neutralTokens = wt(neutralText) positiveDist = freq(positiveTokens) negativeDist = freq(negativeTokens) neutralDist = freq(neutralTokens) tempVector = defaultdict() mostCount = 30 mostPositive = positiveDist.most_common(mostCount) mostNegative = negativeDist.most_common(mostCount) mostNeutral = neutralDist.most_common(mostCount) for mytuple in positiveDist.items(): if mytuple not in mostPositive and mytuple[1] > 1: tempVector[len(tempVector)] = mytuple[0] for mytuple in negativeDist.items(): if mytuple not in mostNegative and mytuple[1] > 1: tempVector[len(tempVector)] = mytuple[0] for mytuple in neutralDist.items(): if mytuple not in mostNeutral and mytuple[1] > 1: tempVector[len(tempVector)] = mytuple[0] print len(tempVector) tempvector = {tempVector[w]: w for w in tempVector} print len(tempvector) return (vectors,labels,tempvector)
# average paragraph size wst = WhitespaceTokenizer() paraWordCounts = [len(wst.tokenize(p)) for p in paragraphs] # the approximate number of words in the document numWords = sum(paraWordCounts) # the average number of words per paragraph avgParagraphLen = mean(paraWordCounts) # rejoin the paragraphs text = ' '.join(paragraphs) # part of speech word list for the text text = [word for subl in [pos_tag(wt(s)) for s in st(text)] for word in subl] # remove symbols from list by checking the first character of the word text = [word for word in text if word[0][0] in alphanum] # convert words to lowercase and convert Penn Tree Bank tags to WordNet tags text = [(word[0].lower(), convertTag(word[1])) for word in text] # remove Nones text = [word for word in text if word[1]] nouns = [word for word in text if word[1] == 'n'] numNouns = len(nouns) verbs = [word for word in text if word[1] == 'v'] numVerbs = len(verbs)
# This shit doesn't make sense but oh well. The Experiment has been experimented. from nltk.stem import PorterStemmer from nltk.tokenize import word_tokenize as wt ps = PorterStemmer() # words = ["python", "pythoner", "pythoning", "pythoned", "pythonly"] # for w in words: # print ps.stem(w) new_text = "It is very important to be pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once" words = wt(new_text) for w in words: print ps.stem(w)
def __extractSynSets(T): ''' Given a text T (as a string) find all words that have WordNet synsets @return a unique list of SynSet objects ''' ''' CONSTANTS ''' nounTags = ['NN','NNP','NNS','NNPS'] verbTags = ['VB','VBD','VBG','VBN','VBP','VBZ'] adjTags = ['JJ','JJR','JJS'] advTags = ['RB','RBR','RBS'] alphanum = letters+octdigits def convertTag(tag): ''' Converts a Penn Tree Bank POS tag to a WordNet @return the converted tag otherwise None ''' if tag in nounTags: return 'n' elif tag in verbTags: return 'v' elif tag in adjTags: return 'as' # adjectives in WordNet can be head adj 'a' or satellite adj 's' elif tag in advTags: return 'r' else: return None def getSynSet(w): ''' For a word 'w' with POS tag 'tag' find the corresponding WordNet synset @return the best matching sysnset for 'w' otherwise None ''' tag = w[1] word = w[0] # get the list of possible synsets for w sets = wn.synsets(word) if not tag or sets == []: return None # look through the list of possible synsets for the first one w/ a pos tag that matches 'tag' for s in sets: if s.pos in tag: return s return None # part of speech word list for the text fullList = [word for subl in [pos_tag(wt(s)) for s in st(T)] for word in subl] # remove symbols and -NONE- tags from list by checking the first character of the word and tag posList = [word for word in fullList if word[1][0] in alphanum and word[0][0] in alphanum] # convert words to lowercase and convert Penn Tree Bank tags to WordNet tags posList = [(word[0].lower(), convertTag(word[1])) for word in posList] # remove words for which there is no WordNet tag (i.e. tag is None) and remove duplicate values posList = list(set([word for word in posList if word[1]])) # for the words in the POS list create a list of syn sets using their tags (remove None values) synSets = [n for n in [getSynSet(w) for w in posList] if n] return synSets
''' Beautifier is a module that does NLP on HTML files from the data for this project ''' from bs4 import BeautifulSoup as bs from nltk.tokenize import word_tokenize as wt from nltk.stem import PorterStemmer from textblob import TextBlob stop_words = [] stemmer = PorterStemmer() with open('res/StopWords.txt', 'rb') as infile: stop_words_file = infile.read().decode('UTF-8') stop_words = set(wt(stop_words_file)) def get_soup_text(soup): for scriptdata in soup.findAll(["script", "style"]): scriptdata.extract() return soup.text.lower() def remove_stop_words_and_stem(soup): result = get_soup_text(soup) tokenized_text = wt(result) filtered_array = [] for word in tokenized_text: # Eliminating the stopwords if word not in stop_words and word.isalpha(): # Only considering alphabetical strings filtered_array.append(stemmer.stem(word)) return TextBlob(' '.join(filtered_array))
def clean_tokens(): n_topics =100 # get all tweets tw = [line.strip('\n') for line in file('corpus_full')] # lower case and tokenize print 'Lower casing' tokens = [[word.lower() for word in wt(tt)] for tt in tw] # filter punctuations print 'Filtering punctuations' punc = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*',\ '@', '#', '$', '%'] tokens_unpunc = [[w for w in tk if not w in punc] for tk in tokens] # filter stopping words print 'Filtering stopping words' english_stopwords = stopwords.words('english') tokens_filtered = [[w for w in tk if not w in english_stopwords] for tk in\ tokens_unpunc] # stemming print 'Stemming words' st = LancasterStemmer() tokens_stemmed = [[st.stem(w) for w in tk] for tk in tokens_filtered] # eliminate words with count == 1 ''' print 'Eliminating words appear once' all_items = sum(tokens_stemmed, []) print len(all_items) print 'Building once' once = set(t for t in set(all_items) if all_items.count(t) == 1) print 'Generating final tokens' final_tokens = [[s for s in tk if s not in once] for tk in tokens_stemmed] ''' # eliminate some specific words and words that appear only once count = collections.defaultdict(int) for t in tokens_stemmed: for w in t: if w == 'http' or w[0 : 6] == '//t.co': print w else: count[w] += 1 tokens_stemmed = [[st.stem(w) for w in tk] for tk in tokens_filtered] tokens_final = [[w for w in tk if count[w] > 1] for tk in tokens_stemmed] # LDA logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s',\ level = logging.INFO) dictionary = gensim.corpora.Dictionary(tokens_final) print 'Building corpus for LDA' corpus = [dictionary.doc2bow(t) for t in tokens_final] print 'LDA' lda = gensim.models.ldamodel.LdaModel(corpus = corpus, id2word = dictionary, num_topics = n_topics) print lda.print_topics() # extract topics for tweets topic_matrix = [] for i in range(98900): topics = lda[dictionary.doc2bow(tokens_final[i])] v = [0.] * n_topics for t in topics: v[t[0]] = t[1] topic_matrix.append(v) # write matrix to the disk np.save('topic_matrix', topic_matrix) # write topics to the disk topics = lda.show_topics(-1) with open('topics', 'a') as f: for i, t in enumerate(topics): f.write(str(i) + '-' + t + '\n')
''' Created on Mar 24, 2011 @author: Blodstone ''' from nltk import ne_chunk as nc from nltk.tag import pos_tag from nltk.tokenize import word_tokenize as wt tokenizeWords = wt('Who is Samuel Pickering ?') pos = pos_tag(tokenizeWords) n = nc(pos) print n