def main(): with open("sentiment.txt", 'r') as _file: stemmer = PorterStemmer() features = [] for words in _file: feature = [] is_sentence = True # 極性ラベルを除外 for word in words.split()[1:]: try: word = word.decode("utf-8") if word not in [".", ",", ":", "?", "!"] \ and not has_stop_list(word): feature.append(stemmer.stem(word)) except UnicodeDecodeError: # 文字化けは無視する is_sentence = False break if is_sentence: features.append(feature) return features
def make_tags(title_string): stemmer = PorterStemmer() ret = [] for word in title_string.split(): if word not in stop_words: ret.append(stemmer.stem_word(word.lower())) return ret
def stemming(line_list): """ Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data Iterates over all terms in lines, stem them Return: stemmed_list (list of strings(terms that stemmed)) """ stemmed_list = [] stemmer = PorterStemmer() for i, line in enumerate(line_list): # linercase line = line.lower() # remove punctuation # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom # nopunct_line = ''.join([c for c in line # if re.match("[a-z\-\' \n\t]", c)]) # this solve the problem above: nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line) # tokenize line_token = wt(nopunct_line) # list to store stemmed terms stemmed_line = [] for term in line_token: term = stemmer.stem_word(term) stemmed_line.append(term) # back to sentence as a string stemmed_sentence = ' '.join(stemmed_line) stemmed_list.append(stemmed_sentence) return stemmed_list
def stemm(cls, tokens): stemmer = PorterStemmer() for i, t in enumerate(tokens): tokens[i] = stemmer.stem(t) return tokens
def process_email(filename): f = open(filename, 'r') text = f.read() f.close() text = text.lower() #replaces html tags by space text = re.sub(r'<[^<>]+>', ' ', text) #replaces numbers by word number text = re.sub(r'[0-9]+', 'number', text) #replaces URLs by word httpaddr text = re.sub(r'(http|https)://[^\s]*', 'httpaddr', text) #replaces email addresses by word emailaddr text = re.sub(r'[^\s]+@[^\s]+', 'emailaddr', text) #replaces dollar signs with word dollar text = re.sub(r'[$]+', 'dollar', text) #removes punctuation and non-words and separates words words = re.split('[^a-z0-9]| ', text) #removes nans words = filter(lambda x: x!='', words) #reduces words to their stems stemmer = PorterStemmer() words = [stemmer.stem(word) for word in words] return words
def _log_likelihood(answer_text, stemmed_vocabulary, distrib_matrix): LL = 0 if answer_text is not '': tokens = word_tokenize(str(answer_text), language='english') porter_stemmer = PorterStemmer() unique_wordcount = len(stemmed_vocabulary) """ per ogni w unica print_function words Cw = conta w in answer_text PwM = self.distrib_matrix[stemmer(w)] unique_wordcount = len(tokenize(answer_text) """ for w in tokens: _w = w.strip().lower() Cw = 0 for _ in answer_text.split(): if _w == _.strip().lower(): Cw += 1 try: w_stem = porter_stemmer.stem(_w.decode('utf-8', 'replace').encode('ascii', 'replace')) except AttributeError: w_stem = porter_stemmer.stem(_w) try: PwM = distrib_matrix[w_stem] except KeyError: # key error means frequency is equal to cutoff point 1 PwM = 1 LL += (Cw * log(float(PwM))) try: LL = "{0:.2f}".format(LL / float(unique_wordcount)) except ZeroDivisionError: LL = 0 return LL
def openAndProcessingFiles(path,resultDict): # Main Function for filename in os.listdir(os.getcwd()+path): thisFile = open(os.getcwd()+path+'/'+filename,'r') #open the file and process each file currentTextString = " ".join(thisFile.read().split())#store the file as a string for removing HTML tags textAfterHtmlRemovingString = re.sub('<[^>]*>', '', currentTextString) # remove HTML tags (String) textAfterHtmlRemovingList = textAfterHtmlRemovingString.split() # convert String to List for the text contains only characters textRemoveingUnnecessaryCharactersList = [removeUnnecessaryCharacters(word) for word in textAfterHtmlRemovingList ] textRemoveingUnnecessaryCharactersList = [word for word in textRemoveingUnnecessaryCharactersList if word is not None] stop_words = set(stopwords.words('english')) stop_words.update(['texthtml', 'html', 'server', "email", 'date', 'gmt', 'www']) # By analying the previous result set, continully adding new stopwords textAfterStopwordsRemovingList = [word for word in textRemoveingUnnecessaryCharactersList if word not in stop_words] #remove stopwords stemmer = PorterStemmer() #stemming for eachWord in textAfterStopwordsRemovingList: eachWord = stemmer.stem(eachWord) storeToResultDict(eachWord,resultDict) thisFile.close()
def review_to_words(raw_review, remove_stopwords = False): # BeautifulSoup pulls data out of html file # here it removes html tags and markups text = BeautifulSoup(raw_review).get_text() # replace numbers by word number text=re.sub(r'[0-9]+','number',text) # remove punctuations (they can be analyzed for better results) text = re.sub(r'[^a-zA-Z]', ' ', text) text = text.lower() #make a list of words words_list = text.split() #download nltk text data sets, including stop words #nltk.download() if remove_stopwords: # get stopwords, searching a set is faster than searching a list stops = set(stopwords.words('english')) # remove stopwords words_list = [word for word in words_list if not word in stops] # reduce words to their stems stemmer=PorterStemmer() words_list=[stemmer.stem(word) for word in words_list] # return the list of words return words_list
def get_ngram_features(self): stemmer = PorterStemmer() top_features = [(stemmer.stem(token) + "__TOP__", True) for token in self.top_text] bottom_features = [(stemmer.stem(token) + "__BOTTOM__", True) for token in self.bottom_text] all_features = [(stemmer.stem(token) + "__ALL__", True) for token in self.all_text] self.ngram_features = dict(top_features + bottom_features + all_features)
def normalize(word): ''' normalize the the word for query or indexing :param word: unicode string :return: unicode string of the normalized ter ''' porter = PorterStemmer() return porter.stem(word) if word[0].isalpha() else ''
def __process_email(self, email_contents, vocab): ''' Preprocess a the body of an email and returns a list of word_indices. Arguments: email_contents (str): Email body. vocab (dict): Words dictionary. Return: (str list): Tokenized email body after processing. ''' # Lower case. email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.sub('<[^<>]+>', ' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.sub('[0-9]+', 'number', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents) # Handle $ sign email_contents = re.sub('[$]+', 'dollar', email_contents) # Tokenize and also get rid of any punctuation word_list = re.split(' |@|$|/|#|\.|-|:|&|\*|\+|=|[|]|\?|!|(|)|{|}|,|''|"|>|_|<|;|%', email_contents) # Remove empty string and skip the word if it is too short. word_list = [s for s in word_list if s and len(s) > 1] # Remove any non alphanumeric characters word_list = [re.sub('[^a-zA-Z0-9]', '', s) for s in word_list] # Remove empty string and skip the word if it is too short. word_list = [s for s in word_list if s and len(s) > 1] # Stem the word ps = PorterStemmer() word_list = [ps.stem_word(s) for s in word_list] word_indices = [] # Find index in vocab list. for w in word_list: if w in vocab: word_indices.append(vocab[w]) return word_indices
def processContent(self, content): stemmer = PorterStemmer() tokens = word_tokenize(content) tokens = filter(lambda x: len(x) < 20 and x.isalnum(), tokens) tokens = [stemmer.stem(token.lower()) for token in tokens] tokens = filter(lambda x: x not in stopwords.words('english'), tokens) tokens = [str(token) for token in tokens] bow = FreqDist(tokens) return(bow)
def getStemmedWords(self,html): stemmed_words=[] #stemmer = SnowballStemmer("english") stemmer = PorterStemmer() for token in html: stemmed_words.append(stemmer.stem_word(token)) return ' '.join(stemmed_words)
def main(): # Use file defined by BIOC_IN as default if no other provided bioc_in = BIOC_IN if len(sys.argv) >= 2: bioc_in = sys.argv[1] # A BioCReader object is put in place to hold the example BioC XML # document bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE) # A BioCWRiter object is prepared to write out the annotated data bioc_writer = BioCWriter(BIOC_OUT) # The NLTK porter stemmer is used for stemming stemmer = PorterStemmer() # The example input file given above (by BIOC_IN) is fed into # a BioCReader object; validation is done by the BioC DTD bioc_reader.read() # Pass over basic data bioc_writer.collection = bioc_reader.collection # Get documents to manipulate documents = bioc_writer.collection.documents # Go through each document annotation_id = 0 for document in documents: # Go through each passage of the document for passage in document: # Stem all the tokens found stems = [stemmer.stem(token) for token in wordpunct_tokenize(passage.text)] # Add an anotation showing the stemmed version, in the # given order for stem in stems: annotation_id += 1 # For each token an annotation is created, providing # the surface form of a 'stemmed token'. # (The annotations are collectively added following # a document passage with a <text> tag.) bioc_annotation = BioCAnnotation() bioc_annotation.text = stem bioc_annotation.id = str(annotation_id) bioc_annotation.put_infon('surface form', 'stemmed token') passage.add_annotation(bioc_annotation) # Print file to screen w/o trailing newline # (Can be redirected into a file, e. g output_bioc.xml) sys.stdout.write(str(bioc_writer)) # Write to disk bioc_writer.write()
def stemmingword(word_list, stemtype='porter'): if stemtype == 'porter': stemengine = PorterStemmer() else: stemengine = LancasterStemmer() try: filtered_words = [stemengine.stem(token).encode('latin-1', errors='ignore') for token in word_list] except UnicodeDecodeError, e: print 'Error en el tipo de caracteres descartando texto "{}"'.format(' '.join(word_list))
def getPosWords(): stemmer = PorterStemmer() stemmedPosTokens = [] pos = open(r'pos.txt').read() pos = re.sub("\d", "", pos) posWords = nltk.word_tokenize(pos) for posWord in posWords: stemmedPosWord = stemmer.stem(posWord) stemmedPosTokens.append(stemmedPosWord.lower()) return stemmedPosTokens
def preprocess( result ): words = removePunct(result.title) words += " " words += removePunct(result.snippet) result.tokens = nltk.word_tokenize(words) for tok in result.tokens: if tok not in STOPS: tok = PorterStemmer().stem(tok.decode('utf-8')) tok = tok.lower().encode('utf-8') return result
def tokenize(self, sentence, do_stopwords, do_stemming,use_bigrams): words = word_tokenize(sentence) words = [w.lower() for w in words if len(w) > 2] if do_stopwords: words = [w for w in words if w not in stop_set] if do_stemming: stemmer = PorterStemmer() words = [stemmer.stem(w) for w in words] if use_bigrams: words = bigrams(words) return words
def update_Porter_stemming(): #We use stems occasionally. "Updating stems from Porter algorithm..." from nltk import PorterStemmer stemmer = PorterStemmer() cursor.execute("""SELECT word FROM words WHERE wordid <= 750000 and stem is null;""") words = cursor.fetchall() for local in words: word = ''.join(local) if re.match("^[A-Za-z]+$",word): query = """UPDATE words SET stem='""" + stemmer.stem(''.join(local)) + """' WHERE word='""" + ''.join(local) + """';""" z = cursor.execute(query)
def stemmer(self, raw): """ Use porter stemmer from nltk library to stem tokens in raw text. """ tokens = word_tokenize(raw) porter = PorterStemmer() # lancaster = LancasterStemmer() # stem_lancaster = [lancaster.stem(t) for t in tokens] stem_porter = [porter.stem(t) for t in tokens] return stem_porter
def getNegWords(): stemmer = PorterStemmer() stemmedNegTokens = [] neg = open(r'neg.txt').read() neg = re.sub("\d", "", neg) negWords = nltk.word_tokenize(neg) for negWord in negWords: stemmedNegWord = stemmer.stem(negWord) stemmedNegTokens.append(stemmedNegWord.lower()) return stemmedNegTokens
def getUncertainWords(): stemmer = PorterStemmer() stemmedUnTokens = [] un = open(r'uncertain.txt').read() un = re.sub("\d", "", un) unWords = nltk.word_tokenize(un) for unWord in unWords: stemmedUnWord = stemmer.stem(unWord) stemmedUnTokens.append(stemmedUnWord.lower()) return stemmedUnTokens
def clean_data_to_feed_classifier(tweests): st = PorterStemmer() stop = stopwords.words('english') parsed_tweests = [] for x in tweests: y=x[0] y = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",y).split()) y = ' '.join(re.sub(r'(.)\1+', r'\1\1', i.lower()) for i in y.split() if i not in stop) y = ' '.join(st.stem(i) for i in y.split() if len(i) > 3 and i.isalpha() and wordnet.synsets(i)) # y = punctuations_repl(y) parsed_tweests.append(y) return parsed_tweests
def buildTrainTokensBigram(self): self.trainTokens = [] with open(self.trainingData, 'r') as reviews: for review in reviews: data = json.loads(review) words = word_tokenize(data['text']) words = [norm(word) for word in words if norm(word)] words = [word for word in words if word not in stwords] stemmer = PorterStemmer() words = [stemmer.stem(word) for word in words] featureSet = self.buildWordFeatureSetBigram(words) self.trainTokens.append((featureSet, data['stars']))
def normalize_data(lines): norm_words = [] punctuation = ['!', '.', ';', ':', '\'', '"','`','?'] exceptions = ['\n', '\'s', '\'t', " "] stemmer = PorterStemmer() stop = stopwords.words('english') mega_stop_list = list(itertools.chain(punctuation, exceptions)) print " Now Normalizing......." for sentence in lines: words = [stemmer.stem(word.lower()) for word in word_tokenize(sentence.rstrip("\\n")) if word not in [stop, "not"]] norm_words.extend([word for word in negate_Ngram(words) if not re.match("[0-9]+", word) if word.lower() not in mega_stop_list]) return norm_words
def tokenize_normalize(raw): ''' tokenize raw texts :param raw: unicode string :return: list[unicode]: a list of tokenized unicode Example: words = tokenize_normalize(line) ''' tokens = [t for t in word_tokenize(raw) if len(t) < 20] # don't use any token too long (like genetic sequence) porter = PorterStemmer() tokens_n = [porter.stem(t) for t in tokens if t[0].isalpha()] # only interested in word tokens_n = ['NUMBER' if all(a.isdigit() for a in t) else t for t in tokens_n] # combine all numbers to one return tokens_n
def formatText(text): text = text.lower() text = text.replace('.',' ') text = text.replace('\\',' ') text = text.replace('/',' ') text = text.replace('\"',' ') text = text.replace('\'',' ') text = text.replace(':',' ') text = text.replace(';',' ') text = text.replace('(',' ') text = text.replace(')',' ') porter = PorterStemmer() return ' '.join([porter.stem(word) for word in text.split(' ')])
def text_preprocessing(text): #lowercase everything text = text.lower() #remove punctuation regex = re.compile('[%s]' % re.escape(string.punctuation)) text = regex.sub(" ", text) #remove stopwords no_stopwords = [word for word in text.split() if word.lower() not in ext_stopwords] text = " ".join(no_stopwords) #stem the words stemmer = PorterStemmer() text = " ".join([stemmer.stem(w) for w in text.split()]) return text
def review_mapper(self, _, data): review = data['text'] rating = data['stars'] business_id = data['business_id'] category = data['category'] words = word_tokenize(review) words = [norm(word) for word in words if norm(word)] words = [word for word in words if word not in stwords] tagged_words = tagger.tag(words) stemmer = PorterStemmer() tagged_words = [(stemmer.stem(tagged_word[0]), tagged_word[1]) for tagged_word in tagged_words] for tagged_word in tagged_words: yield (category, tagged_word), (business_id, rating, 1)
def update_Porter_stemming(self): #We use stems occasionally. print "Updating stems from Porter algorithm..." from nltk import PorterStemmer stemmer = PorterStemmer() cursor = db.query("""SELECT word FROM words""") words = cursor.fetchall() for local in words: word = ''.join(local) #Could probably take the first element of the tuple as well? #Apostrophes have the save stem as the word, if they're included word = word.replace("'s","") if re.match("^[A-Za-z]+$",word): query = """UPDATE words SET stem='""" + stemmer.stem(''.join(local)) + """' WHERE word='""" + ''.join(local) + """';""" z = cursor.execute(query)
def run( lr=0.001, batsize=20, epochs=100, embdim=64, encdim=128, numlayers=1, dropout=.25, wreg=1e-10, cuda=False, gpu=0, minfreq=2, gradnorm=3., beamsize=1, cosine_restarts=1., seed=456789, ): # DONE: Porter stemmer # DONE: linear attention # DONE: grad norm # DONE: beam search # DONE: lr scheduler print(locals()) torch.manual_seed(seed) np.random.seed(seed) tt = q.ticktock("script") device = torch.device("cpu") if not cuda else torch.device("cuda", gpu) tt.tick("loading data") stemmer = PorterStemmer() tokenizer = lambda x: [stemmer.stem(xe) for xe in x.split()] ds = GeoQueryDatasetFunQL( sentence_encoder=SequenceEncoder(tokenizer=tokenizer), min_freq=minfreq) train_dl = ds.dataloader("train", batsize=batsize) test_dl = ds.dataloader("test", batsize=batsize) tt.tock("data loaded") do_rare_stats(ds) # batch = next(iter(train_dl)) # print(batch) # print("input graph") # print(batch.batched_states) model = create_model(embdim=embdim, hdim=encdim, dropout=dropout, numlayers=numlayers, sentence_encoder=ds.sentence_encoder, query_encoder=ds.query_encoder, feedatt=True) # model.apply(initializer) tfdecoder = SeqDecoder( model, tf_ratio=1., eval=[ CELoss(ignore_index=0, mode="logprobs"), SeqAccuracies(), TreeAccuracy( tensor2tree=partial(tensor2tree, D=ds.query_encoder.vocab)) ]) losses = make_array_of_metrics("loss", "elem_acc", "seq_acc", "tree_acc") # beamdecoder = BeamActionSeqDecoder(tfdecoder.model, beamsize=beamsize, maxsteps=50) if beamsize == 1: freedecoder = SeqDecoder( model, maxtime=100, tf_ratio=0., eval=[ SeqAccuracies(), TreeAccuracy( tensor2tree=partial(tensor2tree, D=ds.query_encoder.vocab)) ]) vlosses = make_array_of_metrics("seq_acc", "tree_acc") else: print("Doing beam search!") freedecoder = BeamDecoder( model, beamsize=beamsize, maxtime=60, eval=[ SeqAccuracies(), TreeAccuracy( tensor2tree=partial(tensor2tree, D=ds.query_encoder.vocab)) ]) vlosses = make_array_of_metrics("seq_acc", "tree_acc") # # test # tt.tick("doing one epoch") # for batch in iter(train_dl): # batch = batch.to(device) # ttt.tick("start batch") # # with torch.no_grad(): # out = tfdecoder(batch) # ttt.tock("end batch") # tt.tock("done one epoch") # print(out) # sys.exit() # beamdecoder(next(iter(train_dl))) # print(dict(tfdecoder.named_parameters()).keys()) # 4. define optim optim = torch.optim.Adam(tfdecoder.parameters(), lr=lr, weight_decay=wreg) # optim = torch.optim.SGD(tfdecoder.parameters(), lr=lr, weight_decay=wreg) # lr schedule if cosine_restarts >= 0: # t_max = epochs * len(train_dl) t_max = epochs print(f"Total number of updates: {t_max} ({epochs} * {len(train_dl)})") lr_schedule = q.WarmupCosineWithHardRestartsSchedule( optim, 0, t_max, cycles=cosine_restarts) reduce_lr = [lambda: lr_schedule.step()] else: reduce_lr = [] # 6. define training function (using partial) clipgradnorm = lambda: torch.nn.utils.clip_grad_norm_( tfdecoder.parameters(), gradnorm) trainbatch = partial(q.train_batch, on_before_optim_step=[clipgradnorm]) trainepoch = partial(q.train_epoch, model=tfdecoder, dataloader=train_dl, optim=optim, losses=losses, _train_batch=trainbatch, device=device, on_end=reduce_lr) # 7. define validation function (using partial) validepoch = partial(q.test_epoch, model=freedecoder, dataloader=test_dl, losses=vlosses, device=device) # validepoch = partial(q.test_epoch, model=tfdecoder, dataloader=test_dl, losses=vlosses, device=device) # 7. run training tt.tick("training") q.run_training(run_train_epoch=trainepoch, run_valid_epoch=validepoch, max_epochs=epochs) tt.tock("done training")
def __init__(self): super().__init__() self._stemmer = PorterStemmer()
def tfidf_classifier(fname): with open(fname + ".txt", "r") as file: paragraph = file.read() #clean the extracted content paragraph = " ".join(re.findall(r"\b[a-z0-9]+\b", paragraph, flags=re.I)).lower() #get the part of speech for every word in the content pos_tag_words = pos_tag(paragraph.split()) porter_stemmer_obj = PorterStemmer() stem = porter_stemmer_obj.stem pos_tag_words = [(str(stem(tag[0])), tag[-1]) if tag[-1].startswith("VB") else tag for tag in pos_tag_words] paragraph = " ".join([w[0] for w in pos_tag_words]) #extract all the nouns, adjectives, adverbs and verbs from the paragraph temp_noun_adj_list = [] temp_verb_adv_list = [] all_words = [] all_words_count_dict = {} for pos_words in pos_tag_words: if (pos_words[-1].startswith("NN") or pos_words[-1].startswith("JJ")): temp_noun_adj_list.append(pos_words[0]) if len(temp_verb_adv_list) > 1: adv_verb_str = " ".join(temp_verb_adv_list) if adv_verb_str not in all_words_count_dict: all_words_count_dict[adv_verb_str] = paragraph.count( adv_verb_str) temp_verb_adv_list = [] elif temp_verb_adv_list: if temp_verb_adv_list[0] not in all_words_count_dict: all_words_count_dict[ temp_verb_adv_list[0]] = paragraph.count( temp_verb_adv_list[0]) temp_verb_adv_list = [] elif pos_words[-1].startswith("VB"): temp_verb_adv_list.append(pos_words[0]) if len(temp_noun_adj_list) > 1: adj_noun_str = " ".join(temp_noun_adj_list) if adj_noun_str not in all_words_count_dict: all_words_count_dict[adj_noun_str] = paragraph.count( adj_noun_str) temp_noun_adj_list = [] elif temp_noun_adj_list: if temp_noun_adj_list[0] not in all_words_count_dict: all_words_count_dict[ temp_noun_adj_list[0]] = paragraph.count( temp_noun_adj_list[0]) temp_noun_adj_list = [] elif pos_words[-1].startswith("RB"): temp_verb_adv_list.append(pos_words[0]) if len(temp_noun_adj_list) > 1: adj_noun_str = " ".join(temp_noun_adj_list) if adj_noun_str not in all_words_count_dict: all_words_count_dict[adj_noun_str] = paragraph.count( adj_noun_str) temp_noun_adj_list = [] elif temp_noun_adj_list: if temp_noun_adj_list[0] not in all_words_count_dict: all_words_count_dict[ temp_noun_adj_list[0]] = paragraph.count( temp_noun_adj_list[0]) temp_noun_adj_list = [] else: if temp_noun_adj_list: adj_noun_str = " ".join(temp_noun_adj_list) if adj_noun_str not in all_words_count_dict: all_words_count_dict[adj_noun_str] = paragraph.count( adj_noun_str) temp_noun_adj_list = [] if temp_verb_adv_list: adv_str = " ".join(temp_verb_adv_list) if adv_str not in all_words_count_dict: all_words_count_dict[adv_str] = paragraph.count(adv_str) temp_verb_adv_list = [] if len(temp_noun_adj_list) > 0: adj_noun_str = " ".join(temp_noun_adj_list) if adj_noun_str not in all_words_count_dict: all_words_count_dict[adj_noun_str] = paragraph.count(adj_noun_str) if len(temp_verb_adv_list) > 0: adv_str = " ".join(temp_verb_adv_list) if adv_str not in all_words_count_dict: all_words_count_dict[adv_str] = paragraph.count(adv_str) with open(fname + ".json", "w") as file: json.dump(all_words_count_dict, file)
def feature_maker(embed_file, dataframe, embed_signal='n'): '''takes a path to embeddings file, dataframe as input - default keyword embed-signal means that embeddings are not encoded by default returns an expanded dataframe with: a column of lemmatised words; a column of stemmed words; a column indicating capitalisation status; a column indicating capilatisation status of previous token; columns indicating shape, previous shape, short shape, previous short shape, following token short shape. If kwarg embed_signal is 'y', a list of embeddings is also generated. ''' wnl = WordNetLemmatizer() prtr = PorterStemmer() stringed_list = [str(x) for x in dataframe['token']] wn_lemma_list = [wnl.lemmatize(t) for t in stringed_list] dataframe['lemma'] = wn_lemma_list prtr_stemmer_list = [prtr.stem(t) for t in stringed_list] dataframe['stem'] = prtr_stemmer_list dataframe['caps'] = 'no caps' dataframe.loc[dataframe['token'].str.contains('^[A-Z][a-z]'), ['caps']] = 'begin_cap' dataframe.loc[dataframe['token'].str.contains('[A-Z][A-Z]'), ['caps']] = 'all_caps' dataframe.loc[dataframe['token'].str.contains('[a-z][A-Z]]'), ['caps']] = 'caps_inside' temp_list = dataframe['caps'].to_list() temp_list.insert(0, 'no_cap') temp_list.pop() dataframe['prev_caps'] = temp_list dataframe['short_shape'] = 'x' dataframe.loc[dataframe['token'].str.contains('^[A-Z][a-z]'), ['short_shape']] = 'Xx' dataframe.loc[dataframe['token'].str.contains('[A-Z][A-Z]'), ['short_shape']] = 'XX' dataframe.loc[dataframe['token'].str.contains('[a-z][A-Z]]'), ['short_shape']] = 'xXx' dataframe.loc[dataframe['token'].str.contains('\W'), ['short_shape']] = '-' prev_short_shape_list = [] prev_short_shape_list = dataframe['short_shape'].to_list() prev_short_shape_list.insert(0, '-') prev_short_shape_list.pop() dataframe['prev_short_shape'] = prev_short_shape_list next_short_shape_list = [] next_short_shape_list = dataframe['short_shape'].to_list() next_short_shape_list.pop(0) next_short_shape_list.append('-') dataframe['next_short_shape'] = next_short_shape_list shape_list = [] pre_list = [] suf_list = [] for text in dataframe['token']: prefix = text[:3] suffix = text[-3:] pre_list.append(prefix) suf_list.append(suffix) replace_caps = re.sub('[A-Z]', 'X', text) replace_lowers = re.sub('[a-z]', 'x', replace_caps) replace_digits = re.sub('\d', 'd', replace_lowers) shape_list.append(replace_digits) dataframe['shape'] = shape_list prev_shape_list = [] prev_shape_list = dataframe['shape'].to_list() prev_shape_list.insert(0, '-') prev_shape_list.pop() dataframe['prev_shape'] = prev_shape_list dataframe['prefix'] = pre_list dataframe['suffix'] = suf_list if embed_signal == 'y': word_embedding_model = gensim.models.KeyedVectors.load_word2vec_format( embed_file, binary=True) embeddings = [] for token in dataframe['token']: if token in word_embedding_model: vector = word_embedding_model[token] else: vector = [0] * 300 embeddings.append(vector) return dataframe, embeddings else: return dataframe
class NGramAligner(Aligner): def __init__(self): self.stemmer = PorterStemmer() def align( self, source: Doc, targets: List[Doc], ) -> List[Dict]: alignments = [] source_ngram_spans = self._get_ngram_spans(source) for target in targets: target_ngram_spans = self._get_ngram_spans(target) alignments.append( self._align_ngrams(target_ngram_spans, source_ngram_spans)) return alignments def _get_ngram_spans( self, doc: Doc, ): ngrams = [] for sent in doc.sents: for n in range(1, len(list(sent))): tokens = [t for t in sent if not (t.is_stop or t.is_punct)] ngrams.extend(_ngrams(tokens, n)) def ngram_key(ngram): return tuple( self.stemmer.stem(token.text).lower() for token in ngram) key_to_ngrams = itertoolz.groupby(ngram_key, ngrams) key_to_spans = {} for k, grouped_ngrams in key_to_ngrams.items(): key_to_spans[k] = [(ngram[0].i, ngram[-1].i + 1) for ngram in grouped_ngrams] return key_to_spans def _align_ngrams( self, ngram_spans_1: Dict[Tuple[str], List[Tuple[int, int]]], ngram_spans_2: Dict[Tuple[str], List[Tuple[int, int]]] ) -> Dict[Tuple[int, int], List[Tuple[int, int]]]: """Align ngram spans between two documents Args: ngram_spans_1: Map from (normalized_token1, normalized_token2, ...) n-gram tuple to a list of token spans of format (start_pos, end_pos) ngram_spans_2: Same format as above, but for second text Returns: map from each (start, end) span in text 1 to list of aligned (start, end) spans in text 2 """ if not ngram_spans_1 or not ngram_spans_2: return {} max_span_end_1 = max( span[1] for span in itertools.chain.from_iterable(ngram_spans_1.values())) token_is_available_1 = [True] * max_span_end_1 # matched_keys = list( set(ngram_spans_1.keys()) & set(ngram_spans_2.keys())) # Matched normalized ngrams betwee matched_keys.sort( key=len, reverse=True) # Process n-grams from longest to shortest alignment = defaultdict( list ) # Map from each matched span in text 1 to list of aligned spans in text 2 for key in matched_keys: spans_1 = ngram_spans_1[key] spans_2 = ngram_spans_2[key] available_spans_1 = [ span for span in spans_1 if all(token_is_available_1[slice(*span)]) ] matched_spans_1 = [] if available_spans_1 and spans_2: # if ngram can be matched to available spans in both sequences for span in available_spans_1: # It's possible that these newly matched spans may be overlapping with one another, so # check that token positions still available (only one span allowed ber token in text 1): if all(token_is_available_1[slice(*span)]): matched_spans_1.append(span) token_is_available_1[slice( *span)] = [False] * (span[1] - span[0]) for span1 in matched_spans_1: alignment[span1] = spans_2 return alignment
from collections import defaultdict import re import json from nltk import PorterStemmer from nltk.corpus import words import math import string #asfasfasf INDEX_DICT = {} #DOC_ID_DICT = {} directory = "C:\\Users\\tajun\\PycharmProjects\\ICS-121\\DevlopZip\\DEV" doc_counter = 0 partial_counter = 0 NumOfDocs = 0 ps = PorterStemmer() token_count = 0 output_dict = {} #where {filenum;(word,[list of postings]} skip_count = 0 class Postings: #each doc id is a posting? def __init__(self, docid, positions): self.docid = docid self.positions = positions self.tfidf = 0 # use freq counts for now # self.fields = fields #takes in a file name to tokenize and return a list of tokens//should return a list of lists? where first element is tok, second is count, third is and so on.
def stem(a): a = a.strip('0123456789.,"[]()?!: ') a = PorterStemmer().stem_word(a) return a
# Lowercase the corpus processed = raw_text.str.lower() # Remove punctuation, white spaces processed = processed.str.replace(r'[^\w\d\s]', ' ') processed = processed.str.replace(r'\s+', ' ') processed = processed.str.replace(r'^\s+|\s+?$', '') # Remove stop words stop_words = stopwords.words('english') processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in set(stop_words))) # Remove word stems using a Porter stemmer porter = PorterStemmer() processed = processed.apply( lambda x: ' '.join(porter.stem(term) for term in x.split())) # Construct a design matrix using an n-gram model and a tf-idf statistics vectorizer = TfidfVectorizer(ngram_range=(1, 2)) features = vectorizer.fit_transform(processed) # Prepare the training and test sets using an 80/20 split X_train, X_test, y_train, y_test = train_test_split(features, labels_enc, test_size=0.2, random_state=4, stratify=labels_enc) # Train SVM with a linear kernel on the training set
def preProssess(filename): # Local get for text file file = open(filename, "r", encoding='utf-8') # Use this to read file content as a stream: fullText = file.read() sentences = fullText.split('\n') spell = Speller(fast=True) stop_words = set(stopwords.words('english')) Documents = {} vocab = {} for i in range(len(sentences)): # split by tab currSentenceTuple = sentences[i].split('\t') # [docid, setence] # the key is the tweetid and the value is the tweet text # (fill sentence, dictionary of words and their weights, length) Documents[currSentenceTuple[0]] = (currSentenceTuple[1], {}, 0) # start the preprocessing here currSentenceValue = currSentenceTuple[1] # lower case currSentenceValue = currSentenceValue.lower() # remove URLS currSentenceValue = re.sub(r'http\S+', '', currSentenceValue) # create our tokenizer that will also remove punctuation tokenizer = RegexpTokenizer(r'\w+') # removing the I'm , can't to Im and cant currSentenceValue = currSentenceValue.replace("'", "") #autocorrect spelling mistakes currSentenceValue = spell(currSentenceValue) # tokenize here currSentenceValue = tokenizer.tokenize(currSentenceValue) # remove stop words porterStemmer = PorterStemmer() currSentenceValue = [porterStemmer.stem(w) for w in currSentenceValue if not w in stop_words] # finished preprossessing tweet # send the preprocessed tweet to be indexed (vocab, Documents) = indexing(currSentenceValue, currSentenceTuple, Documents, vocab) tf_max = 0 for word in vocab: if vocab[word][0] > tf_max: tf_max = vocab[word][0] numOfDocs = len(Documents) for docid in Documents: length = 0 for wordsInDoc in Documents[docid][1]: df_i = vocab[wordsInDoc][0] idf = math.log((numOfDocs / df_i), 2) tf_ij = Documents[docid][1][wordsInDoc] / len(Documents[docid][1]) w_ij = tf_ij * idf Documents[docid][1][wordsInDoc] = w_ij length += w_ij ** 2 (doc, sentence, l) = Documents[docid] Documents[docid] = (doc, sentence, math.sqrt(length)) return (vocab, Documents)
def ordered_stems(self) -> List[str]: from nltk import PorterStemmer stemmer = PorterStemmer() return [stemmer.stem(w) for w in self.tokens]
def stemmed_labels(self) -> Set[str]: from nltk import PorterStemmer stemmer = PorterStemmer() return {stemmer.stem(label) for label in self.labels}
def stemIt(word_list, stemmer= PorterStemmer(), encoding= "utf8"): tmp = [] for w in word_list: tmp.append(stemmer.stem(w).encode(encoding)) return tmp
def queryResults(queryString, vocabDict, documents, numberOfRowsForResults): stop_words = set(stopwords.words('english')) scores = {} N = len(documents) queryString = queryString.lower() #queryStringExpansion = queryExpansionMethod(model_glove_twitter,queryString) queryStringExpansion = queryString # create our tokenizer that will also remove punctuation tokenizer = RegexpTokenizer(r'\w+') # removing the I'm , can't to Im and cant queryString = queryString.replace("'", "") # tokenize here queryString = tokenizer.tokenize(queryString) # remove stop words porterStemmer = PorterStemmer() queryString = [ porterStemmer.stem(w) for w in queryString if not w in stop_words ] # we are collecting the weights for the query string and it's length weightsForQuery = {} lengthOfQuery = 0 for stemword in queryString: if stemword.isnumeric(): continue #adding check here so see if the stem word is actually in our vocab. If it's not then we can simply skip it if stemword not in vocabDict: continue # docsFoundForStemWord = vocabDict[stemword] # calculate weight for query word i df_i = vocabDict[stemword][0] tf_iq = queryString.count(stemword) / len(queryString) idf = math.log((N / df_i), 2) w_iq = (0.5 + 0.5 * tf_iq) * idf if stemword not in weightsForQuery: weightsForQuery[stemword] = w_iq lengthOfQuery += w_iq**2 # we now have the length of the query vector and a dict of weights w_iq lengthOfQuery = math.sqrt(lengthOfQuery) # print(weightsForQuery) for word in weightsForQuery: docsFoundForStemWord = vocabDict[word][1] for doc in docsFoundForStemWord: scores[doc] = cosineCalculator(doc, documents, lengthOfQuery, weightsForQuery) arrayOfSortedScoresTuples = sorted(scores.items(), key=lambda x: x[1], reverse=True) #here we add a dictionary that will store the documents and their new scores on the query expansion arrayOfSortedScoresTuplesExpanded = {} for i in range(len(arrayOfSortedScoresTuples)): docId = arrayOfSortedScoresTuples[i][0] originalScore = arrayOfSortedScoresTuples[i][1] docSentence = documents[docId][0] #get sentence #get the tokens in our twitter embedding model tokens_1 = [t for t in docSentence.split() if t in model_glove_twitter] tokens_2 = [ t for t in queryStringExpansion.split() if t in model_glove_twitter ] cosine = 0 if (len(tokens_1) > 0 and len(tokens_2) > 0): cosine = model_glove_twitter.n_similarity(tokens_1, tokens_2) #take the average of both scores! newScoreAvg = (originalScore + cosine) / 2 #store the score with the document arrayOfSortedScoresTuplesExpanded[docId] = newScoreAvg #sort by highest value! arrayOfSortedScoresTuplesExpanded = sorted( arrayOfSortedScoresTuplesExpanded.items(), key=lambda x: x[1], reverse=True) return arrayOfSortedScoresTuplesExpanded[:numberOfRowsForResults]
def stemming_by_portter_1(term): return PorterStemmer().stem(term)
import nltk import pandas as pd import numpy as np import pickle import re from nltk.corpus import stopwords from nltk import PorterStemmer, WordNetLemmatizer data = pd.read_csv('spam.csv', sep=',', encoding='latin-1') data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True) data['Type'] = data['Type'].map({'ham': 0, 'spam': 1}) X = data['Message'] y = data['Type'] stem = PorterStemmer() corpus = [] for i in range(len(data)): words = re.sub('[^a-zA-Z]', ' ', data['Message'][i]) words = words.lower() words = words.split() words = [ stem.stem(word) for word in words if word not in set(stopwords.words('english')) ] words = ' '.join(words) corpus.append(words) #creating BagOfWords from sklearn.feature_extraction.text import CountVectorizer
def tokenizer(direc_path): # get stop-list # change working directory to specified one # process all files and tokenize # get all lines of stop-list # remove newline character at the end fp = open('stoplist.txt', 'r') stoplist = list(fp) fp.close() for i in range(len(stoplist)): stoplist[i] = stoplist[i][:-1] # change path to where corpus is current_dir = os.getcwd() os.chdir(direc_path) flist = [] flist.extend(os.listdir(direc_path)) term_dictionary = {} doc_dictionary = {} doc_id = 1 term_id = 1 # main loop for fname in os.listdir(): # read file and add its name and ID to a dictionary fp = open(fname, 'r', errors='ignore') content = fp.read() fp.close() doc_dictionary[doc_id] = fname doc_id += 1 # ignoring initial headers substr = "<!DOCTYPE" index = content.find(substr) htmlcode = content[index:] # get parsed result result = parsehtml(htmlcode) # Tokenize and turn to lower case token_list = nltk.regexp_tokenize(result, "[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+") for i in range(len(token_list)): token_list[i] = token_list[i].lower() # print(token_list) # print(len(token_list)) # ignore tokens if they're in stop list i = 0 deleteflag = False while i < len(token_list): for s in stoplist: if s == token_list[i]: del token_list[i] deleteflag = True break if deleteflag: deleteflag = False else: i += 1 # print(len(token_list)) # stem the token list stemmer = PorterStemmer() for i in range(len(token_list)): token_list[i] = stemmer.stem(token_list[i]) # put terms as key in dictionary with incremented term id as value for i in range(len(token_list)): if token_list[i] not in term_dictionary: term_dictionary[token_list[i]] = term_id term_id = term_id + 1 # write doc dictionary to file, format is term id /t term # term ids become keys and terms become values os.chdir(current_dir) f = open('docids.txt', 'w') for value, key in doc_dictionary.items(): f.write(str(key) + '\t' + str(value) + '\n') f.close() # write term dictionary to file f = open('termids.txt', 'w', errors='ignore') for key, value in term_dictionary.items(): f.write(str(value) + '\t' + str(key) + '\n') f.close() return (term_dictionary,doc_dictionary)
def stem_it(self): stemmer = PorterStemmer() self.word = stemmer.stem(self.word)
# CliNER - word_features.py # # # # Willie Boag [email protected] # # # # Purpose: Isolate all word-level features into a single file # ###################################################################### import re from cliner.features_dir.wordshape import getWordShapes from nltk import LancasterStemmer, PorterStemmer __author__ = 'Willie Boag' __date__ = 'Apr 27, 2014' lancaster_st = LancasterStemmer() porter_st = PorterStemmer() def feature_word(word): return {('word', word.lower()): 1} def feature_stem_lancaster(word): return {('stem_lancaster', lancaster_st.stem(word.lower())): 1} def feature_generic(word): generic = re.sub('[0-9]', '0', word) return {('Generic#', generic): 1}
def __init__(self): self.stemmer = PorterStemmer()
@author: nausheenfatma """ import sys import logging from XMLCustomParser import WikiXmlHandler import xml.sax import time from datetime import datetime import ast from nltk import PorterStemmer import argparse #from MergeIndices import batch_sort from MergeIndices import MergeIndices sno = PorterStemmer() punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' remove_punctuation_map = dict((ord(char), 32) for char in punctuation) number = '0123456789' remove_number_map = dict((ord(char), None) for char in number) class Document(): def __init__(self): self.doc_id = {} self.title = {} self.body = {} self.infobox = {} self.categories = {} self.external_links = {}
def stem_words(f): stemmer=PorterStemmer() processed=tokenize(f) for i in range(len(processed)): processed[i]=stemmer.stem(processed[i]) return processed
def try_basic_query_tokenizer(): stemmer = PorterStemmer() x = "answer(cityid('new york', _))" y = basic_query_tokenizer(x, strtok=lambda x: [stemmer.stem(xe) for xe in x.split()])
from nltk.stem.wordnet import WordNetLemmatizer import string import gensim from gensim import corpora # from nltk.tokenize import word_tokenize df = pd.read_json('related_data_rm_duplicacy.json') QATags = df.content # print(QATags) QATags = list(QATags) # print(QATags[:10]) stop = set(stopwords.words('english')) exclude = set(string.punctuation) lemma = WordNetLemmatizer() port = PorterStemmer() def clean(doc): stop_free = " ".join([i for i in doc.lower().split() if i not in stop]) # print(stop_free) punc_free = ''.join(ch for ch in stop_free if ch not in exclude) # print(punc_free) normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split()) stem = " ".join(port.stem(word) for word in normalized.split()) remove_non_english = stem.encode("ascii", errors="ignore").decode() return remove_non_english Text_clean = [clean(doc).split() for doc in QATags]
for inputCurr in lstFileNames: file_object = open(input_dir + "//" + inputCurr, 'rU') try: for line in file_object: try: line.decode('ascii') except Exception, error: continue ## skip blanks # Strip punctuations line = StripPunc(line) for word in line.split(): count = count + 1 # make word lower case and stem word word = word.lower() word = PorterStemmer().stem_word(word) if word in dictWords: val = dictWords[word] dictWords[word] = val + 1 else: dictWords[word] = 1 finally: file_object.close() ############################################################################ # Print Summary Statistics # ############################################################################ print "Completed building index of total words seen:\n", count print "Total unique words after stemming in list:\n", len(dictWords) keys = dictWords.keys() ###########################################################################
quotes_token = nltk.word_tokenize(qt) quotes_bigrams = list(nltk.bigrams(quotes_token)) print(quotes_bigrams) quotes_trigrams = list(nltk.trigrams(quotes_token)) print(quotes_trigrams) quotes_quadgrams = list(nltk.ngrams(quotes_token, 4)) print(quotes_quadgrams) # stemming from nltk import PorterStemmer pst = PorterStemmer() pst.stem("having") pst.stem("sudeep") words_stem = ["give", "giving", "given", "gave"] for words in words_stem: print(words + " :" + pst.stem(words)) from nltk import LancasterStemmer lnst = LancasterStemmer() for words in words_stem: print(words + " :" + lnst.stem(words)) from nltk import SnowballStemmer
best_seller_group = shampoo.groupby('best_selling', ) best_seller_group.agg(['mean', 'std', 'median']) rating_mask = shampoo['rating'].isnull() == False rating_group = shampoo.loc[rating_mask, :] rating_group['rating'] = rating_group['rating'].astype('float') rating_grouped = rating_group.groupby('best_selling', ) rating_grouped.agg(['mean', 'std', 'median']) # Natural Language Processing from nltk.corpus import stopwords stop = stopwords.words('english') from textblob import TextBlob from nltk import PorterStemmer stemmer = PorterStemmer() import nltk df = pd.read_csv('description_df') df['nlp_description'] = df['nlp_description'].astype('string') # add product specific stop words stop.extend([ 'shampoo', 'conditioner', 'soap', 'cleanse', 'hair', 'head', 'shoulders', 'loréal', 'pari', 'product', 'help', 'use', 'free', 'make', 'type' ]) #Pre Processing #remove stop words df['nlp_description'] = df['nlp_description'].apply( lambda text: " ".join(word for word in text.split() if word not in stop))
from nltk import MWETokenizer, sent_tokenize, PorterStemmer import json import re import string import wiki import pickle ingredients = set([]) tokenizer = MWETokenizer() utensil_tokenizer = MWETokenizer() method_tokenizer = MWETokenizer() measurements = set([]) techniques = set([]) ps = PorterStemmer() mexican = {} chinese = {} food = set([]) unnaccounted_methods = [ "broil", "mix", "grease", "coat", "arrange", "sprinkle" ] unnaccounted_tools = ["bowl", "dish", "broiler"] with open('healthy.pickle', 'rb') as handle: healthy = pickle.load(handle) def type(food, foodtypes): possibleHits = food.split() for h in possibleHits: for key in foodtypes.keys(): if h in key: return (food, foodtypes[key])
def s(tokens): return [PorterStemmer().stem(t) for t in tokens]
import re from nltk import PorterStemmer from BugSimilarityScoreCalculator import BugSimilarityScoreCalculator from DataSetFieldEnum import DataSetFieldEnum from FinalRank import FinalRank from RVSMCalculator import RVSMCalculator from VSMSimilarityCalculator import VSMSimilarityCalculator first_cap_re = re.compile('(.)([A-Z][a-z]+)') all_cap_re = re.compile('([a-z0-9])([A-Z])') porter_stemmer = PorterStemmer() ranks_file = open('ranks_file.txt', 'w') class BugLocalization: def __init__(self,dataset): self.dataset = dataset def run(self): self.dataset.reset_calculation_lists() for i in range(self.dataset.get_bug_report_list_lenght()): current_bug_report = self.dataset.bug_report_list[i] self.dataset.results = {} self.localize_bugs(current_bug_report) first_file_pos_ranked = self.calculate_rank_first(self.dataset,current_bug_report) files_binary_relevance = self.calculate_binary(current_bug_report,self.dataset) top_n_rank = self.calculate_tops(current_bug_report,self.dataset)
def stem(array): stemmer = PorterStemmer() return [stemmer.stem(w) for w in array]