def __init__(self, text, product_name): self.candidate_features = [] self.feature_sentences = [] self.product_name = product_name.lower().split('-')[0].split('_') t = Tokenizer() sents = t.sent_tokenize(text.lower()) p = POSTagger() wnl = WordNetLemmatizer() for sent in sents: tagged_sent = p.nltk_tag(t.word_tokenize(sent)) feature_sent = {} feature_sent['sentence'] = sent feature_sent['tags'] = tagged_sent feature_sent['nouns'] = [] feature_sent['noun_phrases'] = [] for i in range(0, len(tagged_sent)): (word, tag) = tagged_sent[i] #Don't include proper nouns if tag.startswith('N') and tag != 'NNP': """ Consecutive nouns might form a feature phrase. Eg. Picture quality is a phrase. Meaningless phrases like 'quality digital' are removed later as their frequeny of occurence is low. """ if i > 0 and len(feature_sent['nouns']) > 0 and tagged_sent[i - 1][0] == feature_sent['nouns'][-1] and feature_sent['sentence'].find(feature_sent['nouns'][-1] + ' ' + word) > -1: feature_sent['noun_phrases'].append(wnl.lemmatize(feature_sent['nouns'].pop() + ' ' + word)) else: feature_sent['nouns'].append(wnl.lemmatize(word)) self.feature_sentences.append(feature_sent)
def returnKeywordFromList(convertpath): token_dict = {} i=0 #nltk.download() wnl = WordNetLemmatizer() fileName = {} #print file #print str(i)+ file #file_path = subdir + os.path.sep + file shakes = open(convertpath, 'r') text = shakes.read() lowers = "".join(map(lambda l:l.decode('unicode_escape').encode('ascii','ignore'),text)) no_punctuation = re.sub(r'[?|$|.|!0-9()=+-\/\'\"\|]',r'',lowers) d = {v:True for v in no_punctuation.split()} for token in d.keys(): no_punctuation = no_punctuation.replace(token, wnl.lemmatize(token)) fileName[i] = file token_dict[i] = no_punctuation.replace("\n"," ").replace("\r","") #break #this can take some time ##print token_dict.values() tfidf_vect = TfidfVectorizer(stop_words =stops, ngram_range=(1, 2)) # # # count_vect.stop_words = stops # X_train_counts = tfidf_vect.fit_transform(token_dict.values()) #print tfidf_vect.get_feature_names() #print(sortSparseMatrix(X_train_counts.getrow(0),rev=False, only_indices=False)) sortedMatrix = sortSparseMatrix(X_train_counts.getrow(0),rev=True, only_indices=False)[0] x = map(lambda (x,y):x,sortedMatrix) result = getKeywordAlgorithms(1,sortedMatrix) return map(lambda key:tfidf_vect.get_feature_names()[key],result)
def text2sents(text, lemmatize=False, stemmer=None): """ converts a text into a list of sentences consisted of normalized words :param text: list of string to process :param lemmatize: if true, words will be lemmatized, otherwise -- stemmed :param stemmer: stemmer to be used, if None, PortedStemmer is used. Only applyed if lemmatize==False :return: list of lists of words """ sents = sent_tokenize(text) tokenizer = RegexpTokenizer(r'\w+') if lemmatize: normalizer = WordNetLemmatizer() tagger = PerceptronTagger() elif stemmer is None: normalizer = PorterStemmer() else: normalizer = stemmer sents_normalized = [] for sent in sents: sent_tokenized = tokenizer.tokenize(sent) if lemmatize: sent_tagged = tagger.tag(sent_tokenized) sent_normalized = [normalizer.lemmatize(w[0], get_wordnet_pos(w[1])) for w in sent_tagged] else: sent_normalized = [normalizer.stem(w) for w in sent_tokenized] sents_normalized.append(sent_normalized) return sents_normalized
def lemmatizing(line_list): """ Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data Iterates over all terms in lines, lemmatize them using WordNetLemmatizer() Return: lemmatized_list (list of strings(terms that stemmed)) """ lemmatized_list = [] lemmatizer = WordNetLemmatizer() for i, line in enumerate(line_list): # linercase line = line.lower() # remove punctuation # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom # nopunct_line = ''.join([c for c in line # if re.match("[a-z\-\' \n\t]", c)]) # this solve the problem above: nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line) # tokenize line_token = wt(nopunct_line) # stemming lemmatized_line = [] for term in line_token: term = lemmatizer.lemmatize(term) lemmatized_line.append(term) # back to sentence as a string lemmatized_sentence = ' '.join(lemmatized_line) lemmatized_list.append(lemmatized_sentence) return lemmatized_list
def preprocess(original_str): # stemmer wnl = WordNetLemmatizer() # pos original_str = unicode(original_str, errors='ignore') print type(original_str) article_tok = pos_tag(word_tokenize(original_str)) print type(article_tok) print "token: " print article_tok # choose Noun str_noun = '' for word, tag in article_tok: if ("NN" in tag) or ("JJ" in tag): # print(word,":",tag) # print(wnl.lemmatize(word)) try: stemming_word = wnl.lemmatize(word) print stemming_word if len(word) > 1: str_noun = str_noun + stemming_word + " " except UnicodeDecodeError as e: print "error: " + word # end if # result # final_doc.append(str_noun) # print "return_preprocess : " + str_noun return str_noun
def lemmstem(sentences): ''' This function is responsible for perfoming the lemmarization and stemming of the words Input: A list of trees containing the sentences. All words are classificated by their NE type Output: Lemmatized/Stemmized sentences ''' lmtzr = WordNetLemmatizer() st = LancasterStemmer() dic = {'VB' :wordnet.VERB, 'NN': wordnet.NOUN, 'JJ':wordnet.ADJ, 'RB':wordnet.ADV } for sent in sentences: lvsidx=sent.treepositions('leaves') for pos in lvsidx: word=sent[pos][0] tag = sent[pos][1] rtag = tag[0:2] if rtag in dic: lemm=lmtzr.lemmatize( word, dic[rtag] ) stem=st.stem(lemm) #print word, lemm, stem #Linia maldita sent[pos]=(word, tag, stem) else: sent[pos]=(word, tag, word) return sentences
def write_clean_turian_unigrams(): """ Extracts unigram embeddings from Socher's binary distribution. These can be used by other composers. There are only 50k embeddings (presumably for the most frequent tokens in the corpus). The words have not been processed- there are punctuation-only tokens, uppercased words and non-lemmatized words. There isn't any PoS tag filtering either- words like "to", "while" and "there". I remove punctuation, then lowercase and lemmatize each entry. Multiple entries may map to the same canonical form. I select the shortest original entry (ties are broken by giving preference to words that are already lowercased). This could have been done better. Only vectors for the selected entries are kept. There's 33k canonical forms left, many of which are not nouns/adjs/verbs. We don't have a PoS tag for the canonical forms. I get around the problem by creating 3 copies of each canonical form and expand "cat" to cat/N, cat/J and cat/V, which all share the same vector. """ logging.info('Writing Turian unigrams to %s', turian_unigram_vectors_file) mat = loadmat(socher_unigram_embedding_matlab) words = [w[0] for w in mat['words'].ravel()] df = pd.DataFrame(mat['We'].T, index=words) lmtzr = WordNetLemmatizer() clean_to_dirty = defaultdict(list) # canonical -> [non-canonical] dirty_to_clean = dict() # non-canonical -> canonical to_keep = set() # which non-canonical forms forms we will keep # todo this can be done based on frequency or something for w in words: if set(w).intersection(set(string.punctuation).union(set('0123456789'))): # not a real word- contains digits or punctuation continue lemma = lmtzr.lemmatize(w.lower()) clean_to_dirty[lemma].append(w) dirty_to_clean[w] = lemma # decide which of possibly many non-canonical forms with the same lemma to keep # prefer shorter and lowercased non-canonical forms for lemma, dirty_list in clean_to_dirty.items(): if len(dirty_list) > 1: best_lemma = min(dirty_list, key=lambda w: (len(w), not w.islower())) else: best_lemma = dirty_list[0] to_keep.add(best_lemma) # remove non-canonical forms we don't want idx_to_drop = [i for i, w in enumerate(df.index) if w not in to_keep] ddf = df.drop(df.index[idx_to_drop]) # canonicalize whatever is left ddf.index = [lmtzr.lemmatize(w.lower()) for w in ddf.index] # we don't know what the PoS tags of the canonical forms are, so make them all of the same tag # e.g. expand "cat" to cat/N, cat/J and cat/V, which all share the same vector new_index = ['%s/%s'%(w, pos) for pos in 'NJV' for w in ddf.index] new_data = np.vstack([ddf.values] * 3) ddf = pd.DataFrame(new_data, index= new_index) dv = DenseVectors(ddf, allow_lexical_overlap=True) dv.to_tsv(turian_unigram_vectors_file) logging.info('Done')
def init_feature_sentences(self, total_content): t = Tokenizer() p = POSTagger() wnl = WordNetLemmatizer() sentences = t.sent_tokenize(total_content.lower()) for sentence in sentences: tagged_sentence = p.ntlk_tag(t.word_tokenize(sentence)) #Initializing Feature Sentence dictionary feature_sentence = {} feature_sentence['sentence'] = sentence feature_sentence['tags'] = tagged_sentence feature_sentence['nouns'] = [] feature_sentence['noun_phrases'] = [] #Finding the Nouns/Noun Phrases in the tagged sentence for i in range(0,len(tagged_sentence)): (word, tag) = tagged_sentence[i] #Chunking if tag.startswith('N') and tag != 'NNP': if i > 0 and len(feature_sentence['nouns']) > 0 and tagged_sentence[i - 1][0] == feature_sentence['nouns'][-1] and feature_sentence['sentence'].find(feature_sentence['nouns'][-1] + ' ' + word) > -1: feature_sentence['noun_phrases'].append(wnl.lemmatize(feature_sentence['nouns'].pop() + ' ' + word)) else: feature_sentence['nouns'].append(wnl.lemmatize(word)) self.feature_sentences.append(feature_sentence)
def feature_extractor_tripadvisor_top_words_weights(data): data = data.decode('utf-8') top_file = open('scraper/top_words.txt', 'r') top_words = [word.replace('\n', '') for word in top_file] places_file = open('scraper/places.txt', 'r') for place in places_file: place = place.replace('\n', '') for word in place.split(' '): if word != '-': top_words.append(word) features = {} lemmatizer = WordNetLemmatizer() stop_words = stopwords.words('english') words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)] for word in words: if word not in stop_words: if word in features: if word in top_words: features[word] += 1.5 else: features[word] += 1 else: if word in top_words: features[word] = 1.5 else: features[word] = 1 return features
def feature_extractor_top_words_weights(data): data = data.decode('utf-8') top_words = ['travel', 'vacation', 'city', 'itsmorefuninthephilippines', 'travel', 'boracay', 'philippine', 'view', 'day', 'beach', 'morning', 'resort', 'good', 'cebu', 'island'] features = {} lemmatizer = WordNetLemmatizer() stop_words = stopwords.words('english') words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)] for word in words: if word not in stop_words: if word in features: if word in top_words: features[word] += 1.5 else: features[word] += 1 else: if word in top_words: features[word] = 1.5 else: features[word] = 1 return features
def feature_extractor_top_words_weights(data): """ Extract features using the top words with weights method parameter: data (tweet) returns: returns features of the given data """ data = data.decode('utf-8') # top 15 frequently-ocurring words from the tourism-related twitter corpus top_words = ['travel', 'vacation', 'city', 'itsmorefuninthephilippines', 'travel', 'boracay', 'philippine', 'view', 'day', 'beach', 'morning', 'resort', 'good', 'cebu', 'island'] features = {} lemmatizer = WordNetLemmatizer() stop_words = stopwords.words('english') # preprocessing: tokenize, convert to lowercase and lemmatize words words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)] # remove stop words and add words and their frequencies as features for word in words: if word not in stop_words: if word in features: # if word is found in the top words list, increase by 1.5 or preferred weight if word in top_words: features[word] += 1.5 else: features[word] += 1 else: if word in top_words: features[word] = 1.5 else: features[word] = 1 return features
def Check(mArray): # what am I checking? item = mArray[1] lmtzr = WordNetLemmatizer() item = lmtzr.lemmatize(item) # converts to a string return ''.join(item)
def word_extractor2(text): wordlemmatizer = WordNetLemmatizer() text = re.sub(r'([a-z])\1+', r'\1\1',text)#substitute multiple letter by two words = "" wordtokens = [ wordlemmatizer.lemmatize(word.lower()) \ for word in word_tokenize(text.decode('utf-8', 'ignore')) ] for word in wordtokens: words+=" "+word return words
def Check(mArray): #what am I checking? #Taking the 2nd item in the array since popopen puts the file path as the first item. item = mArray[1] lmtzr = WordNetLemmatizer() item = lmtzr.lemmatize(item, get_wordnet_pos(item)) #converts to a string return ''.join(item)
def lemmatize(tokens): # lemmatize words. try both noun and verb lemmatizations lmtzr = WordNetLemmatizer() for i in range(0,len(tokens)): res = lmtzr.lemmatize(tokens[i]) if res == tokens[i]: tokens[i] = lmtzr.lemmatize(tokens[i], 'v') else: tokens[i] = res return tokens
def add_lemmatizer(): in_fp = open(word_topic_file) out_fp = open(word_topic_lexeme_file, 'w') wnl = WordNetLemmatizer() ### line = '' line_num = 0 while 1 and line_num < max_line_num: line = in_fp.readline() line = line.strip() line_words = line.split(' ') line_write = '' for words in line_words: word_topic = words.split(':') word_id = word_topic[0] topic_id = word_topic[1] line_write += word_id line_write += ':' line_write += topic_id line_write += ':' ## if id_word_dict.has_key(word_id): word = id_word_dict[word_id] if word_lexeme_id_dict.has_key(word): line_write += word_lexeme_id_dict[word] line_write += ' ' else: word_list = [] word_list.append(word) pos = pt(word_list) tag = pos[0][1] lexeme = wnl.lemmatize(word, penn_to_wn(tag)) #print ': ', word, lexeme if word_id_dict.has_key(lexeme): lexeme_id = word_id_dict[lexeme] word_lexeme_id_dict[word] = lexeme_id line_write += lexeme_id line_write += ' ' else: word_lexeme_id_dict[word] = word_id line_write += word_id line_write += ' ' ## line_write = line_write.strip() out_fp.write(line_write) if line_num < max_line_num -1: out_fp.write('\n') line_num += 1 if line_num%1000 ==0: print 'line: ', line_num ### in_fp.close() out_fp.close()
class Lemmatizer(): def __init__(self): self.lemmatizer = WordNetLemmatizer() self.stemmer = SnowballStemmer("english", ignore_stopwords=True) ''' Lemmatizes every word in a sentence and then tokenizes it. sentence: str ''' def lemmatize(self, sentence): tokens = word_tokenize(sentence) lemmas = self.lemmatizeTokens(tokens) return " ".join(lemmas) ''' Turns phrase tokens into lemmatized tokens, which means into some standard format as determined by the nltk lemmatizer. "Dogs" to "dog", "went" to "go", etc. tokens: list of str ''' def lemmatizeTokens(self, tokens): tokens_tagged = pos_tag(tokens) #Get simple POS tags. tokens_simpleTags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in tokens_tagged] #Actually lemmatize. lemmas = [] for token, tag in tokens_simpleTags: lemmatized = "" if tag == "VERB": lemmatized = self.lemmatizer.lemmatize(token, pos='v') elif tag == "ADJ": lemmatized = self.lemmatizer.lemmatize(token, pos='a') elif tag == "ADV": lemmatized = self.lemmatizer.lemmatize(token, pos='r') else: lemmatized = self.lemmatizer.lemmatize(token) #pos = 'n' lemmas.append(lemmatized.encode("utf-8")) return lemmas ''' Reduce this word down to its most basic form by removing suffixes or common ending and finding the "root" or "stem" of the word. Example: "response," "responsive," and "responsivity" all stem from "respons," or something similar. ''' def stem(self, tokens): stemmed = [] for token in tokens: stem = self.stemmer.stem(token) stemmed.append(stem.encode("utf-8")) return stemmed
def review_to_words(raw_review, need_to_lemmatize=False): # Function to convert a raw review to a string of words # optional lemmatization # meaningful_words = review_to_wordlist(raw_review) if need_to_lemmatize: wnl = WordNetLemmatizer() meaningful_words = [wnl.lemmatize(w) for w in meaningful_words] # 6. Join the words back into one string separated by space return " ".join(meaningful_words)
def feature_extractor(data): data = data.decode('utf-8') features = {} lemmatizer = WordNetLemmatizer() stop_words = stopwords.words('english') words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)] for word in words: if word not in stop_words: if word in features: features[word] += 1 else: features[word] = 1 return features
def __init__(self, lightweight=False): # self.sscol = WNGlossTag.read_all_glosstag(os.path.join(WORDNET_30_GLOSSTAG_PATH, 'merged'), verbose=True) if not lightweight: self.sscol = WNGlossTag.build_lelesk_data(os.path.join(WORDNET_30_GLOSSTAG_PATH, 'merged'), verbose=False) self.wnsql = WordNetSQL.get_default() self.wnl = WordNetLemmatizer() self.lemmatize_cache = dict()
def __init__(self): self.weights = [ 2.17985806e-01, 6.01901694e-02, 4.28099419e-01, 0.14174161e-01, 2.45876460e-01, 2.19263225e-01, 1.00816031e-01, 1.06477027e-01, 1.60378048e-03, 5.79940520e-03, 1.89163517e-02, 1.68341118e-02, 1.18885069e-01, 2.68984406e-02, 9.30754965e-03, 1.78371552e-03, 1.77288605e-03, 2.37539365e-03, 5.50162160e-05, 1.10308137e-04, 5.51531014e-05, 5.35273441e-05, 2.31964872e-01, 1.68415302e-04, 2.24946972e-01, ] self.lemmatizer = WordNetLemmatizer()
def __init__(self, lemmatize=True): self.debug = False self.stemmer = PorterStemmer() self.lemmatizer = WordNetLemmatizer() self.estLemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_ESTONIAN) self.lemmatize = lemmatize self.stopwords = self.get_stopwords()
def __init__(self, stopwords=None, punct=None, lower=True, strip=True): self.lower = lower self.strip = strip #self.stopwords = stopwords or set(sw.words('english')) self.punct = punct or set(string.punctuation) self.lemmatizer = WordNetLemmatizer()
def get_words(document): ''' Return a list of unique words in document ''' regex1 = re.compile('\W') # match non-alphanumeric regex2 = re.compile('&(#)*(\w)*;') # match html entities regex3 = re.compile('( ){2,}') # match more than 2 spaces lemmatizer = WordNetLemmatizer() tokenizer = WhitespaceTokenizer() # lowercase document, remove punctuation, and html entities document = regex3.sub(' ', regex2.sub(' ', regex1.sub(' ', document.lower()))) words = [ lemmatizer.lemmatize(word) for word in tokenizer.tokenize(document) if word not in STOPWORDS and len(word) > 2 ] return FreqDist(words)
def feature_extractor_tripadvisor_top_words_weights(data): """ Extract features using the top words with weights method using words from TripAdvisor parameter: data (tweet) returns: returns features of the given data """ data = data.decode('utf-8') # retrieve file of top 100 frequently-occurring words from TripAdvisor comments top_file = open('classifier/top_words.txt', 'r') top_words = [word.replace('\n', '') for word in top_file] # retrieve file of 100 places from TripAdvisor places_file = open('classifier/places.txt', 'r') # clean places file for place in places_file: place = place.replace('\n', '') for word in place.split(' '): if word != '-': top_words.append(word) features = {} lemmatizer = WordNetLemmatizer() stop_words = stopwords.words('english') # preprocessing: tokenize, convert to lowercase and lemmatize words words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)] # remove stop words and add words and their frequencies as features # if word is found in the top words list, increase by 1.5 or preferred weight for word in words: if word not in stop_words: if word in features: if word in top_words: features[word] += 1.5 else: features[word] += 1 else: if word in top_words: features[word] = 1.5 else: features[word] = 1 return features
def __word_cleaner(self, sentence): """ Removes the unwanted words in the sentence. """ features = {} words = {} lematizer = WordNetLemmatizer() # get individual words from text words = [lematizer.lemmatize(word.lower()) for word in \ word_tokenize(sentence)] final_words = [] for word in words: word = word.encode('utf-8', 'ignore') if len(word) > 1: # check if word in not a stop word if word not in stopwords.stop_words: final_words.append(word) return ' '.join(final_words)
def _lemmatize_words(text): """Lemmatize all words in the text.""" lemmatizer = WordNetLemmatizer() lemmatizations = {} tokens = text.split() for word in tokens: if word not in lemmatizations: lemmatizations[word] = lemmatizer.lemmatize(word) for i in xrange(5): # Need to repeat several times to be safe tokens = text.split() for j in xrange(len(tokens)): try: tokens[j] = lemmatizations[tokens[j]] except KeyError: # During last pass, words were turned into their lemmas, which don't # have entries in lemmatizations pass text = ' '.join(tokens) return text
def word_extractor2(text, sw): wordlemmatizer = WordNetLemmatizer() #Se obtienen stopwords del idioma ingles commonwords = stopwords.words('english') text = re.sub(r'([a-z])\1+', r'\1\1', text) words = "" #Se realiza lower-casing y lematizacion wordtokens = [wordlemmatizer.lemmatize(word.lower()) \ for word in word_tokenize(text.decode('utf-8', 'ignore'))] #Se eliminan tokens pertenecientes al conjunto de stopwords, en caso de que sw == True if sw == True: for word in wordtokens: if word not in commonwords: words += " " + word else: for word in wordtokens: words += " " + word return words
def __init__(self): self.weights_sentences = np.array([ 3.48961282e-01, 3.75654800e-01, 4.12711607e-01, -7.24616082e-01, 3.77362029e-02, 1.15394180e-02, 1.33443409e-02, 1.64232249e-02, -3.36975735e-02, -5.02300279e-03, -3.17276960e-02, -2.94709012e-02, 1.09211720e-03, -1.68436954e-02, 7.09680460e-03, 1.01815575e-03, -2.07404857e-02, -3.86330862e-02, 1.66864534e-06, 9.97633950e-04, 7.88702336e-04, -1.04303582e-02, 6.93624232e-02, 7.89814727e-03 ]) self.weights_phrases = np.array([ 0.36460685, 0.16974013, 0.32817442, 0.21123618, 0.44617679, 0.45049947, 0.18118603, 0.16519158, 0.00473076, 0.00340283, 0.11341166, 0.04393267, 0.25306257, 0.01741644, 0.0228946, 0.0, 0.00326796, 0.00490194, 0.0, 0.0, 0.0, 0.00160063, 0.37955125, 0.0 ]) self.lemmatizer = WordNetLemmatizer()
def feature_extractor(d): features = {} words = {} lematizer = WordNetLemmatizer() # get individual words from text words = [lematizer.lemmatize(word.lower()) for word in word_tokenize(d)] for word in words: word = word.encode('utf-8', 'ignore') if len(word) > 1: # check if word in not a stop word if word not in stopwords.stop_words: # check if the word is not a url or @person if not re.match('http://.*|@.*', word): if word in features: features[word] += 1 else: features[word] = 1 return features
class NLTKPreprocessor(BaseEstimator, TransformerMixin): """ Transforms input data by using NLTK tokenization, POS tagging, lemmatization and vectorization. """ def __init__(self, corpus, max_sentence_len = 300, stopwords=None, punct=None, lower=True, strip=True): """ Instantiates the preprocessor. """ self.lower = lower self.strip = strip self.stopwords = set(stopwords) if stopwords else set(sw.words('english')) self.punct = set(punct) if punct else set(string.punctuation) self.lemmatizer = WordNetLemmatizer() self.corpus = corpus self.max_sentence_len = max_sentence_len def fit(self, X, y=None): """ Fit simply returns self. """ return self def inverse_transform(self, X): """ No inverse transformation. """ return X def transform(self, X): """ Actually runs the preprocessing on each document. """ output = np.array([(self.tokenize(doc)) for doc in X]) return output def tokenize(self, document): """ Returns a normalized, lemmatized list of tokens from a document by applying segmentation, tokenization, and part of speech tagging. Uses the part of speech tags to look up the lemma in WordNet, and returns the lowercase version of all the words, removing stopwords and punctuation. """ lemmatized_tokens = [] # Clean the text document = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", document) document = re.sub(r"what's", "what is ", document) document = re.sub(r"\'s", " ", document) document = re.sub(r"\'ve", " have ", document) document = re.sub(r"can't", "cannot ", document) document = re.sub(r"n't", " not ", document) document = re.sub(r"i'm", "i am ", document) document = re.sub(r"\'re", " are ", document) document = re.sub(r"\'d", " would ", document) document = re.sub(r"\'ll", " will ", document) document = re.sub(r"(\d+)(k)", r"\g<1>000", document) # Break the document into sentences for sent in sent_tokenize(document): # Break the sentence into part of speech tagged tokens for token, tag in pos_tag(wordpunct_tokenize(sent)): # Apply preprocessing to the token token = token.lower() if self.lower else token token = token.strip() if self.strip else token token = token.strip('_') if self.strip else token token = token.strip('*') if self.strip else token # If punctuation or stopword, ignore token and continue if token in self.stopwords or all(char in self.punct for char in token): continue # Lemmatize the token lemma = self.lemmatize(token, tag) lemmatized_tokens.append(lemma) doc = ' '.join(lemmatized_tokens) tokenized_document = self.vectorize(np.array(doc)[np.newaxis]) return tokenized_document def vectorize(self, doc): """ Returns a vectorized padded version of sequences. """ save_path = "Data/padding.pickle" with open(save_path, 'rb') as f: tokenizer = pickle.load(f) doc_pad = tokenizer.texts_to_sequences(doc) doc_pad = pad_sequences(doc_pad, padding='pre', truncating='pre', maxlen=self.max_sentence_len) return np.squeeze(doc_pad) def lemmatize(self, token, tag): """ Converts the Penn Treebank tag to a WordNet POS tag, then uses that tag to perform WordNet lemmatization. """ tag = { 'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ }.get(tag[0], wn.NOUN) return self.lemmatizer.lemmatize(token, tag)
def lemmatize(word, pos): global lemmer if lemmer is None: lemmer = WordNetLemmatizer() return lemmer.lemmatize(word, get_wordnet_pos(pos))
def __init__(self): self.wnl = WordNetLemmatizer()
nltk.download('wordnet') nltk.download('averaged_perceptron_tagger') url = 'https://www.gutenberg.org/cache/epub/16370/pg16370.txt' file = urllib.request.urlopen(url) text = '' for line in file: decoded_line = line.decode("utf-8") text = text + decoded_line text = text.replace('\r\n', ' ') for x in range(10): text = text.replace(' ', ' ') #%% Pos tagging and lemmatization wnl = WordNetLemmatizer() sentences = sent_tokenize(text) current_sentence = sentences[150] tokens = word_tokenize(current_sentence) tagged = pos_tag(tokens) print(current_sentence) for x in tagged: word = x[0] tag = x[1] if x[1].startswith('V'): result = wnl.lemmatize(word, pos='v') print(word, tag, result) #%% Get all the adjectives in the text all_adjectives = []
# In[27]: from nltk import WordNetLemmatizer, PorterStemmer, LancasterStemmer # In[28]: # Generate random embedding with same scale as glove np.random.seed(SEED) shape = (VOCAB_SIZE, EMBEDDING_SIZE) scale = glove_embedding_weights.std() * np.sqrt(12) / 2 embedding = np.random.uniform(low=-scale, high=scale, size=shape) # In[29]: wnl = WordNetLemmatizer() porter = PorterStemmer() lancaster = LancasterStemmer() # In[30]: # Copy from glove weights of words that appear in index2word count = 0 for i in range(1, VOCAB_SIZE): w = index2word[i] g = glove_index_dict.get(w) if g is None: w = wnl.lemmatize(w) g = glove_index_dict.get(w) if g is None: w = porter.stem(w)
from nltk.corpus import brown, stopwords from nltk.tokenize import word_tokenize from nltk.stem import PorterStemmer, LancasterStemmer from nltk import WordNetLemmatizer # Name: Jesse Huss # ID: 001209444 # Project: Assignment 1 stopWords = set(stopwords.words('english')) wln = WordNetLemmatizer() lemmatizer = WordNetLemmatizer() porter = PorterStemmer() lancaster = LancasterStemmer() for cat in brown.categories(): words = brown.words(categories=cat) noStopWords = [nsw for nsw in words if nsw not in stopWords] lemmatizedWords = [wln.lemmatize(lw) for lw in words] pstemmedWords = [porter.stem(psw) for psw in words] lstemmedWords = [lancaster.stem(lsw) for lsw in words] print(cat.upper() + ':') print('Word Tokens:\n' + str(len(words)) + ' vanilla.\n' + str(len(noStopWords)) + ' no stop words.\n' + str(len(lemmatizedWords)) + ' lemmatized.\n' + str(len(pstemmedWords)) + ' porter stemmed.\n' + str(len(lstemmedWords)) + ' lancaster scanned.\n') print('Word Types:\n' + str(len(set(words))) + ' vanilla.\n' + str(len(set(noStopWords))) + ' no stop words.\n' + str(len(set(lemmatizedWords))) + ' lemmatized.\n' + str(len(set(pstemmedWords))) + ' porter stemmed.\n' +
# nltk.download() # To make sure all ntlk site packages are upto date and installed to get started with nltk from nltk import PorterStemmer from nltk import WordNetLemmatizer paragraph = """Thank you all so very much. Thank you to the Academy. Thank you to all of you in this room. I have to congratulate the other incredible nominees this year. The Revenant was the product of the tireless efforts of an unbelievable cast and crew. First off, to my brother in this endeavor, Mr. Tom Hardy. Tom, your talent on screen can only be surpassed by your friendship off screen … thank you for creating a transcendent cinematic experience. Thank you to everybody at Fox and New Regency … my entire team. I have to thank everyone from the very onset of my career … To my parents; none of this would be possible without you. And to my friends, I love you dearly; you know who you are. And lastly, I just want to say this: Making The Revenant was about man's relationship to the natural world. A world that we collectively felt in 2015 as the hottest year in recorded history. Our production needed to move to the southern tip of this planet just to be able to find snow. Climate change is real, it is happening right now. It is the most urgent threat facing our entire species, and we need to work collectively together and stop procrastinating. We need to support leaders around the world who do not speak for the big polluters, but who speak for all of humanity, for the indigenous people of the world, for the billions and billions of underprivileged people out there who would be most affected by this. For our children’s children, and for those people out there whose voices have been drowned out by the politics of greed. I thank you all for this amazing award tonight. Let us not take this planet for granted. I do not take tonight for granted. Thank you so very much.""" ## Tokenizing sentences sentences = nltk.sent_tokenize(paragraph) # print(sentences) ## Tokenizing words # wordz = nltk.word_tokenize(paragraph) # print(wordz) # stemmer = PorterStemmer() # Creating an object of PorterStemmer class lemmatizer = WordNetLemmatizer() # Creating an object of PorterStemmer class ## Stemming # for i in range(len(sentences)): # words = nltk.word_tokenize(sentences[i]) # Word Tokenization on sentences list. # stemmed_words = [stemmer.stem(word) for word in words] #List Comprehension usage and stemming each word of a single sentence at a time. # sentences[i] = ' '.join(stemmed_words) # Joining all stemmed words back into sentences using space delimiter and join function # print(sentences) ## Lemmatization for j in range(len(sentences)): words = nltk.word_tokenize( sentences[j]) # Word Tokenization on sentences list. lemmatized_words = [ lemmatizer.lemmatize(word) for word in words
list_of_all_pos_tags=['ADJ' , 'ADP' , 'ADV' , 'AUX' , 'CCONJ' , 'DET' , 'INTJ' , 'NOUN' , 'NUM' , 'PART' , 'PRON' , 'PROPN' , 'PUNCT', 'SCONJ' , 'SYM' , 'VERB','X'] ref={} for i in list_of_all_pos_tags: ref[i]=None ref['AUX']='v' ref['ADJ']='a' ref['NOUN']='n' ref['VERB']='v' ref['ADV']='r' import re import nltk from nltk import WordNetLemmatizer wn_lemmatizer=WordNetLemmatizer(); # This functions returns wether ith character of string s is a consonant or not def cons(s,i): if re.match('[aeiou]',s[i]): return False if re.match('y',s[i]): if i==0: return True else: return (not cons(s,i-1)) return True #This function return the measure of word or word part, (C)(VC)^m(C) def m(s): m = 0 for i in range(0, len(s) - 1):
def build_analyzer(self): lemm = WordNetLemmatizer() analyzer = super(LemmaCountVectorizer, self).build_analyzer() return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc) if (not doc.isdigit()) and len(doc) >= 3)
class MatchWordFeatures(QaTextFeautrizer): def __init__(self, require_unique_match, lemmatizer="word_net", empty_question_features=False, stop_words=None): self.lemmatizer = lemmatizer self.stop_words = stop_words self.empty_question_features = empty_question_features if lemmatizer == "word_net": self._lemmatizer = WordNetLemmatizer() else: raise ValueError() self._cache = {} self.require_unique_match = require_unique_match def n_context_features(self): return 3 def n_question_features(self): return 3 if self.empty_question_features else 0 def lemmatize_word(self, word): cur = self._cache.get(word) if cur is None: cur = self._lemmatizer.lemmatize(word) self._cache[word] = cur return cur def get_features(self, question, context): stop = set() if self.stop_words is None else self.stop_words.words context_features = np.zeros((len(context), 3)) if not self.require_unique_match: question_words = set(x for x in question if x.lower() not in stop) quesiton_words_lower = set(x.lower() for x in question) quesiton_words_stem = set( self.lemmatize_word(x) for x in quesiton_words_lower) else: question_words = set(k for k, v in Counter(question).items() if v == 1) quesiton_words_lower = set(k for k, v in Counter( x.lower() for x in question_words).items() if v == 1) quesiton_words_stem = set(k for k, v in Counter( self.lemmatize_word(x) for x in quesiton_words_lower).items() if v == 1) for i, word in enumerate(context): if word in question_words: context_features[i][:3] = 1 elif word.lower() in quesiton_words_lower: context_features[i][:2] = 1 elif self._lemmatizer.lemmatize(word) in quesiton_words_stem: context_features[i][2] = 1 if self.empty_question_features: return np.zeros((len(question), 3)), context_features else: return np.zeros((len(question), 0)), context_features def __setstate__(self, state): self.__init__(**state) def __getstate__(self): state = dict(self.__dict__) del state["_cache"] del state["_lemmatizer"] return state
def tokenizer(data: DataFrame, rows, columns): tokenDict = dict() #"<entry>": (tf(overall), df, [list of docs it appears in]) tokenDocs = dict() tokPostings = dict() #"<entry>": {docid: [tf in that doc, max_tf, doclen], ...} docInfo = dict() lematizer = WordNetLemmatizer() stopWords = set(stopwords.words("english")) for i in range(0, rows): tf = 1 max_tf = 1 doclen = 0 docNo = i tokens1 = word_tokenize(data["Title"][i]) tokens = list() #print(data["Text"][i]) sentenceList = sent_tokenize(data["Text"][i]) for sentence in sentenceList: tmp = word_tokenize(sentence) for t in tmp: tokens.append(t) #tokens = word_tokenize(sent_tokenize(data["Text"])) for t in tokens1: tokens.append(t) for tok in tokens: doclen += 1 if tok in stopWords: continue word = lematizer.lemmatize(tok) if word in tokenDict: tokenDict[word] = tokenDict.get(word) + 1 tokenDocs[word].add(docNo) # tokPostings[word]. else: tokenDict[word] = 1 tokenDocs[word] = {docNo} # tokPostings[word] = {docNo:1} if word in tokPostings: if docNo in tokPostings[word].keys(): tokPostings[word][docNo][0] = tokPostings[word][docNo][0] + 1 tf = tokPostings[word][docNo][0] if tf > max_tf: max_tf = tf else: tokPostings[word][docNo] = [1, 0, 0] else: tokPostings[word] = {docNo: [1, 0, 0]} # {docid: (tf,max_tf, doclen)} docInfo[docNo] = [max_tf, doclen] for word in tokPostings.keys(): for doc in tokPostings[word]: tokPostings[word][int(doc)][1] = docInfo[int(doc)][0] tokPostings[word][int(doc)][2] = docInfo[int(doc)][1] sumOfDoclens = 0 for doc in docInfo: sumOfDoclens += docInfo[doc][1] avgDoclen = sumOfDoclens / rows fullTokenDict = combineDicts(tokenDict, tokenDocs) # combine dictionaries with same key set if fullTokenDict == -1: print("Failed in combining dictionaries") return # else: # print(fullTokenDict) # print(tokenDict) # stemmedTokenDict, stemmedTokenDocs = stemmer(tokenDict) return fullTokenDict, tokPostings, avgDoclen
from sklearn.cluster import KMeans from sklearn.feature_extraction.text import TfidfVectorizer from tqdm import tqdm from nltk import RegexpTokenizer, WordNetLemmatizer from nltk.corpus import stopwords from pymongo import MongoClient client = MongoClient("localhost", 27017) messages = [] for post in client.pets.posts.find(): messages.append(post['message']) print("\nReceived data from Mongo...") print(messages) tokenizer = RegexpTokenizer(r"\w+") lemmatizer = WordNetLemmatizer() print("Processing words...") processed = [] for message in tqdm(messages): tokens = [ t for t in tokenizer.tokenize(str.lower(message)) if t not in stopwords.words("english") ] if len(tokens) > 0: processed.append([lemmatizer.lemmatize(t) for t in tokens]) print("Processed...") print(processed) vectorizer = TfidfVectorizer() X = vectorizer.fit_transform([y for x in processed for y in x]) clusters = 9 model = KMeans(n_clusters=clusters) model.fit(X)
def __init__(self): super().__init__() self.lemmer = WordNet()
import os
"""单词的形态""" from nltk import PorterStemmer, LancasterStemmer, RegexpStemmer, SnowballStemmer, WordNetLemmatizer """(英语?)词干提取器 nltk.stem""" stemmer = PorterStemmer() stemmer = LancasterStemmer() stemmer = RegexpStemmer('ing') out = [stemmer.stem('working'), stemmer.stem('happiness'), stemmer.stem('pairing')] print(out) print(SnowballStemmer.languages) stemmer = SnowballStemmer('spanish') out = stemmer.stem('comiendo') print(out) stemmer = SnowballStemmer('french') out = stemmer.stem('manager') print(out) """词形还原""" lemmatizer = WordNetLemmatizer() out = [lemmatizer.lemmatize('working'), lemmatizer.lemmatize('working', pos='v'), lemmatizer.lemmatize('works')] print(out) """非英语单词提取器 安装polyglot词典""" # print(downloader.supported_languages_table('morph2'))
import re import pandas as pd import pickle import string from utils import flatten_nested_list from collections import Counter from nltk import WordNetLemmatizer from nltk import word_tokenize from symspellpy import SymSpell, Verbosity # Initialize WordNet Lemmatizer lemmatizer = WordNetLemmatizer() class EntityPreprocessing: def __init__(self, domain_dict, n_thres=10, ignore_article_counts=True, n_spell_check_thres=5): """ params: domain_dict: dictionary containing article pmcid (key) to predicted domain for article (value) n_thres: # of number of instances across corpus an entity must have to be included in final entity list ignore_article_counts: ignore the number of instances of an entity within an article ent_categories: dictionary containing a categorization of entities into pre-specified categories n_spell_check_thres: any entity with a number of instances across the corpus below this number is spell-checked """ self.domain_classifier = domain_dict
class NLTKPreprocessor(BaseEstimator, TransformerMixin): """ Transforms input data by using NLTK tokenization, lemmatization, and other normalization and filtering techniques. """ def __init__(self, stopwords=None, punct=None, lower=True, strip=True): """ Instantiates the preprocessor, which make load corpora, models, or do other time-intenstive NLTK data loading. """ self.lower = lower self.strip = strip self.stopwords = set(stopwords) if stopwords else set( sw.words('english')) self.punct = set(punct) if punct else set(string.punctuation) self.lemmatizer = WordNetLemmatizer() def fit(self, X, y=None): """ Fit simply returns self, no other information is needed. """ return self def inverse_transform(self, X): """ No inverse transformation """ return X def transform(self, X): """ Actually runs the preprocessing on each document. """ return [list(self.tokenize(doc)) for doc in X] def tokenize(self, document): """ Returns a normalized, lemmatized list of tokens from a document by applying segmentation (breaking into sentences), then word/punctuation tokenization, and finally part of speech tagging. It uses the part of speech tags to look up the lemma in WordNet, and returns the lowercase version of all the words, removing stopwords and punctuation. """ # Break the document into sentences for sent in sent_tokenize(document): # Break the sentence into part of speech tagged tokens for token, tag in pos_tag(wordpunct_tokenize(sent)): # Apply preprocessing to the token token = token.lower() if self.lower else token token = token.strip() if self.strip else token token = token.strip('_') if self.strip else token token = token.strip('*') if self.strip else token # If punctuation or stopword, ignore token and continue if token in self.stopwords or all(char in self.punct for char in token): continue # Lemmatize the token and yield lemma = self.lemmatize(token, tag) yield lemma def lemmatize(self, token, tag): """ Converts the Penn Treebank tag to a WordNet POS tag, then uses that tag to perform much more accurate WordNet lemmatization. """ tag = { 'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ }.get(tag[0], wn.NOUN) return self.lemmatizer.lemmatize(token, tag)
def preprocess(sentence): lemmatizer = WordNetLemmatizer() return [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(unicode(sentence, errors='ignore'))]
def generate_part2_dict(ibex_data, unique_id): """Given an ibex results file, returns a dictionary of the following format -- mystery word: [target, highest rated guess, lowest rated guess]""" Lemmy = WordNetLemmatizer() with open(ibex_data, 'rb+') as ibex_data: ibex_data = csv.reader( filter(lambda data_row: data_row[0] != '#', ibex_data)) ibex_data = list(ibex_data) subject_id = unique_id subject_age = ibex_data[1][8] subject_sex = ibex_data[2][8] ibex_data = filter(lambda row: row[5] != 'end', ibex_data) ibex_data = filter(lambda row: row[5] != 'intro3', ibex_data) ibex_data = [[x.lower() for x in y] for y in ibex_data] subj_dict = {} guess_and_confidence = [] previous_line = ['', '', '', '', '', '', '', '', ''] trial_identifier = 5 mystery_word, target_word, guess, confidence = 0, 1, 2, 2 for current_line in ibex_data: # print "Current line:" + str(current_line) if current_line[trial_identifier] == previous_line[ trial_identifier]: # print "match" current_line_info = current_line[trial_identifier].split("_") previous_line_info = previous_line[trial_identifier].split("_") current_line_info = [x.lower() for x in current_line_info] previous_line_info = [x.lower() for x in previous_line_info] if (current_line_info[target_word], current_line_info[mystery_word]) not in subj_dict: subj_dict[(current_line_info[target_word], current_line_info[mystery_word])] = [ (previous_line[8], current_line[8]) ] else: if (current_line_info[target_word], current_line_info[mystery_word]) in subj_dict: subj_dict[(current_line_info[target_word], current_line_info[mystery_word])] += [ (previous_line[8], current_line[8]) ] previous_line = current_line if len(subj_dict) != 12: raise ValueError( "ERROR: subj_dict does not equal 12. Check input results file") part_2_dict = defaultdict(list) # initialize a new dictionary for tracking some stats about the subject responses response_stats = defaultdict(list) for target_w_mystery_w, g_c_list in subj_dict.iteritems(): g_c_reversed = reversed(g_c_list) g_c_reversed = list(g_c_reversed) guesses = [] correct_answer_alternate_form = False for gc in g_c_reversed: lemmatized_guess = Lemmy.lemmatize( gc[0].strip().decode('unicode_escape').encode( 'ascii', 'ignore'), pos='n') lemmatized_guess = lemmatized_guess.encode('utf-8') for k, v in correct_answers.iteritems(): if lemmatized_guess in v: correct_answer_alternate_form = lemmatized_guess lemmatized_guess = k guesses.append((lemmatized_guess, gc[1])) guesses = [(x[0], int(x[1])) for x in guesses] # find if the target word was guessed during learning # and, find the highest confidence for that guess # and, find the number of times it was guessed target_guessed = 0 target_highest_confidence = 'NA' target_n_times_guessed = 'NA' if correct_answer_alternate_form: target_guessed = 1 target_highest_confidence = max(x[1] for x in guesses if x[0] == lemmatized_guess) target_n_times_guessed = sum(x[0] == lemmatized_guess for x in guesses) elif target_w_mystery_w[0] in [x[0] for x in g_c_reversed]: target_guessed = 1 target_highest_confidence = max( x[1] for x in guesses if x[0] == target_w_mystery_w[0]) target_n_times_guessed = sum(x[0] == target_w_mystery_w[0] for x in guesses) response_stats[target_w_mystery_w[0]] = [ target_guessed, target_highest_confidence, target_n_times_guessed ] guesses = [gc for gc in guesses if gc[0] != target_w_mystery_w[0]] if not guesses: guesses = [ (random.choice(frequent_words), random.randint(1, 5)), (random.choice(frequent_words), random.randint(1, 5)), (random.choice(frequent_words), random.randint(1, 5)) ] highest_confidence = max(x[1] for x in guesses) lowest_confidence = min(x[1] for x in guesses) highest_guesses = map( lambda x: x if x[1] >= highest_confidence else None, guesses) lowest_guesses = map( lambda x: x if x[1] <= lowest_confidence else None, guesses) highest_guesses = (x for x in highest_guesses if x is not None) lowest_guesses = (x for x in lowest_guesses if x is not None) highest_guess = next(highest_guesses, None) lowest_guess = next(lowest_guesses, None) highest_guess = highest_guess[0] lowest_guess = lowest_guess[0] if highest_guess == lowest_guess: # print "high-low match" lowest_guess = next(lowest_guesses, None) lowest_guess = lowest_guess[0] if type( lowest_guess) is tuple else None highest_guessed = 0 highest_guess_highest_confidence = 'NA' highest_guess_n_times_guessed = 'NA' lowest_guessed = 0 lowest_guess_highest_confidence = 'NA' lowest_guess_n_times_guessed = 'NA' if highest_guess in [x[0] for x in guesses]: highest_guessed = 1 highest_guess_highest_confidence = max( x[1] for x in guesses if x[0] == highest_guess) highest_guess_n_times_guessed = sum(x[0] == highest_guess for x in guesses) if lowest_guess in [x[0] for x in guesses]: lowest_guessed = 1 lowest_guess_highest_confidence = max(x[1] for x in guesses if x[0] == lowest_guess) lowest_guess_n_times_guessed = sum(x[0] == lowest_guess for x in guesses) response_stats[highest_guess] = [ highest_guessed, highest_guess_highest_confidence, highest_guess_n_times_guessed ] response_stats[lowest_guess] = [ lowest_guessed, lowest_guess_highest_confidence, lowest_guess_n_times_guessed ] response_stats['distractor'] = [0, 'NA', 'NA'] target_word = correct_answer_alternate_form if correct_answer_alternate_form else target_w_mystery_w[ 0] part_2_dict[target_w_mystery_w[1]] = [ target_word, highest_guess, lowest_guess ] # print subject_id return [part_2_dict, response_stats]
class DynammicClustering: """""" """""" """ Initializing NER model and files input """ """""" """""" gateway = JavaGateway() # connect to the JVM def __init__(self, a=float(1 / 11), b=float(5 / 11), c=float(1 / 11), d=float(4 / 11), threshold=0.16, threshold2=0.3, threshold3=.4, inputFile="../Data/mytweet02.csv", Outfilename="../MyOutputs/clustersIds.csv"): """""" """""" """""" """ similarity score parameters a,b,c,d """ """""" """""" """""" self.a = a # commen Noun self.b = b # properNoun self.c = c # verb self.d = d # hashtag threshold self.threshold = threshold self.threshold2 = threshold2 self.threshold3 = threshold3 self.inputFile = inputFile self.clusterfile = Outfilename """"" Inputfile reading """ self.Alltweets = pd.read_csv(inputFile, ",") data = [] data.insert( 0, { 'id': 1234567890, 'created_at': "Mon Apr 01 02:59:33 +0000 2019", 'text': "@awesome_lucky Congrats to Mo Yan for being the 1st Chinese Nobel Prize of Literature laureate!", 'user': "******", 'retweet_count': 5 }) self.Alltweets = pd.concat([pd.DataFrame(data), self.Alltweets], ignore_index=True, sort=False) output = open(Outfilename, mode='wt', encoding='utf-8') fieldnames = ['clusterno', 'tweetd'] self.writer = csv.DictWriter(output, fieldnames=fieldnames, quoting=csv.QUOTE_MINIMAL) self.writer.writeheader() output1 = open("../MyOutputs/slang2.csv", mode='wt', encoding='utf-8') fieldnames = ['id', 'slangs'] self.slang_writer = csv.DictWriter(output1, fieldnames=fieldnames, quoting=csv.QUOTE_MINIMAL) self.slang_writer.writeheader() self.lmtz = WordNetLemmatizer() self.MergeCache = defaultdict(MergeCluster) self.UntiClusters = defaultdict(cluster) def mySimilarityFun(self, x, java_object, r): return x.similarity(java_object, r, self.a, self.b, self.c, self.d) def tweetPrecos(self, textWord): _str = re.sub('[^a-zA-Z0-9-_.]', '', textWord) # Check if selected word matches short forms[LHS] in text file. if _str.upper() in abbrRemov.keys(): # If match found replace it with its appropriate phrase in text file. _str = abbrRemov[_str.upper()] return _str def tweets_to_clusters(self, inputfile, clustersFile): x = pd.read_csv(inputfile, ',') y = pd.read_csv(clustersFile, ',') z = pd.read_csv("../MyOutputs/slang2.csv", sep=',', quotechar='"', converters={1: ast.literal_eval}) tweets = {} merged = pd.merge(y, x, left_on='tweetd', right_on='id') merged = pd.merge(merged, z, left_on='id', right_on='id') col = ['tweets', 'clusterID'] df = pd.DataFrame(columns=col, index=None) df['tweets'] = merged['text'].apply(lambda x: self.tweet_clean( x.lower(), merged.loc[merged['text'] == x, 'slangs'].iloc[0])) df['tweets'].replace('', numpy.nan, inplace=True) df.dropna(subset=['tweets'], inplace=True) df['clusterID'] = merged['clusterno'] df.to_csv('../MyOutputs/clusters.csv') df['tweets'] = merged['text'] df.to_csv('../MyOutputs/Cleaned.csv') #abbr removed def translator(self, user_string): # Check if selected word matches short forms[LHS] in text file. if user_string.upper() in abbrRemov.keys(): # If match found replace it with its appropriate phrase in text file. user_string = abbrRemov[user_string.upper()] return user_string """""" """""" """" preproceessing variable and functions """ "" def tweet_clean(self, t, words): #cut out formating of new lines t = t.replace('\n', " ").replace('\r', " ") #for each word that we identified as removeable like emoji we remove it for ele in words: if len(ele) >= 2: t = t.replace(ele + " ", " ") #remove mentions t = re.sub('@[^\s]+', '', t) #remove urls t = re.sub( r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", t) #remove wspace t = re.sub(r"[^\w\s]", "", t) if (t.isspace()): return None return t """"" performs Part of the speech tagging input: text to parse and tweet id out: Default dict of tags """ def NERPass(self, text, id): Preprocessed = defaultdict() java_object = DynammicClustering.gateway.entry_point.getStack( text.lower()) # return {A: "adjective list",N:"nouns list....} keysToMatch = {'#', 'V', 'T'} nounKeys = {'N', '^', 'Z', 'M', 'S'} removeables = {'!', '~', 'G', 'E', '#'} slangs = [] if 'U' in java_object: #lematization does better preprocessing for more matches Preprocessed['U'] = mset( self.lmtz.lemmatize(word, 'v') for word in re.split(" ", java_object['U'])) #match { '#', 'V', 'T'} for key in keysToMatch: if (key in java_object): Preprocessed[key] = mset( self.lmtz.lemmatize(word, 'v') for word in re.split(" ", java_object[key])) sett = None #match Nouns (all kinds in here) for key in nounKeys: if (key in java_object): if sett is None: sett = mset( self.lmtz.lemmatize(word, 'v') for word in re.split(" ", java_object[key])) else: sett = sett | mset( self.lmtz.lemmatize(word, 'v') for word in re.split(" ", java_object[key])) if sett != None: Preprocessed['N'] = sett sett = None #slangs and emotican in the tweets for key in removeables: if (key in java_object): if sett is None: sett = set( self.lmtz.lemmatize(word, 'v') for word in re.split(" ", java_object[key])) else: sett = sett | set( self.lmtz.lemmatize(word, 'v') for word in re.split(" ", java_object[key])) if sett != None: self.slang_writer.writerow({'id': id, 'slangs': list(sett)}) return Preprocessed def MergeClusters(self, unitCluster): score = [] if len(self.MergeCache) == 0: cno1 = len(self.MergeCache) self.MergeCache[cno1] = MergeCluster(cno1) self.MergeCache[cno1].Extend(unitCluster) return score = list( map( lambda x: self.MergeCache[x].similarity( unitCluster, self.a, self.b, self.c, self.d), self.MergeCache.keys())) score = sorted(score, key=self.takeSecond, reverse=True) # print(score) # print("score"+str(score[0])) if (score[0][1] > self.threshold2): self.MergeCache[score[0][0]].Extend(unitCluster) else: #new event cno1 = len(self.MergeCache) self.MergeCache[cno1] = MergeCluster(cno1) self.MergeCache[cno1].Extend(unitCluster) """""" """ Alltweet is a pd dataframe id|text|username|timestamp | | | """ """""" def takeSecond(self, elem): return elem[1] def theLastMerge(self): # k = 1 # lenght = len(self.MergeCache) # deactivated = [] # for i in range(len(self.MergeCache)): # score = [] # for cluster in range(i+1,len(self.MergeCache)): # if (cluster not in deactivated and cluster != i): # scorr = self.MergeCache[cluster].similarity(self.MergeCache[i],self.a,self.b,self.c,self.d) # if (scorr[1] > .8): # self.MergeCache[cluster].Extend(self.MergeCache[i]) # deactivated.append(i) # break # else: # score.append(scorr) # if (i not in deactivated): # score = sorted(score, key=self.takeSecond, reverse=True) # if (score[0][1] > self.threshold3): # self.MergeCache[score[0][0]].Extend(self.MergeCache[i]) # deactivated.append(i) # xcnn=0 # for cluster in self.MergeCache: # if cluster not in deactivated: # for td in self.MergeCache[cluster].ids: # xcnn+=1 # self.writer.writerow({'clusterno': cluster, 'tweetd': td}) for cluster in self.MergeCache: for td in self.MergeCache[cluster].ids: self.writer.writerow({'clusterno': cluster, 'tweetd': td})
ptreeTEST1 = ParentedTree.convert(tree) ptreeTEST = ParentedTree('S', [ ParentedTree('NP', [ParentedTree('PRP', ['She'])]), ParentedTree('VP', [ ParentedTree('VBD', ['was']), ParentedTree('RB', ['not']), ParentedTree('VP', [ParentedTree('VBD', ['admired'])]) ]), ParentedTree('.', ['.']) ]) for i in range(len(sentList[0])): print(i, sentList[0][i]) from nltk import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() #wordnet_lemmatizer.lemmatize('creating','v') # u'create' #tokens = nltk.word_tokenize(test1) def get_wordnet_pos(treebank_tag): if treebank_tag[0] == 'J': return wordnet.ADJ elif treebank_tag[0] == 'V': return wordnet.VERB elif treebank_tag[0] == 'N': return wordnet.NOUN elif treebank_tag == 'R': return wordnet.ADV else: return ''
from nltk.corpus import stopwords from pymongo import MongoClient reviews_collection = MongoClient( "mongodb://localhost:27017/")["Dataset_Challenge_Reviews"]["Reviews"] business_collection = MongoClient( "mongodb://localhost:27017/")["Dataset_Challenge_Reviews"]["Business"] corpus_collection = MongoClient( "mongodb://localhost:27017/")["Dataset_Challenge_Reviews"]["Corpus"] stopset = set(stopwords.words('english')) stopwords = {} with open('stopwords.txt', 'rU') as f: for line in f: stopwords[line.strip()] = 1 lmtzr = WordNetLemmatizer() with open( '../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json' ) as dataset: for line in dataset: data = json.loads(line) if 'Restaurants' in data["categories"] and data['city'] == 'Phoenix': business_collection.insert({"_id": data["business_id"]}) n = 0 with open( '../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json' ) as dataset: for line in dataset: data = json.loads(line)
def __init__(self, model): self.model = model self.lemmatizer = WordNetLemmatizer() self.intents = json.loads(open('intents.json').read()) self.words = pickle.load(open('words.pkl', 'rb')) self.classes = pickle.load(open('classes.pkl', 'rb'))
def __init__(self, stopwords=None, punct=None, lower=True, strip=True): self.lower = lower self.strip = strip self.stopwords = stopwords or set(sw.words('english')) self.punct = punct or set(string.punctuation) self.lemmatizer = WordNetLemmatizer()
class LemmaTokenizer(object): def __init__(self): self.wnl = WordNetLemmatizer() def __call__(self, doc): return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
def preprocess(text): lemmatizer = WordNetLemmatizer() return [ lemmatizer.lemmatize(word.lower()) for word in word_tokenize(str(text)) ]
stop = stopwords.words('english') # Code from https://www.kaggle.com/pjoshi15/so-many-outfits-so-little-time-word2vec reviews_df['Review_Tidy'] = reviews_df['Review_Tidy'].apply( lambda x: ' '.join([word for word in x.split() if word not in (stop)])) # Combining title and review reviews_df["Review_Tidy"] = reviews_df["Title"].map( str) + " " + reviews_df["Review_Tidy"] # Lementizing # https://pythonprogramming.net/lemmatizing-nltk-tutorial/ print("Lemmatizing....") from nltk import WordNetLemmatizer lemmatizer = WordNetLemmatizer() for review in reviews_df['Review_Tidy']: for word in review: word = lemmatizer.lemmatize(word) # Remove Repeated Characters print("Removing repeated Characters....") import re for review in reviews_df['Review_Tidy']: for word in review: word = re.sub(r'(.)\1+', r'\1\1', word) # Covert to lowercase print("Converting words to lowercase...") reviews_df['Review_Tidy'] = reviews_df['Review_Tidy'].str.lower()
class Preprocessor(BaseEstimator, TransformerMixin): def __init__(self, flag, flag1, stem, stopwords=None, punct=None, lower=True, strip=True): self.flag = flag self.flag1 = flag1 self.lemmatizer = WordNetLemmatizer() self.stemmer = stem self.lower = lower self.strip = strip self.stopwords = stopwords or set(sw.words('english')) self.punct = set(punct) if punct else set(string.punctuation) def fit(self, X, y=None): #print('fit') return self def inverse_transform(self, X): #print('inverse_transform') return X def transform(self, X): #print('transform: ', len(X)) return [ self.tokenize(sent) for sent in X ] def tokenize(self, sentenses): ''' sentenses = sentenses.lower() sentenses = sentenses.strip() for stop in stop_words: if sentenses.find(stop)!=-1: self.replace_stop_words(sentenses) break ''' res = '' for token, tag in pos_tag(wordpunct_tokenize(sentenses)): token = token.lower() if self.lower else token token = token.strip() if self.strip else token token = token.strip('_') if self.strip else token token = token.strip('*') if self.strip else token # If punctuation or stopword, ignore token and continue if self.flag1 == 0: if token in self.stopwords or all(char in self.punct for char in token): continue else: if all(char in self.punct for char in token): continue if token in self.stopwords: token = self.replace_stop_words(token) # Lemmatize or stemming the token and yield if self.flag == 0: lemma = token elif self.flag == 1: lemma = self.lemmatize(token, tag) elif self.flag == 2: lemma = self.stemmer.stem(token) res += lemma + ' ' res = res.strip() return res def lemmatize(self, token, tag): """ Converts the Penn Treebank tag to a WordNet POS tag, then uses that tag to perform much more accurate WordNet lemmatization. """ tag = { 'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ }.get(tag[0], wn.NOUN) return self.lemmatizer.lemmatize(token, tag) def stemmer(self, token): return self.stem.stem(token) def replace_stop_words(self, token): for stop in stop_words: if token.find(stop) != -1: if stop.find('n\'t') != -1: ind2 = stop.find('n') token = stop[0:ind2] + ' not' break else: token = ' be' break return token
slot.append(None) return tokens, slot def collide(l1, l2): """ Detect whether l1 and l2 have common elements. :param list l1: List 1. :param list l2: List 2. :rtype: bool """ return len(set(l1).intersection(l2)) > 0 wnl = WordNetLemmatizer() def lemmatize(word): """ Helper function of convert. :param str word: word to convert. :rtype: str """ if word.endswith('ly'): word = word[:-2] word = wnl.lemmatize(word, 'v') word = wnl.lemmatize(word, 'n') word = wnl.lemmatize(word, 'a') word = wnl.lemmatize(word, 's') return word
def lemmatizing(self, text): wl = WordNetLemmatizer() return [wl.lemmatize(word) for word in text]