def get_pos_distro_features(train, test=None, within=False): if within: X_train, X_test, y_train, y_test = split_dataset_within( train, int(args.length), False, False) else: X_train, X_test, y_train, y_test = split_dataset_cross( train, test, False, False) X_train, y_train = shuffle(X_train, y_train, random_state=42) X_test, y_test = shuffle(X_test, y_test, random_state=42) X_train_pos = [] for item in X_train: item_pos = nltk.pos_tag(item.split()) item_mapped = " ".join([ nltk.map_tag("en-ptb", "universal", tag) for word, tag in item_pos ]) X_train_pos.append(item_mapped) X_test_pos = [] for item in X_test: item_pos = nltk.pos_tag(item.split()) item_mapped = " ".join([ nltk.map_tag("en-ptb", "universal", tag) for word, tag in item_pos ]) X_test_pos.append(item_mapped) X_pos = X_train_pos + X_test_pos y_pos = y_train + y_test get_pos_distribution_features(X_pos, y_pos, "pos_tag_distributions.txt", model_name=args.model_name, output_type=args.output_type)
def predict_next_word(self, model, base_query_string): # Returns the same matches than the argument model whith a probability which # takes into account the grammar tag of the words of the base_query_string # First, get the tag sequence associated to the word sequence text = nltk.word_tokenize(base_query_string) tagged = nltk.pos_tag(text) # Transform it according to the universal tagset simplified = [nltk.map_tag('en-ptb', 'universal', tag) for word, tag in tagged] blank = " " simple = blank.join(simplified) # Predict the possible tags and their probabilities after this tag sequence # according to our Grammar Ngrams model tag_matches = self.predict_next_tag(simple) if len(tag_matches) == 0: empty = [] return empty # Keep only the predicted tag (the last one) of each match tags = [(tuple[-1], prob) for tuple, prob in list(set(tag_matches))] # Predict the possible tags and their probabilities after the word sequence # according the Word Ngrams model argument of this method word_matches = model.predict_next_word(base_query_string) # Keep only the predicted word (the last one) of each match words = [(tuple[-1], prob) for tuple, prob in list(set(word_matches))] # Fetch the probability of the tag assoiated to each predicted world prior = [] for i in range(len(words)): word, prob = words[i] # Tag the predicted word tag = nltk.pos_tag(nltk.word_tokenize(word)) universal_tag = nltk.map_tag('brown', 'universal', tag[0][1]) # Take the probabilities of the equal tag in our GrammarModel predictions or 0 # if this tag is not possible after our tag sequence p = 0 for j in range(len(tags)): if universal_tag == tags[j][0]: p = tags[j][1] prior.append(p) # Return the same matches than our Word NgramModel before, with a new probability better_matches = [] for i in range(len(prior)): tuple, prob = list(set(word_matches))[i] better_matches.append((tuple, prob * prior[i])) prob_sum = sum(m[-1] for m in better_matches) better_matches = [[m[0], m[-1] / prob_sum] for m in better_matches] return better_matches
def transform(doc): doc = word_tokenize(doc) for i in range(0, len(doc)): token = doc[i] token.rstrip("'") if not token in expansions and not token in unexpandables: token = token.lstrip("'") doc[i] = token doc = pos_tag(doc) doc = [(map_token(token), map_tag('en-ptb', 'universal', tag)) for token, tag in doc] doc_by_process = {} for process in processes: processed_doc = None if process == 'parts_all': processed_doc = [tag for token, tag in doc] elif process == 'tokens_all': processed_doc = [token for token, tag in doc] elif process == 'tokens_dense': tags = primary_tags - set(['VERB', 'ADV', 'PRON']) processed_doc = [wnl.lemmatize(token) for token, tag in doc if tag in tags] elif process == 'tokens_other': processed_doc = [token for token, tag in doc if tag not in primary_tags] else: tags = set([process.split('_')[1].upper()]) processed_doc = [token for token, tag in doc if tag in tags] s = '' for i in range(0, len(processed_doc)): s += processed_doc[i] if i != len(processed_doc) - 1: s += ' ' doc_by_process[process] = s return doc_by_process
def preprocess_text(text, affix, country, pos_tagging=False): # make sure you avoid counting URL and NUM as capitalized words (for e.g. German) # maybe @username?, maybe check again for RTs? with open("fileyouwontneed.txt", "a", encoding="utf-8") as outfile: tokenizer = TweetTokenizer(reduce_len=True, preserve_case=True) cleanr = re.compile('<.*?>') remove_markup = re.sub(cleanr, '', text) replace_urls = re.sub(r"http\S+", "URL", remove_markup) replace_digits = re.sub(r'\d+', "NUM", replace_urls) if affix.endswith("@"): replace_digits = re.sub(r'\.([a-zA-Z])', r'. \1', replace_digits) text = tokenizer.tokenize(replace_digits) if pos_tagging: original_text = text text = nltk.pos_tag(text) text = " ".join([ nltk.map_tag("en-ptb", "universal", tag) for word, tag in text ]) #text = " ".join([item[1] if item[1].startswith("NN") or item[1].startswith("VB") else item[0] for item in text]) outfile.write("{0},{1},{2}\n".format(" ".join(original_text), " ".join(text), country)) else: text = " ".join([item for item in text]) return text
def _get_meaning_set(self, sentence): if sentence is None or sentence == u'': return None tags = word_tokenize(sentence) proccessed_tags = nltk.pos_tag(tags) simplified_tags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in proccessed_tags] return simplified_tags
def tag_histogram(text): tokenized_text = nltk.word_tokenize(text) tagged_text = nltk.pos_tag(tokenized_text) simplified_tagged_text = [(word, nltk.map_tag('en-ptb', 'universal', tag)) for word, tag in tagged_text] tagdict = defaultdict(int) for word in simplified_tagged_text: tagdict[word[1]] += 1 return tagdict
def tag_pos(x): sentences = sent_tokenize(x) sents = [] for s in sentences: text = word_tokenize(s) pos_tagged = pos_tag(text) simplified_tags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in pos_tagged] sents.append(simplified_tags) return sents
def POSDensitySimple(array): tagged = nltk.pos_tag(array) simplifiedTags = [(word, nltk.map_tag('en-ptb', 'universal', tag)) for word, tag in tagged] s = len(array) counts = dict(Counter(tag for word, tag in simplifiedTags)) #counts = collections.UserDict(counts) for k in counts.keys(): counts[k] *= 1.0 / s counts[k] = '%.4f' % (counts[k]) return (counts)
def removeUnwantedWords(input): userInputWithOnlyQuestionAndKeywords = [] posTagged = nltk.pos_tag(input) simplifiedTags = [(word, nltk.map_tag('en-ptb', 'universal', tag)) for word, tag in posTagged] for key,value in simplifiedTags: if (key.lower() in config.questionList) or (value not in 'ADP' and value not in 'PRON' and value not in 'DET' and value not in 'CONJ' and value not in 'PRT' and key not in 'is'): userInputWithOnlyQuestionAndKeywords.append(key) else: log.writetofile("blacklisted word: " + key) return userInputWithOnlyQuestionAndKeywords
def extract_words_plus_pos_tags(texts, lang): results = [] if lang in stanford_lang_models: import nltk.tag.stanford as stanford_tagger tagger = stanford_tagger.StanfordPOSTagger( stanford_res_path + stanford_lang_models[lang], path_to_jar=stanford_res_path + "stanford-postagger.jar") results = tagger.tag(word_tokenize(texts, language=lang_map[lang])) if lang == 'en': # convert eng tags to universal tags results = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in results] return results
def train(self, words, tagged=False): if tagged is True: tags = [] for i in range(len(words)): tags.append(words[i][1]) self.ngrams = list(nltk.ngrams(tags, self.n)) else: # text = nltk.word_tokenize(words) tagged_words = nltk.pos_tag(words) universal_tags = [nltk.map_tag('en-ptb', 'universal', tag) for word, tag in tagged_words] self.ngrams = list(nltk.ngrams(universal_tags, self.n)) self.frequencies = nltk.FreqDist(self.ngrams) self.probs_ng = nltk.MLEProbDist(self.frequencies) print self.probs_ng
def pos_tagging(tweet): """ Finds the pos tag with nltk post_tag function, and then maps them with a tag required for the lemmatize function. :param tweet: list of words (tokens) represented like strings :type tweet: list :return: list of tuple (word, tag) :rtype: list """ dict_tags = { 'ADJ': 'a', 'ADJ_SAT': 's', 'ADV': 'r', 'NOUN': 'n', 'VERB': 'v' } tokens_tags = [ (tokens[0], dict_tags[map_tag('en-ptb', 'universal', tokens[1])]) if map_tag('en-ptb', 'universal', tokens[1]) in dict_tags else (tokens[0], '') for tokens in pos_tag(tweet) ] return tokens_tags
def sentence_2_pos(self, sent): porter = PorterStemmer() #without stemming #text = nltk.word_tokenize(sent) #with stemming text = [porter.stem(word) for word in word_tokenize(sent)] posTagged = nltk.pos_tag(text) words_tags = [(word, nltk.map_tag('en-ptb', 'universal', tag)) for word, tag in posTagged] words = [item[0] for item in words_tags] tags = [item_[1] for item_ in words_tags] pos_onehot = [self.one_hot_POS(i) for i in tags] pos = list(np.sum(np.array(pos_onehot), axis=0)) return (words, pos)
def lemmatizeTokens(self, tokens): tokens_tagged = pos_tag(tokens) tokens_simpleTags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in tokens_tagged] #Actually lemmatize. lemmas = [] for token, tag in tokens_simpleTags: lemmatized = "" if tag == "VERB": lemmatized = self.lemmatizer.lemmatize(token, pos='v') elif tag == "ADJ": lemmatized = self.lemmatizer.lemmatize(token, pos='a') elif tag == "ADV": lemmatized = self.lemmatizer.lemmatize(token, pos='r') else: lemmatized = self.lemmatizer.lemmatize(token) #pos = 'n' lemmas.append(lemmatized) return lemmas
def lemmatizeTokens(self, tokens): tokens_tagged = pos_tag(tokens) #Get simple POS tags. tokens_simpleTags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in tokens_tagged] #Actually lemmatize. lemmas = [] for token, tag in tokens_simpleTags: lemmatized = "" if tag == "VERB": lemmatized = self.lemmatizer.lemmatize(token, pos='v') elif tag == "ADJ": lemmatized = self.lemmatizer.lemmatize(token, pos='a') elif tag == "ADV": lemmatized = self.lemmatizer.lemmatize(token, pos='r') else: lemmatized = self.lemmatizer.lemmatize(token) #pos = 'n' lemmas.append(lemmatized.encode("utf-8")) return lemmas
def pos_tagger(data): """ does pos tagging on the input text file Parameters: data: str text file Returns: list pos tagged list of words """ sentences = sent_tokenize(data) sents = [] for s in sentences: text = word_tokenize(s) pos_tagged = pos_tag(text) sub_tags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in pos_tagged] sents.append(sub_tags) return sents
def runSingleWords(self): percentage = float(self.getPluginParamValue("Percentage")) / 100.0 minCharLength = int(self.getPluginParamValue("MinCharLength")) posFilter = self.getPluginParamValue("POS") inputContent = self.getInputContent().lower() punctuation = string.punctuation.replace("-", "") puncFilter = dict((ord(char), None) for char in punctuation) tokens = nltk.word_tokenize(inputContent.translate(puncFilter)) tokensCnt = len(tokens) if tokensCnt < 1: self.raiseException("No words found") maxTokensCnt = int(percentage * tokensCnt) tags = nltk.pos_tag(tokens) pos = [(token, nltk.map_tag('en-ptb', 'universal', tag)) for token, tag in tags] filteredTokens1 = [] for p in pos: if len(p[0]) < minCharLength: continue if p[1] not in posFilter: continue filteredTokens1.append(p) freqTokens = nltk.FreqDist(tokens) content = "" cnt = 0 for freqToken in freqTokens.most_common(tokensCnt): for token in filteredTokens1: if freqToken[0] == token[0]: content = "{0}\n{1},{2},{3}".format( content, token[0], token[1], freqToken[1]) cnt += 1 break if cnt >= maxTokensCnt: break content = content.strip() self.setAnalyzerContent(content) return content
def token_parse_amz(categ, path): if categ == 'Yelp' or categ == 'Tripadvisor': return done = 0 start = time.time() # Load stopwords and tokenizer stopwds = stopwords.words('english') tokenizer = regexp.RegexpTokenizer("[\w']+", flags=re.UNICODE) with open(path, 'r') as g: for l in g: u = json.loads(json.dumps(eval(l))) if not (u.get('reviewerID') and u.get('asin') and u.get('reviewerName') \ and u.get('helpful') and u.get('reviewText')): continue if u['helpful'][1] < 10: continue sentences = sent_tokenize(u['reviewText']) num_sent = len(sentences) num_tokens = 0 num_pos = 0 num_neg = 0 sent_len = 0 words = [] for sentence in sentences: sent_len += len(sentence) tokens = tokenizer.tokenize(sentence) num_tokens += len(tokens) pos_tagged = pos_tag(tokens) simplified_tags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in pos_tagged] for word, tag in simplified_tags: words.append({'word': word, 'pos': tag}) tf = tag[0].lower() if tag == 'ADV': tf = 'r' if tag == 'NP' or tag == 'NUM': tf = tag # No need to calculate positive score if tf in ['a', 'v', 'r', 'n']: try: sen_ls = swn.senti_synsets(word, tf) if len(sen_ls) != 0: sen_score = sen_ls[0] pos_score = sen_score.pos_score() neg_score = sen_score.neg_score() # obj_score = sen_score.obj_score() if pos_score > neg_score: num_pos += 1 if pos_score < neg_score: num_neg += 1 except WordNetError: pass if num_sent != 0: sent_len = sent_len / num_sent tag = {} tag['num_sent'] = num_sent tag['sent_len'] = sent_len tag['num_tokens'] = num_tokens tag['num_pos'] = num_pos tag['num_neg'] = num_neg tag['words'] = words tag['review_id'] = u['reviewerID'] tag['user_id'] = u['reviewerName'] tag['item_id'] = u['asin'] tag['votes'] = int(u['helpful'][1]) tag['helpful'] = int(u['helpful'][0]) done += 1 if done % 1000 == 0: tmp = time.time() - start print categ, 'Tagging reviews, Done ', done, ' in', tmp yield str(tag)
def apply_syntactic_filters(pos_tagged_tokens, syntactic_filters): tags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in pos_tagged_tokens] return [word.lower() for (word, tag) in tags if tag in syntactic_filters]
def token_parse_yelp(categ, path): if categ != "Yelp" and categ != "Tripadvisor": return done = 0 start = time.time() # Load stopwords and tokenizer stopwds = stopwords.words("english") tokenizer = regexp.RegexpTokenizer("[\w']+", flags=re.UNICODE) with open(path, "r") as g: for l in g: u = json.loads(json.dumps(eval(l))) if not ( u.get("review_id") and u.get("user_id") and u.get("item_id") and u.get("helpful") and u.get("votes") and u.get("text") ): continue if int(u["votes"]) < 10: continue sentences = sent_tokenize(u["text"]) num_sent = len(sentences) num_tokens = 0 num_pos = 0 num_neg = 0 sent_len = 0 words = [] for sentence in sentences: sent_len += len(sentence) tokens = tokenizer.tokenize(sentence) num_tokens += len(tokens) pos_tagged = pos_tag(tokens) simplified_tags = [(word, map_tag("en-ptb", "universal", tag)) for word, tag in pos_tagged] for word, tag in simplified_tags: words.append({"word": word, "pos": tag}) tf = tag[0].lower() if tag == "ADV": tf = "r" if tag == "NP" or tag == "NUM": tf = tag # No need to calculate positive score if tf in ["a", "v", "r", "n"]: try: sen_ls = swn.senti_synsets(word, tf) if len(sen_ls) != 0: sen_score = sen_ls[0] pos_score = sen_score.pos_score() neg_score = sen_score.neg_score() # obj_score = sen_score.obj_score() if pos_score > neg_score: num_pos += 1 if pos_score < neg_score: num_neg += 1 except WordNetError: pass if num_sent != 0: sent_len = sent_len / num_sent tag = {} tag["num_sent"] = num_sent tag["sent_len"] = sent_len tag["num_tokens"] = num_tokens tag["num_pos"] = num_pos tag["num_neg"] = num_neg tag["words"] = words tag["review_id"] = u["review_id"] tag["user_id"] = u["user_id"] tag["item_id"] = u["item_id"] tag["votes"] = int(u["votes"]) tag["helpful"] = int(u["helpful"]) done += 1 if done % 100 == 0: tmp = time.time() - start print categ, "Tagging reviews, Done ", done, " in", tmp # break yield str(tag)
def manualTagNltk(self): sentence = 'Marley was dead : to begin with . There is no doubt whatever about that .' tokens = nltk.word_tokenize(sentence) taggedText = [(word, nltk.map_tag('brown', 'universal', tag) ) for word, tag in self.manualTagBrown()] return list(taggedText)
def pos_senti(df_copy): #takes li_swn = [] li_swn_pos = [] li_swn_neg = [] missing_words = [] for i in range(len(df_copy.index)): text = df_copy.loc[i]['tidy_tweet'] tokens = nltk.word_tokenize(text) tagged_sent = nltk.pos_tag(tokens) store_it = [(word, nltk.map_tag('en-ptb', 'universal', tag)) for word, tag in tagged_sent] #print("Tagged Parts of Speech:",store_it) pos_total = 0 neg_total = 0 for word, tag in store_it: if (tag == 'NOUN'): tag = 'n' elif (tag == 'VERB'): tag = 'v' elif (tag == 'ADJ'): tag = 'a' elif (tag == 'ADV'): tag = 'r' else: tag = 'nothing' if (tag != 'nothing'): concat = word + '.' + tag + '.01' try: this_word_pos = swn.senti_synset(concat).pos_score() this_word_neg = swn.senti_synset(concat).neg_score() #print(word,tag,':',this_word_pos,this_word_neg) except Exception as e: wor = lem.lemmatize(word) concat = wor + '.' + tag + '.01' # Checking if there's a possiblity of lemmatized word be accepted into SWN corpus try: this_word_pos = swn.senti_synset(concat).pos_score() this_word_neg = swn.senti_synset(concat).neg_score() except Exception as e: wor = pstem.stem(word) concat = wor + '.' + tag + '.01' # Checking if there's a possiblity of lemmatized word be accepted try: this_word_pos = swn.senti_synset( concat).pos_score() this_word_neg = swn.senti_synset( concat).neg_score() except: missing_words.append(word) continue pos_total += this_word_pos neg_total += this_word_neg li_swn_pos.append(pos_total) li_swn_neg.append(neg_total) if (pos_total != 0 or neg_total != 0): if (pos_total > neg_total): li_swn.append(1) else: li_swn.append(-1) else: li_swn.append(0) df_copy.insert(4, "pos_score", li_swn_pos, True) df_copy.insert(5, "neg_score", li_swn_neg, True) df_copy.insert(6, "sent_score", li_swn, True) return df_copy
try: stopwords = nltk.corpus.stopwords.words('english') except LookupError: nltk.download('stopwords') stopwords = nltk.corpus.stopwords.words('english') try: linker = WN_Linker(w, stopwords) except LookupError: nltk.download('averaged_perceptron_tagger') linker = WN_Linker(w, stopwords) try: nltk.word_tokenize('cat') except LookupError: nltk.download('punkt') try: nltk.map_tag('en-ptb', 'universal', 'NNP') except LookupError: nltk.download('universal_tagset') for dset in ['MSVD', 'MSRVTT']: json_fn = f'{dset}_parsed_captions.json' with open(json_fn) as f: d = json.load(f) new_dps = [] for vidid, dp in d.items(): atoms_with_synsets = linker.get_synsets_of_rule_parse( dp, convert=False) # Discard atoms that have a component that hasn't been linked to WN new_dp = dict( dp, **{ 'atoms_with_synsets': [
import nltk import numpy as np from nltk.corpus import gutenberg import pickle def save_object(obj, filename): with open(filename, "w") as output: pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL) sents = gutenberg.sents("blake-poems.txt") table = [] for i in range(20): table.append([]) for s in sents[1:]: # TODO prevent ?!, => . and dont count . in a sentence to length if len(s) > 2 and len(s) < 20: tags = nltk.pos_tag(s) simpleTags = [(word, nltk.map_tag("en-ptb", "universal", tag)) for word, tag in tags] tagsOnly = [t[1] for t in simpleTags] # this is to filter out headlines if tagsOnly[len(tagsOnly) - 1] == ".": wordCount = len(tagsOnly) - tagsOnly.count(".") table[wordCount].append(tagsOnly) save_object(table, "grammar")
def convert_to_uni_tag(self, token): return '_'.join( [token[0], nltk.map_tag('en-ptb', 'universal', token[1])])
def tag(self, tokens, tagset=None): tagged_tokens = APTaggerUtils.tagger.tag(tokens) if tagset: tagged_tokens = [(token, nltk.map_tag('en-ptb', tagset, tag)) for (token, tag) in tagged_tokens] return tagged_tokens
def pos_tag_simplified(self, tokenized): tagged = self.pos_tag(tokenized) simplified = [(word, nltk.map_tag('en-ptb', 'universal', tag)) for word, tag in tagged] return simplified
def convert_tag_to_universal(tag): return nltk.map_tag('en_ptb', 'universal', tag)