def data(): # get current directory path = os.getcwd() # get one directory up path = os.path.dirname(path) WORD_VECTORS = "../embeddings/word2vec.txt" WORD_VECTORS_DIMS = 300 TRAIN_DATA = path + "/datasets/ABSA16_Restaurants_Train_SB1_v2.xml" VAL_DATA = path + "/datasets/EN_REST_SB1_TEST.xml" max_length = 80 # load word embeddings print("loading word embeddings...") word2idx, idx2word, embeddings = load_word_vectors(WORD_VECTORS, WORD_VECTORS_DIMS) print("loading categories") entity_attribute_pairs = createCategories2() # load raw data print("loading datasets...") train_review, train_ent_attrib = \ read_xml2_train3(entity_attribute_pairs, TRAIN_DATA) gold_review, gold_ent_attrib = \ read_xml2_train3(entity_attribute_pairs, VAL_DATA) y_train = train_ent_attrib y_test = gold_ent_attrib print("Tokenizing...") # nltk tokenizer X_train = [ casual_tokenize(x, preserve_case=False, reduce_len=True, strip_handles=False) for x in train_review ] X_test = [ casual_tokenize(x, preserve_case=False, reduce_len=True, strip_handles=False) for x in gold_review ] print("Vectorizing...") X_train = numpy.array( [vectorize(x, word2idx, max_length) for x in X_train]) X_test = numpy.array([vectorize(x, word2idx, max_length) for x in X_test]) print("Turning test and train data to numpy arrays") X_train = numpy.array(X_train) y_train = numpy.array(y_train) X_test = numpy.array(X_test) y_test = numpy.array(y_test) label_encoder = LabelBinarizer() y_train_res = label_encoder.fit_transform(y_train) y_test = label_encoder.fit_transform(y_test) # Everything to numpy X_train = numpy.array(X_train) y_train = numpy.array(y_train_res) y_test = numpy.array(y_test) return embeddings, X_train, X_test, y_train, y_test, max_length
def emotions_with_decay(self, s, N, M, D): # (s)tring, (N)umber of slices # slices in (M)emory, (D)ecay factor tokens = casual_tokenize(s) len_tokens = len(tokens) base_window = dict(zip(self.emotions, [0] * len(self.emotions))) summary = [base_window] * N priors = [base_window] * M carry = 0 done = 0 for i in range(0, N): w_size = int((len_tokens + carry) / N) carry = w_size - ((len_tokens + carry) / N) w = tokens[done:min(done + w_size, len_tokens)] done += w_size summ = self.do_sentiments(w) # Add decay values to summary summ_mem = summ.copy() for j in range(M): summ_mem = self.add_dict(summ_mem, priors[j]) summary[i] = summ_mem # shift prior windows priors.pop(0) priors.append(summ) # decay prior windows priors = self.decay(priors, D) return pd.DataFrame(summary)
def process_text(text, adds=None, removals=None): if adds is None: adds = set([]) if removals is None: removals = set([]) words = casual_tokenize(text, preserve_case=False) filtered = set([]) go_words = set([]) normed_go_words = set([]) for x in words: if x in sw: filtered.add(x) else: go_words.add(x) for x in go_words: nw = stemmer.stem(x) stem_record.setdefault(nw, set([])) stem_record[nw].add(x) if nw in sw: filtered.add(nw) else: normed_go_words.add(nw) normed_go_words = (normed_go_words | adds) - removals return normed_go_words
def getSentencePositivity(sentence): """ Returns positivity of the given sentence from -1.0 (very negative) to 1.0 (very positive). May return None if no classifier exists to perform sentiment analysis. """ classifier = __getClassifier() if classifier is None: return None #prepare for classifier tokenized = list( map(lambda x: 'I' if x == 'i' else x, casual_tokenize(sentence))) custom_tokens = __remove_noise(tokenized) #classify and get probability probdist = classifier.prob_classify( dict([token, True] for token in custom_tokens)) pos = probdist.prob('Positive') normalized_pos = pos * 2 - 1 #handle negation negation_count = len( list( filter(lambda x: x[1] == 'RB' and x[0] in ("not", "n't"), pos_tag(tokenized)))) normalized_pos *= ( -0.2 )**negation_count #invert with lower magnitude if negation is detected in sentence #return result return normalized_pos
def tokenize(self, text, a_preserve_case=True, a_reduce_len=False, a_strip_handles=False): return casual_tokenize(text, preserve_case=a_preserve_case, reduce_len=a_reduce_len, strip_handles=a_strip_handles)
def tokenize(text): """Use Twitter aware casual tokenizer followed by WordNetLemmatizer on extracted tokens""" # Implementation of casual_tokenize at www.nltk.org/_modules/nltk/tokenize/casual.html tokens = casual_tokenize(text.lower()) lemmatizer = WordNetLemmatizer() clean_tokens = [] for tok in tokens: clean_tok = lemmatizer.lemmatize(tok).lower().strip() clean_tokens.append(clean_tok) return clean_tokens
def prepare_text_data(text_array, word2idx, MAX_LENGTH): "Method used for all the necessary text prepossessing before it enters the model" print("Tokenizing...") X_train = [ casual_tokenize(x, preserve_case=False, reduce_len=True, strip_handles=False) for x in text_array ] print("Vectorizing...") X_train = numpy.array( [vectorize(x, word2idx, MAX_LENGTH) for x in X_train]) print("Turning test and train data to numpy arrays") X_train = numpy.array(X_train) return X_train
def parse_sentence(s): ''' Returns the tagged and tokenized sentence in the form of a (token, tag) list. If a (token, tag) list is given, it returns itself. This allows for redundant calls to make sure the sentence is tokenized. ''' if isinstance(s, list): return s global IDENTITY if IDENTITY is None: import positivity IDENTITY = positivity.Sentience.getIdentity() s = s.replace('@' + IDENTITY, IDENTITY) tokens = list(map(lambda x: 'I' if x == 'i' else x, casual_tokenize(s,reduce_len=True))) tagged_tokens = list(map(lambda x: (x[0], 'NN') if Understanding.matches_target(x[0]) else x, pos_tag(tokens))) return tagged_tokens
def check(cell): '''Spell check one markdown cell''' lines = cell['source'] for i, line in enumerate(lines): # using casual tokenize to get rid of urls and emojis words = casual_tokenize(line) words = [word for word in words if not word.startswith('http')] newline = ' '.join(words) # this handles e.g. contractions: you're, etc words = tokenizer.tokenize(newline) misspelled = spell.unknown(words) if len(misspelled): print_highlight(line, misspelled) for word in misspelled: new_word = handle_word(word) # learnt_words.append(new_word) if new_word: line = update_line(line, word, new_word) cell['source'][i] = line
def preprocess(text): if not text or type(text) != str: return '' text = text.lower() text = re.sub(r"https?://[^\s]+", '', text) # hyperlinks text = re.sub(r"\@\w+", '', text) # mentions text = re.sub(r"#", '', text) # hashtags text = re.sub(r"\d+\w*", '', text) # numbers text = re.sub(r"'s", '', text) # possesive text = re.sub(r"n't", ' not', text) # contractions text = re.sub(r"'m", ' am', text) text = re.sub(r"'s", ' is', text) text = re.sub(r"'re", ' are', text) words = [word for word in casual_tokenize(text) if word not in stops] words = [ lemmatizer.lemmatize(word, tag_for_lemmatizer(tag)) for word, tag in pos_tag(words) ] text = ' '.join(words) return text
def ingest(self, document, weight=1.0, purge_list=PURGE_LIST): doc_lower = UnicodeDammit(document).unicode_markup.lower() for word in BLACKLIST: if word in doc_lower: return word_stream = [ item for item in casual_tokenize(document, reduce_len=True) if "/" not in item and item not in purge_list ] if len(word_stream) < 3: return word_stream = [BEGIN_SYMBOL] * self.n_back + word_stream word_stream += [END_SYMBOL] * self.n_back for current_position, word in enumerate(word_stream): if current_position < self.n_back: # advance until we have enough words in memory to consider # (even if only begin symbols) continue prior_ngram = word_stream[(current_position - self.n_back):current_position] result_word = word_stream[current_position] self.chain = record_chain_link(prior_ngram, result_word, self.chain, weight)
from nltk.tokenize.casual import casual_tokenize from nltk.util import ngrams import re message = "RT @TJMonticello Best day everrrrrrr at Monticello.... Awesommmmmmeeeeeeee day :*) " cas_tok = casual_tokenize(message) print(cas_tok) cas_tok2 = casual_tokenize(message, reduce_len=True, strip_handles=True) print(cas_tok2) print('------------------') sentence = "Albiona Hoti filloi punen si software engineer ne moshen 22 vjecare." pattern = re.compile(r"([-\s.,;!?])+") tokens = pattern.split(sentence) tokens = [x for x in tokens if x and x not in '- \t\n.,;!?'] print(tokens) print('---------------') two_grams = list(ngrams(tokens, 2)) print(two_grams) print('---------------')
eric.babble_and_evaluate_one() print results["acc"] eric.babble_and_evaluate_one() vec, features = eric.fit_tfidf() # eric.amnesia(3) from nltk.tokenize.casual import casual_tokenize for k, v in eric.observational_memory.items(): try: print "{}".format(unicode(v["text"])) except: print "***" print casual_tokenize(v["text"]) print unicode(u'\U0001f98b') for k, v in eric.observational_memory.items(): eric.observational_memory[k]["stemmed_text"] = stem_and_tokenize( v["text"])[3] eric.pickle_me() for item in vec.get_feature_names(): print item from sklearn.decomposition import TruncatedSVD svd = TruncatedSVD(n_components=100) reduced = svd.fit_transform(features)
def annotate_string(string): return liwc_inst.annotate_doc(casual_tokenize(string, reduce_len=True))
def summarize_string(string): return liwc_inst.summarize_doc(casual_tokenize(string, reduce_len=True))
def get_string_tokens(text: str) -> list: """Returns word tokens of input string.""" return casual_tokenize(text)
from nltk.tokenize.casual import casual_tokenize message = "RT @TJMonticello Best day everrrrrrr at Monticello." \ "Awesommmmmmeeeeeeee day :*)" print(casual_tokenize(message)) print(casual_tokenize(message, reduce_len=True, strip_handles=True))
def generate_response(s, autoanswer_level=2): ''' Generates a response for the given message with the autoanswer_level (default 1). autoanswer_level: - 0: Do not answer except for debug calls - 1: Only respond rarely - 2: Respond sometimes if confident - 3: Respond whenever possible - 4: Always respond ''' debug_out = Responder.process_debug_output(s) if debug_out: return debug_out # # SENTENCE PARSING # words = casual_tokenize(s.lower(), reduce_len=True) parsed_result = Understanding.parse_queries(s, merge_results=True) subject_call = parsed_result["subject_call"] queries = parsed_result["queries"] statements = parsed_result["statements"] tofu_tagged = Understanding.is_target_tagged(s) tofu_targeted = parsed_result["target_summoned"] or autoanswer_level >= 3 someone_else_targeted = not tofu_targeted and subject_call query_types = set(map(lambda x: x[1], queries)) too_complicated = len(query_types) > 1 or len(queries) > 4 Sentience.exposeToMessage(s) mood = Sentience.getPrimaryMood() if autoanswer_level == 0: return None # # Greetings # if queries == [] and statements == [] and tofu_targeted: #greeting likely now = datetime.datetime.now() if mood > 0.3: if (6 <= now.hour <= 11) and 'morning' in words: return random.choice(['good morning', 'morning', 'おはよう']) + ('!' if mood > 0.75 else ('.' if mood < 0.5 else '')) if (19 <= now.hour <= 23 or now.hour <= 2) and ('night' in words or 'gn' in words): return random.choice(['good night', 'gn', 'おやすみ']) + ('.' if mood < 0.5 else '') if 'hello' in words or 'hi' in words: if mood > 0.7 or autoanswer_level >= 4: return random.choice(['hello!', 'hi!', 'こんにちは!']) if mood <= 0.3: return random.choice(['bleh', 'o', 'meh', 'hmph']) # # Query answering # if too_complicated: if tofu_targeted: return random.choice([ "i'm confused", "interesting question", "uh.. i am confused", "i don't understand what you mean", "this sentence is too complicated for me to understand", "this question is too confusing for me" "hmm", ]) return None if 'STD_QN' in query_types and tofu_targeted: return random.choice([ "sorry, this question is not within my capabilities to answer", "i can't answer that yet oops", "sorry, the question is too open-ended for me", "i don't know how to answer that, am weak in FRQs sry", "i'm not smart enough to know how to answer that", "that sounds like an interesting question", "hmm", ]) if 'YN_QN' in query_types and ((autoanswer_level >= 2 and not someone_else_targeted) or tofu_targeted): filtered_queries = list(filter(lambda x: x[1] == 'YN_QN', queries)) if len(filtered_queries) == 1: yes_opt = random.choice([ "perhaps", "i believe yes", "yeah", "yes", "my deductions indicate yes", "maybe", "i think so", "very likely", "most definitely", "yes indeed", "i'd say yes" ]) no_opt = random.choice([ "maybe not", "my sources say no", "no", "nah", "i don't think so", "doubt it", "probably not", "most definitely not", "i think no", "not at all" ]) rnd_opt = random.choice([ "i'm not sure about that", "bleh", "interesting question", "i don't wanna tell you right now", "i don't have a clue", "hmmm", "my sources cannot be trusted" ]) chosen = Sentience.decideResponseAgree(filtered_queries[0][0]) if chosen is None: return rnd_opt return yes_opt if chosen else no_opt if len(filtered_queries) == 2: opt_1 = random.choice([ "first option", "go with the first", "the former" ]) opt_2 = random.choice([ "second option", "on second thought, your second option", "the latter" ]) opt_nil = random.choice([ "why not both", "i can't find the answer to that", "i think neither", "can't decide, so i'll say yes" ]) subj, pred1 = Understanding.parse_sentence_subject_predicate(filtered_queries[0][0]) _ , pred2 = Understanding.parse_sentence_subject_predicate(filtered_queries[1][0]) chosen = Sentience.decideResponseOptionsIndex(subj, [pred1, pred2]) if chosen == 0: return opt_1 if chosen == 1: return opt_2 return opt_nil if len(filtered_queries) > 2 and tofu_targeted: subject = None options = [] for query, _ in filtered_queries: res = Understanding.parse_sentence_subject_predicate(query) if subject is None: subject = res[0] options.append(res[1]) chosen = Sentience.decideResponseOptionsIndex(subject, options) if chosen is None: return random.choice([ "i can't decide", "am a little confused here", "not sure which one" ]) return random.choice([ "option %d it is", "i'll pick option %d", "i think option %d", "option %d" ]) % (chosen+1) # # Misc responses # if mood > 0.5 and Sentience.getExposedPositivity() >= 0 and autoanswer_level >= 1: if not tofu_targeted and (IDENTITY.lower() in words or IDENTITY.lower() == s.lower()) and random.random() <= 0.1: return random.choice(['hmm i heard my name', 'hmmmm', 'interesting', 'hm']) if len(words) <= 5: combos = _get_message_combos() words_copy = words.copy() random.shuffle(words_copy) for word in words_copy: for w in [word, Understanding.remove_repeated_chars_word(word)]: if w in combos: w_response, w_response_chance = combos[w] #increase chances at higher autoanswer levels if autoanswer_level >= 3: w_response_chance **= 0.25 #lower chances at lower autoanswer levels if autoanswer_level <= 1: w_response_chance **= 3 if (tofu_tagged or random.random() <= w_response_chance): return random.choice(w_response) break roll = random.random() if Sentience.isExposedPositivityOverloaded(): roll **= 2 if autoanswer_level >= 4 or (autoanswer_level >= 2 and roll > 0.95) or (tofu_targeted and roll > 0.75): if mood >= 0.3: x = Sentience.determineMessagePositivity(s) if x >= 0.6: return random.choice([ 'ay', 'nice', ':D', 'yay', 'heh', 'haha', 'lol', ]) if x < 0: return random.choice([ 'oof', 'ono', 'uh', 'oops', 'sad', ':(', '.-.', ]) return random.choice([ 'hmm', 'ah', 'hm', 'oof', 'interesting', ]) return random.choice(['o', 'meh', 'm', '.']) return None
from nltk.tokenize import TreebankWordTokenizer tokenizer = TreebankWordTokenizer() tokenized_sen = tokenizer.tokenize(sentence) # print("TreebankWordTokenizer") # print(tokenized_sen) # Tokenize informal text from social networks such as Twitter and Facebook from nltk.tokenize.casual import casual_tokenize message = """RT @TJMonticello Best day everrrrrrr at Monticello. Awesommmmmmeeeeeeee day :*)""" tokenized_sen = casual_tokenize(message) # print("casual_tokenize") # print(tokenized_sen) tokenized_sen = casual_tokenize(message, reduce_len=True, strip_handles=True) # print("casual_tokenize reduce_len") # print(tokenized_sen) # Stop Words import nltk nltk.download('stopwords') stop_words = nltk.corpus.stopwords.words('english') '''
import numpy as np import pandas as pd import seaborn as sns import re import nltk from nltk.tokenize.casual import casual_tokenize from nltk.util import ngrams from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer sen = "I know he gets knocked on a lot.But I like being un-sober and watching robots beat each other up. :)" tokens = casual_tokenize(sen, reduce_len=True, strip_handles=True) print(list(ngrams(tokens, 2))) #Creates a generator analyzer = SentimentIntensityAnalyzer() #print(analyzer.lexicon) print(analyzer.polarity_scores(text=sen))
def tokenize_text(text): tokens = casual_tokenize(text) tokens = [word for word in tokens if word.isalpha() or word.isdigit()] return [word for word in tokens if word not in stopwords.words('english')]
res = [] for doc in X: res.append(self._doc_transform(doc)) return res def avg_glove(df): vectors = [] for t in tqdm(df.content.values): vectors.append(np.average(GloveStruct.glove.query(word_tokenize(t)), axis=0)) return np.array(vectors) def tfidf_glove(df, idf_dict): vectors = [] for title in tqdm(df.content.values): glove_vectors = GloveStruct.glove.query(word_tokenize(title)) weights = [idf_dict.get(word, 1) for word in word_tokenize(title)] vectors.append(np.average(glove_vectors, axis=0, weights=weights)) return np.array(vectors) """ if __name__ == '__main__': __nltk_corpus_data_downloader() snowball = SnowballTokenizer() casual_tokenize('to be fair.aaa aaa aaa') print(snowball('to be fair. $3.11 beeb. aaa aaa aaa 80% 3453')) lemma = LemmaTokenizer() lemma('test') stem = StemTokenizer() stem('test')
def casualTokenize(raw_sentence, preserve_case=False): return casual_tokenize(raw_sentence, preserve_case=preserve_case, reduce_len=True)
from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r"\w+|$[0-9.]+|\S+") print(tokenizer.tokenize(sentence)) print( "----------------------------------------------------------------------------------" ) from nltk.tokenize import TreebankWordTokenizer sentence = """Monticello wasn't designated as UNESCO World Heritage Site until 1987.""" tokenizer = TreebankWordTokenizer() print(tokenizer.tokenize(sentence)) print( "------------------------------------------------------------------------------------" ) from nltk.tokenize.casual import casual_tokenize message = """RT @TJMonticello Best day everrrrrrrr at Monticello. Awesommmmmmmmmeeeeeeeee day :*)""" print(casual_tokenize(message)) print(casual_tokenize(message, reduce_len=True, strip_handles=True)) print( "-----------------------------------------------------------------------------------------" ) sentence = "Thomas Jefferson began bulding Monticello at the age of 26." from nltk.util import ngrams print(list(ngrams(tokenizer.tokenize(sentence), 2))) print(list(ngrams(tokenizer.tokenize(sentence), 3))) two_grams = list(ngrams(tokenizer.tokenize(sentence), 2)) print([" ".join(x) for x in two_grams]) stop_words = ["a", "an", "the", "on", "of", "off", "this", "is"] tokens = ["the", "house", "is", "on", "fire"] tokens_without_stopwords = [x for x in tokens if x not in stop_words] print(tokens_without_stopwords) print(
from three_step_decoding import * from nltk.tokenize.casual import casual_tokenize tsd = ThreeStepDecoding('lid_models/hinglish', htrans='nmt_models/rom2hin.pt', etrans='nmt_models/eng2eng.pt') dataset = [] dataset_t = [] with open('/home/devanshg27/cm_parallel_data/en-hi-codemixed-corpus/s-enhi.txt' ) as f: for line in f: line = line.rstrip() line = casual_tokenize(line, preserve_case=True, reduce_len=True, strip_handles=False) dataset.append(line) with open('/home/devanshg27/cm_parallel_data/en-hi-codemixed-corpus/t-en.txt' ) as f: for line in f: line = line.rstrip() line = casual_tokenize(line, preserve_case=True, reduce_len=True, strip_handles=False) dataset_t.append(line) valid_idx = [ i for i in range(len(dataset)) if dataset[i] != [] and dataset_t[i] != []
def __init__(self, csv_path, tokenizer_name, use_stopwords=True, use_preprocessor=False, min_df=10, max_df=0.75, max_ngram=3): # Where data is stored self.csv_path = csv_path #Read data directly self.dataframe = pd.read_csv(self.csv_path) # Choose tokenizer if tokenizer_name == 'casual_std': func = lambda x: casual_tokenize( x, preserve_case=True, reduce_len=False, strip_handles=False) self.tokenizer = func elif tokenizer_name == 'casual_reduce': func = lambda x: casual_tokenize( x, preserve_case=False, reduce_len=True, strip_handles=True) self.tokenizer = func elif tokenizer_name == 'words': self.tokenizer = tokenize_words elif tokenizer_name == 'orig': self.tokenizer = tokenize else: raise NotImplementedError('Unknown tokenizer') # Stopwords if use_stopwords: self.stopwords = nltk.corpus.stopwords.words("english").extend( ["#ff", "ff", "rt"]) else: self.stopwords = None # Preprocessor if use_preprocessor: self.preprocessor = preprocess else: self.preprocessor = None # Some hyperparameters self.min_df = min_df self.max_df = max_df self.max_ngram = max_ngram # Vectorizer self.vectorizer = TfidfVectorizer( tokenizer=self.tokenizer, #casual_tokenize_specified, preprocessor=self.preprocessor, ngram_range=(1, self.max_ngram), stop_words=self.stopwords, use_idf=True, smooth_idf=False, norm=None, decode_error='replace', max_features=10000, min_df=self.min_df, max_df=self.max_df) # PosVectorizer self.pos_vectorizer = TfidfVectorizer( tokenizer=None, lowercase=False, preprocessor=None, ngram_range=(1, self.max_ngram), stop_words=None, use_idf=False, smooth_idf=False, norm=None, decode_error='replace', max_features=5000, min_df=5, max_df=0.75, ) #Construct tfidf matrix and get relevant scores self.tfidf = self.vectorizer.fit_transform( self.dataframe['tweet']).toarray() self.vocab = { v: i for i, v in enumerate(self.vectorizer.get_feature_names()) } self.idf_vals = self.vectorizer.idf_ self.idf_dict = {i: self.idf_vals[i] for i in self.vocab.values()} print(f'A vocab was created. It consists of {len(self.vocab)} entries') # POS-tagging self.tweet_tags = [ pos_tag_tweet(tweet, self.tokenizer, print_tweet=False) for tweet in self.dataframe['tweet'] ] self.pos = self.pos_vectorizer.fit_transform(pd.Series( self.tweet_tags)).toarray() self.pos_vocab = { v: i for i, v in enumerate(self.pos_vectorizer.get_feature_names()) } # Other features: this is untouched self.feats = get_feature_array(self.dataframe['tweet']) #Now join them all up self.features = np.concatenate([self.tfidf, self.pos, self.feats], axis=1) self.feature_names = [k for k, _ in self.vocab.items()] + [ k for k, _ in self.pos_vocab.items() ] + [ "FKRA", "FRE", "num_syllables", "avg_syl_per_word", "num_chars", "num_chars_total", "num_terms", "num_words", "num_unique_words", "vader neg", "vader pos", "vader neu", "vader compound", "num_hashtags", "num_mentions", "num_urls", "is_retweet" ] self.labels = self.dataframe['class'] print( f'\n Data has been processed and is now available. Feature dim: {self.features.shape}' )
from nltk.tokenize.casual import casual_tokenize message = """RT TJMonticello Best day everrrrrrr at Monticello. Awesommmmmmeeeeeeee day""" tokens = casual_tokenize(message) print(tokens) tokens = casual_tokenize(message, preserve_case=False, reduce_len=True, strip_handles=True) print(tokens)