def levenshtein_dist(message1, message2, stop_words=None, thr=0.5, substitution_cost=2): """ Counts the number of words in the message, whose levenshtein_dist distance from any word in the "question" is smaller than threshold - tokens1 and tokens2 are tokenized messages/questions.. order should not matter - Threshold (thr) is relative to the word length (default 1/2 the length) - SW is an array of stop-words: if given the words present in this array are ignored - sub_cost is the cost for substitution in the calculation of levenshtein distance """ tokens1 = tokenize.casual_tokenize(message1) tokens2 = tokenize.casual_tokenize(message2) dist = 0 for t1 in tokens1: if stop_words is not None and t1 in stop_words: continue for t2 in tokens2: if stop_words is not None and t2 in stop_words: continue n = max(len(t2), len(t1)) l = Lev(t2, t1, substitution_cost=substitution_cost) if l <= (n * thr): dist += 1 return dist
def _get_tags(self, string, tagset='universal'): word_list = casual_tokenize(string) tag_list = pos_tag(word_list, tagset=tagset) out_list = [] for _, item in tag_list: out_list.append(self.TAG_DICT[item]) return out_list
def word_hashtag_extraction_noun_based(tweet_dictionary): text = tweet_dictionary['text'] # skip over retweets if text[0:4] == 'RT @': return [], [] text = casual_tokenize(tweet_dictionary['text']) result = nltk.pos_tag(text) hashtags = ['#' + s for s in tweet_dictionary['hashtags']] urls = tweet_dictionary['urls'] user_mentions = ['@' + s for s in tweet_dictionary['user_mentions']] ignore = hashtags + urls + user_mentions all_noun_tags = ['NN', 'NNS', 'NNP', 'NNPS'] proper_noun_tags = ['NNP', 'NNPS'] cleaned_text = {} for t in result: word = t[0] tag = t[1] if (tag in all_noun_tags) and (not word in ignore) and isEnglish(word): if tag in proper_noun_tags: cleaned_text[word.lower()] = PROPER_NOUN_WORD_TYPE else: cleaned_text[word.lower()] = REGULAR_NOUN_WORD_TYPE return cleaned_text, set([h.lower() for h in hashtags])
def tokenize(text): text = text.lower() tokens = casual_tokenize(text) stemmer = PorterStemmer() stems = [stemmer.stem(w) for w in tokens if w not in STOPWORDS] stems = [w for w in stems if w not in STOPWORDS] return stems
def pray_with_simplification(ineed, homog=True, threshold=.67, min_tokens=4, max_deletions=3): """ will try to find match, taking off a token until there is a match, i.e. I need to dance the rhumba at the Stork Club I need to dance the rhumba at the Stork I need to dance the rhumba at I need to dance the rhumba I need to dance the or not enough tokens left (after the first instance of "need") or max_deletions reached returns a prayer dict if successful returns None if unsuccessful """ for i in range(max_deletions): ineed = homogenize(ineed) prayer = pray(ineed, homog=False) ## already one if prayer["score"] >= threshold: return prayer ## delete last word-ish and make sure long enough deeni = ineed[::-1] ## reverse ineed_new = deeni.split(" ", 1)[1][::-1] if ineed == ineed_new: return None ineed = ineed_new if len(tokenize.casual_tokenize(ineed.split("need", 1)[1]) ) < min_tokens: ## must have at least n tokens after "I need" return None
def spell_tokenizer(text): """ Perform word tokenization using casual_tokenize after spelling correction :param text: string without punctuation :return: list of tokens """ tokens = [] corrector = jamspell.TSpellCorrector() corrector.LoadLangModel('model_en.bin') for word in casual_tokenize(rm_punctuation(text), preserve_case=False, reduce_len=True, strip_handles=True): if not (bool(re.search(r'\d', word))): corr_word = corrector.GetCandidates([word], 0) if (len(corr_word) > 0) and (word != corr_word[0]): for candidate in corr_word[:1]: tokens.append(candidate) else: tokens.append(word) wordnet_lemmatizer = WordNetLemmatizer() stems = [wordnet_lemmatizer.lemmatize(item) for item in tokens] # stemmer = PorterStemmer() # stems = [stemmer.stem(item) for item in tokens] return stems
def word_hashtag_extraction_noun_based(tweet_dictionary): text = tweet_dictionary['tweet'] text = casual_tokenize(tweet_dictionary['tweet']) result = nltk.pos_tag(text) all_noun_tags = ['NN', 'NNS', 'NNP', 'NNPS'] proper_noun_tags = ['NNP', 'NNPS'] cleaned_text = {} for t in result: word = t[0] tag = t[1] # words if (tag in all_noun_tags) and isEnglish(word) and (len(word) >= MIN_WORD_LEN): if tag in proper_noun_tags: cleaned_text[word.lower()] = PROPER_NOUN_WORD_TYPE else: cleaned_text[word.lower()] = REGULAR_NOUN_WORD_TYPE hashtags = tweet_dictionary['hashtags'] cleaned_tags = set() for h in hashtags: if isEnglish(h[1:]): cleaned_tags.add(h.lower()) return cleaned_text, cleaned_tags
def add_new_template(self,text): sent = random.choice(tokenize.sent_tokenize(text)) print(sent) template = [tag for token,tag in pos_tag(tokenize.casual_tokenize(sent))] if template not in self.gen.templates: self.gen.templates.append(template) return template
def parse_doc(doc): stemmer = SnowballStemmer("english") stems = [] for word in casual_tokenize(doc): stem = stemmer.stem(word.lower()) if stem.isalpha(): stems.append(stem) return ' '.join(stems)
def _simple_chunking(self, astring, minimum=3, maximum=10): tagged = pos_tag(tokenize.casual_tokenize(astring)) allpossible = [] for n in range(minimum, maximum): #len(tagged)-maxminus): allpossible += list(ngrams(tagged, n)) ok = [c for c in allpossible if self._test_chunk(c)] return ok
def top_tokens(text): freq_dict = defaultdict(int) tokens = casual_tokenize(text) for token in tokens: freq_dict[token] += 1 return sorted(freq_dict, key=freq_dict.get, reverse=True)
def get_stem(lang, sentence): stemmer = SnowballStemmer(lang) stemmed = '' for word in casual_tokenize(sentence): word = stemmer.stem(word) stemmed = stemmed + word + ' ' return stemmed
def lemmatize(sentence): res = [] lmtzr = WordNetLemmatizer() words = casual_tokenize(sentence, preserve_case=False, reduce_len=True, strip_handles=True) # TweetTokenizer for word, pos in pos_tag(words): # POS tagging wordnet_pos = get_wordnet_pos(pos) or wordnet.NOUN res.append(lmtzr.lemmatize(word, pos=wordnet_pos)) # lemmatize based on tags return res
def add_new_text_to_generator(self,text): tokenized_and_tagged = pos_tag(tokenize.casual_tokenize(text)) print(tokenized_and_tagged) for token,tag in tokenized_and_tagged: self.gen.vocab[tag].append(token) print("vocablength:") print(len(list(itertools.chain(*self.gen.vocab.values())))) return len(list(set(self.gen.vocab)))
def get_compression_data(origin_data, output_labels, flag_data, name=''): data_split = casual_tokenize(origin_data) out_1 = [] for data, pred_labels, oov_flag in zip(data_split, output_labels[1:], flag_data): if pred_labels == 1: # out_1.append(data if oov_flag != 3 else data+'(OOV)') out_1.append(data) print(name, end='\t\t\t') print(' '.join(out_1))
def normalize_corpus_words(corpus, stemmer=stemmer, synonyms=SYNONYMS, stopwords=STOPWORDS): docs = [doc.lower() for doc in corpus] docs = [casual_tokenize(doc) for doc in docs] docs = [[synonyms.get(w, w) for w in words if w not in stopwords] for words in docs] if stemmer: docs = [[stemmer.stem(w) for w in words if w not in stopwords] for words in docs] docs = [[synonyms.get(w, w) for w in words if w not in stopwords] for words in docs] docs = [' '.join(w for w in words if w not in stopwords) for words in docs] return docs
def run(self, text, val): """ TODO Improvements: 1. casual_tokenize can't handle 'words-with-hyphens-like-this' & reduces coverage """ # Remove new lines and turn to lower case # TODO what if only wanting to read first x lines, but that should only be for purposes of ML self.val = val text = re.sub('\n', ' ', text).lower() # Extract keyphrases using Rake # TODO also possible to extract keywords from sentence rake = Rake() if val == 'article': rake.extract_keywords_from_text(text) elif val == 'social': rake.extract_keywords_from_sentences(text) all_phrases = rake.get_ranked_phrases_with_scores() word_freq_dist = rake.get_word_frequency_distribution() # Tokenize text article_text_tokenized = casual_tokenize(text) # Tokenize phrases all_phrases_tokenized = self.tokenize_phrases(all_phrases) # Tag all phrases and remove all but noun words all_phrases_tagged = self.pos_tag_phrase_pairs(all_phrases) all_phrases_tagged_nouns = self.filter_nouns(all_phrases_tagged) # Convert list of tagged nouns back to a string phrase string_phrases_nouns = self.tuple_list_to_string_list( all_phrases_tagged_nouns) # Get the indexes from the non-filtered suggested phrases in the original text all_surrounding_tokens, all_context_tokens = self.get_all_surrounding_tokens( all_phrases_tokenized, article_text_tokenized) # Get wikipedia urls for top 5 phrases mapping_list = self.get_wiki_urls_top_n_phrases( string_phrases_nouns, all_surrounding_tokens, 10) # Return mapping to console wiki_mapping = self.write_suggestions_to_json(mapping_list) # print(json.dumps(wiki_mapping)) # Get page links on medium by phrase medium_mapping = self.get_n_listed_medium_posts( string_phrases_nouns, 2) # print(json.dumps(medium_mapping)) # Combine jsons mapping = self.combine_mappings(wiki_mapping, medium_mapping) print(json.dumps(mapping))
def tokenize(text, corpus=tfidf_dense): docs = [text.lower()] docs = [casual_tokenize(doc) for doc in docs] docs = [[SYNONYMS.get(w, w) for w in words if w not in STOPWORDS] for words in docs] stemmer = PorterStemmer() docs = [[stemmer.stem(w) for w in words if w not in STOPWORDS] for words in docs] docs = [[SYNONYMS.get(w, w) for w in words if w not in STOPWORDS] for words in docs] docs = [' '.join(w for w in words if w not in STOPWORDS) for words in docs] stems = [w for w in docs[0].split() if w in corpus.columns] return stems
def clean_tweet(tweet): tweet = re.sub(r"https?://\S+", "", tweet) # tweet = re.sub(URLS, "", tweet) toks = casual_tokenize(tweet, preserve_case=False, reduce_len=True, strip_handles=True) # toks = [''.join(c for c in s if c not in punctuation) for s in toks] toks = [s for s in toks if s] return toks
def prop_name_count(message, names): """ Function counts the appearance of proper names(names is array of proper names) in tokenized messages (tokens) """ tokens = tokenize.casual_tokenize(message) d = 0 for token in tokens: if token.capitalize() in names: d += 1 return d
def preprocess(text, name, speaker='U', first_name=None): """ normalize and tokenize raw text args: text: input raw text (str) name: user name (str) first_name: user's first name (str) sys_utt: flag if this is a sysmtem's turn (bool) return: normalized text (str) """ # modify apostrophe character text = re.sub(u'’', "'", text) text = re.sub(u'(“|”)', '', text) # remove handle names in the beginning text = re.sub(r'^(@[A-Za-z0-9_]+[\.;, ])+', '', text) # remove connected tweets indicator e.g. (1/2) (2/2) text = re.sub(r'(^|[\(\[ ])[1234]\/[2345]([\)\] ]|$)', ' ', text) # replace long numbers text = re.sub(r'(?<=[ A-Z])(\+\d|\d\-|\d\d\d+|\(\d\d+\))[\d\- ]+\d\d\d', '<NUMBERS>', text) # replace user name in system response if speaker == 'S': if name: text = re.sub('@' + name, '<USER>', text) if first_name: text = re.sub('(^|[^A-Za-z0-9])' + first_name + '($|[^A-Za-z0-9])', '\\1<USER>\\2', text) # tokenize and replace entities words = casual_tokenize(text, preserve_case=False, reduce_len=True) for n in six.moves.range(len(words)): token = words[n] # replace entities with tags (E-MAIL, URL, NUMBERS, USER, etc) token = re.sub(r'^([a-z0-9_\.\-]+@[a-z0-9_\.\-]+\.[a-z]+)$', '<E-MAIL>', token) token = re.sub(r'^https?:\S+$', '<URL>', token) token = re.sub('^<numbers>$', '<NUMBERS>', token) token = re.sub('^<user>$', '<USER>', token) # make spaces for apostrophe and period token = re.sub(r'^([a-z]+)\'([a-z]+)$', '\\1 \'\\2', token) token = re.sub(r'^([a-z]+)\.([a-z]+)$', '\\1 . \\2', token) words[n] = token # join text = ' '.join(words) # remove signature of tweets (e.g. ... ^TH, - John, etc.) if speaker == 'S': text = re.sub( u'[\\^\\-~–][\\-– ]*([a-z]+\\s*|[a-z ]{2,8})(<URL>\\s*$|\\.\\s*$|$)', '\\2', text) if not re.search(r' (thanks|thnks|thx)\s*$', text): text = re.sub(u'(?<= [\\-,!?.–])\\s*[a-z]+\\s*$', '', text) return text
def parse_file(filename): tree = etree.parse(filename) # iterate over documents for doc in tree.getroot(): # iterate over paragraphs (<P> elements) for paragraph in doc.find('TEXT'): text = paragraph.text # Tokenize and split into sentences for s in sent_tokenize(text): tok = casual_tokenize(s.replace("\n", " ")) if len(tok) < LEN_LIMIT: print(" ".join(tok))
def get_sentiment(classifier, tweets, keep_status=True): ''' Takes tweets as a list of dictionaries. Returns tweets as a list of dictionaries with sentiment labels. Arguments: classifier: NLTK Naive Bayes Classifier object. tweets: Tweets as a list of dictionaries. keep_status: Set as False to remove tweet status and reduce data size, keeping sentiment distribution and label. Returns: tweets: Tweets as a list of dictionaries. ''' print('Starting text analysis...') print('Scoring tweets...') # get positive and negative probabilities for each tweet for tweet in tweets: custom_tokens = helpers.remove_noise(casual_tokenize(tweet['status'])) dist = classifier.prob_classify(dict([token, True] for token in custom_tokens)) # append probabilities to list pos_probability = dist.prob('Positive') neg_probability = dist.prob('Negative') # add sentiment probabilities to tweet dictionary try: tweet['positive'] = pos_probability tweet['negative'] = neg_probability except Exception as e: print(e) # add sentiment label to tweet dictionary if pos_probability >= 0.9: tweet['label'] = 'Very Positive' elif pos_probability >= 0.7: tweet['label'] = 'Positive' elif pos_probability > 0.3 and neg_probability > 0.3: tweet['label'] = 'Neutral' elif neg_probability >= 0.9: tweet['label'] = 'Very Negative' elif neg_probability >= 0.7: tweet['label'] = 'Negative' else: tweet['label'] = 'None' # optional: remove tweet status to reduce data size (keeping sentiment distribution and label) if tweet['status'] and keep_status == False: del tweet['status'] print('Text analysis complete!\n') return tweets
def get_word_based_on_pos_and_vector(pos, keysetters, ineed, threshold=.42, probability=.8, top_n=7): """ a confusing function tries to find a word of a specific pos that is similar to one or more words, called keysetters this similarity is cosine distance on word vectors, with a threshold applied ranks the top_n of these similar-enough words according to second comparison: max similarity to word in ineed statement if this fails, simply orders list by proximity of words to keysetters if this fails, simply return the first keysetter, assumed to be a valid input returns one option, using randomness to add variation so, for instance, given the pos "NN", the keysetter ["food"], and the statement: "I need some sushi" this function should try to create a list of words like "bread," "meat," and "fish" and return "fish" because it is close to "sushi" """ default_word = keysetters[0] options = myvocabulary_filtered[pos] #options = [w.split(">")[0] for w in options]## this is not necessary; already filtered options = list(set(options)) ## weird thing to maybe just get one keysetter, some of the time if random.choice([True, True, False]): keysetters = [random.choice(keysetters)] try: options_sorted_according_to_keysetters = tune_one_list_according_to_max_similarity_to_another_list( options, keysetters, threshold) try: ### must have at least n options_tuned_according_to_need = tune_one_list_according_to_max_similarity_to_another_list( options_sorted_according_to_keysetters[:top_n], [ w.lower() for w in tokenize.casual_tokenize(ineed) if w.isalpha() ]) options_sorted = options_tuned_according_to_need except: options_sorted = options_sorted_according_to_keysetters while True: if random.random() <= probability: return options_sorted[0] else: ## cycle first = options_sorted.pop(0) options_sorted.append(first) except: return default_word
def _get_ok_words(self, astring, justnouns=False): if justnouns: oktags = ["NN", "NNS"] else: oktags = ['NN', "VB", "JJ", "AD", "NNS"] okwords = [ token for token, tag in pos_tag(tokenize.casual_tokenize(astring)) if tag[:2] in oktags ] okwords = [token for token in okwords if token not in stops] okwordsstrict = [token for token in okwords if spell.check(token)] if okwordsstrict != []: return okwordsstrict else: return okwords
def casual_tokenizer(text): """ Perform word tokenization using casual_tokenize :param text: string without punctuation :param if_cap: remove strings with numbers when appearance rate is not capped :return: list of tokens """ tokens = [word for word in casual_tokenize(rm_punctuation(text), preserve_case=False, reduce_len=True, strip_handles=True)] wordnet_lemmatizer = WordNetLemmatizer() stems = [wordnet_lemmatizer.lemmatize(item) for item in tokens] # stemmer = PorterStemmer() # Mild stemmer comparing to other type # stems = [stemmer.stem(item) for item in tokens] return stems
def get_similar_suggestions( self, suggestions, original_tokens): # TODO add counter to prioritize similarity ret = [] suggestions_tokens = [] for suggestion in suggestions: suggestions_tokens.append(casual_tokenize(suggestion)) for token in original_tokens: for suggestion in suggestions_tokens: if token in suggestion: if not suggestion in ret: ret.append(suggestion) # out = self.order_list_by_similarity(original_tokens, ret) return ret
def get_nltk_tokenized_corpus(data: pd.DataFrame, y_data: pd.Series, tokenize_format='casual'): x_data: pd.Series = data[comment_text] bags_of_words: List = [] if tokenize_format == 'casual': for text in x_data.iteritems(): bags_of_words.append(Counter(casual_tokenize(text[1]))) elif tokenize_format == 'word': for text in x_data.iteritems(): bags_of_words.append(Counter(word_tokenize(text[1]))) else: raise Exception('Please select valid Tokenizer') df = pd.DataFrame.from_records(bags_of_words) df = df.fillna(0).astype(int) x_train, x_test, y_train, y_test = train_test_split( df, y_data, random_state=random_state_0) return x_train, x_test, y_train, y_test
def __init__(self, ds, name="train"): fn = "tokens_" + name + ".dat" if os.path.exists(fn): self.tokens, self.cases = load(fn) return data = [] cases = [] print("Tokenizing:" + name) for f in ds.texts(): ts = casual_tokenize(f, preserve_case=True) case = [] tw = [] for x in ts: tw.append(x) if x.upper() == x: case.append(2) elif x.lower() != x: case.append(1) else: case.append(0)
def stemming_message_snowball(message, stemmings_to_words=dict()): from nltk.stem.snowball import SnowballStemmer from nltk.tokenize import casual_tokenize stemmer = SnowballStemmer('finnish') if type(message) == None: return '', stemmings_to_words message.replace('#', '') stemmed_message = [] for word in casual_tokenize(message): stemmed_word = stemmer.stem(word.lower()) stemmed_message.append(stemmed_word) stemmings_to_words[stemmed_word] = word stemmed_message = ' '.join(stemmed_message) return stemmed_message, stemmings_to_words
this. Instructions on how to disable it here: http://technet.microsoft.com/en-us/library/cc778248(v=ws.10).aspx Next, if that doesn't resolve it, I would try running curl/wget on the server, and requesting the displayed URL. curl can be run with curl example.com and wget wget -qO- example.com, both will displayed the returned data (if any) on the terminal. If one returns a command not found, try the other. If that doesn't work, something is going on with your server. If it's returning something that looks like an error (e.g. a sever generated error page), I'd look into that too at this step. If you appear to have connectivity issues, you can see if there's any iptables rules in place by running iptables -L on the server. A DROP all under Chain INPUT would cause this. You can read more about iptables here, and how to set it up for your needs here: https://help.ubuntu.com/community/IptablesHowTo (Even if you're not using ubuntu, this will still work for you, look in the "Allowing Incoming Traffic on Specific Ports" section, there's an example there you would need to adopt slightly for the non-standard port the server is running on). If there's an external firewall preventing access, you would need to talk with whoever is managing the sever. """ sentences = nltk.sent_tokenize(document) for sentence in sentences: tokens = casual_tokenize(sentence, preserve_case=False, reduce_len=True) # print tokens stoppath = "SmartStoplist.txt" with open("SmartStoplist.txt", 'r') as f: content = f.readlines() # you may also want to remove whitespace characters like `\n` at the end of each line content = [x.strip() for x in content] stop_words = set(content) clean_tokens = [words for words in tokens if words not in stop_words] print clean_tokens tagged = nltk.pos_tag(clean_tokens) print tagged