def levenshtein_dist(message1, message2, stop_words=None, thr=0.5, substitution_cost=2):
    """
    Counts the number of words in the message, whose levenshtein_dist distance from any word in the "question" is smaller than threshold
      - tokens1 and tokens2 are tokenized messages/questions.. order should not matter
      - Threshold (thr) is relative to the word length (default 1/2 the length)
      - SW is an array of stop-words: if given the words present in this array are ignored
      - sub_cost is the cost for substitution in the calculation of levenshtein distance
    """

    tokens1 = tokenize.casual_tokenize(message1)
    tokens2 = tokenize.casual_tokenize(message2)

    dist = 0
    for t1 in tokens1:
        if stop_words is not None and t1 in stop_words:
            continue

        for t2 in tokens2:
            if stop_words is not None and t2 in stop_words:
                continue

            n = max(len(t2), len(t1))
            l = Lev(t2, t1, substitution_cost=substitution_cost)
            if l <= (n * thr):
                dist += 1

    return dist
示例#2
0
 def _get_tags(self, string, tagset='universal'):
     word_list = casual_tokenize(string)
     tag_list = pos_tag(word_list, tagset=tagset)
     out_list = []
     for _, item in tag_list:
         out_list.append(self.TAG_DICT[item])
     return out_list
示例#3
0
def word_hashtag_extraction_noun_based(tweet_dictionary):
    text = tweet_dictionary['text']
    # skip over retweets
    if text[0:4] == 'RT @':
        return [], []
    text = casual_tokenize(tweet_dictionary['text'])
    result = nltk.pos_tag(text)

    hashtags = ['#' + s for s in tweet_dictionary['hashtags']]
    urls = tweet_dictionary['urls']
    user_mentions = ['@' + s for s in tweet_dictionary['user_mentions']]
    ignore = hashtags + urls + user_mentions

    all_noun_tags = ['NN', 'NNS', 'NNP', 'NNPS']
    proper_noun_tags = ['NNP', 'NNPS']

    cleaned_text = {}
    for t in result:
        word = t[0]
        tag = t[1]
        if (tag in all_noun_tags) and (not word in ignore) and isEnglish(word):
            if tag in proper_noun_tags:
                cleaned_text[word.lower()] = PROPER_NOUN_WORD_TYPE
            else:
                cleaned_text[word.lower()] = REGULAR_NOUN_WORD_TYPE

    return cleaned_text, set([h.lower() for h in hashtags])
示例#4
0
def tokenize(text):
    text = text.lower()
    tokens = casual_tokenize(text)
    stemmer = PorterStemmer()
    stems = [stemmer.stem(w) for w in tokens if w not in STOPWORDS]
    stems = [w for w in stems if w not in STOPWORDS]
    return stems
示例#5
0
def pray_with_simplification(ineed,
                             homog=True,
                             threshold=.67,
                             min_tokens=4,
                             max_deletions=3):
    """
    will try to find match, taking off a token until there is a match, i.e.

    I need to dance the rhumba at the Stork Club
    I need to dance the rhumba at the Stork
    I need to dance the rhumba at 
    I need to dance the rhumba 
    I need to dance the

    or not enough tokens left (after the first instance of "need")
    or max_deletions reached
    returns a prayer dict if successful
    returns None if unsuccessful
    """
    for i in range(max_deletions):
        ineed = homogenize(ineed)
        prayer = pray(ineed, homog=False)  ## already one
        if prayer["score"] >= threshold:
            return prayer
        ## delete last word-ish and make sure long enough
        deeni = ineed[::-1]  ## reverse
        ineed_new = deeni.split(" ", 1)[1][::-1]
        if ineed == ineed_new:
            return None
        ineed = ineed_new
        if len(tokenize.casual_tokenize(ineed.split("need", 1)[1])
               ) < min_tokens:  ## must have at least n tokens after "I need"
            return None
示例#6
0
def spell_tokenizer(text):
    """
    Perform word tokenization using casual_tokenize after spelling correction
    :param text: string without punctuation
    :return: list of tokens
    """
    tokens = []
    corrector = jamspell.TSpellCorrector()
    corrector.LoadLangModel('model_en.bin')

    for word in casual_tokenize(rm_punctuation(text), preserve_case=False, reduce_len=True, strip_handles=True):
        if not (bool(re.search(r'\d', word))):
            corr_word = corrector.GetCandidates([word], 0)
            if (len(corr_word) > 0) and (word != corr_word[0]):
                for candidate in corr_word[:1]:
                    tokens.append(candidate)
            else:
                tokens.append(word)

    wordnet_lemmatizer = WordNetLemmatizer()
    stems = [wordnet_lemmatizer.lemmatize(item) for item in tokens]
    # stemmer = PorterStemmer()
    # stems = [stemmer.stem(item) for item in tokens]

    return stems
def word_hashtag_extraction_noun_based(tweet_dictionary):
    text = tweet_dictionary['tweet']
    text = casual_tokenize(tweet_dictionary['tweet'])
    result = nltk.pos_tag(text)

    all_noun_tags = ['NN', 'NNS', 'NNP', 'NNPS']
    proper_noun_tags = ['NNP', 'NNPS']

    cleaned_text = {}
    for t in result:
        word = t[0]
        tag = t[1]

        # words
        if (tag in all_noun_tags) and isEnglish(word) and (len(word) >=
                                                           MIN_WORD_LEN):
            if tag in proper_noun_tags:
                cleaned_text[word.lower()] = PROPER_NOUN_WORD_TYPE
            else:
                cleaned_text[word.lower()] = REGULAR_NOUN_WORD_TYPE

    hashtags = tweet_dictionary['hashtags']
    cleaned_tags = set()
    for h in hashtags:
        if isEnglish(h[1:]):
            cleaned_tags.add(h.lower())

    return cleaned_text, cleaned_tags
示例#8
0
 def add_new_template(self,text):
     sent = random.choice(tokenize.sent_tokenize(text))
     print(sent)
     template = [tag for token,tag in pos_tag(tokenize.casual_tokenize(sent))]
     if template not in self.gen.templates:
         self.gen.templates.append(template)
     return template
示例#9
0
def parse_doc(doc):
    stemmer = SnowballStemmer("english")
    stems = []
    for word in casual_tokenize(doc):
        stem = stemmer.stem(word.lower())
        if stem.isalpha():
            stems.append(stem)
    return ' '.join(stems)
示例#10
0
    def _simple_chunking(self, astring, minimum=3, maximum=10):
        tagged = pos_tag(tokenize.casual_tokenize(astring))
        allpossible = []
        for n in range(minimum, maximum):  #len(tagged)-maxminus):
            allpossible += list(ngrams(tagged, n))

        ok = [c for c in allpossible if self._test_chunk(c)]
        return ok
示例#11
0
def top_tokens(text):
    freq_dict = defaultdict(int)
    tokens = casual_tokenize(text)

    for token in tokens:
        freq_dict[token] += 1

    return sorted(freq_dict, key=freq_dict.get, reverse=True)
示例#12
0
def get_stem(lang, sentence):
    stemmer = SnowballStemmer(lang)
    stemmed = ''
    for word in casual_tokenize(sentence):
        word = stemmer.stem(word)
        stemmed = stemmed + word + ' '

    return stemmed
示例#13
0
def lemmatize(sentence):
    res = []
    lmtzr = WordNetLemmatizer()
    words = casual_tokenize(sentence, preserve_case=False, reduce_len=True, strip_handles=True)         # TweetTokenizer
    for word, pos in pos_tag(words):                                                                       # POS tagging
        wordnet_pos = get_wordnet_pos(pos) or wordnet.NOUN
        res.append(lmtzr.lemmatize(word, pos=wordnet_pos))                                     # lemmatize based on tags
    return res
示例#14
0
 def add_new_text_to_generator(self,text):
     tokenized_and_tagged = pos_tag(tokenize.casual_tokenize(text))
     print(tokenized_and_tagged)
     for token,tag in tokenized_and_tagged:
         self.gen.vocab[tag].append(token)
     print("vocablength:")
     print(len(list(itertools.chain(*self.gen.vocab.values()))))
     return len(list(set(self.gen.vocab)))
示例#15
0
 def get_compression_data(origin_data, output_labels, flag_data, name=''):
     data_split = casual_tokenize(origin_data)
     out_1 = []
     for data, pred_labels, oov_flag in zip(data_split, output_labels[1:], flag_data):
         if pred_labels == 1:
             # out_1.append(data if oov_flag != 3 else data+'(OOV)')
             out_1.append(data)
     print(name, end='\t\t\t')
     print(' '.join(out_1))
示例#16
0
def normalize_corpus_words(corpus, stemmer=stemmer, synonyms=SYNONYMS, stopwords=STOPWORDS):
    docs = [doc.lower() for doc in corpus]
    docs = [casual_tokenize(doc) for doc in docs]
    docs = [[synonyms.get(w, w) for w in words if w not in stopwords] for words in docs]
    if stemmer:
        docs = [[stemmer.stem(w) for w in words if w not in stopwords] for words in docs]
    docs = [[synonyms.get(w, w) for w in words if w not in stopwords] for words in docs]
    docs = [' '.join(w for w in words if w not in stopwords) for words in docs]
    return docs
示例#17
0
    def run(self, text, val):
        """
        TODO Improvements:
        1. casual_tokenize can't handle 'words-with-hyphens-like-this' & reduces coverage
        """

        # Remove new lines and turn to lower case
        # TODO what if only wanting to read first x lines, but that should only be for purposes of ML
        self.val = val

        text = re.sub('\n', ' ', text).lower()

        # Extract keyphrases using Rake
        # TODO also possible to extract keywords from sentence
        rake = Rake()
        if val == 'article':
            rake.extract_keywords_from_text(text)
        elif val == 'social':
            rake.extract_keywords_from_sentences(text)
        all_phrases = rake.get_ranked_phrases_with_scores()
        word_freq_dist = rake.get_word_frequency_distribution()

        # Tokenize text
        article_text_tokenized = casual_tokenize(text)

        # Tokenize phrases
        all_phrases_tokenized = self.tokenize_phrases(all_phrases)

        # Tag all phrases and remove all but noun words
        all_phrases_tagged = self.pos_tag_phrase_pairs(all_phrases)
        all_phrases_tagged_nouns = self.filter_nouns(all_phrases_tagged)

        # Convert list of tagged nouns back to a string phrase
        string_phrases_nouns = self.tuple_list_to_string_list(
            all_phrases_tagged_nouns)

        # Get the indexes from the non-filtered suggested phrases in the original text
        all_surrounding_tokens, all_context_tokens = self.get_all_surrounding_tokens(
            all_phrases_tokenized, article_text_tokenized)

        # Get wikipedia urls for top 5 phrases
        mapping_list = self.get_wiki_urls_top_n_phrases(
            string_phrases_nouns, all_surrounding_tokens, 10)

        # Return mapping to console
        wiki_mapping = self.write_suggestions_to_json(mapping_list)
        # print(json.dumps(wiki_mapping))

        # Get page links on medium by phrase
        medium_mapping = self.get_n_listed_medium_posts(
            string_phrases_nouns, 2)
        # print(json.dumps(medium_mapping))

        # Combine jsons
        mapping = self.combine_mappings(wiki_mapping, medium_mapping)
        print(json.dumps(mapping))
示例#18
0
def tokenize(text, corpus=tfidf_dense):
    docs = [text.lower()]
    docs = [casual_tokenize(doc) for doc in docs]
    docs = [[SYNONYMS.get(w, w) for w in words if w not in STOPWORDS] for words in docs]
    stemmer = PorterStemmer()
    docs = [[stemmer.stem(w) for w in words if w not in STOPWORDS] for words in docs]
    docs = [[SYNONYMS.get(w, w) for w in words if w not in STOPWORDS] for words in docs]
    docs = [' '.join(w for w in words if w not in STOPWORDS) for words in docs]
    stems = [w for w in docs[0].split() if w in corpus.columns]
    return stems
示例#19
0
def clean_tweet(tweet):
    tweet = re.sub(r"https?://\S+", "", tweet)
    # tweet = re.sub(URLS, "", tweet)
    toks = casual_tokenize(tweet,
                           preserve_case=False,
                           reduce_len=True,
                           strip_handles=True)
    # toks = [''.join(c for c in s if c not in punctuation) for s in toks]
    toks = [s for s in toks if s]
    return toks
def prop_name_count(message, names):
    """
    Function counts the appearance of proper names(names is array of proper names) in tokenized messages (tokens)
    """
    tokens = tokenize.casual_tokenize(message)
    d = 0
    for token in tokens:
        if token.capitalize() in names:
            d += 1
    return d
示例#21
0
def preprocess(text, name, speaker='U', first_name=None):
    """ normalize and tokenize raw text 
    args:
        text: input raw text (str)
        name: user name (str)
        first_name: user's first name (str)
        sys_utt: flag if this is a sysmtem's turn (bool)
    return:
        normalized text (str)
    """
    # modify apostrophe character
    text = re.sub(u'’', "'", text)
    text = re.sub(u'(“|”)', '', text)
    # remove handle names in the beginning
    text = re.sub(r'^(@[A-Za-z0-9_]+[\.;, ])+', '', text)
    # remove connected tweets indicator e.g. (1/2) (2/2)
    text = re.sub(r'(^|[\(\[ ])[1234]\/[2345]([\)\] ]|$)', ' ', text)
    # replace long numbers
    text = re.sub(r'(?<=[ A-Z])(\+\d|\d\-|\d\d\d+|\(\d\d+\))[\d\- ]+\d\d\d',
                  '<NUMBERS>', text)
    # replace user name in system response
    if speaker == 'S':
        if name:
            text = re.sub('@' + name, '<USER>', text)
        if first_name:
            text = re.sub('(^|[^A-Za-z0-9])' + first_name + '($|[^A-Za-z0-9])',
                          '\\1<USER>\\2', text)

    # tokenize and replace entities
    words = casual_tokenize(text, preserve_case=False, reduce_len=True)
    for n in six.moves.range(len(words)):
        token = words[n]
        # replace entities with tags (E-MAIL, URL, NUMBERS, USER, etc)
        token = re.sub(r'^([a-z0-9_\.\-]+@[a-z0-9_\.\-]+\.[a-z]+)$',
                       '<E-MAIL>', token)
        token = re.sub(r'^https?:\S+$', '<URL>', token)
        token = re.sub('^<numbers>$', '<NUMBERS>', token)
        token = re.sub('^<user>$', '<USER>', token)
        # make spaces for apostrophe and period
        token = re.sub(r'^([a-z]+)\'([a-z]+)$', '\\1 \'\\2', token)
        token = re.sub(r'^([a-z]+)\.([a-z]+)$', '\\1 . \\2', token)
        words[n] = token
    # join
    text = ' '.join(words)
    # remove signature of tweets (e.g. ... ^TH, - John, etc.)
    if speaker == 'S':
        text = re.sub(
            u'[\\^\\-~–][\\-– ]*([a-z]+\\s*|[a-z ]{2,8})(<URL>\\s*$|\\.\\s*$|$)',
            '\\2', text)
        if not re.search(r' (thanks|thnks|thx)\s*$', text):
            text = re.sub(u'(?<= [\\-,!?.–])\\s*[a-z]+\\s*$', '', text)

    return text
示例#22
0
def parse_file(filename):
    tree = etree.parse(filename)
    # iterate over documents
    for doc in tree.getroot():
        # iterate over paragraphs (<P> elements)
        for paragraph in doc.find('TEXT'):
            text = paragraph.text
            # Tokenize and split into sentences
            for s in sent_tokenize(text):
                tok = casual_tokenize(s.replace("\n", " "))
                if len(tok) < LEN_LIMIT:
                    print(" ".join(tok))
示例#23
0
def get_sentiment(classifier, tweets, keep_status=True):
	'''
	Takes tweets as a list of dictionaries.
	Returns tweets as a list of dictionaries with sentiment labels.

	Arguments:
		classifier: NLTK Naive Bayes Classifier object.
		tweets: Tweets as a list of dictionaries.
		keep_status: Set as False to remove tweet status and reduce data size, 
			keeping sentiment distribution and label.
	Returns:
		tweets: Tweets as a list of dictionaries.
	'''

	print('Starting text analysis...')
	print('Scoring tweets...')
	# get positive and negative probabilities for each tweet
	for tweet in tweets:
		custom_tokens = helpers.remove_noise(casual_tokenize(tweet['status']))
		dist = classifier.prob_classify(dict([token, True] for token in custom_tokens))

		# append probabilities to list
		pos_probability = dist.prob('Positive')
		neg_probability = dist.prob('Negative')

		# add sentiment probabilities to tweet dictionary
		try:
			tweet['positive'] = pos_probability
			tweet['negative'] = neg_probability
		except Exception as e:
			print(e)

		# add sentiment label to tweet dictionary
		if pos_probability >= 0.9:
			tweet['label'] = 'Very Positive'
		elif pos_probability >= 0.7:
			tweet['label'] = 'Positive'
		elif pos_probability > 0.3 and neg_probability > 0.3:
			tweet['label'] = 'Neutral'
		elif neg_probability >= 0.9:
			tweet['label'] = 'Very Negative'
		elif neg_probability >= 0.7:
			tweet['label'] = 'Negative'
		else:
			tweet['label'] = 'None'

		# optional: remove tweet status to reduce data size (keeping sentiment distribution and label)
		if tweet['status'] and keep_status == False:
			del tweet['status']

	print('Text analysis complete!\n')

	return tweets
示例#24
0
def get_word_based_on_pos_and_vector(pos,
                                     keysetters,
                                     ineed,
                                     threshold=.42,
                                     probability=.8,
                                     top_n=7):
    """
    a confusing function
    tries to find a word of a specific pos that is similar to one or more words, called keysetters
    this similarity is cosine distance on word vectors, with a threshold applied 
    ranks the top_n of these similar-enough words according to second comparison: max similarity to word in ineed statement
    if this fails, simply orders list by proximity of words to keysetters
    if this fails, simply return the first keysetter, assumed to be a valid input
    returns one option, using randomness to add variation

    so, for instance, given the pos "NN", the keysetter ["food"], and the statement: "I need some sushi"
    this function should try to create a list of words like "bread," "meat," and "fish"
    and return "fish" because it is close to "sushi"
    """
    default_word = keysetters[0]
    options = myvocabulary_filtered[pos]
    #options = [w.split(">")[0] for w in options]## this is not necessary; already filtered
    options = list(set(options))

    ## weird thing to maybe just get one keysetter, some of the time
    if random.choice([True, True, False]):
        keysetters = [random.choice(keysetters)]

    try:
        options_sorted_according_to_keysetters = tune_one_list_according_to_max_similarity_to_another_list(
            options, keysetters, threshold)
        try:
            ### must have at least n
            options_tuned_according_to_need = tune_one_list_according_to_max_similarity_to_another_list(
                options_sorted_according_to_keysetters[:top_n], [
                    w.lower()
                    for w in tokenize.casual_tokenize(ineed) if w.isalpha()
                ])
            options_sorted = options_tuned_according_to_need
        except:
            options_sorted = options_sorted_according_to_keysetters
        while True:
            if random.random() <= probability:
                return options_sorted[0]
            else:
                ## cycle
                first = options_sorted.pop(0)
                options_sorted.append(first)
    except:
        return default_word
示例#25
0
 def _get_ok_words(self, astring, justnouns=False):
     if justnouns:
         oktags = ["NN", "NNS"]
     else:
         oktags = ['NN', "VB", "JJ", "AD", "NNS"]
     okwords = [
         token for token, tag in pos_tag(tokenize.casual_tokenize(astring))
         if tag[:2] in oktags
     ]
     okwords = [token for token in okwords if token not in stops]
     okwordsstrict = [token for token in okwords if spell.check(token)]
     if okwordsstrict != []:
         return okwordsstrict
     else:
         return okwords
示例#26
0
def casual_tokenizer(text):
    """
    Perform word tokenization using casual_tokenize
    :param text: string without punctuation
    :param if_cap: remove strings with numbers when appearance rate is not capped
    :return: list of tokens
    """
    tokens = [word for word in casual_tokenize(rm_punctuation(text), preserve_case=False, reduce_len=True, strip_handles=True)]

    wordnet_lemmatizer = WordNetLemmatizer()
    stems = [wordnet_lemmatizer.lemmatize(item) for item in tokens]

    # stemmer = PorterStemmer()    # Mild stemmer comparing to other type
    # stems = [stemmer.stem(item) for item in tokens]

    return stems
示例#27
0
    def get_similar_suggestions(
            self, suggestions,
            original_tokens):  # TODO add counter to prioritize similarity
        ret = []
        suggestions_tokens = []
        for suggestion in suggestions:
            suggestions_tokens.append(casual_tokenize(suggestion))

        for token in original_tokens:
            for suggestion in suggestions_tokens:
                if token in suggestion:
                    if not suggestion in ret:
                        ret.append(suggestion)

        # out = self.order_list_by_similarity(original_tokens, ret)
        return ret
示例#28
0
def get_nltk_tokenized_corpus(data: pd.DataFrame,
                              y_data: pd.Series,
                              tokenize_format='casual'):
    x_data: pd.Series = data[comment_text]
    bags_of_words: List = []
    if tokenize_format == 'casual':
        for text in x_data.iteritems():
            bags_of_words.append(Counter(casual_tokenize(text[1])))
    elif tokenize_format == 'word':
        for text in x_data.iteritems():
            bags_of_words.append(Counter(word_tokenize(text[1])))
    else:
        raise Exception('Please select valid Tokenizer')

    df = pd.DataFrame.from_records(bags_of_words)
    df = df.fillna(0).astype(int)
    x_train, x_test, y_train, y_test = train_test_split(
        df, y_data, random_state=random_state_0)
    return x_train, x_test, y_train, y_test
 def __init__(self, ds, name="train"):
     fn = "tokens_" + name + ".dat"
     if os.path.exists(fn):
         self.tokens, self.cases = load(fn)
         return
     data = []
     cases = []
     print("Tokenizing:" + name)
     for f in ds.texts():
         ts = casual_tokenize(f, preserve_case=True)
         case = []
         tw = []
         for x in ts:
             tw.append(x)
             if x.upper() == x:
                 case.append(2)
             elif x.lower() != x:
                 case.append(1)
             else:
                 case.append(0)
示例#30
0
def stemming_message_snowball(message, stemmings_to_words=dict()):
    from nltk.stem.snowball import SnowballStemmer
    from nltk.tokenize import casual_tokenize
    stemmer = SnowballStemmer('finnish')

    if type(message) == None:
        return '', stemmings_to_words

    message.replace('#', '')

    stemmed_message = []

    for word in casual_tokenize(message):

        stemmed_word = stemmer.stem(word.lower())
        stemmed_message.append(stemmed_word)
        stemmings_to_words[stemmed_word] = word

    stemmed_message = ' '.join(stemmed_message)

    return stemmed_message, stemmings_to_words
this. Instructions on how to disable it here: http://technet.microsoft.com/en-us/library/cc778248(v=ws.10).aspx Next, 
if that doesn't resolve it, I would try running curl/wget on the server, and requesting the displayed URL. curl can 
be run with curl example.com and wget wget -qO- example.com, both will displayed the returned data (if any) on the 
terminal. If one returns a command not found, try the other. If that doesn't work, something is going on with your 
server. If it's returning something that looks like an error (e.g. a sever generated error page), I'd look into that 
too at this step. If you appear to have connectivity issues, you can see if there's any iptables rules in place by 
running iptables -L on the server. A DROP all under Chain INPUT would cause this. You can read more about iptables 
here, and how to set it up for your needs here: https://help.ubuntu.com/community/IptablesHowTo (Even if you're not 
using ubuntu, this will still work for you, look in the "Allowing Incoming Traffic on Specific Ports" section, 
there's an example there you would need to adopt slightly for the non-standard port the server is running on). If 
there's an external firewall preventing access, you would need to talk with whoever is managing the sever. """

sentences = nltk.sent_tokenize(document)

for sentence in sentences:
    tokens = casual_tokenize(sentence, preserve_case=False, reduce_len=True)
    # print tokens
    stoppath = "SmartStoplist.txt"

    with open("SmartStoplist.txt", 'r') as f:
        content = f.readlines()
    # you may also want to remove whitespace characters like `\n` at the end of each line
    content = [x.strip() for x in content]

    stop_words = set(content)

    clean_tokens = [words for words in tokens if words not in stop_words]
    print clean_tokens

    tagged = nltk.pos_tag(clean_tokens)
    print tagged