示例#1
0
def bag_of_words(tr_tweets,
                 te_tweets,
                 tr_targets=pd.Series(),
                 te_targets=pd.Series(),
                 per_target=False,
                 max_feats=None,
                 normalise_counts=False,
                 **kwargs):
    """
    Calculate bag-of-words representations of train and test tweets
    :param tr_tweets: pandas Series of strings, raw texts to convert (from train set)
    :param te_tweets: pandas Series of strings, raw texts to convert (from test set)
    :param tr_targets: pandas Series of strings, target classes (from train set)
    :param te_targets: pandas Series of strings, target classes (from test set)
    :param per_target: bool, whether to find separate BoW repr for each target class
    :param max_feats: int, maximum number of words/ngrams to keep, number of dimensions
    in returned feature matrices
    :param normalise_counts: bool, whether to divide the counts within each tweet by the
    number of tokens (not for Multinomial NB)
    :param kwargs: to be passed onto sklearn CountVectorizer
    :return: tuple, training feature matrix, test feature matrix, list of feature names
    (with '_bow' appended to each)
    """

    if per_target and not tr_targets.empty and not te_targets.empty:
        # Create different BoW for each target
        # Only useful if using max_features - as most common words/n-grams
        # May be for only one or two of the targets
        x_tr = np.zeros((tr_tweets.shape[0], max_feats), dtype=np.int64)
        x_te = np.zeros((te_tweets.shape[0], max_feats), dtype=np.int64)
        for _targ in tr_targets.unique():
            word_bagger = text_sk.CountVectorizer(max_features=max_feats,
                                                  **kwargs)
            x_tr[(tr_targets == _targ).values] = \
                word_bagger.fit_transform(tr_tweets[(tr_targets == _targ).values].values).toarray()
            x_te[(te_targets == _targ).values] = \
                word_bagger.transform(te_tweets[(te_targets == _targ).values].values).toarray()
    else:
        word_bagger = text_sk.CountVectorizer(max_features=max_feats, **kwargs)
        x_tr = word_bagger.fit_transform(tr_tweets).toarray()
        x_te = word_bagger.transform(te_tweets).toarray()

    if normalise_counts:
        # Normliase counts by length of tweet
        tr_tweet_lens = tr_tweets.apply(
            tokenize.TweetTokenizer().tokenize).apply(len)
        te_tweet_lens = te_tweets.apply(
            tokenize.TweetTokenizer().tokenize).apply(len)
        x_tr = np.divide(x_tr, tr_tweet_lens.values[:, np.newaxis])
        x_te = np.divide(x_te, te_tweet_lens.values[:, np.newaxis])
    return x_tr, x_te, [
        _fn + '_bow' for _fn in word_bagger.get_feature_names()
    ]
示例#2
0
def tokenizer(tweets):
    tokens = list()
    tk = tokenize.TweetTokenizer(strip_handles=True,
                                 reduce_len=True,
                                 preserve_case=False)
    for tweet in tweets:
        try:
            element = tk.tokenize(tweet)
        except UnicodeDecodeError:
            element = []
        tokens.append(element)
    return tokens
示例#3
0
def _get_word2vec():
    '''
  Loads the word2vec model from the {WORD2VEC_MODE_FILE} filepath. If the file cannot
  be found, creates a new model from going through the top posts of all ccsr subreddits.

  :return: A gensim word2vec model centered on ccsr subreddits
  '''
    if os.path.isfile(WORD2VEC_MODEL_FILE):
        logging.info('Loading word2vec model from file {0} ...'.format(
            WORD2VEC_MODEL_FILE))
        return Word2Vec.load(WORD2VEC_MODEL_FILE)

    word_tokenizer = tokenize.TweetTokenizer()
    sentences = []

    def parse_comment(comment, subreddit_sentences):
        subreddit_sentences.append(word_tokenizer.tokenize(comment.body))
        for reply in comment.replies:
            parse_comment(reply, subreddit_sentences)

    for ccsr in CRYPTOCURRENCY_SUBREDDITS:
        logging.info(
            "Compiling submission titles and comments from subreddit '{0}' ..."
            .format(ccsr))
        subreddit = reddit.subreddit(ccsr)
        subreddit_sentences = []
        for submission in subreddit.top(limit=1e10):

            logging.info("Looking at submission '{0}'".format(
                submission.title))
            subreddit_sentences.append(
                word_tokenizer.tokenize(submission.title))
            submission.comments.replace_more(limit=0)
            for comment in submission.comments:
                parse_comment(comment, subreddit_sentences)

            logging.info(
                "Collected {0} sentences ... {1} in training set.".format(
                    len(subreddit_sentences),
                    len(subreddit_sentences) + len(sentences)))
            if len(subreddit_sentences
                   ) > NUM_TRAINING_SUBMISSIONS_PER_SUBREDDIT:
                sentences.extend(subreddit_sentences)
                break

    logging.info('Training model on {0} sentences ...'.format(len(sentences)))
    model = Word2Vec(sentences, size=200, window=5, min_count=5, workers=4)
    model.save(WORD2VEC_MODEL_FILE)
    return model
示例#4
0
    def tokenize(self, sentences, task_ids):
        # nltk TweetTokenizer for stance
        tweet_tokenizer = tokenize.TweetTokenizer()

        # nltk WordPunctTokenizer for NLI
        punct_tokenizer = tokenize.WordPunctTokenizer()

        all_sentence = []
        for sentence, task_id in zip(sentences, task_ids):
            if task_id == 0:  # stance
                tokenize_sent = tweet_tokenizer.tokenize(sentence)
            elif task_id == 1:  # NLI
                tokenize_sent = punct_tokenizer.tokenize(sentence)

            all_sentence.append(tokenize_sent)

        return all_sentence
def get_tweets_with_emoji(tweets, emojis_ours, emojis_theirs, emojis_popular):
    """Get all tweets with emoji in the sets

    Args:
        tweets: List of Tweets
        emojis_ours: Emoji vectors trained on our model
        emojis_theirs: Emoji vectors trained on an external model
        emojis_popular: List of popular emojis

    Returns:
        All tweets containing emoji

    """
    tokenizer = tk.TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
    ems = list()
    for tweet in tweets:
        if get_emojis_in_tweet(tweet, emojis_ours, emojis_theirs, emojis_popular, tokenizer):
            ems.append(tweet)
    return ems
示例#6
0
def sentenceTransform(sentenceList):
    #gives a list of sentences, return a cleaned, tokenized sentenceList, and calculate the longest sentence length
    token_sentence_list = []
    tknzr = tokenize.TweetTokenizer()

    for sentence in sentenceList:
        sentence = sentence.lower()  #Tolowercase
        sentence = tknzr.tokenize(sentence)  #tokenize
        token_sentence_list.append(sentence)

    for sentence in token_sentence_list:
        for idx, word in enumerate(sentence):
            if word == '@user':
                sentence[idx] = '<user>'
            if word == 'url':
                sentence[idx] = '<url>'
            if word.isdigit():
                sentence[idx] = '<number>'
            if word[0] == '#':
                sentence[idx] = '<hashtag>'

    return token_sentence_list
def prepare_tweet_vector_averages(tweets, p2v):
    """Take the vector sum of all tokens in each tweet

    Args:
        tweets: All tweets
        p2v: Phrase2Vec model

    Returns:
        Average vectors for each tweet
        Truth
    """
    tokenizer = tk.TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)

    avg_vecs = list()
    y = list()

    for tweet in tweets:
        tokens = tokenizer.tokenize(tweet.text)
        avg_vecs.append(np.sum([p2v[x] for x in tokens], axis=0) / len(tokens))
        y.append(tweet.label)

    return avg_vecs, y
def predict_nltk_twitter_tokenizer(sentences: list) -> list:
    """
    Predict all sentences sequentially.

    Parameters
    ----------
    sentences: list
        List of strings with pre-processed sentences.

    Retunrs
    -------
    list:
        List of predicted tokens for each sentenec.
    """
    pred_tokens = []

    tokenizer = tokenize.TweetTokenizer()

    for sentence in sentences:
        pred_tokens.append(tokenizer.tokenize(sentence))

    return pred_tokens
示例#9
0
import gensim.models as gsm
from parse_recipes import load_recipes
from config import cfg
import phrase2vec as p2v
import numpy as np
import json
import pickle

import nltk.tokenize as tk
tokenizer = tk.TweetTokenizer(preserve_case=False,
                              reduce_len=True,
                              strip_handles=True)


def create_recipe_vectors(p2v_our_emoji):
    """
		Create recipe vectors by averaging embeddings of inredients and saving the result
	"""
    recipes = load_recipes()
    recipes_clean = {}

    for key, value in recipes.items():
        ingredients_tokens = [
            tokenizer.tokenize(ingredient)
            for ingredient in value['ingredients_clean']
        ]
        ingredients_tokens_flat = [
            ingredient for sublist in ingredients_tokens
            for ingredient in sublist
        ]
示例#10
0
class TweetTokenizer(BaseTokenizer):
    """ Pre-trained tokenizer for tweets. """
    tokenizer = tokenize.TweetTokenizer()
    name = 'Tweet'
示例#11
0
文件: data.py 项目: Ali-Omrani/NTAP
import copy, inspect
from scipy.spatial.distance import cosine

stem = SnowballStemmer("english").stem

link_re = re.compile(r"(http(s)?[^\s]*)|(pic\.[s]*)")
hashtag_re = re.compile(r"#[a-zA-Z0-9_]+")
mention_re = re.compile(r"@[a-zA-Z0-9_]+")

pat_type = {'links': link_re,
            'hashtags': hashtag_re,
            'mentions': mention_re}

tokenizers = {'treebank': nltk_token.TreebankWordTokenizer().tokenize,
              'wordpunct': nltk_token.WordPunctTokenizer().tokenize,
              'tweettokenize': nltk_token.TweetTokenizer().tokenize}

def read_file(path):
    if not os.path.exists(path):
        raise ValueError("Path does not point to existing file: {}".format(path))
        return
    ending = path.split('.')[-1]
    if ending == 'csv':
        return pd.read_csv(path)
    elif ending == 'tsv':
        return pd.read_csv(path, delimiter='\t')
    elif ending == 'pkl':
        return pd.read_pickle(path)
    elif ending == 'json':
        return pd.read_json(path)
from matplotlib import pyplot as plt
from sklearn.cluster import AgglomerativeClustering

DATA_PATH = config.get_data_path()
STORIES_FILE = DATA_PATH / 'stories_processed.csv'

stories = pd.read_csv(STORIES_FILE,
                      index_col=0,
                      converters={
                          'all_text': eval,
                          'font_size': eval,
                          'guids': eval
                      })

ft_model_path = DATA_PATH / '../fasttext' / 'cc.ru.300.bin'

model = fasttext.load_model(str(ft_model_path))
tokenizer = tokenize.TweetTokenizer()

vectors = []
for text in stories['all_text']:
    text = ' '.join(text).lower()
    text = tokenizer.tokenize(text)
    vector = np.array([model[word] for word in text]).mean(axis=0)
    vectors.append(vector)

cluster_model = AgglomerativeClustering(n_clusters=8)
clusters = cluster_model.fit_predict(vectors)
stories['clusters'] = clusters
stories.to_csv(STORIES_FILE)
示例#13
0
class TweetTokenizer(BaseTokenizer):
    """ 预训练的推特分词器.保留表情符号. This example. :-) #simple → (This), (example), (.), (:-)), (#simple) """
    tokenizer = tokenize.TweetTokenizer()
    name = '推特分词'
示例#14
0
VERSION_STR = 'v1.0.0'

import db
import time
from error import Error
from flask import Blueprint, request, jsonify, json, g

blueprint = Blueprint(VERSION_STR, __name__)

import nltk.data
from nltk import tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

VADER_SENTIMENT_ANALYZER = SentimentIntensityAnalyzer()
WORD_TOKENIZER = tokenize.TweetTokenizer()
SENT_TOKENIZER = nltk.data.load('tokenizers/punkt/english.pickle')
PARA_TOKENIZER = tokenize.BlanklineTokenizer()


def score_word(word):
    return VADER_SENTIMENT_ANALYZER.lexicon.get(word, 0.0)


def compute_sentiment_record(text):
    sentiment_record = {'text': text}
    sentiment_record.update(VADER_SENTIMENT_ANALYZER.polarity_scores(text))
    return sentiment_record


def r_remove_key(o, keys_to_remove):
    if hasattr(o, 'iteritems'):
示例#15
0
# -*- coding: utf-8 -*-
"""
Created on Mon Apr  9 17:25:19 2018

@author: miaoji
"""
import nltk.tokenize as nt
from textblob import TextBlob
import time

start_time = time.time()

in_file = open("/data/zhangbin/caozhaojun/true_procress_data/daodao_en.txt",
               'r')
out_file = open("handle_daodao_en.txt", 'a+')
tokenizer = nt.TweetTokenizer()

line_id = 0
for line in in_file.readlines():
    line_id += 1
    if line_id % 1000 == 0:
        print(line_id)
    correct_line = TextBlob(line.lower().replace('...',
                                                 ' ').strip())  #.correct()
    token_line = correct_line.tokenize(tokenizer)
    final_line = ' '.join([word for word in token_line])
    out_file.write(final_line + '\n')
in_file.close()
out_file.close()

end_time = time.time()