示例#1
0
def tokenizer_features(features_tweets):
    import sys
    sys.path.append('src/ark-twokenize-py')

    from twokenize import tokenizeRawTweetText

    for row, tweet in features_tweets:
        row['features']['tokenizer'] = {
            'tokens':
            tokenizeRawTweetText(tweet.text.lower()),
            'tokens_without_entities':
            tokenizeRawTweetText(tweet.text_without_entities.lower()),
        }

        yield row, tweet
def perprocessing(tdic):
    new_dic = {}
    POS_feature = []
    for line in tdic:
        id = line
        gt = tdic[line][0]
        raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1]))
        text = twokenize.normalizeTextForTagger(raw)
        text_tk = twokenize.tokenize(text)
        # print(text_tk)
        print(text_tk)
        telist = []
        for word in text_tk:
            word = word.lower()
            # ps = nltk.stem.PorterStemmer()
            # word = ps.stem(word)
            telist.append(word)
        # print(telist)
        afterlemma = lemma(telist)
        telist = afterlemma[0]
        POS_feature.append(afterlemma[1])
        # print(telist)
        newtext = ' '.join(telist)
        # print(newtext)
        newtext = textPreprocessor01.replaceall(newtext)  #now preprocess . change to URLINK SADFACE
        print(newtext)
        new_dic[id] = gt, newtext
    return new_dic, np.array(POS_feature)
示例#3
0
def get_twokens_rt_fave(filename):
    with open(filename) as fi:
        for line in fi:
            t = json.loads(line)
            lang = t["twitter_lang"]
            if lang == "en":
                try:
                    print "english"
                    text = t['body'].decode('utf8').lower()
                    #print text
                    text = URL_RE.sub(" ", text)
                    text = USERNAME_RE.sub(" ", text)
                    #print 'text', text
                    #words = re.findall(' (\w{3,})', text)
                    toks = twokenize.tokenizeRawTweetText(text)
                    print "toks", toks
                    if toks:
                        #print "has toks"
                        isReply = 1 if "inReplyTo" in t.keys() else 0
                        #import pdb; pdb.set_trace()

                        print(toks, t['retweetCount'], t['favoritesCount'],
                              isReply)
                        yield (toks, t['retweetCount'], t['favoritesCount'],
                               isReply)
                    else:
                        print "NO TOKS"

                except:
                    continue
def get_clean_tokens(tweet):
    tokens = []  
    ctweet = _remove_tags(clean_tweet(tweet))
    #Replace this with TweetNLP tokenizer
    #tokens = nltk.tokenize.WhitespaceTokenizer().tokenize(tweet.lower());
    tokens = twk.tokenizeRawTweetText(ctweet)
    return tokens
示例#5
0
    def next(self):
        tweet = self.tw_stream.next()

        if tweet is stream.End_Of_Stream:
            return stream.End_Of_Stream

        if tweet is None:
            return None

        t = tweet.timestamp
        uid = tweet.uid
        txt = tweet.str

        # lower case
        txt = txt.lower()

        # tokenize
        try:
            tokens = twokenize.tokenizeRawTweetText(txt)
        except:
            return None

        # filter
        tokens = filter(
            lambda x: (not stop_words.contains(x)) and
            (not _PUN_PATTERN.match(x)) and (len(x) <= 32) and (len(x) > 1),
            tokens)

        return stream.PreprocessedTweetItem(t, uid, tokens)
def download():
    global twitter, arguments, tid_list
    with open(arguments.inputfile + "." + arguments.outputtype, "w") as fw:
        tid_number = len(tid_list)
        max_round = tid_number / MAX_LOOKUP_NUMBER + 1
        for i in range(max_round):
            tids = tid_list[i * MAX_LOOKUP_NUMBER:(i + 1) * MAX_LOOKUP_NUMBER]
            time.sleep(SLEEP_TIME)
            jobjs = twitter.lookup_status(id=tids)
            for jobj in jobjs:
                if arguments.outputtype == "json":
                    fw.write(json.dumps(jobj))
                elif arguments.outputtype == "IdTweet":
                    tweet = jobj["text"]
                    tid = jobj["id_str"]
                    fw.write(json.dumps({"id_str": tid, "text": tweet}))
                else:
                    tweet = jobj["text"]
                    tokens = twokenize.tokenizeRawTweetText(tweet)
                    tid = jobj["id_str"]
                    fw.write(
                        json.dumps({
                            "id_str": tid,
                            "text": " ".join(tokens)
                        }))
                fw.write("\n")
def tokenize_tweet(tweet):
    tokens = []  
    ctweet = clean_tweet(tweet)
    #Replace this with TweetNLP tokenizer
    #tokens = nltk.tokenize.WhitespaceTokenizer().tokenize(tweet.lower());
    tokens = twk.tokenizeRawTweetText(ctweet)
    return tokens
示例#8
0
    def extract(self, corpus, *args, **kwargs):
        """
        Extract all the named entities from the corpus.
        The output is a list of lists.
        Each outer list represents a document.
        Each inner list is the candidates in that document.

        :param corpus: The corpus of documents where to extract candidate participants.
        :type corpus: list

        :return: A list of candidates separated by the document in which they were found.
        :rtype: list of list of str
        """

        candidates = []

        for document in corpus:
            document_entities = []
            tokens = tokenizeRawTweetText(document.text)

            entities = TwitterNEREntityExtractor.ner.get_entities(tokens)
            candidates.append([
                " ".join(tokens[start:end]).lower()
                for (start, end, type) in entities
            ])

        return candidates
示例#9
0
def perprocessing(tdic):
    new_dic = {}
    for line in tdic:
        id = line
        gt = tdic[line][0]
        raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1]))
        text = twokenize.normalizeTextForTagger(raw)
        text_tk = twokenize.tokenize(text)
        telist = []
        for word in text_tk:
            word = word.lower()
            ps = nltk.stem.PorterStemmer()
            word = ps.stem(word)
            # word = nltk.stem.SnowballStemmer(word)
            telist.append(word)
        # 	return ''.join(ans)
        # newtext = ?telist
        # newtext = ' '.join(text_tk)
        newtext = ' '.join(telist)
        # print(newtext)
        newtext = textPreprocessor01.replaceall(newtext)
        new_dic[id] = gt, newtext
        # print(type(tdic[line][1]))
        # print(line)
        # print(type(line))
        # print(type(newtext))
        # print(newtext)
    return new_dic
示例#10
0
def process(tweet=None, configuration=None):
    if tweet is None or configuration is None:
        return (False, [], [], [])
    else:
        #TwitterNER extraction
        tokens = tokenizeRawTweetText(tweet)
        ner_processed = configuration.get_entities(tokens)


        data = {
            'PERSON': [],
            'ORGANIZATION': [],
            'LOCATION': []
        }
        # print 'NER Tweet: \n' + ner_tweet

        '''Run extraction parser'''

        #TwitterNER Extraction
        for ner_token in ner_processed:
            (from_index, to_index, ner_key) = ner_token 
            entity = " ".join(tokens[from_index:to_index])
            data[ner_key].append(entity)

        # print 'People: {0}'.format(extracted['people'])
        # print 'Organizations: {0}'.format(extracted['organizations'])
        # print 'Locations: {0}\n'.format(extracted['locations'])
        return (True, data['PERSON'], data['ORGANIZATION'], data['LOCATION'])
示例#11
0
def cleanTweet(tweet):
    tokens = twokenize.tokenizeRawTweetText(tweet)
    cleanedTweet = "";
    for token in tokens:
        if (not token.startswith('http')) and (not token.startswith('www')) :
            token += ' '
            cleanedTweet += token
    return cleanedTweet
示例#12
0
    def transform(self, documents):
        user_mentions = re.compile(ur"@\w\w+:?")
        text_number = re.compile(ur"\b\w*\d+\w*\b")
        url = re.compile(ur"((www\.[^\s]+)|(https?:\/\/[^\s]+))")
        mentioning = [len(user_mentions.findall(doc.content)) for doc in documents]
        # print mentioning
        exclamation = [doc.content.count("!") for doc in documents]
        # print exclamation
        question = [doc.content.count("?") for doc in documents]
        # print question
        hashtag = [doc.content.count("#") for doc in documents]
        # print hashtag
        n_words = [len(tokenizeRawTweetText(doc.content)) for doc in documents]
        # print n_words
        n_chars = [len(doc.content) for doc in documents]

        avetweetlength = [np.mean([len(tweet) for tweet in nltk.sent_tokenize(doc.content)]) for doc in documents]
        # print avetweetlength
        avgwordlenght = [np.mean([len(word) for word in tokenizeRawTweetText(doc.content)]) for doc in documents]
        # print avgwordlenght
        allcaps = [np.sum([word.isupper() for word in tokenizeRawTweetText(doc.content)]) for doc in documents]
        # print allcaps
        numtexttoken = [len(text_number.findall(doc.content)) for doc in documents]
        # print numtexttoken
        url_count = [len(url.findall(doc.content)) for doc in documents]
        # print url_count

        X = np.array(
            [
                n_words,
                n_chars,
                numtexttoken,
                allcaps,
                exclamation,
                question,
                hashtag,
                mentioning,
                url_count,
                avgwordlenght,
                avetweetlength,
            ]
        ).T
        if not hasattr(self, "scalar"):
            self.scalar = preprocessing.StandardScaler().fit(X)
        return self.scalar.transform(X)
示例#13
0
    def tokenize(sentence):
        """
        Uses twokenize to tokenize tweets.

        :param sentence: String of the tweet
        :return: list containing the tokens
        """
        toks = twokenize.tokenizeRawTweetText(sentence.lower())
        return toks
def tokenize_tweets(tweets):
    sys.path.append('.')
    import twokenize
    decoded = [x.replace("\\n", "\n") for x in tweets]
    ttweets = [twokenize.tokenizeRawTweetText(x) for x in decoded]
    uncased = []
    for tokens in ttweets:
        uncased.append([x.lower() for x in tokens])
    return uncased
示例#15
0
def ner_tweet(tweet):
    global ner
    l = []
    print(tweet)
    tokens = tokenizeRawTweetText(tweet)
    list = ner.get_entities(tokens)
    print(list)
    for x in list:
        l.append(str(" ".join(tokens[int(x[0]):int(x[1])]).encode('utf-8')))
    return l
示例#16
0
def get_twokens(filename):
    """CMU Twokenizer."""
    tweets = []
    with open(filename) as fi:
        for line in fi:
            t = json.loads(line)    
            text = t['body'].encode('utf8')
            tokens = twokenize.tokenizeRawTweetText(text)
            tweets.append(tokens)
    return tweets
示例#17
0
def get_twokens(filename):
    """CMU Twokenizer."""
    tweets = []
    with open(filename) as fi:
        for line in fi:
            t = json.loads(line)    
            text = t['body'].encode('utf8')
            tokens = twokenize.tokenizeRawTweetText(text)
            tweets.append(tokens)
    return tweets
示例#18
0
def relative_score1(keywords, txt):
    tokens = twokenize.tokenizeRawTweetText(txt)

    score = 0.0

    for token in tokens:
        if token in keywords:
            score += 1

    score /= 3
    return score
示例#19
0
 def __next__(self):
     _tweet = next(self.stream)
     if _tweet is stream.End_Of_Stream:
         return stream.End_Of_Stream
     _t     = _tweet.timestamp
     _tid   = _tweet.tid
     _text  = _tweet.tweet
     _tokens = tokenize.tokenizeRawTweetText(_text)
     _tokens = list(set(filter(lambda x:not x is "" and x not in stopwords and x not in tokenize.e_punc and not x.startswith('http'),[re.sub(r'[^A-Za-z0-9\':/.&$|@%\\]','',tokenize.deRepeatWords(i.lower())) for i in _tokens])))
     return stream.PreprocessedTweetItem(_t,_tid,_tokens)
     
示例#20
0
def relative_score2(keywords, txt):
    tokens = twokenize.tokenizeRawTweetText(txt)

    words = set()

    for token in tokens:
        if token in keywords:
            words.add(token)

    score = len(words)
    score /= 2
    return score
示例#21
0
    def next(self):
        tweet = self.tw_stream.next()

        if tweet is stream.End_Of_Stream:
            return stream.End_Of_Stream

        if tweet is None:
            return None

        if tweet.is_retweet():
            user = tweet.who_is_retweeted()
            if user:
                if user in _FILTERED_USERS:
                    return None

        t = tweet.timestamp
        uid = tweet.uid
        txt = tweet.str

        # clean txt
        try:
            txt = self.wb_cleaner.clean_wb(txt)
        except:
            return None

        # remove all urls
        urls = re.findall(_HTTP_PATTERN, txt)
        for url in urls:
            txt = txt.replace(url, ' ')

        # lower case
        txt = txt.lower()

        # tokenize
        try:
            tokens = twokenize.tokenizeRawTweetText(txt)
        except:
            return None

        # filter
        tokens = filter(
            lambda x: (not stop_words.contains(x)) and
            (not _PUN_PATTERN.match(x)) and (len(x) <= 32), tokens)
        # space filter
        tokens = filter(lambda x: not _SPACE_PATTERN.match(x), tokens)

        # to ascii
        #tokens = map(lambda x: x.encode('ascii','ignore'), tokens)
        #tokens = filter(lambda x: len(x) > 0, tokens)
        ret = stream.PreprocessedTweetItem(t, uid, tokens, tweet)
        active_term_maintainer.add(ret)

        return ret
def tokenize(line):
    text = line.txt
    if text is not None:
        text = text.lower()
        text = re.sub(URL_PATTERN, 'URL', text)
        text = re.sub(USER_PATTERN, 'USER', text)
        text = re.sub(PUNCT_PATTERN, ' ', text)
        tokens = twokenize.tokenizeRawTweetText(text)
        line.tokens = tokens
        return tokens
    else:
        return None
示例#23
0
def tokenize(line):
    text = line.txt
    if text is not None:
        text = text.lower()
        text = re.sub(URL_PATTERN, 'URL', text)
        text = re.sub(USER_PATTERN, 'USER', text)
        text = re.sub(PUNCT_PATTERN, ' ', text)
        tokens = twokenize.tokenizeRawTweetText(text)
        line.tokens = tokens
        return tokens
    else:
        return None
示例#24
0
def message_to_words(message, word_del):
    message_clean = re.sub(
        "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "URL", message
    )
    word_tokens = tokenizeRawTweetText(message_clean)  # tokenize
    word_tokens = replace_at(word_tokens)  # replace @ using predefined char
    word_tokens = [s for s in word_tokens if s not in word_del]  # remove few speicific words "[" "]"
    # word_list1 = [w.lower() for w in word_tokens if not w.lower() in nltk.corpus.stopwords.words('english') and not w in string.punctuation] #lowering the case and stop words removal
    word_list1 = [w.lower() for w in word_tokens if not w in string.punctuation]
    word_list2 = [x for x in word_list1 if not (x[0].isdigit() or x[0] == "-" and x[1:].isdigit())]  # remove digit no.
    # word_stemmed=[porter.stem(t) for t in word_list2]   # stemming to get normalized words
    # word_stemmed1= ' '.join(str(v).encode('ascii', 'ignore') for v in word_stemmed)   # s.endcode('ascii','ignore') avoid the utf8 decode errors
    return word_list2
示例#25
0
def ark_twokenize(text: str) -> List[str]:
    '''
    A Twitter tokeniser from
    `CMU Ark <https://github.com/brendano/ark-tweet-nlp>`_

    This is a wrapper of
    `this <https://github.com/Sentimentron/ark-twokenize-py>`_

    :param text: A string to be tokenised.
    :returns: A list of tokens where each token is a String.
    '''

    if isinstance(text, str):
        return twokenize.tokenizeRawTweetText(text)
    raise ValueError(f'The paramter must be of type str not {type(text)}')
示例#26
0
def preprocess(documents, custom=[]):
    # Remove urls: http? bug fixed (SV)
    documents = [re.sub(r"(?:\@|https?\://)\S+", "", doc) for doc in documents]
    # remove duplicate tweets
    documents = set(documents)
    # Remove documents with less 100 words (some tweets contain only URLs)
    # documents = [doc for doc in documents if len(doc) > 100]

    # Tokenize
    documents = [set(tokenizeRawTweetText(doc.lower())) for doc in documents]
    # print documents

    # Remove stop words
    unigrams = [w for doc in documents for w in doc if len(w) == 1]
    bigrams = [w for doc in documents for w in doc if len(w) == 2]
    # print bigrams
    # print custom  + STOPLIST
    stoplist = set(
        nltk.corpus.stopwords.words("english") + STOPLIST_TW + unigrams +
        bigrams + custom)
    # print stoplist
    # and strip #
    documents = [[
        token.lstrip('#') for token in doc if token.lstrip('#') not in stoplist
    ] for doc in documents]

    # remove punctuation tokens
    documents = [[token for token in doc if not re.match(punctSeq, token)]
                 for doc in documents]

    # Remove words that only occur once
    token_frequency = defaultdict(int)

    lmtzr = WordNetLemmatizer()
    documents = [[lmtzr.lemmatize(token) for token in doc]
                 for doc in documents]

    # count all token
    for doc in documents:
        for token in doc:
            token_frequency[token] += 1

    # keep words that occur more than once
    documents = [[token for token in doc if token_frequency[token] > 1]
                 for doc in documents]

    # print documents
    return documents
示例#27
0
def twokenize(text, no_duplicates=True, stem=False):
    text = text.lower()
    clean_text = re.sub(r"(?:\@|https?\://)\S+", "", text)

    # remove non alpha chars
    # clean_text = filter(str.isalnum, clean_text)
    # regex = re.compile('[^a-zA-Z ]')
    # clean_text = regex.sub('', clean_text)
    clean_text = re.sub(r'[^\x00-\x7F]+', ' ', clean_text)
    # strip punctuation
    # clean_text = clean_text.decode('unicode_escape').encode('ascii','ignore')
    translator = string.maketrans(
        string.punctuation,
        ' ' * len(string.punctuation))  #map punctuation to space
    clean_text = str(clean_text).translate(translator)

    # Remove documents with less 100 words (some tweets contain only URLs)
    # documents = [doc for doc in documents if len(doc) > 100]

    # Tokenize
    # tokens = tokenizeRawTweetText(clean_text.lower())
    tokens = tokenizeRawTweetText(clean_text)

    # Remove stop words
    # unigrams = [w for doc in documents for w in doc if len(w) == 1]
    # bigrams = [w for doc in documents for w in doc if len(w) == 2]
    # print bigrams
    # + STOPLIST_TW + STOPLIST + unigrams + bigrams)
    # and strip #
    tokens = [token for token in tokens if token not in STOPLIST]
    # tokens = [token for token in tokens if token not in string.punctuation]
    # print tokens
    # remove punctuation tokens
    # tokens = [token for token in tokens if not re.match(punctSeq, token)]

    # lmtzr = WordNetLemmatizer()
    # tokens = [lmtzr.lemmatize(token) for token in tokens]
    if stem:
        stemmer = SnowballStemmer("english")
        # print tokens
        tokens = [stemmer.stem(token) for token in tokens]
    if no_duplicates:
        # tokens = set(tokens)
        tokens = f7(tokens)
    clean_text = ' '.join(tokens)
    return clean_text
示例#28
0
def ark_twokenize(text):
    '''
    A Twitter tokeniser from `CMU Ark <https://github.com/brendano/ark-tweet-nlp>`_
    returns a list of tokens.

    This is just a wrapper of `this <https://github.com/Sentimentron/ark-twokenize-py>`_

    :param text: A string to be tokenised.
    :type text: String
    :returns: A list of tokens where each token is a String.
    :rtype: list
    '''

    if isinstance(text, str):
        return twokenize.tokenizeRawTweetText(text)
    raise ValueError('The paramter must be of type str not {}'.format(
        type(text)))
示例#29
0
def perprocessing(tdic):
    new_dic = {}
    for line in tdic:
        id = line
        gt = tdic[line][0]
        raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1]))
        text = twokenize.normalizeTextForTagger(raw)
        text_tk = twokenize.tokenize(text)
        # print(text_tk)
        newtext = ' '.join(text_tk)
        newtext = textPreprocessor01.replaceall(newtext)
        new_dic[id] = gt, newtext
        # print(type(tdic[line][1]))
        # print(line)
        # print(type(line))
        # print(type(newtext))
        # print(newtext)
    return new_dic
示例#30
0
def perprocessing(tdic):
    new_dic = {}
    for line in tdic:
        id = line
        gt = tdic[line][0]
        raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1]))
        text = twokenize.normalizeTextForTagger(raw)
        text_tk = twokenize.tokenize(text)
        telist = []
        for word in text_tk:
            word = word.lower()
            ps = nltk.stem.PorterStemmer()
            word = ps.stem(word)
            telist.append(word)
        newtext = ' '.join(telist)
        newtext = textPreprocessor02.replaceall(newtext)
        new_dic[id] = gt, newtext
    return new_dic
示例#31
0
def get_positives_and_negatives(corpus, lexicon):
    num_words = {}
    num_positive_words = {}
    num_negative_words = {}
    for user, l in corpus.items():
        for tweet in l:
            for token in twokenize.tokenizeRawTweetText(tweet):
                num_words[user] = num_words[user] if user in num_words else 0
                num_words[user] = num_words[user] + 1
                if token in lexicon['positive']:
                    num_positive_words[user] = num_positive_words[
                        user] if user in num_positive_words else 0
                    num_positive_words[user] = num_positive_words[user] + 1
                elif token in lexicon['negative']:
                    num_negative_words[user] = num_negative_words[
                        user] if user in num_negative_words else 0
                    num_negative_words[user] = num_negative_words[user] + 1
    return num_words, num_positive_words, num_negative_words
示例#32
0
def crawler(file):

    #Dictionary initilization.
    tweets = {}
    emoticons = {}
    hashtags = {}
    capitals = {}
    longs = {}
    sentiment = {}

    #Read the file and store it in tweets dictionary
    with open(file) as tsvfile:
        tsvreader = csv.reader(tsvfile, delimiter="\t")
        #diction = enchant.request_dict("en_US")
        for line in tsvreader:

            tweet_id = str(line[1])+'$'+str(line[2])+'$'+str(line[3])
            splitted_text=str(line[5]).split(" ")

            start = int(line[2])
            end= int(line[3])
            isolated_phrase= isolatePhrase(splitted_text,start,end)

            tokenized_text = twokenize.tokenizeRawTweetText(isolated_phrase)
            tokenized_text = replaceUnicode(tokenized_text)

            capitals[tweet_id] = storeCapitals(tokenized_text)
            emoticons[tweet_id] = storeEmoticons(tokenized_text)
            hashtags[tweet_id] = storeHashtags(tokenized_text)

            new_text =replaceCapitals(tokenized_text)
            new_text = replaceURLs(tokenized_text)
            new_text = replaceUserMentions(new_text)
            new_text = replaceEmoticons(new_text)           
            new_text = replaceHashtags(new_text)

            longs[tweet_id] = storeLongWords(new_text)
            sentiment[tweet_id] = str(line[4])
            #print(sentiment)
            tweets[tweet_id] = new_text
    tsvfile.close()


    return (tweets, emoticons, hashtags, capitals,longs, sentiment)
def twokenize(text, no_duplicates=True, stem=False):
    text = text.lower()
    clean_text = re.sub(r"(?:\@|https?\://)\S+", "", text)

    # remove non alpha chars
    # clean_text = filter(str.isalnum, clean_text)
    # regex = re.compile('[^a-zA-Z ]')
    # clean_text = regex.sub('', clean_text)
    clean_text = re.sub(r'[^\x00-\x7F]+',' ', clean_text)
    # strip punctuation
    # clean_text = clean_text.decode('unicode_escape').encode('ascii','ignore')
    translator = string.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
    clean_text = str(clean_text).translate(translator)

    # Remove documents with less 100 words (some tweets contain only URLs)
    # documents = [doc for doc in documents if len(doc) > 100]

    # Tokenize
    # tokens = tokenizeRawTweetText(clean_text.lower())
    tokens = tokenizeRawTweetText(clean_text)

    # Remove stop words
    # unigrams = [w for doc in documents for w in doc if len(w) == 1]
    # bigrams = [w for doc in documents for w in doc if len(w) == 2]
    # print bigrams
        # + STOPLIST_TW + STOPLIST + unigrams + bigrams)
    # and strip #
    tokens = [token for token in tokens if token not in STOPLIST]
    # tokens = [token for token in tokens if token not in string.punctuation]
    # print tokens
    # remove punctuation tokens
    # tokens = [token for token in tokens if not re.match(punctSeq, token)]

    # lmtzr = WordNetLemmatizer()
    # tokens = [lmtzr.lemmatize(token) for token in tokens]
    if stem:
        stemmer = SnowballStemmer("english")
        # print tokens
        tokens = [stemmer.stem(token) for token in tokens]
    if no_duplicates:
        # tokens = set(tokens)
        tokens = f7(tokens)
    clean_text = ' '.join(tokens)
    return clean_text
示例#34
0
def makeCorpus(rawTweets, rawLabels, saveLocal=False, corpusFormat='mm'):

    # Tokenize
    # Buidling stoplist
    #stoplist = nltk.corpus.stopwords.words('english')
    word_stoplist = "1,2,3,4,5,6,7,8,9,0,http,https,today,great,day,amp,u,w,a,s,able,about,above,according,accordingly,across,actually,after,afterwards,again,against,ain,t,all,allow,allows,almost,alone,along,already,also,although,always,am,among,amongst,an,and,another,any,anybody,anyhow,anyone,anything,anyway,anyways,anywhere,apart,appear,appreciate,appropriate,are,aren,t,around,as,aside,ask,asking,associated,at,available,away,awfully,be,became,because,become,becomes,becoming,been,before,beforehand,behind,being,believe,below,beside,besides,best,better,between,beyond,both,brief,but,by,c,mon,c,s,came,can,can,t,cannot,cant,cause,causes,certain,certainly,changes,clearly,co,com,come,comes,concerning,consequently,consider,considering,contain,containing,contains,corresponding,could,couldn,t,course,currently,definitely,described,despite,did,didn,t,different,do,does,doesn,t,doing,don,t,done,down,downwards,during,each,edu,eg,eight,either,else,elsewhere,enough,entirely,especially,et,etc,even,ever,every,everybody,everyone,everything,everywhere,ex,exactly,example,except,far,few,fifth,first,five,followed,following,follows,for,former,formerly,forth,four,from,further,furthermore,get,gets,getting,given,gives,go,goes,going,gone,got,gotten,greetings,had,hadn,t,happens,hardly,has,hasn,t,have,haven,t,having,he,he,s,hello,help,hence,her,here,here,s,hereafter,hereby,herein,hereupon,hers,herself,hi,him,himself,his,hither,hopefully,how,howbeit,however,i,d,i,ll,i,m,i,ve,ie,if,ignored,immediate,in,inasmuch,inc,indeed,indicate,indicated,indicates,inner,insofar,instead,into,inward,is,isn,t,it,it,d,it,ll,it,s,its,itself,just,keep,keeps,kept,know,knows,known,last,lately,later,latter,latterly,least,less,lest,let,let,s,like,liked,likely,little,look,looking,looks,ltd,mainly,many,may,maybe,me,mean,meanwhile,merely,might,more,moreover,most,mostly,much,must,my,myself,name,namely,nd,near,nearly,necessary,need,needs,neither,never,nevertheless,new,next,nine,no,nobody,non,none,noone,nor,normally,not,nothing,novel,now,nowhere,obviously,of,off,often,oh,ok,okay,old,on,once,one,ones,only,onto,or,other,others,otherwise,ought,our,ours,ourselves,out,outside,over,overall,own,particular,particularly,per,perhaps,placed,please,plus,possible,presumably,probably,provides,que,quite,qv,rather,rd,re,really,reasonably,regarding,regardless,regards,relatively,respectively,right,said,same,saw,say,saying,says,second,secondly,see,seeing,seem,seemed,seeming,seems,seen,self,selves,sensible,sent,serious,seriously,seven,several,shall,she,should,shouldn,t,since,six,so,some,somebody,somehow,someone,something,sometime,sometimes,somewhat,somewhere,soon,sorry,specified,specify,specifying,still,sub,such,sup,sure,t,s,take,taken,tell,tends,th,than,thank,thanks,thanx,that,that,s,thats,the,their,theirs,them,themselves,then,thence,there,there,s,thereafter,thereby,therefore,therein,theres,thereupon,these,they,they,d,they,ll,they,re,they,ve,think,third,this,thorough,thoroughly,those,though,three,through,throughout,thru,thus,to,together,too,took,toward,towards,tried,tries,truly,try,trying,twice,two,un,under,unfortunately,unless,unlikely,until,unto,up,upon,us,use,used,useful,uses,using,usually,value,various,very,via,viz,vs,want,wants,was,wasn,t,way,we,we,d,we,ll,we,re,we,ve,welcome,well,went,were,weren,t,what,what,s,whatever,when,whence,whenever,where,where,s,whereafter,whereas,whereby,wherein,whereupon,wherever,whether,which,while,whither,who,who,s,whoever,whole,whom,whose,why,will,willing,wish,with,within,without,won,t,wonder,would,would,wouldn,t,yes,yet,you,you,d,you,ll,you,re,you,ve,your,yours,yourself,yourselves,zero".split(
        ',')
    punctuation = [
        '.', '?', '!', ':', ';', '...', '"', "'", '/', ',', '&', '|'
    ]
    stoplist = word_stoplist + punctuation
    #stoplist = ['for','a','of','the','and','to',',',';','.','"','in']

    tweets = []
    train = open("text.txt")
    train_tweets = train.readlines()
    #print np.shape(train_tweets)

    for train_tweet in train_tweets:
        tokenizetweet = tokenizeRawTweetText(train_tweet)
        tweet = [word for word in tokenizetweet if word not in stoplist]
        #print "tweet:", tweet
        #[tweet.remove(stopword) for stopword in stoplist if tweet.count(stopword)>0]
        tweets.append(tweet)

    # Build gensim dictionary
    # Will want to consider options for doing this in streaming form
    dictionary = corpora.Dictionary(tweets)
    if saveLocal:
        dictionary.save('congress.dict')

    # Build gensim corpus
    # Need to consider serialization options
    corpus = [dictionary.doc2bow(tweet) for tweet in tweets]
    if saveLocal:
        if corpusFormat == 'mm':
            corpora.MmCorpus.serialize('corpus.mm', corpus)
        elif corpusFormat == 'blei':
            corpora.BleiCorpus.serialize('corpus.lda-c', corpus)
        elif corpusFormat == 'low':
            corpora.LowCorpus.serialize('corpus.low', corpus)
    #print corpus
    return dictionary, corpus
示例#35
0
def clean_dataframe(streamer):
    df = emotes.streamer_df(streamer)

    # Tokenize and clean data. Removes hyperlinks, non-ASCII characters, and rows with less than 5 tokens. Then lowercases all tokens.
    df['body'] = df['body'].apply(lambda x: twokenize.tokenizeRawTweetText(x))
    df['body'] = df['body'].apply(
        lambda x: [re.sub(r"http\S+", "", i) for i in x])
    df['body'] = df['body'].apply(
        lambda x: [re.sub(r'[^\x00-\x7F]', '', i) for i in x])
    indexNames = df[df['body'].str.len() <= 4].index
    df.drop(indexNames, inplace=True)
    df['body'] = df['body'].apply(lambda x: [i.lower() for i in x])

    # Grab global and streamer emotes and then lowercase them.
    X = emotes.global_streamer_emotes(streamer)
    X = [i.lower() for i in X]

    # Drop rows containing at least 2 distinct emotes.
    df['unique emotes'] = df['body'].apply(set)
    df['unique emotes'] = df['unique emotes'].apply(
        lambda x: x.intersection(set(X)))
    df['number of unique emotes'] = df['unique emotes'].apply(len)
    indexNames1 = df[df['number of unique emotes'] != 1].index
    df.drop(indexNames1, inplace=True)

    # Drop unnecessary (for now) columns.
    df = df.drop([
        'channel_id', 'commenter_id', 'commenter_type', 'created_at',
        'fragments', 'offset', 'updated_at', 'video_id',
        'number of unique emotes'
    ],
                 axis=1)

    # Turn unique emotes column into column of strings instead of set.
    df['unique emotes'] = df['unique emotes'].apply(lambda x: list(x)[0])

    # Remove identical messages (spam).
    df['body_str'] = df['body'].apply(lambda x: ' '.join(x))
    df = df.drop_duplicates(subset=['body_str'])
    df = df.drop(columns=['body_str'])

    return df
def tokenize_str(string):
    
    stopwords = corpus.stopwords.words('english')
    stopwords.extend([u"---", u"...", u"n't"])
    lemmatizer = WordNetLemmatizer()
    
    tokens = tokenizeRawTweetText(string)     
    
    tokens = [token.lower() for token in tokens if len(token) > 2]
    tokens = [token for token in tokens if token[1:] not in hashtags]
    tokens = [token for token in tokens if token not in urls]
    tokens = [token for token in tokens if token[1:] not in user_mentions]
    tokens = [token for token in tokens if token not in stopwords]
    tokens = [token for token in tokens if not token.isdigit()]

    stemmed_tokens = []
    for w in tokens:
        w = re.sub(r'(.)\1+', r'\1\1', w)
        stemmed_tokens.append(lemmatizer.lemmatize(w)) #.encode("ascii","ignore")
    return stemmed_tokens
示例#37
0
def tweet_tokenizer(text: str) -> List[str]:
    '''
    A Twitter tokenizer from
    `CMU Ark <https://github.com/brendano/ark-tweet-nlp>`_
    This is a wrapper of
    `this <https://github.com/Sentimentron/ark-twokenize-py>`_. Further more 
    this is an adapted version as it also splits hashtags 
    e.g. `#anything` becomes [`#`, `anything`]. This follows the tokenization 
    of `Yu et al. 2018 <https://www.aclweb.org/anthology/D18-1137>`_. 

    :param text: A string to be tokenized.
    :returns: A list of tokens where each token is a String.
    :raises AssertionError: If the tokenized text is not character preserving.
    :raises ValueError: If the given text is not a String
    '''

    hashtag_pattern = re.compile('^#.+')
    if isinstance(text, str):
        tokenized_text = twokenize.tokenizeRawTweetText(text)
        hashtag_tokenized_text = []
        for token in tokenized_text:
            if hashtag_pattern.search(token):
                hashtag = token[0]
                hashtag_tokenized_text.append(hashtag)

                other_token_text = token[1:].strip()
                if other_token_text:
                    hashtag_tokenized_text.append(other_token_text)
            else:
                hashtag_tokenized_text.append(token)

        assert_err = (
            'The tokenizer has not been charcter preserving. Original'
            f' text: {text}\nHashtag tokenized tokens '
            f'{hashtag_tokenized_text}')
        assert is_character_preserving(text,
                                       hashtag_tokenized_text), assert_err

        return hashtag_tokenized_text

    raise ValueError(f'The paramter must be of type str not {type(text)}')
示例#38
0
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    # string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    # string = re.sub(r"\'s", " \'s", string)
    # string = re.sub(r"\'ve", " \'ve", string)
    # string = re.sub(r"n\'t", " n\'t", string)
    # string = re.sub(r"\'re", " \'re", string)
    # string = re.sub(r"\'d", " \'d", string)
    # string = re.sub(r"\'ll", " \'ll", string)
    # string = re.sub(r",", " , ", string)
    # string = re.sub(r"!", " ! ", string)
    # string = re.sub(r"\(", " \( ", string)
    # string = re.sub(r"\)", " \) ", string)
    # string = re.sub(r"\?", " \? ", string)
    # string = re.sub(r"\s{2,}", " ", string)
    string = twokenize.tokenizeRawTweetText(string)
    string = " ".join(str(x) for x in string)
    return string.strip().lower()
示例#39
0
文件: cleanse.py 项目: mac389/lovasi
#--Command  line parsing
op = OptionParser()
op.add_option('--i', dest='source', type='str', help='Filename of unprocessed text.')
op.add_option('--o', dest='destination', type='str', help='Filename for processed text.')
op.add_option('--stopwords', dest='stopwords_filename',type='str',help='Source of stopwords, default is local file called stopwords')
op.print_help()
#

opts,args = op.parse_args()
if len(args) > 0:
	op.error('This script only takes arguments preceded by command line options.')
	sys.exit(1)

custom_stopwords  = set(stopwords.words('english') if not opts.stopwords_filename else open('stopwords','rb').read().splitlines() + stopwords.words('english'))

text = open(opts.source,'rb').read().splitlines()

bar = Bar('Cleansing %s'%opts.source,max=len(text))
for i,item in enumerate(text):
	text[i] = [word for word in Tokenizer.tokenizeRawTweetText(item) if word not in custom_stopwords and not word.isdigit()]
	bar.next()
bar.finish()

#Remove words that open appeal once
all_tokens = sum(text,[])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word)==1)
text = [[word for word in item if word not in tokens_once] for item in text]

with open(opts.destination,'wb') as f:
	for item in text:
		print>>f, ' '.join(item)
示例#40
0
文件: get-age.py 项目: mac389/lovasi
import re
import twokenize as Tokenizer

test = "Happy Bday @ImTheFrancescaC You're 10 years old! I wish you have amazing day LYSM pretty #birthday #of #a #princess pic.twitter.com/1s4DiF7TOn"

test = Tokenizer.tokenizeRawTweetText(test)

print test
示例#41
0
 def to_words(self, in_text):
     return tokenizeRawTweetText(in_text)
示例#42
0
files = listdir('../data/us-to-russia/')

russia_tally = 0
putin_tally = 0
russia_list = []
putin_list = []
done_dict = defaultdict(bool)
for batch in files:
    with open('../data/us-to-russia/'+batch) as f:
        tweets = json.load(f)
        for t in tweets:
            if done_dict[t[u'id']] or langid.classify(t[u'text']) != 'en' or len(t[u'text']) == 0:
                continue
            else:
                text = t[u'text']
                tokens = tokenizeRawTweetText(text)
                if 'russia' in tokens:
                    print 'found russia'
                    russia_list.append(('0',text))# += tokens
                    russia_tally += 1
                if 'putin' in tokens:
                    print 'found putin'
                    putin_list.append(('0',text))# += tokens
                    putin_tally += 1
            done_dict[t[u'id']] = True

russia_labeled = th.predict(russia_list)
putin_labeled = th.predict(putin_list)
print compute_polairty(Counter(russia_labeled))
print compute_polarity(Counter(putin_labeled))
示例#43
0
def tokenize(text):
	return tokenizeRawTweetText(text)