示例#1
0
def relevant_days(dates, query, n_days):
    tknzr = tokenizer.TweetTokenizer(
        preserve_handles=False, preserve_hashes=False, preserve_case=False, preserve_url=False)
    args = {"tknzr": tknzr, "lemmatize": True}
    query_tokens = tokenizer_bow(query, tknzr, lemmatize=True)
    choosen_dates = []
    for date in dates:
        date_tweets = df.loc[df['created_at'] == date].get("text").values
        if len(date_tweets) > 0:
            vectorizer = CountVectorizer(stop_words="english",
                                         binary=False, tokenizer=lambda text: tokenizer_bow(text, **args))
            tf = vectorizer.fit_transform(date_tweets)  # Tokens Frequency
            freqs = list(zip(vectorizer.get_feature_names(),
                             np.ravel(tf.sum(axis=0))))
            sum = 0
            for t in query_tokens:
                try:
                    token, freq = next((token, freq)
                                       for (token, freq) in freqs if token == t)
                    sum += freq
                except StopIteration:
                    sum += 0
            choosen_dates.append(sum)
    order = np.flip(np.argsort(choosen_dates))
    return dates[order][:n_days]
示例#2
0
 def __init__(self):
     self.T = tokenizer.TweetTokenizer(preserve_handles=False,
                                       preserve_url=False,
                                       preserve_len=False,
                                       preserve_hashes=False,
                                       preserve_emoji=False,
                                       preserve_case=True,
                                       regularize=True)
示例#3
0
def train(input_text):
    T = tokenizer.TweetTokenizer()
    mc = collections.defaultdict(dict)
    mr = collections.defaultdict(dict)
    for i in input_text:
        tokens = T.tokenize(i)
        tokens.insert(0, 'START')
        tokens.append('END')
        mc = add_to_model(tokens, mc)
        mr = add_to_reverse_model(tokens, mr)
    return mc, mr
示例#4
0
def features_bow(X_BOW,
                 _dataTexts,
                 _lemmatize=False,
                 _mdf=3,
                 _metric="cosine",
                 _k=10,
                 _handles=False,
                 _hashes=False,
                 _case=False,
                 _url=False):
    if X_BOW is None:
        tknzr = tokenizer.TweetTokenizer(preserve_handles=_handles,
                                         preserve_hashes=_hashes,
                                         preserve_case=_case,
                                         preserve_url=_url)
        X_BOW_VEC, X_BOW = init_bow(_dataTexts, {
            "tknzr": tknzr,
            "lemmatize": _lemmatize
        }, _mdf)
    return X_BOW, X_BOW_VEC
示例#5
0
def tokenize(tweet):
    #remove email
    tweet = re.sub('\S*@\S*\s?', '', tweet)
    #remove url
    tweet = re.sub(r'http\S+', '', tweet)

    tweet = tokenizer.TweetTokenizer(
        preserve_case=False, preserve_handles=False, preserve_hashes=False,
        regularize=True, preserve_emoji=True
    ).tokenize(tweet)

    #emoji processing
    tweet = list(map(lambda x: str2emoji(x), tweet))
    tweet = ' '.join(tweet)

    #remove contraction
    tweet = contraction_removal(tweet)

    #remove puntuation
    tweet = re.sub('[' + punctuation + ']', '', tweet).split(' ')
    tweet = list(filter(lambda x: x != u'', tweet))

    return tweet
示例#6
0
import re
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
from nltk.corpus import stopwords
import pandas as pd
import nltk

notstopwords = set(('not', 'no', 'mustn', "mustn\'t"))
stopwords = set(stopwords.words('english')) - notstopwords

lemmatizer = WordNetLemmatizer()
T = tokenizer.TweetTokenizer(preserve_handles=False,
                             preserve_hashes=False,
                             preserve_case=False,
                             preserve_url=False,
                             regularize=True)


def data_preprocessing(path_tweets):
    tweets = pd.read_csv(path_tweets, encoding='utf-8', sep=',')
    tweets['text'] = tweets['text'].apply(lambda x: standardization(x))
    tweets['sentiment'] = tweets['airline_sentiment'].apply(
        lambda x: 0 if x == 'negative' else (1 if x == 'neutral' else 2))
    return tweets['text'], tweets['sentiment']


def data_preprocessing(path_tweets, corpora):
    data = pd.read_csv(path_tweets,
                       encoding='utf-8',
示例#7
0
import flask
import torch
from flask import Flask, render_template, request
from utils import label_full_decoder
import sys
import config
import dataset
import engine
from model import BERTBaseUncased
from tokenizer import tokenizer
T = tokenizer.TweetTokenizer(preserve_handles=True,
                             preserve_hashes=True,
                             preserve_case=False,
                             preserve_url=False)

app = Flask(__name__,
            static_url_path='',
            static_folder='app/static',
            template_folder='app/templates/public')

MODEL = None
DEVICE = config.device


def preprocess(text):
    tokens = T.tokenize(text)
    print(tokens, file=sys.stderr)
    ptokens = []
    for index, token in enumerate(tokens):
        if "@" in token:
            if index > 0:
#!/usr/bin/python
#-*-coding:utf-8-*-
from nltk.corpus import stopwords
from tokenizer import tokenizer
import nltk

import os
import json
import re

word_freq = dict()
#nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
T = tokenizer.TweetTokenizer()


#过滤字符
def Filter(input):
    pattern1 = re.compile(r'http[a-zA-Z0-9.?/&=:]*')
    input = pattern1.sub("", input)
    # pattern2 = re.compile(r'[-,$()#+&*!?.":;/–:,。“”‘’=+]')
    # input = pattern2.sub(" ", input)
    r = ""
    words = input.strip().split()
    for word in words:
        word = word.lower()
        if '@' not in word:
            r += (word + ' ')
            if word in word_freq:
                word_freq[word] += 1
            else:
示例#9
0
        dataset.append(datapoint)
        label_counter[indexer.get_object(label)] += 1
        count += 1
        if count % 500000 == 0:
            print("created", count, "datapoints")

    return dataset

indexer = Indexer()
label_counter = Counter()
dataset = create_dataset(tweets, indexer, label_counter

print ("length of dataset: ", len(dataset))

from tokenizer import tokenizer as vinay
v = vinay.TweetTokenizer(regularize=True, preserve_len=False)


word_cnts = Counter()
def count_words(text):
    words = v.tokenize(text)
    for word in words:
        word_cnts[word] += 1

for dp in dataset:
    count_words(dp.text)

new_dataset = []
count_of_bad = 0
for i in range(0, len(dataset)):
    data = dataset[i]