예제 #1
0
파일: utils.py 프로젝트: hangyav/biadapt
def to_iob(text, target, label, tokenizer=TweetTokenizer()):
    text = tokenizer.tokenize(text)
    target = target.lower().split()
    found = False

    res = list()
    i = 0
    while i < len(text):
        if found or target[0] != text[i]:
            res.append((text[i], 'o'))
        else:
            if len(target) == 1:
                res.append((text[i], 'b-{}'.format(label)))
                found = True
            else:
                good = True
                for j in range(1, len(target)):
                    if i + j == len(text):
                        good = False
                        break
                    if target[j] != text[i + j]:
                        good = False
                        break
                if good:
                    res.append((text[i], 'b-{}'.format(label)))
                    for j in range(1, len(target)):
                        res.append((text[i + 1], 'i-{}'.format(label)))
                        i += 1
                    found = True

        i += 1

    if found:
        return res

    res = list()
    i = 0
    while i < len(text):
        if found or (target[0] not in text[i]):
            res.append((text[i], 'o'))
        else:
            for t in target:
                if t in text[i]:
                    res.append((text[i], '{}-{}'.format(
                        ('b' if t == target[0] else 'i'), label)))
                    i += 1
                    found = True
                    if i == len(text):
                        break
                else:
                    break
            i -= 1

        i += 1

    if found:
        return res

    res.append(('</s>', 'o'))
    for t in target:
        res.append((t, '{}-{}'.format(('b' if t == target[0] else 'i'),
                                      label)))

    return res
예제 #2
0
import re
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words("english"))
TKNZR = TweetTokenizer()


def remove_url(txt):
    """Replace URLs found in a text string with nothing 
	(i.e. it will remove the URL from the string).

	Parameters
	----------
	txt : string
		A text string that you want to parse and remove urls.

	Returns
	-------
	The same txt string with url's removed.
	"""

    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())


def tweet_toks(tweet):
    tweet = remove_url(tweet.lower())
    tweet_words = TKNZR.tokenize(tweet)
    tweet_words = [x for x in tweet_words if x not in STOPWORDS]

    return tweet_words
예제 #3
0
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"
}

## Tokenizers 
tokenizer = TweetTokenizer()
lem = WordNetLemmatizer()
engstopwords = set(stopwords.words('english'))
punc = string.punctuation  
## emojis wriiten using brakets remove this emojis
emo = {'):','(:',':',':-)',':))'}
    
## Cleaning the data
def data_cleaning(data):
    #Remove email
    re_email = re.compile(r'[\w.-]+@[\w.-]+')
    data= re_email.sub(r'',data)
    #Remove Emoji
    data = demoji.replace(data,repl='')
    #Remove webiste
    reg_website = re.compile(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([\w+-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
예제 #4
0
def processtweets(dirPath, limitStr):
    # convert the limit argument from a string to an int
    limit = int(limitStr)
    # initialize NLTK built-in tweet tokenizer
    twtokenizer = TweetTokenizer()

    os.chdir(dirPath)

    f = open('./downloaded-tweeti-b-dist.tsv', 'r')
    # loop over lines in the file and use the first limit of them
    #    assuming that the tweets are sufficiently randomized
    tweetdata = []
    for line in f:
        if (len(tweetdata) < limit):
            # remove final end of line character
            line = line.strip()
            # each line has 4 items separated by tabs
            # ignore the tweet and user ids, and keep the sentiment and tweet text
            tweetdata.append(line.split('\t')[2:4])

    # create list of tweet documents as (list of words, label)
    # where the labels are condensed to just 3:  'pos', 'neg', 'neu'
    tweetdocs = []
    # add all the tweets except the ones whose text is Not Available
    for tweet in tweetdata:
        if (tweet[1] != 'Not Available'):
            processedtweet = _processtweets(tweet[1])
            # run the tweet tokenizer on the text string - returns unicode tokens, so convert to utf8
            tokens = twtokenizer.tokenize(processedtweet)

            if tweet[0] == '"positive"':
                label = 'pos'
            else:
                if tweet[0] == '"negative"':
                    label = 'neg'
                else:
                    if (tweet[0]
                            == '"neutral"') or (tweet[0] == '"objective"') or (
                                tweet[0] == '"objective-OR-neutral"'):
                        label = 'neu'
                    else:
                        label = ''
            tweetdocs.append((tokens, label))

    ## shuffle dataset and generate feature set with most common 2000 words
    random.shuffle(tweetdocs)
    all_words_list = [word for (sent, cat) in tweetdocs for word in sent]
    all_words = nltk.FreqDist(all_words_list)
    word_items = all_words.most_common(2000)
    word_features = [word for (word, count) in word_items]

    ## Base Accuracy with all the words
    featuresets = getfeatureset(tweetdocs, word_features)
    setsize = int(limit / 5)
    train_set, test_set = featuresets[setsize:], featuresets[:setsize]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print("Base accuracy with all the words: " +
          str(nltk.classify.accuracy(classifier, test_set)))
    print("****** Most informative 20 features *********")
    classifier.show_most_informative_features(20)
    print("**********************************************")

    ## Cross validation with k fold
    num_folds = 5
    print("Cross validation with 5 folds: ")
    cross_validation_accuracy(num_folds, featuresets)
    cross_validation_metrics(num_folds, featuresets)

    ## Stop words removal
    dirname = os.path.realpath('..')
    stopwordspath = dirname + '/stopwords_twitter.txt'
    stopwords = [line.strip() for line in open(stopwordspath)]
    negationwords = [
        'no', 'not', 'never', 'none', 'nowhere', 'nothing', 'noone', 'rather',
        'hardly', 'scarcely', 'rarely', 'seldom', 'neither', 'nor'
    ]
    negationwords.extend([
        'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven',
        'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn',
        'weren', 'won', 'wouldn'
    ])
    newstopwords = [word for word in stopwords if word not in negationwords]
    new_all_words_list = [
        word for (sent, cat) in tweetdocs for word in sent
        if word not in newstopwords
    ]
    new_all_words = nltk.FreqDist(new_all_words_list)
    new_word_items = new_all_words.most_common(2000)
    new_word_features = [word for (word, count) in new_word_items]

    ## Accuracy with stop word removal
    new_featuresets = getfeatureset(tweetdocs, new_word_features)
    setsize = int(limit / 5)
    train_set, test_set = new_featuresets[setsize:], new_featuresets[:setsize]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print("Accuracy with stop words removal: " +
          str(nltk.classify.accuracy(classifier, test_set)))

    ## Subjectivity lexicon
    SLfeaturesets = getSLfeatureset(tweetdocs, word_features)
    setsize = int(limit / 5)
    train_set, test_set = SLfeaturesets[setsize:], SLfeaturesets[:setsize]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print("Accuracy with Subjectivity lexicon: " +
          str(nltk.classify.accuracy(classifier, test_set)))

    ## Negation words
    notfeaturesets = getNOTfeatureset(tweetdocs, word_features, negationwords)
    setsize = int(limit / 5)
    train_set, test_set = notfeaturesets[setsize:], notfeaturesets[:setsize]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print("Accuracy with negation words features: " +
          str(nltk.classify.accuracy(classifier, test_set)))

    ## Negation words and stop words removal
    newnotfeaturesets = getNOTfeatureset(tweetdocs, new_word_features,
                                         negationwords)
    setsize = int(limit / 5)
    train_set, test_set = newnotfeaturesets[
        setsize:], newnotfeaturesets[:setsize]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print("Accuracy with negation words features and stop words removal: " +
          str(nltk.classify.accuracy(classifier, test_set)))

    ## Opinion lexicon
    opinionlexfeaturesets = getopinionfeatureset(tweetdocs, word_features)
    train_set, test_set = opinionlexfeaturesets[
        1000:], opinionlexfeaturesets[:1000]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print("Accuracy with Opinion lexicon: " +
          str(nltk.classify.accuracy(classifier, test_set)))

    ## Advanced experiments
    print("*************** Advanced experiments **************")

    ## sklearn SGDClassifier
    skfetaureset = [tweet_features(d, word_features) for (d, c) in tweetdocs]
    X = pd.DataFrame.from_dict(skfetaureset)
    y = [c for (d, c) in tweetdocs]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)
    clf = SGDClassifier()
    clf.fit(X_train, y_train)
    predicted = clf.predict(X_test)
    print("Accuracy with SGDClassifier - default parameters: " +
          str(np.mean(predicted == y_test)))

    text_clf_svm = SGDClassifier(loss='hinge',
                                 penalty='l2',
                                 alpha=1e-3,
                                 max_iter=6,
                                 random_state=42)
    text_clf_svm.fit(X_train, y_train)
    predicted_svm = text_clf_svm.predict(X_test)
    print("Accuracy with SGDClassifier: " +
          str(np.mean(predicted_svm == y_test)))

    ## Different classifiers from sklearn - ref https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
    skfetaureset = [tweet_features(d, word_features) for (d, c) in tweetdocs]
    X = pd.DataFrame.from_dict(skfetaureset)
    y = [c for (d, c) in tweetdocs]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)
    h = .02  # step size in the mesh
    names = [
        "Nearest Neighbors", "Linear SVM", "Decision Tree", "Random Forest",
        "Naive Bayes"
    ]
    classifiers = [
        KNeighborsClassifier(3),
        SVC(kernel="linear", C=0.025),
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        GaussianNB()
    ]
    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        predicted = clf.predict(X_test)
        print(name + " : " + str(score))

    ## Different classifiers from sklearn with stop words removal - ref https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
    sknewfetaureset = [
        tweet_features(d, new_word_features) for (d, c) in tweetdocs
    ]
    print("Classifiers with stop word removal")
    X = pd.DataFrame.from_dict(sknewfetaureset)
    y = [c for (d, c) in tweetdocs]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)
    h = .02  # step size in the mesh
    names = [
        "Nearest Neighbors", "Linear SVM", "Decision Tree", "Random Forest",
        "Naive Bayes"
    ]
    classifiers = [
        KNeighborsClassifier(3),
        SVC(kernel="linear", C=0.025),
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        GaussianNB()
    ]
    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        predicted = clf.predict(X_test)
        print(name + " : " + str(score))
예제 #5
0
def my_tokenize(s):
    tknzr = TweetTokenizer()
    return tknzr.tokenize(s)
예제 #6
0
def build_word_statistics(text_file_path, label_file_path, no_sentiment_words=True):
    if no_sentiment_words:
        logger.info("Also excluding sentiment words!")

    text_tokenizer = TweetTokenizer()
    vocab = set()
    with open(text_file_path) as text_file:
        for line in text_file:
            vocab.update(text_tokenizer.tokenize(line.lower()))
    stopwords = lexicon_helper.get_stopwords()
    vocab -= stopwords
    if no_sentiment_words:
        sentiment_words = lexicon_helper.get_sentiment_words()
        vocab -= sentiment_words
    logger.debug(vocab)
    logger.info("Vocab size after filtering: " + str(len(vocab)))

    labels = set()
    with open(label_file_path) as label_file:
        for label_line in label_file:
            label = label_line.strip()
            labels.add(label)
    logger.debug(labels)

    empty_template = dict()
    for label in labels:
        empty_template[label] = 0
    logger.debug(empty_template)

    word_occurrences = dict()
    with open(text_file_path) as text_file, open(label_file_path) as label_file:
        for text_line, label_line in zip(text_file, label_file):
            words = text_tokenizer.tokenize(text_line.strip())
            label = label_line.strip()
            for word in words:
                if len(word) > 3 and word in vocab:
                    if word not in word_occurrences:
                        word_occurrences[word] = empty_template.copy()
                    word_occurrences[word][label] += 1
    logger.debug(word_occurrences)

    label_word_scores = dict()
    for label in labels:
        word_scores = list()
        for word in word_occurrences:
            try:
                occurrence = word_occurrences[word]
                positive_count = occurrence[label]
                negative_count = sum(occurrence.values()) - positive_count
                kld = positive_count * (math.log(positive_count) / math.log(negative_count))
                word_scores.append((kld, word))
            except Exception:
                logger.debug("error while processing word '{}' for label '{}'".format(word, label))
        label_word_scores[label] = word_scores

    logger.debug(label_word_scores)

    for label in label_word_scores:
        word_scores = label_word_scores[label]
        word_scores.sort(key=lambda x: x[0], reverse=True)
        most_correlated = [x[1] for x in word_scores[:100]]
        logger.info("For label '{}'".format(label))
        logger.info("Most correlated words: {}".format(most_correlated))
def represent_tweet(tweets):
    tokens = TweetTokenizer().tokenize(tweets)
    frequency = defaultdict(int)
    for token in tokens:
        frequency[token] += 1
    return frequency
def extract_features(tweet):
    features = []
    tknzr = TweetTokenizer()
    ps = SnowballStemmer("english")

    # HASHTAGS COUNT
    list_hashtag = re.findall(r'\B(\#[a-zA-Z]+\b)(?!;)', tweet)
    features.append(len(list_hashtag))

    # PUNCTUATION COUNT & PRESENCE
    count = lambda l1, l2: sum([1 for x in l1 if x in l2])
    count_punct = count(tweet, set(punctuation))
    features.append(count_punct)
    features.append(count_punct != 0)

    # EXCLAMATION MARK
    list_exclamation = re.findall(r'\!', tweet)
    features.append(len(list_exclamation))
    features.append(list_exclamation != 0)

    # QUESTION MARK
    list_question = re.findall(r'\?', tweet)
    features.append(len(list_question))
    features.append(list_question != 0)

    # REMOVE PUNCTUATION
    tweet = tweet.translate(str.maketrans('', '', punctuation))

    # EMOJIS COUNT & PRESENCE
    demojized_tweet = emoji.demojize(tweet)
    emoji_list = re.findall(r'(:[^:]*:)', demojized_tweet)
    list_emoji = [emoji.emojize(x) for x in emoji_list]
    features.append(len(emoji_list))
    features.append(len(emoji_list) != 0)

    # REMOVE EMOJIS
    for emo in list_emoji:
        if emo in tweet:
            tweet = tweet.replace(emo, "")

    # WORD COUNT &LENGTH: number of words, number of words without stopwords, average word length
    tokens = [x for x in tknzr.tokenize(tweet)]
    bag_of_words = [x for x in tokens if x.lower() not in stop_words]
    stemmed_text = [
        ps.stem(x) for x in bag_of_words if x.lower() not in stop_words
    ]
    features.append(len(bag_of_words))
    features.append(float(sum(map(len, bag_of_words))) / len(bag_of_words))

    # SPECIFIC WORD COUNT
    favor_words = ['blm', 'blacklivesmatter']
    against_words = ['alllivesmatter', 'bluelivesmatter']
    favor = [x for x in bag_of_words if x.lower() in favor_words]
    against = [x for x in bag_of_words if x.lower() in against_words]
    features.append(len(favor))
    features.append(len(favor) / len(bag_of_words))
    features.append(len(favor) != 0)
    features.append(len(against))
    features.append(len(against) / len(bag_of_words))
    features.append(len(against) != 0)

    # WORD COUNT UPPER
    upper_words = [x for x in bag_of_words if x.isupper()]
    features.append(len(upper_words) != 0)

    # CHECK FOR ELONGATED WORDS
    regex = re.compile(r"(.)\1{2}")
    elongated_words = [word for word in bag_of_words if regex.search(word)]
    features.append(len(elongated_words))
    features.append(len(elongated_words) != 0)

    # POS TAGS: frequency of each POS tag
    pos_tags = nltk.pos_tag(stemmed_text)
    counts = Counter(tag for word, tag in pos_tags)
    features.append(counts['JJ'])
    features.append(counts['RB'])
    features.append(counts['VB'])
    features.append(counts['NN'])

    #Letter features: number of letters, frequency of each letter, frequency of each capital letter
    features.append(len(tweet))
    for letter in list(string.ascii_lowercase):
        features.append(len([x for x in tweet if x.lower() == letter]))
    for letter in list(string.ascii_uppercase):
        features.append(len([x for x in tweet if x == letter]))

    total_upper = sum(1 for x in tweet if x.isupper())
    total_lower = sum(1 for x in tweet if x.islower())
    features.append(total_upper / len(tweet))
    features.append(total_lower / len(tweet))

    return features
예제 #9
0
import argparse
import math
import codecs
import os
import sys

from nltk.tokenize import TweetTokenizer
#from nltk.tokenize.moses import MosesDetokenizer

app = Flask(__name__)
args = None
translator = None

#dt = MosesDetokenizer()

tt = TweetTokenizer()


@app.route("/api/health")
def health():
    return jsonify({"status": "ok"}), 200


@app.route("/api/translate", methods=['POST'])
def score():
    line = request.get_json()['text']
    #srcTokens = tt.tokenize(line.lower())
    #srcBatch = [srcTokens]
    #predBatch, predScore, goldScore, attn, src = translator.translate(srcBatch, None)
    #responses = [dt.detokenize(i, return_str=True) for i in predBatch[0]]
    #scores = list(predScore[0])
예제 #10
0
def preprocess_tweet(tweet):
    tokenizer = TweetTokenizer(preserve_case=False)
    return [token for token in tokenizer.tokenize(tweet) if token_is_allowed(token)]
예제 #11
0
import csv
import tweepy
from nltk.tokenize import TweetTokenizer
from tweepy import RateLimitError

from DataExtraction.Scripts import twitter_credentials
import time
import datetime

date_now = datetime.datetime.now()

tknzr = TweetTokenizer(strip_handles=True)

access_token = twitter_credentials.ACCESS_TOKEN
access_token_secret = twitter_credentials.ACCESS_TOKEN_SECRET
consumer_key = twitter_credentials.CONSUMER_KEY
consumer_secret = twitter_credentials.CONSUMER_SECRET

auth = tweepy.auth.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)

startDate = datetime.datetime(2020, 1, 1, 0, 0, 0)
endDate = datetime.datetime.now()

csvFile = open('OldTimeLineTweets.csv', 'a',
               encoding="utf-16")  # ../../../CollectedData/NDC/Pres
csvWriter = csv.writer(csvFile)
csvWriter.writerow(["Date", "Id", "Text", "Likes", "Re-Tweets", "Location"])
예제 #12
0
from nltk import word_tokenize
from nltk.tokenize import TweetTokenizer

tweet = '@Hari: This sounds good! :D'
print(word_tokenize(tweet))
print(TweetTokenizer().tokenize(tweet))

 def __init__(self, max_dict_size=MM_MAX_DICT_SIZE, device="cpu"):
     self.max_dict_size = max_dict_size
     self.token_to_id = {TOKEN_UNK: 0}
     self.next_id = 1
     self.tokenizer = TweetTokenizer(preserve_case=True)
     self.device = device
from nltk.tokenize import TweetTokenizer
import pymorphy2

NUMB_OF_FEATURES = 6

twtk = TweetTokenizer()
morph = pymorphy2.MorphAnalyzer(lang='uk')

POS_l = [
    "Unk",
    "NOUN",
    "ADJF",
    "ADJS",
    "COMP",
    "VERB",
    "INFN",
    "PRTS",
    "GRND",
    "NUMR",
    "ADVB",
    "NPRO",
    "PRED",
    "PREP",
    "CONJ",
    "PRCL",
    "INTJ",
]
cases_l = [
    'Unk', 'nomn', 'gent', 'datv', 'accs', 'ablt', 'loct', 'voct', 'gen2',
    'acc2', 'loc2'
]
def process_tweet(
    tweet,
    remove_USER_URL=True,
    remove_punctuation=True,
    remove_stopwords=True,
    remove_HTMLentities=True,
    remove_hashtags=True,
    appostrophe_handling=True,
    lemmatize=True,
    trial=False,
    sym_spell=None,
    reduce_lengthenings=True,
    segment_words=True,
    correct_spelling=True,
):
    """
    This function receives tweets and returns clean word-list
    """
    ### Handle USERS and URLS ################################################
    if remove_USER_URL:
        if trial:
            tweet = re.sub(r'@\w+ ?', '', tweet)
            tweet = re.sub(r'http\S+', '', tweet)
        else:
            tweet = re.sub(r"@USER", "<>", tweet)
            tweet = re.sub(r"URL", "", tweet)
    else:
        if trial:
            tweet = re.sub(r'@\w+ ?', '<usertoken> ', tweet)
            tweet = re.sub(r'http\S+', '<urltoken> ', tweet)
        else:
            tweet = re.sub(r"@USER", "<usertoken>", tweet)
            tweet = re.sub(r"URL", "<urltoken>", tweet)

    ### Remove HTML Entiti es #################################################
    if remove_HTMLentities:
        tweet = html.unescape(tweet)

    ### REMOVE HASHTAGS? #####################################################
    if remove_hashtags:
        tweet = re.sub(r'#\w+ ?', '', tweet)

    ### Convert to lower case: Hi->hi, MAGA -> maga ##########################
    tweet = tweet.lower()

    ### Cleaning: non-ASCII filtering, some appostrophes, separation #########
    tweet = re.sub(r"’", r"'", tweet)
    tweet = re.sub(r"[^A-Za-z0-9'^,!.\/+-=@]", " ", tweet)
    tweet = re.sub(r"what's", "what is ", tweet)
    tweet = re.sub(r"\'s", " ", tweet)
    tweet = re.sub(r"\'ve", " have ", tweet)
    tweet = re.sub(r"n't", " not ", tweet)
    #     tweet = re.sub(r"i'm", "i am ", tweet)
    tweet = re.sub(r"\'re", " are ", tweet)
    tweet = re.sub(r"\'d", " would ", tweet)
    tweet = re.sub(r"\'ll", " will ", tweet)
    tweet = re.sub(r",", " ", tweet)
    tweet = re.sub(r"\.", " ", tweet)
    tweet = re.sub(r"!", " ! ", tweet)
    tweet = re.sub(r"\/", " ", tweet)
    tweet = re.sub(r"\^", " ^ ", tweet)
    tweet = re.sub(r"\+", " + ", tweet)
    tweet = re.sub(r"\-", " - ", tweet)
    tweet = re.sub(r"\=", " = ", tweet)
    tweet = re.sub(r"(\d+)(k)", r"\g<1>000", tweet)
    tweet = re.sub(r":", " : ", tweet)
    tweet = re.sub(r" e g ", " eg ", tweet)
    tweet = re.sub(r" b g ", " bg ", tweet)
    tweet = re.sub(r" u s ", " american ", tweet)
    tweet = re.sub(r"\0s", "0", tweet)
    tweet = re.sub(r" 9 11 ", "911", tweet)
    tweet = re.sub(r"e - mail", "email", tweet)
    tweet = re.sub(r"j k", "jk", tweet)
    tweet = re.sub(r"\s{2,}", " ", tweet)

    ### Remove Punctuation ###################################################
    if remove_punctuation:
        translator = str.maketrans(
            '', '', ''.join(list(set(string.punctuation) - set("'"))))
        tweet = tweet.translate(translator)

    # Tokenize sentence for further word-level processing
    tokenizer = TweetTokenizer()
    words = tokenizer.tokenize(tweet)

    ### Apostrophe handling:    you're   -> you are  ########################
    APPO = {
        "aren't": "are not",
        "can't": "cannot",
        "couldn't": "could not",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he would",
        "he'll": "he will",
        "he's": "he is",
        "i'd": "I would",
        "i'll": "I will",
        "i'm": "I am",
        "isn't": "is not",
        "it's": "it is",
        "it'll": "it will",
        "i've": "I have",
        "let's": "let us",
        "mightn't": "might not",
        "mustn't": "must not",
        "shan't": "shall not",
        "she'd": "she would",
        "she'll": "she will",
        "she's": "she is",
        "shouldn't": "should not",
        "that's": "that is",
        "there's": "there is",
        "they'd": "they would",
        "they'll": "they will",
        "they're": "they are",
        "they've": "they have",
        "we'd": "we would",
        "we're": "we are",
        "weren't": "were not",
        "we've": "we have",
        "what'll": "what will",
        "what're": "what are",
        "what's": "what is",
        "what've": "what have",
        "where's": "where is",
        "who'd": "who would",
        "who'll": "who will",
        "who're": "who are",
        "who's": "who is",
        "who've": "who have",
        "won't": "will not",
        "wouldn't": "would not",
        "you'd": "you would",
        "you'll": "you will",
        "you're": "you are",
        "you've": "you have",
        "'re": " are",
        "wasn't": "was not",
        "we'll": " will"
    }
    if appostrophe_handling:
        words = [APPO[word] if word in APPO else word for word in words]

    tweet = ' '.join(words)
    words = tokenizer.tokenize(tweet)

    ### Lemmatisation:          drinking -> drink ###########################
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word, "v") for word in words]

    ### Remove stop words:      is, that, the, ... ##########################
    if remove_stopwords:
        eng_stopwords = set(stopwords.words("english"))
        words = [w for w in words if not w in eng_stopwords]

    ### Reduce lengthening:    aaaaaaaaah -> aah, bleeeeerh -> bleerh #################
    if reduce_lengthenings:
        pattern = re.compile(r"(.)\1{2,}")
        words = [pattern.sub(r"\1\1", w) for w in words]

    if sym_spell and (segment_words or correct_spelling):

        ### Segment words:    thecatonthemat -> the cat on the mat ####################
        if segment_words:
            words = [
                sym_spell.word_segmentation(word).corrected_string
                for word in words
            ]

        ### Correct spelling: birberals -> liberals ######################
        if correct_spelling:

            def correct_spelling_for_word(word):
                suggestions = sym_spell.lookup(word, Verbosity.TOP, 2)

                if len(suggestions) > 0:
                    return suggestions[0].term
                return word

            words = [correct_spelling_for_word(word) for word in words]

    clean_tweet = " ".join(words)
    clean_tweet = re.sub("  ", " ", clean_tweet)
    clean_tweet = clean_tweet.lower()

    return clean_tweet
예제 #16
0
def dummy_fun(doc):
    return TweetTokenizer().tokenize(doc)
예제 #17
0
def runFitting(params, objects):

    TASK = 'binary'
    #TASK = 'multi'
    '''
    Preparing data
    '''

    featureList = []

    if params["sentenceComplexityCheck"]:
        featureList.append("posTag")
    if params["embeddingsTermFreqFiltering"]:
        objects["freqFilter"].embeddingsEnabled = True
    if params["oneHotTermFreqFiltering"]:
        objects["freqFilter"].oneHotEnabled = True

    objects["liguisticFeatureExtractor"].setFeatureList(featureList)
    offenseval_train = './Data/train/offenseval-training-v1.tsv'

    #print('Reading in offenseval training data...')
    if TASK == 'binary':
        IDsTrain, Xtrain, Ytrain = helperFunctions.read_corpus(
            offenseval_train)
    else:
        IDsTrain, Xtrain, Ytrain = helperFunctions.read_corpus(
            offenseval_train, binary=False)

    Xtrain = helperFunctions.clean_samples(Xtrain)
    '''
    Preparing vectorizer and classifier
    '''

    # Vectorizing data / Extracting features
    #print('Preparing tools (vectorizer, classifier) ...')
    if params["tweetTokenization"]:
        count_word = transformers.CountVectorizer(
            ngram_range=(1, 2),
            stop_words=stop_words.get_stop_words('de'),
            tokenizer=TweetTokenizer().tokenize)
    else:
        count_word = transformers.CountVectorizer(
            ngram_range=(1, 2), stop_words=stop_words.get_stop_words('de'))
    count_char = transformers.CountVectorizer(analyzer='char',
                                              ngram_range=(3, 7))

    embedder = features.Embeddings(objects["embeddings"], pool='max')

    vectorizer = FeatureUnion([('word', count_word), ('char', count_char),
                               ('word_embeds', embedder)])

    if len(featureList) > 0:
        vectorizer.transformer_list.append(
            ('lingFeats', objects["liguisticFeatureExtractor"]))

    if params["oneHotTermFreqFiltering"] or params[
            "embeddingsTermFreqFiltering"]:
        vectorizer.transformer_list.append(
            ('freqFilter', objects["freqFilter"]))

    if params["charNgramFreqFiltering"]:
        objects["charFreqFilter"].oneHotEnabled = True
        objects["charFreqFilter"].embeddingsEnabled = False
        vectorizer.transformer_list.append(
            ('charfreqFilter', objects["charFreqFilter"]))

    if params["POStagCheck"]:
        vectorizer.transformer_list.append(
            ('posTagger', transformers.posTagExtractor(Xtrain, Ytrain)))

    # Set up SVM classifier with unbalanced class weights
    """     if TASK == 'binary':
        # cl_weights_binary = None
        cl_weights_binary = {'OTHER':1, 'OFFENSE':10}
        clf = LinearSVC(class_weight=cl_weights_binary)
    else:
        # cl_weights_multi = None
        cl_weights_multi = {'OTHER':0.5,
                            'ABUSE':3,
                            'INSULT':3,
                            'PROFANITY':4}
        clf = LinearSVC(class_weight=cl_weights_multi) """
    clf = LinearSVC()
    #scaler = StandardScaler(with_mean=False)

    classifier = Pipeline([
        ('vectorize', vectorizer),
        #('scale', scaler),
        ('classify', clf)
    ])
    '''
    Actual training and predicting:
    '''

    print('Fitting on training data...')
    classifier.fit(Xtrain, Ytrain)
    print("storing")
    dump(classifier, "./Models/RUG_Offense_concatModel.joblib")
    ##print("cross validating")
    ### predicting on set aside training data
    #print('Predicting on set aside data...')
    #Yguess = classifier.predict(XcustomTest)
    #result = cross_validate(classifier, Xtrain, Ytrain,cv=10)
    #print(result)
    ########

    #print('Predicting...')
    #Yguess = classifier.predict(Xtest)
    """     '''
    Outputting in format required
    '''

    print('Outputting predictions...')

    outdir = '/Users/balinthompot/RUG/Honours/HateSpeech/offenseval-rug-master/Submission'
    fname = 'rug_fine_2.txt'

    with open(outdir + '/' + fname, 'w', encoding='utf-8') as fo:
        assert len(Yguess) == len(Xtest_raw), 'Unequal length between samples and predictions!'
        for idx in range(len(Yguess)):
            # print(Xtest_raw[idx] + '\t' + Yguess[idx] + '\t' + 'XXX', file=fo) # binary task (coarse)
            print(Xtest_raw[idx] + '\t' + 'XXX' + '\t' + Yguess[idx], file=fo) # multi task (fine)

    print('Done.')
    """
    return classifier
 def __init__(self):
     self.tknzr = TweetTokenizer()
import json
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import pyqtgraph as pg
import numpy as np
import re

name_cand1 = "Donald"
name_cand2 = "Hillary"

#Tags that correspond to each Candidate
tags_cand1 = {"trumpforpresident":None,"lasttimetrumppaidtaxes":None,"trump4president":None,"trumptaxes":None,"trump":None,"donaldtrump":None,"makeamericagreatagain":None,"trumptales":None}
tags_cand2 = {"hillaryclinton":None,"sheswithus":None,"hillaryforpresident":None,"hillary4president":None,"imwithher":None,"hillary":None,"clinton":None}


tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

#Adding stopwords to the pre-existing stopwords
stop = set(stopwords.words('english')+["rt",":","!","-","…","?","url",".","’","\n"])
        
tweetNum_cand1=0
tweetNum_cand2=0

ptweet_cand1=[]
ntweet_cand1=[]
ptweet_cand2=[]
ntweet_cand2=[]

#Load Positive Words
p = {}
with open("positive_words.txt") as f:
예제 #20
0
def tknzrTweet(txt):
    tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
    return (tknzr.tokenize(txt))
예제 #21
0
파일: util.py 프로젝트: youikim/nltk
def demo_tweets(trainer, n_instances=None, output=None):
    """
    Train and test Naive Bayes classifier on 10000 tweets, tokenized using
    TweetTokenizer.
    Features are composed of:
        - 1000 most frequent unigrams
        - 100 top bigrams (using BigramAssocMeasures.pmi)

    :param trainer: `train` method of a classifier.
    :param n_instances: the number of total tweets that have to be used for
        training and testing. Tweets will be equally split between positive and
        negative.
    :param output: the output file where results have to be reported.
    """
    from nltk.tokenize import TweetTokenizer
    from nltk.sentiment import SentimentAnalyzer
    from nltk.corpus import twitter_samples, stopwords

    # Different customizations for the TweetTokenizer
    tokenizer = TweetTokenizer(preserve_case=False)
    # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True)
    # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)

    if n_instances is not None:
        n_instances = int(n_instances / 2)

    fields = ["id", "text"]
    positive_json = twitter_samples.abspath("positive_tweets.json")
    positive_csv = "positive_tweets.csv"
    json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances)

    negative_json = twitter_samples.abspath("negative_tweets.json")
    negative_csv = "negative_tweets.csv"
    json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances)

    neg_docs = parse_tweets_set(negative_csv,
                                label="neg",
                                word_tokenizer=tokenizer)
    pos_docs = parse_tweets_set(positive_csv,
                                label="pos",
                                word_tokenizer=tokenizer)

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
    train_neg_docs, test_neg_docs = split_train_test(neg_docs)

    training_tweets = train_pos_docs + train_neg_docs
    testing_tweets = test_pos_docs + test_neg_docs

    sentim_analyzer = SentimentAnalyzer()
    # stopwords = stopwords.words('english')
    # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords]
    all_words = [word for word in sentim_analyzer.all_words(training_tweets)]

    # Add simple unigram word features
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats,
                                       unigrams=unigram_feats)

    # Add bigram collocation features
    bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats(
        [tweet[0] for tweet in training_tweets], top_n=100, min_freq=12)
    sentim_analyzer.add_feat_extractor(extract_bigram_feats,
                                       bigrams=bigram_collocs_feats)

    training_set = sentim_analyzer.apply_features(training_tweets)
    test_set = sentim_analyzer.apply_features(testing_tweets)

    classifier = sentim_analyzer.train(trainer, training_set)
    # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print(
            "Your classifier does not provide a show_most_informative_features() method."
        )
    results = sentim_analyzer.evaluate(test_set)

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(
            output,
            Dataset="labeled_tweets",
            Classifier=type(classifier).__name__,
            Tokenizer=tokenizer.__class__.__name__,
            Feats=extr,
            Results=results,
            Instances=n_instances,
        )
#from os import path
#from PIL import Image
import sys  #para resolver os emoji's
import csv
import nltk
from nltk.tokenize import TweetTokenizer
from nltk import FreqDist
#import matplotlib.pyplot as plt
from wordcloud import WordCloud

#INSTANCIA UM OBJETO DO TIPO TweetTokenizer
#strip_handles=True -> irá remover mençoes a usuários do tweet
#reduce_len=True -> irá reduzir caracteres duplicados
#preserve_case=False -> irá alterar a forma da escrita, deixando tudo em minúsculo
tweet_tokenizer = TweetTokenizer(strip_handles=True,
                                 reduce_len=True,
                                 preserve_case=False)

#para tratar emoji's
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)

#ASSUNTO DESEJADO
arquivo_csv = 'twitterData.csv'

#pega a base de dados e transforma em dataframe
dt_frame = pd.read_csv(arquivo_csv, encoding='UTF-8')

#coloca todos os tweets em uma única variável de texto
text = ''
for val in dt_frame.tweet:
    text = text + val + ' '
예제 #23
0
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords 

import re
import string

nltk.download('stopwords')
nltk.download('wordnet')

#Global variables

lemmatizer = WordNetLemmatizer() 

tknzr = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)

#stemmer = PorterStemmer()
stemmer = SnowballStemmer("english") #snowball is faster, improvement of Porter stemmer

stop_words_en = set(stopwords.words('english')) 
stop_words_es = set(stopwords.words('spanish')) 
#excluded = set(string.punctuation.replace('#','').replace('@',''))

def readDataset(filename):
  df = pd.read_csv(filename)
  return df

def stopWordsTextEnglish(word_tokens):
  words_filtered_en = [w for w in word_tokens if not w in stop_words_en]
  return words_filtered_en
    parser.add_argument("-year",
                        type=int,
                        default=2014,
                        help="VQA dataset year (2014/2017)")

    args = parser.parse_args()

    print("Loading dataset...")
    trainset = VQADataset(args.data_dir, year=args.year, which_set="train")
    validset = VQADataset(args.data_dir, year=args.year, which_set="val")
    testdevset = VQADataset(args.data_dir,
                            year=args.year,
                            which_set="test-dev")
    testset = VQADataset(args.data_dir, year=args.year, which_set="test")

    tokenizer = TweetTokenizer(preserve_case=False)

    print("Loading glove...")
    with io.open(args.glove_in, 'r', encoding="utf-8") as f:
        vectors = {}
        for line in f:
            vals = line.rstrip().split(' ')
            vectors[vals[0]] = [float(x) for x in vals[1:]]

    print("Mapping glove...")
    glove_dict = {}
    not_in_dict = {}
    for _set in [trainset, validset, testdevset, testset]:
        for g in _set.games:
            words = tokenizer.tokenize(g.question)
            for w in words:
예제 #25
0
        self.load_state_dict(torch.load(
            path, map_location=lambda storage, loc: storage)[args.model_name])
        print('Model Loaded')

    def num_all_params(self,):
        return sum([param.nelement() for param in self.parameters()])


df_noSPammer = pd.read_html('textMaliciousMark_Small_NotSpammer.html')
df_noSPammer = df_noSPammer[0][1:]
df_Spammer = pd.read_html('textMaliciousMark_Small_Spammer.html')
df_Spammer = df_Spammer[0][1:]
df = pd.concat([df_Spammer, df_noSPammer])
del df_noSPammer, df_Spammer

tknzr = TweetTokenizer()
ps = nltk.stem.PorterStemmer()


def preprocessingInputData(input):
    allText = [i for i in input]
    preprocessedText = [[ps.stem(word) for word in tknzr.tokenize(re.sub(r'\d+', '', re.sub(r"http\S+|www.\S+", lambda match: urlparse(match.group(
    )).netloc or match.group().split("/")[0], sentence)).lower()) if word not in nltk.corpus.stopwords.words('english') and len(word) >= 3] for sentence in allText]
    return preprocessedText


def mapFromWordToIdx(input):

    wholeText = []

    for s in input:
def clean_text(text,
               remove_punt_number_special_chars=False,
               remove_stopwords=False,
               apply_stemming=False):
    """Clean text
    Args:
        text: (str) Text
        remove_punt_number_special_chars: (bool) Remove punctuations, numbers and special characters
        remove_stopwords: (bool) Remove stopwords
        apply_stemming: (bool) Apply stemming on the words on the text
    """
    #Remove emojis
    text = re.sub(":[a-zA-Z\-\_]*:", "",
                  emoji.demojize(text))  #:hear-no-evil_monkey:
    text = re.sub(":\w+:", "", emoji.demojize(text))
    text = re.sub(":\w+\’\w+:", "", emoji.demojize(text))  #:woman's_boot:

    #Remove mentions, usernames (@USER)
    text = re.sub("\s*@USER\s*", '', text)

    #Remove URL
    text = re.sub("\s*URL\s*", '', text)

    #And
    text = re.sub("&amp;", "and", text)
    text = re.sub("&lt;", "<", text)
    text = re.sub("&gt", ">", text)
    text = re.sub("&", "and", text)

    #Replace contractions and slang of word
    text = re.sub("i'm", "I'm", text)
    text = contractions.fix(text, slang=True)

    #Lowercase
    text = text.lower()

    #Remove Hashtags + Words
    text = re.sub("#\s*\w+\s*", '', text)

    #Remove repeating whitespaces
    text = re.sub("\s[2, ]", " ", text)

    #Remove non ascii characters
    text.encode("ascii", errors="ignore").decode()

    #Remove punctuations, numbers and special characters (remove emoticons)
    if remove_punt_number_special_chars:
        text = re.sub('[^a-zA-Z]', ' ', text)

    #Tokenize text
    tt = TweetTokenizer(preserve_case=False,
                        strip_handles=True,
                        reduce_len=True)

    text_tokens = tt.tokenize(text)

    #Remove stopwords
    if remove_stopwords:
        stopwords = set(STOPWORDS)
        text_tokens = [
            token for token in text_tokens if token not in stopwords
        ]

    #Stemming
    if apply_stemming:
        text_stem = [stemmer.stem(token) for token in text_tokens]

    clean = " ".join(text_tokens)

    return clean
def tokenize(s):
    return TweetTokenizer(preserve_case=False).tokenize(s)
예제 #28
0
    options_file = 'data/elmo_2x4096_512_2048cnn_2xhighway_options.json'
    weight_file = 'data/elmo_2x4096_512_2048cnn_2xhighway_weights'
    encoding_type_map = {'lstm': 'lstm', 'linear': 'linear'}

    model_name = 'PathBasedGCN'
    if evaluation_mode:
        logger = config_logger('evaluation/' + model_name)
    else:
        logger = config_logger('PathBasedGCN')

    model_path = os.getcwd() + '/models/' + model_name + '/'
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    tokenize = TweetTokenizer().tokenize
    logger.info('Hop number is %s', hops)
    logger.info('Learning rate is %s', learning_rate)
    logger.info('Training epoch is %s', epochs)
    logger.info('Batch size is %s', batch_size)
    logger.info('Dropout rate is %f', dropout)
    logger.info('Encoding size for nodes and query feature is %s',
                encoding_size)
    query_feature_type = encoding_type_map['lstm']
    logger.info('Encoding type for query feature is %s', query_feature_type)
    dynamic_change_learning_rate = True
    logger.info('Is learning rate changing along with epoch count: %s',
                dynamic_change_learning_rate)

    # tf.reset_default_graph()
    tf.reset_default_graph()
예제 #29
0
def tokenize(data):
    toked = []
    tknzr = TweetTokenizer()
    for d in data:
        toked.append(tknzr.tokenize(d))
    return toked
예제 #30
0
#nltk.download('stopwords')
#nltk.download('wordnet')
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

vectorizer = CountVectorizer(min_df=0, stop_words=None, strip_accents='ascii')
wordnet_lemmatizer = WordNetLemmatizer()                            #Lemmatizer Object
stop_words = set(stopwords.words('english'))                        #stop words from english
porter_stemmer = PorterStemmer()                                    #Stemming Object
df=pd.read_csv('Womens Clothing E-Commerce Reviews.csv')
corpus,real,rating,product=[],[],[],[]
tk = TweetTokenizer()                                               #Tokenising Object
dictionary = {}

#building corpus, tokenising the reviews, removing symbols
for i in df.index:
    if df['Department Name'][i]=='Jackets' and str(df['Recommended IND'][i])=='1':
        review = str(df['Review Text'][i])
        real.append(review)
        rating.append(str(df['Rating'][i]))
        product.append(str(df['Clothing ID'][i]))
        review = re.sub("[^A-Za-z']+",' ',review)                   #removing all symbols
        review = ' '.join(tk.tokenize(review))                      #tokenising
        corpus.append(review) 

print('The corpus for Jackets consists of '+str(len(corpus))+' reviews.')