Пример #1
0
    def tokenize(self, tweet):
        tweet = remove_handles(tweet)

        tweet = tweet.replace('#', ' ')
        tweet = tweet.replace('<', ' ')
        tweet = tweet.replace('>', ' ')
        tweet = tweet.replace('&', ' und ')
        tweet = tweet.replace('|LBR|', ' ')
        tweet = tweet.replace('-', ' ')
        tweet = tweet.replace('_', ' ')
        tweet = tweet.replace("'s", ' ')
        tweet = tweet.replace(",", ' ')
        tweet = tweet.replace(";", ' ')
        tweet = tweet.replace(":", ' ')
        tweet = tweet.replace("/", ' ')
        tweet = tweet.replace("+", ' ')

        tknzr = Tokenizer_NLTK(preserve_case=self.preserve_case,
                               reduce_len=True)

        if self.join:
            return " ".join(tknzr.tokenize(tweet))
        elif self.use_stemmer:
            stmmr = Stemmer_NLTK()
            return [stmmr.stem(token) for token in tknzr.tokenize(tweet)]
        else:
            return tknzr.tokenize(tweet)
Пример #2
0
def read_files(filemap, categories, stopwordsPunctuationList):
    """Reads every line in the files from get_filenames_in_folder(),
	tokenizes the lines and creates a bag of words from it with bag_of_words()"""
    feats = list()
    print("\n##### Reading files...")
    for category in categories:
        files = get_filenames_in_folder(filemap + '/' + category)
        random.shuffle(files)
        num_files = 0
        for f in files:
            if not f.startswith('.'):
                data = open(filemap + '/' + category + '/' + f,
                            'r',
                            encoding='UTF-8').read()
                for line in data.split('\n'):
                    line = remove_handles(line)
                    line = re.sub(r"http\S+", "", line)
                    line = tokenizer.tokenize(line)
                    line = ' '.join(line)
                    line = line.lower()
                    line = tknzr.tokenize(line)
                    line = ' '.join(line)

                    line = line.split()
                    # exclude newlines
                    if not line == []:
                        filtered_tokens = [
                            w for w in line if not w.isdigit()
                            and w not in stopwordsPunctuationList
                            and w not in ['bitcoin', 'Bitcoin', 'btc', 'BTC']
                        ]
                        #print(filtered_tokens)
                        bag = bag_of_words(filtered_tokens)

                        feats.append((bag, category))
                        #print(feats)
                        #print len(filtered_tokens)
                        num_files += 1
                        if num_files >= 1000:  # you may want to de-comment this and the next line if you're doing tests (it just loads N documents instead of the whole collection so it runs faster
                            break

        print("  Category %s, %i files read" % (category, num_files))

    print("  Total, %i files read" % (len(feats)))
    return feats
Пример #3
0
    def tokenize(self, text):
        """
        :param text: str
        :rtype: list(str)
        :return: a tokenized list of strings;

        Normalizes URLs, usernames and word lengthening depending of the
        attributes of the instance.

        """
        # Fix HTML character entities:
        text = _replace_html_entities(text)

        # Remove or replace username handles
        if self.strip_handles:
            text = remove_handles(text)
        elif self.normalize_usernames:
            text = normalize_mentions(text)

        if self.normalize_urls:
            # Shorten problematic sequences of characters
            text = normalize_urls(text)

        # Normalize word lengthening
        if self.reduce_len:
            text = HANG_RE.sub(r'\1\1\1', text)
            text = reduce_lengthening(text)

        # Tokenize:
        safe_text = HANG_RE.sub(r'\1\1\1', text)
        words = WORD_RE.findall(safe_text)

        # Possibly alter the case, but avoid changing emoticons like :D into :d:
        # lower words but keep words that are all upper cases
        if not self.preserve_case:
            words = [_lowerize(w, self.keep_allupper) for w in words]

        words = [_stock_code(w) for w in words]

        return words
Пример #4
0
 def tokenize(self, text):
     """
     :param text: str
     :rtype: list(str)
     :return: a tokenized list of strings; concatenating this list returns\
     the original string if `preserve_case=False`
     """
     # Fix HTML character entities:
     text = _replace_html_entities(text)
     # Remove username handles
     if self.strip_handles:
         text = remove_handles(text)
     # Normalize word lengthening
     if self.reduce_len:
         text = reduce_lengthening(text)
     # Shorten problematic sequences of characters
     safe_text = HANG_RE.sub(r"\1\1\1", text)
     # Tokenize:
     r"|<(?:[^\d>]+|:[A-Za-z0-9]+:)\w+>"
     custom_Re = regex.compile(
         r"""(%s)"""
         % "|".join(
             (
                 r":[^:\s]+:",
                 r"<:[^:\s]+:[0-9]+>",
                 r"<a:[^:\s]+:[0-9]+>",
                 r"<(?:[^\d>]+|:[A-Za-z0-9]+:)\w+>",
             )
             + REGEXPS
         ),
         regex.VERBOSE | regex.I | regex.UNICODE,
     )
     words = custom_Re.findall(safe_text)
     # Possibly alter the case, but avoid changing emoticons like :D into :d:
     if not self.preserve_case:
         words = list(
             map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words)
         )
     return words
Пример #5
0
def text_cleaner(text):
    use_GermanStemmer = False
    tokens = False

    # Remove username handles
    # -? do we need the user names
    text = remove_handles(text)

    # Remove punctuation marks
    text_blob = TextBlob(text)
    text = ' '.join(text_blob.words)

    # replace the umlauts
    # =============================================================================
    #         text = re.sub('ä', 'ae', text)
    #         text = re.sub('ö', 'oe', text)
    #         text = re.sub('ü', 'ue', text)
    #         text = re.sub('Ä', 'Ae', text)
    #         text = re.sub('Ö', 'Oe', text)
    #         text = re.sub('Ü', 'Ue', text)
    #         text = re.sub('ß', 'ss', text)
    # =============================================================================

    # remove the numbers
    text = re.sub(r'[0-9]+', '', text)

    # Remove emojis
    german_char = " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZäöüÄÖÜ"
    text = ''.join(c for c in text if c in german_char)

    tokenizer = TweetTokenizer(preserve_case=True, reduce_len=True)
    if tokens:
        return tokenizer.tokenize(text)
    elif use_GermanStemmer:
        stemmer = GermanStemmer()
        return [stemmer.stem(token) for token in tokenizer.tokenize(text)]
    else:
        return text
Пример #6
0
def word_bagger(tlist):
    # Take out twitter handles
    newdoc = []
    for doc in tlist:
        newdoc.append(casual.remove_handles(doc))

    # Initialize algo
    counter = CountVectorizer(ngram_range=(1, 2), stop_words=en_stop, min_df=4)

    # Fit the model
    counts = counter.fit_transform(newdoc).toarray()

    # Summarize counts
    vocab = counter.get_feature_names()
    dist = np.sum(counts, axis=0)

    word_counts = []
    for tag, count in zip(vocab, dist):
        word_counts += [{'count': count, 'word': tag}]

    word_counts = pd.DataFrame.from_dict(word_counts)

    # Return a Pandas Dataframe
    return word_counts
Пример #7
0
def preprocess(tweet_text):
    return URL_RE.sub(' ', remove_handles(_replace_html_entities(tweet_text)))
Пример #8
0
def main():
    stopwordsPunctuationList = stopwords.words('english')
    [stopwordsPunctuationList.append(i) for i in string.punctuation]
    stopwordsPunctuationList.append('not')
    categories = ['positive', 'negative']
    trainmap = 'trainset'
    feats = read_files(trainmap, categories, stopwordsPunctuationList)

    #print(feats)

    high_info_words = high_information(feats, categories)

    high_info_feats = filter_high_information_words(feats, high_info_words)

    train_feats, test_feats = split_train_test(high_info_feats)

    all_words = []
    for f in feats:
        for word in f[0].keys():
            all_words.append(word)

    frequency = nltk.FreqDist(all_words)
    word_features = frequency.keys()

    #print(word_features)
    # https://www.youtube.com/watch?annotation_id=annotation_3385405775&feature=iv&index=12&list=PLQVvvaa0QuDf2JswnfiGkliBInZnIC4HL&src_vid=zi16nl82AMA&v=-vVskDsHcVc
    def find_features(document):
        words = set(document)
        features = {}
        for w in word_features:
            features[w] = (w in words)
        return features

    train_set = nltk.classify.apply_features(find_features, high_info_feats)
    nb_classifier = nltk.NaiveBayesClassifier.train(train_set)

    testmap = 'testset'
    #test_feats = read_files(testmap,categories,stopwordsPunctuationList)
    test_set = nltk.classify.apply_features(find_features, test_feats)

    accuracy_score = evaluation(nb_classifier, test_set, categories)
    analysis(nb_classifier)

    research_data = [
        'research_data/07-06-2018.csv', 'research_data/08-06-2018.csv',
        'research_data/09-06-2018.csv', 'research_data/10-06-2018.csv',
        'research_data/11-06-2018.csv', 'research_data/12-06-2018.csv',
        'research_data/13-06-2018.csv', 'research_data/14-06-2018.csv',
        'research_data/15-06-2018.csv', 'research_data/16-06-2018.csv',
        'research_data/17-06-2018.csv', 'research_data/18-06-2018.csv',
        'research_data/19-06-2018.csv', 'research_data/20-06-2018.csv',
        'research_data/21-06-2018.csv', 'research_data/22-06-2018.csv',
        'research_data/23-06-2018.csv'
    ]

    for file in research_data:
        positives, negatives = 0, 0
        with open(file, 'r', encoding='UTF-8') as f:
            f_list = list(f)
            for line in f_list:
                line = line.strip()
                #str_list = list(filter(None,line))
                line = line.strip('\n')
                line = remove_handles(line)
                line = re.sub(r"http\S+", "", line)
                line = re.sub(r'\d+', '', line)
                line = tokenizer.tokenize(line)
                line = ' '.join(line)
                line = line.lower()
                line = tknzr.tokenize(line)
                line = [
                    w for w in line if not w in stopwordsPunctuationList
                    and w not in ['rt', 'btc', 'bitcoin']
                ]
                line = ' '.join(line)

                result = nb_classifier.prob_classify(
                    find_features(line.split()))
                if result.max(
                ) == 'positive' and result.prob('positive') >= .7:
                    positives += 1
                elif result.max(
                ) == 'negative' and result.prob('negative') >= .7:
                    negatives += 1
                #break
        print(
            'Data: {} Positives: {} Negatives: {} Confidence Positive: {} Confidence Negative: {}'
            .format(file, positives, negatives, result.prob('positive'),
                    result.prob('negative')))
Пример #9
0
def preprocess(lines,
               dataset,
               stem=True,
               tokenize=True,
               removeHandles=True,
               removeLinks=True,
               removeEmojis=True,
               removeSymbols=True,
               replaceSlang=True,
               verbose=True):

    # Input: Raw lines from the dataset file
    # Output: Preprocessed tweet (or list of tokens if tokenize = True)

    tweets = []

    for i, line in enumerate(lines):

        if dataset == 'DisasterTweet':
            tweetElements = line.strip().split(",")[:-4]

            for i in range(len(tweetElements)):
                tweetElements[i] = tweetElements[i].strip()

            for k in tweetElements:
                if k == '':
                    tweetElements.remove(k)

            tweet = ",".join(tweetElements)

        elif dataset == 'cf10k':
            tweet = line

        elif dataset == 'cr26':
            tweet = line

        else:

            raise Exception('No dataset selected')
            tweet = None

        if verbose:
            print('original:', tweet)

        if removeHandles:
            tweet = remove_handles(tweet)

            if verbose:
                print('removeHandles:', tweet)

        if removeLinks:
            tweet = remove_links(tweet)

            if verbose:
                print('removeLinks:', tweet)

        if stem:
            tweet = [
                stemmer.stem(word) for word in tweet.split()
                if (word not in symbols)
            ]
            tweet = " ".join(tweet)

            if verbose:
                print('stem:', tweet)

        if tokenize:
            tweet = tk.tokenize(tweet)

            if verbose:
                print('tokenize', tweet)

        if removeEmojis:
            tweet = remove_emojis(tweet)

            if verbose:
                print('removeEmojis:', tweet)

        if removeSymbols:

            for w in tweet:
                if w in symbols:
                    tweet.remove(w)

            if verbose:
                print('removeSymbols:', tweet)

        if replaceSlang:

            if type(tweet) == str:
                tweet = tk.tokenize(tweet)

            tweet = replace_slang(tweet)

            if verbose:
                print('replaceSlang:', tweet)

        # ---
        if verbose:
            print('\n')
        # ---

        tweets.append(tweet)

    return tweets