예제 #1
0
 def __init__(
     self,
     token_mapping: Mapping[str, int],
     preserve_case: bool = False,
 ):
     self._token_mapping = token_mapping
     self._tokenizer = TweetTokenizer(preserve_case=preserve_case)
예제 #2
0
def preprocess_tweet(tweet):
    """
    This function will preprocess the input tweet

    Steps for preprocessing:
        1. Lowercase the letters
        2. Replace the characters with frequency greater than 3 with 3 in a word
        3. Replace a url with Tag: <URLURL>
        4. Replace a tag mention: <UsernameMention>


    @TODO:
        1. Look for better preprocessing methods on the web
        2. Apply here
    """
    clean_tweet = tp.clean(tweet)

    # perform lemmatization
    tokenizer = TweetTokenizer()
    tweet_tokens = tokenizer.tokenize(clean_tweet)

    lemmatized_tweet = lemmatize_tweet(tweet_tokens)

    # remove stopwords
    preprocessed_tweet = remove_stopwords(lemmatized_tweet)
    return preprocessed_tweet
def nltk_tweet_tokenizer(s, **tokenizer_kwargs): 
    """NTLK TweetTokenizer"""
    kwargs = dict(strip_handles=False, reduce_len=True)
    kwargs.update(**tokenizer_kwargs)
    tokenizer = TweetTokenizer(**kwargs)
    token_list = tokenizer.tokenize(s)
    return token_list 
def tokenize_comments(base_dir, comments_file,hashh=None):
    tkd_data = None

    if hashh:
        tkd_data = load_cached_data(hashh)

    if tkd_data is None:
        hash_f = get_cache_path(hashh)
        with open(hash_f, 'wb') as pkl_f:
            tkd_data = defaultdict(dict)
            tk = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False)
            for i, (root, dirs, files) in enumerate(os.walk(base_dir)):
                if comments_file in files:
                    project = root.split('/')[-1]
                    print('Processing %s, number %d' % (project, i))
                    posts = []
                    with open(os.path.join(root, comments_file), 'r') as inf:
                        r = csv.DictReader(inf)
                        for row in r:
                            p = post(' '.join(list(tk.tokenize(row['body']))),
                                     row['login'],
                                     row['mention_login'],
                                     row['issue_num'],
                                     row['datetime'],
                                     project)
                            posts.append(p)

                    tkd_data[project] = posts
            pickle.dump(tkd_data, pkl_f)

    return tkd_data
 def __init__(self, source_vocabulary, target_vocabulary, max_source_length,
              max_target_length):
     self.source_vocabulary = source_vocabulary
     self.target_vocabulary = target_vocabulary
     self.max_source_length = max_source_length
     self.max_target_length = max_target_length
     self.tokenizer = TweetTokenizer()
예제 #6
0
def data_processing(df):
    t = TweetTokenizer()
    emotions = [
        'anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative',
        'positive', 'sadness', 'surprise', 'trust'
    ]

    df['translated_full_text'] = df['translated_full_text'].astype(str).apply(
        remove_links)
    df['cleaned_text'] = df['translated_full_text'].astype(str).apply(
        style_text)
    df['cleaned_text'] = df['cleaned_text'].astype(str).apply(
        lambda x: remove_words(x.split(), stopcorpus))
    df['cleaned_text'] = df['cleaned_text'].apply(collapse_list_to_string)
    df['cleaned_text'] = df['cleaned_text'].astype(str).apply(
        remove_apostrophes)
    df['tokenized_sents'] = df.apply(
        lambda row: t.tokenize(row['cleaned_text']), axis=1)
    df['word_count'] = df.apply(lambda row: len(row['tokenized_sents']),
                                axis=1)
    df = df[df.word_count > 0]

    df = text_emotion(df)

    for emotion in emotions:
        df[emotion] = df[emotion] / df['word_count']

    date = datetime.datetime.strptime(df['created_at'].min(),
                                      '%Y-%m-%d %H:%M:%S').date()

    df.to_pickle(str(date) + ".pickle")
    df.to_excel(str(date) + ".xlsx")

    return
예제 #7
0
def nltk_tweet_tokenizer(s, **tokenizer_kwargs):
    """NTLK TweetTokenizer"""
    kwargs = dict(strip_handles=False, reduce_len=True)
    kwargs.update(**tokenizer_kwargs)
    tokenizer = TweetTokenizer(**kwargs)
    token_list = tokenizer.tokenize(s)
    return token_list
def parse_XML (file):
    text = ''
    continuation = False
    for line in file:
        #Searches if it's a line with a tweet
        if not continuation:
            groups = re.search('\[CDATA\[(.*)\]\]>', line)

            if groups is not None:
                #Extracts the tweet and lowercases it
                tweet = groups.group(1)
                tweet = tweet.lower()

                #Tokenizes the tweet
                tokens = TweetTokenizer().tokenize(tweet)
                tokenized = ' '.join(s.encode('ascii', 'ignore') for s in tokens)
                text = text + '\n' + tokenized
            else:
                groups = re.search('\[CDATA\[(.*)', line)
                if groups is not None:
                    temp = groups.group(1)
                    continuation = True
        else:
            groups = re.search('(.*)\]\]>', line)
            if groups is not None:
                tweet = temp + ' ' + groups.group(0)
                tweet = tweet.lower()
                tokens = TweetTokenizer().tokenize(tweet)
                tokenized = ' '.join(s.encode('ascii', 'ignore') for s in tokens)
                text = text + '\n' + tokenized
                continuation = False
            else:
                temp = temp + ' ' + line.rstrip()

    return text[1:]
예제 #9
0
    def tokenize(self, text):
        #Make a list where each word is an element, text_list = text.split(' ')
        #Lemmatize each word. Exception: We want "better" to become its lemma "good" but "best" should stay "best".
            #There are nltk methods for this. Look at https://www.youtube.com/watch?v=uoHVztKY6S4
        #Remove the articles 'a', 'an', 'the'
        #Also split on punctuation marks so that, "I like, fish" becomes ['I', 'like', ',', 'fish'] = token_list

        tweettokenizer = TweetTokenizer();
        lemmatizer = WordNetLemmatizer();
        token_list = tweettokenizer.tokenize(text)

        try:
            token_list.remove('a');
            token_list.remove('an');
            token_list.remove('the');
        except ValueError:
            pass

        pos_list = pos_tag(token_list)
        pos_listwordnet = [(word[0], self.get_wordnet_pos(word[1])) for word in pos_list]

        for i in range(len(token_list)):
            token_list[i] = lemmatizer.lemmatize(token_list[i] ,pos=pos_listwordnet[i][1])
        if len(token_list) == 1:
            token_list.append('.')

        return token_list
예제 #10
0
def parse_data_iterator(vocab, filename, delimiter=",", steps=10):
    vocab.add_word('</s>')
    file = open(filename, 'r')
    reader = csv.reader(
        file,
        delimiter=delimiter,
    )
    headers = next(reader)
    list_of_train = []
    tokenizer = TweetTokenizer(preserve_case=False,
                               strip_handles=True,
                               reduce_len=False)
    for row in reader:
        curr = []
        encoded = []
        label = [row[1]]
        if (row[1] == 0):
            label.append(1)
        else:
            label.append(0)
        words = tokenizer.tokenize(" ".join(row[3:]))
        for i in range(steps):
            if i < len(words):
                try:
                    words[i] = str(words[i])
                except:
                    words[i] = words[i]
                words[i] = canon_word(words[i])
                vocab.add_word(words[i])
                curr.append(words[i])
            else:
                curr.append('</s>')
        for word in curr:
            encoded.append(vocab.encode(word))
        yield label, curr
예제 #11
0
def tokenize(sents: list):
    tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True)
    sents_tok = []
    for sent in sents:
        tokens = [token for token in tokenizer.tokenize(sent) if not token.startswith('http')]
        sents_tok.append(' '.join(tokens))
    return sents_tok
예제 #12
0
    def __init__(self):
        with open('model/tokenizer.pickle', 'rb') as handle:
            self.tokenizer = pickle.load(handle)

        with open('model/label_encoder', 'rb') as handle:
            self.y_enc = pickle.load(handle)

        self.tweeter = TweetTokenizer()
        self.lemma = WordNetLemmatizer()
        self.vocab_size = len(self.tokenizer.word_index) + 1

        self.model = tf.keras.Sequential([
            tf.keras.layers.Embedding(self.vocab_size, 50, mask_zero=True),
            tf.keras.layers.Dropout(0.4),
            tf.keras.layers.Bidirectional(
                tf.keras.layers.LSTM(1024, return_sequences=True)),
            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1024)),
            tf.keras.layers.Dropout(0.4),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(len(self.y_enc.classes_),
                                  activation='softmax')
        ])

        self.model.load_weights('model/chatbot')

        self.responses = self._load_responses()
예제 #13
0
 def tweet_clean(self, tweet):
     # Remove tickers
     sent_no_tickers = re.sub(r'\$\w*', '', tweet)
     tw_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
     temp_tw_list = tw_tokenizer.tokenize(sent_no_tickers)
     # Remove stopwords
     list_no_stopwords = [
         i for i in temp_tw_list
         if i.lower() not in self._cache_english_stopwords
     ]
     # Remove hyperlinks
     list_no_hyperlinks = [
         re.sub(r'https?:\/\/.*\/\w*', '', i) for i in list_no_stopwords
     ]
     # Remove hashtags
     list_no_hashtags = [re.sub(r'#', '', i) for i in list_no_hyperlinks]
     # Remove Punctuation and split 's, 't, 've with a space for filter
     list_no_punctuation = [
         re.sub(r'[' + string.punctuation + ']+', ' ', i)
         for i in list_no_hashtags
     ]
     # Remove multiple whitespace
     new_sent = ' '.join(list_no_punctuation)
     # Remove any words with 2 or fewer letters
     filtered_list = tw_tokenizer.tokenize(new_sent)
     list_filtered = [re.sub(r'^\w\w?$', '', i) for i in filtered_list]
     filtered_sent = ' '.join(list_filtered)
     cleaned_tweet = re.sub(r'\s\s+', ' ', filtered_sent)
     # Remove any whitespace at the front of the sentence
     cleaned_tweet = cleaned_tweet.lstrip(' ')
     return cleaned_tweet
예제 #14
0
def get_train_test_data(find_and_concatenate_expressions=False):
    def remove_url(tokens):
        tokens = filter(lambda x: "http" not in x, tokens)
        return list(tokens)

    def remove_hashtags(tokens):
        tokens = map(lambda x: x.replace('#', ''), tokens)
        return list(tokens)

    db = pd.read_excel("Classeur1.xlsx", encoding="utf-8")
    dict_values = {'Not Relevant': -1, 'Relevant': 1, "Can't Decide": 0}
    db["to_predict"] = db.choose_one.map(dict_values)
    db = db[["text", "to_predict"]]
    twtk = TweetTokenizer(preserve_case=False,
                          reduce_len=True,
                          strip_handles=True)
    db["token_retreated_text"] = db["text"].apply(
        lambda x: remove_hashtags(remove_url(twtk.tokenize(x))))
    db["retreated_text"] = db["token_retreated_text"].apply(
        lambda x: " ".join(x))

    if find_and_concatenate_expressions:
        db["token_retreated_text"] = clean_corpus(db["retreated_text"])
        db["retreated_text"] = db["token_retreated_text"].apply(
            lambda x: " ".join(x))

    msk = np.random.rand(len(db)) < 0.8
    train = db[msk]
    test = db[~msk]

    return train, test
예제 #15
0
 def set_params(self, **parameters):
     """Set the params"""
     for parameter, value in parameters.items():
         setattr(self, '_{}'.format(parameter), value)
     self._tokenizer = TweetTokenizer(preserve_case=False,
                                      reduce_len=True,
                                      strip_handles=True)
def tokenize(s):
    sentence_splitter = TweetTokenizer()
    tokens = sentence_splitter.tokenize(s)
    result = []
    for word in tokens:
        result.append(unicodedata.normalize('NFKD', word).encode('ascii', 'ignore'))

    return result
예제 #17
0
 def __init__(self, data_loader):
     self.data = data_loader
     self.tokenizer = TweetTokenizer()
     self.stemmer = PorterStemmer()
     self.stopwords = stopwords.words('english')
     self.re_url = r'http\S+'
     self.punctuation = string.punctuation
     self.vocab = defaultdict(set)
예제 #18
0
 def preprocess(comments, preprocessors):
     tokenizer = TweetTokenizer()
     html_cleaner = re.compile('<.+?>')
     for comment in comments:
         comment = html_cleaner.sub('', comment)
         tokenized_comment = tokenizer.tokenize(comment)
         for preprocessor in preprocessors:
             tokenized_comment = preprocessor.optimize(tokenized_comment)
         yield tokenized_comment
def tokenize(s):
    sentence_splitter = TweetTokenizer()
    tokens = sentence_splitter.tokenize(s)
    result = []
    for word in tokens:
        result.append(
            unicodedata.normalize('NFKD', word).encode('ascii', 'ignore'))

    return result
예제 #20
0
 def __init__(self, tokenizer="tweet", punctuation=True, verbose=1):
     self.contextualizer = Contextualizer()
     self.corrector = Corrector(word2index=self.contextualizer.word2index,
                                index2word=self.contextualizer.index2word)
     self.tokenizer_type = tokenizer
     self.keep_punctuation = punctuation
     if self.tokenizer_type == "tweet":
         self.tokenizer = TweetTokenizer()
     self.verbose = verbose
예제 #21
0
def tokeniza(chars, keyword=None):
    """
    Tokenize a string (duplicates keywords if any)
    """
    tokenizer = TweetTokenizer(preserve_case=False,
                               strip_handles=True,
                               reduce_len=True)
    tokens = tokenizer.tokenize(chars)
    return tokens
예제 #22
0
def modify_abbrev(tweet):
    tokenizer = TweetTokenizer(preserve_case=False,
                               strip_handles=True,
                               reduce_len=True)
    tokens = tokenizer.tokenize(tweet)
    for w in tokens:
        w = abbreviations[
            w.lower()] if w.lower() in abbreviations.keys() else w
    text = ' '.join(tokens)
    return text
예제 #23
0
def clear_data(tweet):
    tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
    tokens = tokenizer.tokenize(tweet)
    clean_tweet = tokens \
            | remove_urls \
            | process_hashtags \
            | remove_stopwords \
            | remove_numbers \
            | remove_multiple_occurrence
    return ' '.join(clean_tweet)
예제 #24
0
def preprocess(text):
    tokenizer = TweetTokenizer()

    # Remove stopwords.
    tokens = tokenizer.tokenize(text)
    tokens = [
        token for token in tokens
        if token not in ENGLISH_STOPWORDS and token.isalpha()
    ]
    return tokens
예제 #25
0
def tokenize(text):
    tweet_tokenizer = TweetTokenizer()
    # 1. Tokenize
    text = tweet_tokenizer.tokenize(text)
    # 2. Cleaning
    # Punctuation
    text = [t for t in text if t not in string.punctuation]
    # Normalisieren
    text = [t.lower() for t in text]
    return text
예제 #26
0
def process_tweet_text(tweet):
   if tweet.startswith('@null'):
       return "[Tweet not available]"
   tweet = re.sub(r'\$\w*','',tweet) # Remove tickers
   tweet = re.sub(r'https?:\/\/.*\/\w*','',tweet) # Remove hyperlinks
   tweet = re.sub(r'['+string.punctuation+']+', ' ',tweet) # Remove puncutations like 's
   twtok = TweetTokenizer(strip_handles=True, reduce_len=True)
   tokens = twtok.tokenize(tweet)
   tokens = [i.lower() for i in tokens if i not in stopwords and len(i) > 2 and  
                                             i in english_vocab]
   return tokens
예제 #27
0
def normalize_messages(messages):
    tokenizer = TweetTokenizer(preserve_case=False)
    normalized_messages = []
    for message in messages:
        try:
            tokens = tokenizer.tokenize(message)
            text = [word.lower() for word in Text(tokens)]
            if text:
                normalized_messages.append(text)
        except TypeError:
            pass
    return normalized_messages
예제 #28
0
 def __init__(self, preserve_case: Boolean(), reduce_len: Boolean(),
              strip_handles: Boolean()):
     self.preserve_case = preserve_case
     self.reduce_len = reduce_len
     self.strip_handles = strip_handles
     NltkTokenizer.__init__(self)
     _TweetTokenizer.__init__(
         self,
         preserve_case=preserve_case,
         reduce_len=reduce_len,
         strip_handles=strip_handles,
     )
예제 #29
0
def create_matrix(tweets: List, name: str = 'oscar pistorius') -> csr_matrix:
    matrix_loc = Path('data', name, 'tf_idf_matrix.pickle')

    if matrix_loc.exists():
        logger.info("Matrix exists! loading...")
        with matrix_loc.open('rb') as f:
            matrix = pickle.loads(f.read())
            return matrix

    stemmer = PorterStemmer()
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True)

    texts = []
    for tweet in tqdm(tweets, desc="(create_matrix) iterating over tweets..."):
        text = tweet.text

        tokens = tokenizer.tokenize(text)
        text_proc = []
        for token in tokens:
            token = token.strip()
            if len(token) < 3:
                continue
            elif token in stopwords.words('english'):
                continue
            elif nlp_utils.match_url(token):
                continue
            elif token in string.punctuation:
                continue
            # elif token.startswith(("#", "$")):
            #     continue

            token = token.translate({ord(k): "" for k in string.punctuation})
            token = stemmer.stem(token)

            token = token.strip()
            if token == "":
                continue

            text_proc.append(token)

        texts.append(text_proc)

    vectorizer = TfidfVectorizer(analyzer="word",
                                 tokenizer=lambda x: x,
                                 lowercase=False)
    m = vectorizer.fit_transform(texts)

    logger.info("Saving computed matrix...")
    with matrix_loc.open('wb') as f:
        f.write(pickle.dumps(m))

    return m
예제 #30
0
def tokenize_text(text):
    """
    Transforms the specified files in tokens using the Twitter tokenizer.
    @params: str
        Input text to tokenize
    @returns: list(str)
        Returns the tokens as a list of strings.
    """
    tokenizer = TweetTokenizer()
    # tokenizing the text
    tokens = tokenizer.tokenize(text)
    words = [w.lower() for w in tokens]
    return words
예제 #31
0
def compare(topicsFileName, headlinesFileName):
    """
    This function compares a set of detected trending topics to a list of headlines in the JSON format provided by NewsAPI.

    A detected trending topic is considered as matching a headline if the intersection of the headline and the topic is at least 40%.

    It returns the list of trending topics that are included in the provided headlines, as well as:
    recall: number of matching topics divided by number of headlines
    precision: average fraction of headline terms found per matching topic
    """

    # load topics from file
    topics = []
    with open(topicsFileName, 'r', encoding='utf-8') as tf:
        topics = [json.loads(line) for line in tf]

    # load headlines from file
    headlines = []
    with open(headlinesFileName, 'r', encoding='utf-8') as hf:
        headlines = [json.loads(line) for line in hf]

    # prepare stemmer and tokenizer
    stemmer = PorterStemmer(mode=PorterStemmer.MARTIN_EXTENSIONS)
    tokenizer = TweetTokenizer()

    # compare every topic with every headline
    matchingTopics = []
    for tIter, topic in enumerate(topics):
        print('\r', tIter + 1, len(topics), end='', file=sys.stderr)
        for headline in headlines:
            # split headline title (rather than description) into stemmed terms
            if 'title' not in headline or headline['title'] is None or len(
                    headline['title']) == 0:
                continue
            usedText = headline['title']
            headlineTerms = [
                stemmer.stem(term) for term in tokenizer.tokenize(usedText)
                if term not in stopwords.stopwords + stopwords.moreStopwords
            ]

            # check for inclusion of topic in headline
            if len(set(topic['terms'].keys()) & set(headlineTerms)
                   ) >= 0.4 * min(len(set(headlineTerms)),
                                  len(set(topic['terms'].keys()))):
                matchingTopics.append(topic)
                break
    print(file=sys.stderr)

    precision = len(matchingTopics) / len(topics)
    recall = len(matchingTopics) / len(headlines)
    return matchingTopics, recall, precision
예제 #32
0
def preprocess(text, sentiments, w2i, maxlen, shuffle=True):
    tokenizer = TweetTokenizer()

    reviews = []
    for t in text:
        tokens = list(tokenizer.tokenize(t))
        token_idx = convert_str_to_idx(tokens, w2i, maxlen)
        reviews.append(token_idx)

    txt, sents = torch.LongTensor(reviews), torch.FloatTensor(sentiments)
    if shuffle:
        txt, sents = _shuffle(txt, sents)

    return txt, sents.unsqueeze(1)
예제 #33
0
 def __init__(self, preserve_case=True,
             strip_handles=True, reduce_len=True):
     TweetTokenizer.__init__(self, preserve_case,
             strip_handles, reduce_len)