def parse_XML (file):
    text = ''
    continuation = False
    for line in file:
        #Searches if it's a line with a tweet
        if not continuation:
            groups = re.search('\[CDATA\[(.*)\]\]>', line)

            if groups is not None:
                #Extracts the tweet and lowercases it
                tweet = groups.group(1)
                tweet = tweet.lower()

                #Tokenizes the tweet
                tokens = TweetTokenizer().tokenize(tweet)
                tokenized = ' '.join(s.encode('ascii', 'ignore') for s in tokens)
                text = text + '\n' + tokenized
            else:
                groups = re.search('\[CDATA\[(.*)', line)
                if groups is not None:
                    temp = groups.group(1)
                    continuation = True
        else:
            groups = re.search('(.*)\]\]>', line)
            if groups is not None:
                tweet = temp + ' ' + groups.group(0)
                tweet = tweet.lower()
                tokens = TweetTokenizer().tokenize(tweet)
                tokenized = ' '.join(s.encode('ascii', 'ignore') for s in tokens)
                text = text + '\n' + tokenized
                continuation = False
            else:
                temp = temp + ' ' + line.rstrip()

    return text[1:]
예제 #2
0
def tokenize(sents: list):
    tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True)
    sents_tok = []
    for sent in sents:
        tokens = [token for token in tokenizer.tokenize(sent) if not token.startswith('http')]
        sents_tok.append(' '.join(tokens))
    return sents_tok
예제 #3
0
    def __init__(self):
        with open('model/tokenizer.pickle', 'rb') as handle:
            self.tokenizer = pickle.load(handle)

        with open('model/label_encoder', 'rb') as handle:
            self.y_enc = pickle.load(handle)

        self.tweeter = TweetTokenizer()
        self.lemma = WordNetLemmatizer()
        self.vocab_size = len(self.tokenizer.word_index) + 1

        self.model = tf.keras.Sequential([
            tf.keras.layers.Embedding(self.vocab_size, 50, mask_zero=True),
            tf.keras.layers.Dropout(0.4),
            tf.keras.layers.Bidirectional(
                tf.keras.layers.LSTM(1024, return_sequences=True)),
            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1024)),
            tf.keras.layers.Dropout(0.4),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(len(self.y_enc.classes_),
                                  activation='softmax')
        ])

        self.model.load_weights('model/chatbot')

        self.responses = self._load_responses()
예제 #4
0
 def tweet_clean(self, tweet):
     # Remove tickers
     sent_no_tickers = re.sub(r'\$\w*', '', tweet)
     tw_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
     temp_tw_list = tw_tokenizer.tokenize(sent_no_tickers)
     # Remove stopwords
     list_no_stopwords = [
         i for i in temp_tw_list
         if i.lower() not in self._cache_english_stopwords
     ]
     # Remove hyperlinks
     list_no_hyperlinks = [
         re.sub(r'https?:\/\/.*\/\w*', '', i) for i in list_no_stopwords
     ]
     # Remove hashtags
     list_no_hashtags = [re.sub(r'#', '', i) for i in list_no_hyperlinks]
     # Remove Punctuation and split 's, 't, 've with a space for filter
     list_no_punctuation = [
         re.sub(r'[' + string.punctuation + ']+', ' ', i)
         for i in list_no_hashtags
     ]
     # Remove multiple whitespace
     new_sent = ' '.join(list_no_punctuation)
     # Remove any words with 2 or fewer letters
     filtered_list = tw_tokenizer.tokenize(new_sent)
     list_filtered = [re.sub(r'^\w\w?$', '', i) for i in filtered_list]
     filtered_sent = ' '.join(list_filtered)
     cleaned_tweet = re.sub(r'\s\s+', ' ', filtered_sent)
     # Remove any whitespace at the front of the sentence
     cleaned_tweet = cleaned_tweet.lstrip(' ')
     return cleaned_tweet
예제 #5
0
    def tokenize(self, text):
        #Make a list where each word is an element, text_list = text.split(' ')
        #Lemmatize each word. Exception: We want "better" to become its lemma "good" but "best" should stay "best".
            #There are nltk methods for this. Look at https://www.youtube.com/watch?v=uoHVztKY6S4
        #Remove the articles 'a', 'an', 'the'
        #Also split on punctuation marks so that, "I like, fish" becomes ['I', 'like', ',', 'fish'] = token_list

        tweettokenizer = TweetTokenizer();
        lemmatizer = WordNetLemmatizer();
        token_list = tweettokenizer.tokenize(text)

        try:
            token_list.remove('a');
            token_list.remove('an');
            token_list.remove('the');
        except ValueError:
            pass

        pos_list = pos_tag(token_list)
        pos_listwordnet = [(word[0], self.get_wordnet_pos(word[1])) for word in pos_list]

        for i in range(len(token_list)):
            token_list[i] = lemmatizer.lemmatize(token_list[i] ,pos=pos_listwordnet[i][1])
        if len(token_list) == 1:
            token_list.append('.')

        return token_list
예제 #6
0
 def __init__(
     self,
     token_mapping: Mapping[str, int],
     preserve_case: bool = False,
 ):
     self._token_mapping = token_mapping
     self._tokenizer = TweetTokenizer(preserve_case=preserve_case)
예제 #7
0
def preprocess_tweet(tweet):
    """
    This function will preprocess the input tweet

    Steps for preprocessing:
        1. Lowercase the letters
        2. Replace the characters with frequency greater than 3 with 3 in a word
        3. Replace a url with Tag: <URLURL>
        4. Replace a tag mention: <UsernameMention>


    @TODO:
        1. Look for better preprocessing methods on the web
        2. Apply here
    """
    clean_tweet = tp.clean(tweet)

    # perform lemmatization
    tokenizer = TweetTokenizer()
    tweet_tokens = tokenizer.tokenize(clean_tweet)

    lemmatized_tweet = lemmatize_tweet(tweet_tokens)

    # remove stopwords
    preprocessed_tweet = remove_stopwords(lemmatized_tweet)
    return preprocessed_tweet
예제 #8
0
def nltk_tweet_tokenizer(s, **tokenizer_kwargs):
    """NTLK TweetTokenizer"""
    kwargs = dict(strip_handles=False, reduce_len=True)
    kwargs.update(**tokenizer_kwargs)
    tokenizer = TweetTokenizer(**kwargs)
    token_list = tokenizer.tokenize(s)
    return token_list
 def __init__(self, source_vocabulary, target_vocabulary, max_source_length,
              max_target_length):
     self.source_vocabulary = source_vocabulary
     self.target_vocabulary = target_vocabulary
     self.max_source_length = max_source_length
     self.max_target_length = max_target_length
     self.tokenizer = TweetTokenizer()
def parse_data(data):
    """
    Parse all unique sentences in data.
    
    :param data: pandas.DataFrame with text data
    :returns parsed_data:: pandas.DataFrame with text data
    """
    parser_en = spacy.load('en_core_web_md', disable=['ner', 'textcat'])
    parser_es = spacy.load('es_core_news_sm', disable=['ner', 'textcat'])
    # custom tokenizers because duh
    parser_en.tokenizer = NLTKTokenizerSpacy(parser_en.vocab, TweetTokenizer())
    parser_es.tokenizer = NLTKTokenizerSpacy(parser_es.vocab, ToktokTokenizer())
    data.loc[:, 'lang'] = data.loc[:, 'txt'].apply(lambda x: langid.classify(x)[0])
    parsed_data = []
    for i, data_i in data.iterrows():
        txt = data_i.loc['txt']
        txt = clean_data_for_spacy(txt)
        sents = sent_tokenize(txt)
        parsed_data_i = []
        for sent in sents:
            if(data_i.loc['lang'] == 'es'):
                parse_i = parser_es(sent)
            else:
                parse_i = parser_en(sent)
            # extract tree
            tree_i = build_parse(parse_i, parse_type='spacy')
            parsed_data_i.append(tree_i)
        parsed_data_i = pd.DataFrame(pd.Series(parsed_data_i), columns=['parse'])
#         logging.debug('processing id %s/%s'%(data_i.loc['id'], int(data_i.loc['id'])))
        parsed_data_i = parsed_data_i.assign(**{'id' : int(data_i.loc['id'])})
        parsed_data.append(parsed_data_i)
    parsed_data = pd.concat(parsed_data, axis=0)
#     parsed_data.loc[:, 'id'] = parsed_data.loc[:, 'id'].astype(np.int64)
    return parsed_data
def tokenize_comments(base_dir, comments_file,hashh=None):
    tkd_data = None

    if hashh:
        tkd_data = load_cached_data(hashh)

    if tkd_data is None:
        hash_f = get_cache_path(hashh)
        with open(hash_f, 'wb') as pkl_f:
            tkd_data = defaultdict(dict)
            tk = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False)
            for i, (root, dirs, files) in enumerate(os.walk(base_dir)):
                if comments_file in files:
                    project = root.split('/')[-1]
                    print('Processing %s, number %d' % (project, i))
                    posts = []
                    with open(os.path.join(root, comments_file), 'r') as inf:
                        r = csv.DictReader(inf)
                        for row in r:
                            p = post(' '.join(list(tk.tokenize(row['body']))),
                                     row['login'],
                                     row['mention_login'],
                                     row['issue_num'],
                                     row['datetime'],
                                     project)
                            posts.append(p)

                    tkd_data[project] = posts
            pickle.dump(tkd_data, pkl_f)

    return tkd_data
예제 #12
0
def tokenize_sentences(corpus):
    twtk = TweetTokenizer(preserve_case=False,
                          reduce_len=True,
                          strip_handles=True)
    tokenized_sentences = corpus.apply(twtk.tokenize)

    return tokenized_sentences
예제 #13
0
def data_processing(df):
    t = TweetTokenizer()
    emotions = [
        'anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative',
        'positive', 'sadness', 'surprise', 'trust'
    ]

    df['translated_full_text'] = df['translated_full_text'].astype(str).apply(
        remove_links)
    df['cleaned_text'] = df['translated_full_text'].astype(str).apply(
        style_text)
    df['cleaned_text'] = df['cleaned_text'].astype(str).apply(
        lambda x: remove_words(x.split(), stopcorpus))
    df['cleaned_text'] = df['cleaned_text'].apply(collapse_list_to_string)
    df['cleaned_text'] = df['cleaned_text'].astype(str).apply(
        remove_apostrophes)
    df['tokenized_sents'] = df.apply(
        lambda row: t.tokenize(row['cleaned_text']), axis=1)
    df['word_count'] = df.apply(lambda row: len(row['tokenized_sents']),
                                axis=1)
    df = df[df.word_count > 0]

    df = text_emotion(df)

    for emotion in emotions:
        df[emotion] = df[emotion] / df['word_count']

    date = datetime.datetime.strptime(df['created_at'].min(),
                                      '%Y-%m-%d %H:%M:%S').date()

    df.to_pickle(str(date) + ".pickle")
    df.to_excel(str(date) + ".xlsx")

    return
예제 #14
0
def get_train_test_data(find_and_concatenate_expressions=False):
    def remove_url(tokens):
        tokens = filter(lambda x: "http" not in x, tokens)
        return list(tokens)

    def remove_hashtags(tokens):
        tokens = map(lambda x: x.replace('#', ''), tokens)
        return list(tokens)

    db = pd.read_excel("Classeur1.xlsx", encoding="utf-8")
    dict_values = {'Not Relevant': -1, 'Relevant': 1, "Can't Decide": 0}
    db["to_predict"] = db.choose_one.map(dict_values)
    db = db[["text", "to_predict"]]
    twtk = TweetTokenizer(preserve_case=False,
                          reduce_len=True,
                          strip_handles=True)
    db["token_retreated_text"] = db["text"].apply(
        lambda x: remove_hashtags(remove_url(twtk.tokenize(x))))
    db["retreated_text"] = db["token_retreated_text"].apply(
        lambda x: " ".join(x))

    if find_and_concatenate_expressions:
        db["token_retreated_text"] = clean_corpus(db["retreated_text"])
        db["retreated_text"] = db["token_retreated_text"].apply(
            lambda x: " ".join(x))

    msk = np.random.rand(len(db)) < 0.8
    train = db[msk]
    test = db[~msk]

    return train, test
예제 #15
0
    def __init__(self, phase, kwargs):
        self.mode        = Mode[kwargs['mode']]
        self.image_size  = kwargs['image_size']
        self.hidden_size = kwargs['hidden_size']

        self.debug_use_dataset  = kwargs['debug_use_dataset']
        self.debug_one_sentence = kwargs['debug_one_sentence']
        self.__use_densenet     = kwargs['__use_densenet']

        self.sent_tokenizer = PunktSentenceTokenizer()
        self.word_tokenizer = TweetTokenizer()

        if phase == Phase.train:
            jitter = [ColorJitter(brightness=0.5, contrast=0.5)]
        else:
            jitter = []

        if self.__use_densenet:
            self.transform = Compose((
                [Lambda(lambda img: img.convert('RGB'))] +
                [Resize((256, 256))] +
                jitter +
                [ToTensor()] +
                [Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]
            ))
        else:
            self.transform = Compose((
                [Resize((256, 256))] +
                jitter +
                [ToTensor()]
            ))
예제 #16
0
def parse_data_iterator(vocab, filename, delimiter=",", steps=10):
    vocab.add_word('</s>')
    file = open(filename, 'r')
    reader = csv.reader(
        file,
        delimiter=delimiter,
    )
    headers = next(reader)
    list_of_train = []
    tokenizer = TweetTokenizer(preserve_case=False,
                               strip_handles=True,
                               reduce_len=False)
    for row in reader:
        curr = []
        encoded = []
        label = [row[1]]
        if (row[1] == 0):
            label.append(1)
        else:
            label.append(0)
        words = tokenizer.tokenize(" ".join(row[3:]))
        for i in range(steps):
            if i < len(words):
                try:
                    words[i] = str(words[i])
                except:
                    words[i] = words[i]
                words[i] = canon_word(words[i])
                vocab.add_word(words[i])
                curr.append(words[i])
            else:
                curr.append('</s>')
        for word in curr:
            encoded.append(vocab.encode(word))
        yield label, curr
예제 #17
0
 def set_params(self, **parameters):
     """Set the params"""
     for parameter, value in parameters.items():
         setattr(self, '_{}'.format(parameter), value)
     self._tokenizer = TweetTokenizer(preserve_case=False,
                                      reduce_len=True,
                                      strip_handles=True)
예제 #18
0
def get_vocabulary_tokenizer(samples):
    texts = [sample.text for sample in samples]
    vocabulary = Vocabulary()
    tokenizer = WordTokenizer(texts=texts, tokenizer=TweetTokenizer())
    tokenized_samples = [tokenizer.tokenize(sample.text) for sample in samples]
    vocabulary.fit((token for tokens in tokenized_samples for token in tokens))
    print(tokenized_samples[0:1])
    return vocabulary, tokenizer
예제 #19
0
 def __init__(self, data_loader):
     self.data = data_loader
     self.tokenizer = TweetTokenizer()
     self.stemmer = PorterStemmer()
     self.stopwords = stopwords.words('english')
     self.re_url = r'http\S+'
     self.punctuation = string.punctuation
     self.vocab = defaultdict(set)
예제 #20
0
def tokeniza(chars, keyword=None):
    """
    Tokenize a string (duplicates keywords if any)
    """
    tokenizer = TweetTokenizer(preserve_case=False,
                               strip_handles=True,
                               reduce_len=True)
    tokens = tokenizer.tokenize(chars)
    return tokens
def tokenize(s):
    sentence_splitter = TweetTokenizer()
    tokens = sentence_splitter.tokenize(s)
    result = []
    for word in tokens:
        result.append(
            unicodedata.normalize('NFKD', word).encode('ascii', 'ignore'))

    return result
예제 #22
0
 def preprocess(comments, preprocessors):
     tokenizer = TweetTokenizer()
     html_cleaner = re.compile('<.+?>')
     for comment in comments:
         comment = html_cleaner.sub('', comment)
         tokenized_comment = tokenizer.tokenize(comment)
         for preprocessor in preprocessors:
             tokenized_comment = preprocessor.optimize(tokenized_comment)
         yield tokenized_comment
예제 #23
0
 def __init__(self, tokenizer="tweet", punctuation=True, verbose=1):
     self.contextualizer = Contextualizer()
     self.corrector = Corrector(word2index=self.contextualizer.word2index,
                                index2word=self.contextualizer.index2word)
     self.tokenizer_type = tokenizer
     self.keep_punctuation = punctuation
     if self.tokenizer_type == "tweet":
         self.tokenizer = TweetTokenizer()
     self.verbose = verbose
예제 #24
0
def preprocess(text):
    tokenizer = TweetTokenizer()

    # Remove stopwords.
    tokens = tokenizer.tokenize(text)
    tokens = [
        token for token in tokens
        if token not in ENGLISH_STOPWORDS and token.isalpha()
    ]
    return tokens
예제 #25
0
def tokenize(text):
    tweet_tokenizer = TweetTokenizer()
    # 1. Tokenize
    text = tweet_tokenizer.tokenize(text)
    # 2. Cleaning
    # Punctuation
    text = [t for t in text if t not in string.punctuation]
    # Normalisieren
    text = [t.lower() for t in text]
    return text
예제 #26
0
def modify_abbrev(tweet):
    tokenizer = TweetTokenizer(preserve_case=False,
                               strip_handles=True,
                               reduce_len=True)
    tokens = tokenizer.tokenize(tweet)
    for w in tokens:
        w = abbreviations[
            w.lower()] if w.lower() in abbreviations.keys() else w
    text = ' '.join(tokens)
    return text
예제 #27
0
def clear_data(tweet):
    tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
    tokens = tokenizer.tokenize(tweet)
    clean_tweet = tokens \
            | remove_urls \
            | process_hashtags \
            | remove_stopwords \
            | remove_numbers \
            | remove_multiple_occurrence
    return ' '.join(clean_tweet)
예제 #28
0
def process_tweet_text(tweet):
   if tweet.startswith('@null'):
       return "[Tweet not available]"
   tweet = re.sub(r'\$\w*','',tweet) # Remove tickers
   tweet = re.sub(r'https?:\/\/.*\/\w*','',tweet) # Remove hyperlinks
   tweet = re.sub(r'['+string.punctuation+']+', ' ',tweet) # Remove puncutations like 's
   twtok = TweetTokenizer(strip_handles=True, reduce_len=True)
   tokens = twtok.tokenize(tweet)
   tokens = [i.lower() for i in tokens if i not in stopwords and len(i) > 2 and  
                                             i in english_vocab]
   return tokens
예제 #29
0
def normalize_messages(messages):
    tokenizer = TweetTokenizer(preserve_case=False)
    normalized_messages = []
    for message in messages:
        try:
            tokens = tokenizer.tokenize(message)
            text = [word.lower() for word in Text(tokens)]
            if text:
                normalized_messages.append(text)
        except TypeError:
            pass
    return normalized_messages
예제 #30
0
def create_matrix(tweets: List, name: str = 'oscar pistorius') -> csr_matrix:
    matrix_loc = Path('data', name, 'tf_idf_matrix.pickle')

    if matrix_loc.exists():
        logger.info("Matrix exists! loading...")
        with matrix_loc.open('rb') as f:
            matrix = pickle.loads(f.read())
            return matrix

    stemmer = PorterStemmer()
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True)

    texts = []
    for tweet in tqdm(tweets, desc="(create_matrix) iterating over tweets..."):
        text = tweet.text

        tokens = tokenizer.tokenize(text)
        text_proc = []
        for token in tokens:
            token = token.strip()
            if len(token) < 3:
                continue
            elif token in stopwords.words('english'):
                continue
            elif nlp_utils.match_url(token):
                continue
            elif token in string.punctuation:
                continue
            # elif token.startswith(("#", "$")):
            #     continue

            token = token.translate({ord(k): "" for k in string.punctuation})
            token = stemmer.stem(token)

            token = token.strip()
            if token == "":
                continue

            text_proc.append(token)

        texts.append(text_proc)

    vectorizer = TfidfVectorizer(analyzer="word",
                                 tokenizer=lambda x: x,
                                 lowercase=False)
    m = vectorizer.fit_transform(texts)

    logger.info("Saving computed matrix...")
    with matrix_loc.open('wb') as f:
        f.write(pickle.dumps(m))

    return m