def parse_XML (file): text = '' continuation = False for line in file: #Searches if it's a line with a tweet if not continuation: groups = re.search('\[CDATA\[(.*)\]\]>', line) if groups is not None: #Extracts the tweet and lowercases it tweet = groups.group(1) tweet = tweet.lower() #Tokenizes the tweet tokens = TweetTokenizer().tokenize(tweet) tokenized = ' '.join(s.encode('ascii', 'ignore') for s in tokens) text = text + '\n' + tokenized else: groups = re.search('\[CDATA\[(.*)', line) if groups is not None: temp = groups.group(1) continuation = True else: groups = re.search('(.*)\]\]>', line) if groups is not None: tweet = temp + ' ' + groups.group(0) tweet = tweet.lower() tokens = TweetTokenizer().tokenize(tweet) tokenized = ' '.join(s.encode('ascii', 'ignore') for s in tokens) text = text + '\n' + tokenized continuation = False else: temp = temp + ' ' + line.rstrip() return text[1:]
def tokenize(sents: list): tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True) sents_tok = [] for sent in sents: tokens = [token for token in tokenizer.tokenize(sent) if not token.startswith('http')] sents_tok.append(' '.join(tokens)) return sents_tok
def __init__(self): with open('model/tokenizer.pickle', 'rb') as handle: self.tokenizer = pickle.load(handle) with open('model/label_encoder', 'rb') as handle: self.y_enc = pickle.load(handle) self.tweeter = TweetTokenizer() self.lemma = WordNetLemmatizer() self.vocab_size = len(self.tokenizer.word_index) + 1 self.model = tf.keras.Sequential([ tf.keras.layers.Embedding(self.vocab_size, 50, mask_zero=True), tf.keras.layers.Dropout(0.4), tf.keras.layers.Bidirectional( tf.keras.layers.LSTM(1024, return_sequences=True)), tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1024)), tf.keras.layers.Dropout(0.4), tf.keras.layers.Flatten(), tf.keras.layers.Dense(64, activation="relu"), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(len(self.y_enc.classes_), activation='softmax') ]) self.model.load_weights('model/chatbot') self.responses = self._load_responses()
def tweet_clean(self, tweet): # Remove tickers sent_no_tickers = re.sub(r'\$\w*', '', tweet) tw_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True) temp_tw_list = tw_tokenizer.tokenize(sent_no_tickers) # Remove stopwords list_no_stopwords = [ i for i in temp_tw_list if i.lower() not in self._cache_english_stopwords ] # Remove hyperlinks list_no_hyperlinks = [ re.sub(r'https?:\/\/.*\/\w*', '', i) for i in list_no_stopwords ] # Remove hashtags list_no_hashtags = [re.sub(r'#', '', i) for i in list_no_hyperlinks] # Remove Punctuation and split 's, 't, 've with a space for filter list_no_punctuation = [ re.sub(r'[' + string.punctuation + ']+', ' ', i) for i in list_no_hashtags ] # Remove multiple whitespace new_sent = ' '.join(list_no_punctuation) # Remove any words with 2 or fewer letters filtered_list = tw_tokenizer.tokenize(new_sent) list_filtered = [re.sub(r'^\w\w?$', '', i) for i in filtered_list] filtered_sent = ' '.join(list_filtered) cleaned_tweet = re.sub(r'\s\s+', ' ', filtered_sent) # Remove any whitespace at the front of the sentence cleaned_tweet = cleaned_tweet.lstrip(' ') return cleaned_tweet
def tokenize(self, text): #Make a list where each word is an element, text_list = text.split(' ') #Lemmatize each word. Exception: We want "better" to become its lemma "good" but "best" should stay "best". #There are nltk methods for this. Look at https://www.youtube.com/watch?v=uoHVztKY6S4 #Remove the articles 'a', 'an', 'the' #Also split on punctuation marks so that, "I like, fish" becomes ['I', 'like', ',', 'fish'] = token_list tweettokenizer = TweetTokenizer(); lemmatizer = WordNetLemmatizer(); token_list = tweettokenizer.tokenize(text) try: token_list.remove('a'); token_list.remove('an'); token_list.remove('the'); except ValueError: pass pos_list = pos_tag(token_list) pos_listwordnet = [(word[0], self.get_wordnet_pos(word[1])) for word in pos_list] for i in range(len(token_list)): token_list[i] = lemmatizer.lemmatize(token_list[i] ,pos=pos_listwordnet[i][1]) if len(token_list) == 1: token_list.append('.') return token_list
def __init__( self, token_mapping: Mapping[str, int], preserve_case: bool = False, ): self._token_mapping = token_mapping self._tokenizer = TweetTokenizer(preserve_case=preserve_case)
def preprocess_tweet(tweet): """ This function will preprocess the input tweet Steps for preprocessing: 1. Lowercase the letters 2. Replace the characters with frequency greater than 3 with 3 in a word 3. Replace a url with Tag: <URLURL> 4. Replace a tag mention: <UsernameMention> @TODO: 1. Look for better preprocessing methods on the web 2. Apply here """ clean_tweet = tp.clean(tweet) # perform lemmatization tokenizer = TweetTokenizer() tweet_tokens = tokenizer.tokenize(clean_tweet) lemmatized_tweet = lemmatize_tweet(tweet_tokens) # remove stopwords preprocessed_tweet = remove_stopwords(lemmatized_tweet) return preprocessed_tweet
def nltk_tweet_tokenizer(s, **tokenizer_kwargs): """NTLK TweetTokenizer""" kwargs = dict(strip_handles=False, reduce_len=True) kwargs.update(**tokenizer_kwargs) tokenizer = TweetTokenizer(**kwargs) token_list = tokenizer.tokenize(s) return token_list
def __init__(self, source_vocabulary, target_vocabulary, max_source_length, max_target_length): self.source_vocabulary = source_vocabulary self.target_vocabulary = target_vocabulary self.max_source_length = max_source_length self.max_target_length = max_target_length self.tokenizer = TweetTokenizer()
def parse_data(data): """ Parse all unique sentences in data. :param data: pandas.DataFrame with text data :returns parsed_data:: pandas.DataFrame with text data """ parser_en = spacy.load('en_core_web_md', disable=['ner', 'textcat']) parser_es = spacy.load('es_core_news_sm', disable=['ner', 'textcat']) # custom tokenizers because duh parser_en.tokenizer = NLTKTokenizerSpacy(parser_en.vocab, TweetTokenizer()) parser_es.tokenizer = NLTKTokenizerSpacy(parser_es.vocab, ToktokTokenizer()) data.loc[:, 'lang'] = data.loc[:, 'txt'].apply(lambda x: langid.classify(x)[0]) parsed_data = [] for i, data_i in data.iterrows(): txt = data_i.loc['txt'] txt = clean_data_for_spacy(txt) sents = sent_tokenize(txt) parsed_data_i = [] for sent in sents: if(data_i.loc['lang'] == 'es'): parse_i = parser_es(sent) else: parse_i = parser_en(sent) # extract tree tree_i = build_parse(parse_i, parse_type='spacy') parsed_data_i.append(tree_i) parsed_data_i = pd.DataFrame(pd.Series(parsed_data_i), columns=['parse']) # logging.debug('processing id %s/%s'%(data_i.loc['id'], int(data_i.loc['id']))) parsed_data_i = parsed_data_i.assign(**{'id' : int(data_i.loc['id'])}) parsed_data.append(parsed_data_i) parsed_data = pd.concat(parsed_data, axis=0) # parsed_data.loc[:, 'id'] = parsed_data.loc[:, 'id'].astype(np.int64) return parsed_data
def tokenize_comments(base_dir, comments_file,hashh=None): tkd_data = None if hashh: tkd_data = load_cached_data(hashh) if tkd_data is None: hash_f = get_cache_path(hashh) with open(hash_f, 'wb') as pkl_f: tkd_data = defaultdict(dict) tk = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False) for i, (root, dirs, files) in enumerate(os.walk(base_dir)): if comments_file in files: project = root.split('/')[-1] print('Processing %s, number %d' % (project, i)) posts = [] with open(os.path.join(root, comments_file), 'r') as inf: r = csv.DictReader(inf) for row in r: p = post(' '.join(list(tk.tokenize(row['body']))), row['login'], row['mention_login'], row['issue_num'], row['datetime'], project) posts.append(p) tkd_data[project] = posts pickle.dump(tkd_data, pkl_f) return tkd_data
def tokenize_sentences(corpus): twtk = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True) tokenized_sentences = corpus.apply(twtk.tokenize) return tokenized_sentences
def data_processing(df): t = TweetTokenizer() emotions = [ 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust' ] df['translated_full_text'] = df['translated_full_text'].astype(str).apply( remove_links) df['cleaned_text'] = df['translated_full_text'].astype(str).apply( style_text) df['cleaned_text'] = df['cleaned_text'].astype(str).apply( lambda x: remove_words(x.split(), stopcorpus)) df['cleaned_text'] = df['cleaned_text'].apply(collapse_list_to_string) df['cleaned_text'] = df['cleaned_text'].astype(str).apply( remove_apostrophes) df['tokenized_sents'] = df.apply( lambda row: t.tokenize(row['cleaned_text']), axis=1) df['word_count'] = df.apply(lambda row: len(row['tokenized_sents']), axis=1) df = df[df.word_count > 0] df = text_emotion(df) for emotion in emotions: df[emotion] = df[emotion] / df['word_count'] date = datetime.datetime.strptime(df['created_at'].min(), '%Y-%m-%d %H:%M:%S').date() df.to_pickle(str(date) + ".pickle") df.to_excel(str(date) + ".xlsx") return
def get_train_test_data(find_and_concatenate_expressions=False): def remove_url(tokens): tokens = filter(lambda x: "http" not in x, tokens) return list(tokens) def remove_hashtags(tokens): tokens = map(lambda x: x.replace('#', ''), tokens) return list(tokens) db = pd.read_excel("Classeur1.xlsx", encoding="utf-8") dict_values = {'Not Relevant': -1, 'Relevant': 1, "Can't Decide": 0} db["to_predict"] = db.choose_one.map(dict_values) db = db[["text", "to_predict"]] twtk = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True) db["token_retreated_text"] = db["text"].apply( lambda x: remove_hashtags(remove_url(twtk.tokenize(x)))) db["retreated_text"] = db["token_retreated_text"].apply( lambda x: " ".join(x)) if find_and_concatenate_expressions: db["token_retreated_text"] = clean_corpus(db["retreated_text"]) db["retreated_text"] = db["token_retreated_text"].apply( lambda x: " ".join(x)) msk = np.random.rand(len(db)) < 0.8 train = db[msk] test = db[~msk] return train, test
def __init__(self, phase, kwargs): self.mode = Mode[kwargs['mode']] self.image_size = kwargs['image_size'] self.hidden_size = kwargs['hidden_size'] self.debug_use_dataset = kwargs['debug_use_dataset'] self.debug_one_sentence = kwargs['debug_one_sentence'] self.__use_densenet = kwargs['__use_densenet'] self.sent_tokenizer = PunktSentenceTokenizer() self.word_tokenizer = TweetTokenizer() if phase == Phase.train: jitter = [ColorJitter(brightness=0.5, contrast=0.5)] else: jitter = [] if self.__use_densenet: self.transform = Compose(( [Lambda(lambda img: img.convert('RGB'))] + [Resize((256, 256))] + jitter + [ToTensor()] + [Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])] )) else: self.transform = Compose(( [Resize((256, 256))] + jitter + [ToTensor()] ))
def parse_data_iterator(vocab, filename, delimiter=",", steps=10): vocab.add_word('</s>') file = open(filename, 'r') reader = csv.reader( file, delimiter=delimiter, ) headers = next(reader) list_of_train = [] tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=False) for row in reader: curr = [] encoded = [] label = [row[1]] if (row[1] == 0): label.append(1) else: label.append(0) words = tokenizer.tokenize(" ".join(row[3:])) for i in range(steps): if i < len(words): try: words[i] = str(words[i]) except: words[i] = words[i] words[i] = canon_word(words[i]) vocab.add_word(words[i]) curr.append(words[i]) else: curr.append('</s>') for word in curr: encoded.append(vocab.encode(word)) yield label, curr
def set_params(self, **parameters): """Set the params""" for parameter, value in parameters.items(): setattr(self, '_{}'.format(parameter), value) self._tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
def get_vocabulary_tokenizer(samples): texts = [sample.text for sample in samples] vocabulary = Vocabulary() tokenizer = WordTokenizer(texts=texts, tokenizer=TweetTokenizer()) tokenized_samples = [tokenizer.tokenize(sample.text) for sample in samples] vocabulary.fit((token for tokens in tokenized_samples for token in tokens)) print(tokenized_samples[0:1]) return vocabulary, tokenizer
def __init__(self, data_loader): self.data = data_loader self.tokenizer = TweetTokenizer() self.stemmer = PorterStemmer() self.stopwords = stopwords.words('english') self.re_url = r'http\S+' self.punctuation = string.punctuation self.vocab = defaultdict(set)
def tokeniza(chars, keyword=None): """ Tokenize a string (duplicates keywords if any) """ tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) tokens = tokenizer.tokenize(chars) return tokens
def tokenize(s): sentence_splitter = TweetTokenizer() tokens = sentence_splitter.tokenize(s) result = [] for word in tokens: result.append( unicodedata.normalize('NFKD', word).encode('ascii', 'ignore')) return result
def preprocess(comments, preprocessors): tokenizer = TweetTokenizer() html_cleaner = re.compile('<.+?>') for comment in comments: comment = html_cleaner.sub('', comment) tokenized_comment = tokenizer.tokenize(comment) for preprocessor in preprocessors: tokenized_comment = preprocessor.optimize(tokenized_comment) yield tokenized_comment
def __init__(self, tokenizer="tweet", punctuation=True, verbose=1): self.contextualizer = Contextualizer() self.corrector = Corrector(word2index=self.contextualizer.word2index, index2word=self.contextualizer.index2word) self.tokenizer_type = tokenizer self.keep_punctuation = punctuation if self.tokenizer_type == "tweet": self.tokenizer = TweetTokenizer() self.verbose = verbose
def preprocess(text): tokenizer = TweetTokenizer() # Remove stopwords. tokens = tokenizer.tokenize(text) tokens = [ token for token in tokens if token not in ENGLISH_STOPWORDS and token.isalpha() ] return tokens
def tokenize(text): tweet_tokenizer = TweetTokenizer() # 1. Tokenize text = tweet_tokenizer.tokenize(text) # 2. Cleaning # Punctuation text = [t for t in text if t not in string.punctuation] # Normalisieren text = [t.lower() for t in text] return text
def modify_abbrev(tweet): tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) tokens = tokenizer.tokenize(tweet) for w in tokens: w = abbreviations[ w.lower()] if w.lower() in abbreviations.keys() else w text = ' '.join(tokens) return text
def clear_data(tweet): tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True) tokens = tokenizer.tokenize(tweet) clean_tweet = tokens \ | remove_urls \ | process_hashtags \ | remove_stopwords \ | remove_numbers \ | remove_multiple_occurrence return ' '.join(clean_tweet)
def process_tweet_text(tweet): if tweet.startswith('@null'): return "[Tweet not available]" tweet = re.sub(r'\$\w*','',tweet) # Remove tickers tweet = re.sub(r'https?:\/\/.*\/\w*','',tweet) # Remove hyperlinks tweet = re.sub(r'['+string.punctuation+']+', ' ',tweet) # Remove puncutations like 's twtok = TweetTokenizer(strip_handles=True, reduce_len=True) tokens = twtok.tokenize(tweet) tokens = [i.lower() for i in tokens if i not in stopwords and len(i) > 2 and i in english_vocab] return tokens
def normalize_messages(messages): tokenizer = TweetTokenizer(preserve_case=False) normalized_messages = [] for message in messages: try: tokens = tokenizer.tokenize(message) text = [word.lower() for word in Text(tokens)] if text: normalized_messages.append(text) except TypeError: pass return normalized_messages
def create_matrix(tweets: List, name: str = 'oscar pistorius') -> csr_matrix: matrix_loc = Path('data', name, 'tf_idf_matrix.pickle') if matrix_loc.exists(): logger.info("Matrix exists! loading...") with matrix_loc.open('rb') as f: matrix = pickle.loads(f.read()) return matrix stemmer = PorterStemmer() tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True) texts = [] for tweet in tqdm(tweets, desc="(create_matrix) iterating over tweets..."): text = tweet.text tokens = tokenizer.tokenize(text) text_proc = [] for token in tokens: token = token.strip() if len(token) < 3: continue elif token in stopwords.words('english'): continue elif nlp_utils.match_url(token): continue elif token in string.punctuation: continue # elif token.startswith(("#", "$")): # continue token = token.translate({ord(k): "" for k in string.punctuation}) token = stemmer.stem(token) token = token.strip() if token == "": continue text_proc.append(token) texts.append(text_proc) vectorizer = TfidfVectorizer(analyzer="word", tokenizer=lambda x: x, lowercase=False) m = vectorizer.fit_transform(texts) logger.info("Saving computed matrix...") with matrix_loc.open('wb') as f: f.write(pickle.dumps(m)) return m