def get_raw_data(path_g, full): """ Generate raw data without punctuation :param path_g: Path on the Google Drive :param full: string, 'f' for full dataset and 'nf' for non-full dataset """ if full == 'f': path_pos = path_g + 'data/twitter-datasets/train_pos_full.txt' path_neg = path_g + 'data/twitter-datasets/train_neg_full.txt' elif full == 'nf': path_pos = path_g + 'data/twitter-datasets/train_pos.txt' path_neg = path_g + 'data/twitter-datasets/train_neg.txt' else: raise ValueError("Not valid full, should be 'f' or 'nf'") path_test = path_g + 'data/twitter-datasets/test_data.txt' # Read all files data_neg = read_file(path_neg) data_pos = read_file(path_pos) data_test = read_file(path_test) df_neg = pd.DataFrame(data_neg) df_pos = pd.DataFrame(data_pos) df_test = pd.DataFrame(data_test, columns=['tweet']) df_neg = pd.DataFrame(pd.unique(df_neg[0]).T, columns=['tweet']) df_neg['sentiment'] = 0 print(df_neg.shape) df_pos = pd.DataFrame(pd.unique(df_pos[0]).T, columns=['tweet']) df_pos['sentiment'] = 1 print(df_pos.shape) df = pd.concat([df_neg, df_pos]) text_data = df['tweet'].values text_data_test = df_test['tweet'].values for idx, tweet in enumerate(text_data): text_data[idx] = text_to_word_sequence( tweet, filters='#"$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n0123456789', lower=True) for idx, tweet in enumerate(text_data_test): text_data_test[idx] = text_to_word_sequence( tweet, filters='#"$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n123456789', lower=True) labels = df['sentiment'].values return text_data, labels, text_data_test
def preprocess_news_data(filename): print('Preprocessing news...') all_texts = [] category_map = {} titles = [] abstracts = [] categories = [] with open(filename, 'r') as f: for l in f: id, category, subcategory, title, abstract, url, entity = l.strip( '\n').split('\t') title = title.lower() abstract = abstract.lower() all_texts.append(title + ". " + abstract) # map every category to a number if category not in category_map: category_map[category] = len(category_map) # map every subcategory to a number titles.append(title) abstracts.append(abstract) categories.append(category) tokenizer = Tokenizer() tokenizer.fit_on_texts(all_texts) word_index = tokenizer.word_index # a dict: word_index[word]=index print('Found %s unique tokens.' % len(word_index)) # print(word_index) # title news_title = np.zeros((len(titles), MAX_TITLE_LENGTH), dtype='int32') for i, title in enumerate(titles): wordTokens = text_to_word_sequence(title) k = 0 for _, word in enumerate(wordTokens): if k < MAX_TITLE_LENGTH: news_title[i, k] = word_index[word] k = k + 1 # abstract news_abstract = np.zeros((len(abstracts), MAX_ABSTRACT_LENGTH), dtype='int32') for i, abstract in enumerate(abstracts): wordTokens = text_to_word_sequence(abstract) k = 0 for _, word in enumerate(wordTokens): if k < MAX_ABSTRACT_LENGTH: news_abstract[i, k] = word_index[word] k = k + 1 # category & subcategory news_category = [] k = 0 for category in categories: news_category.append(category_map[category]) k += 1 news_category = to_categorical(np.asarray(news_category)) return word_index, category_map, news_category, news_abstract, news_title
def preprocess_news_data(filename, filename_2): # only use news title print('Preprocessing news...') titles = [] news_index = {} with open(filename, 'r') as f: for l in f: id, category, subcategory, title, abstract, url, entity = l.strip( '\n').split('\t') if id not in news_index: news_index[id] = len(news_index) title = title.lower() titles.append(title) news_index_test = {} titles_test = [] with open(filename_2, 'r') as f: for l in f: id, category, subcategory, title, abstract, url, entity = l.strip( '\n').split('\t') if id not in news_index: news_index[id] = len(news_index) title = title.lower() titles.append(title) if id not in news_index_test: news_index_test[id] = len(news_index_test) title = title.lower() titles_test.append(title) tokenizer = Tokenizer() tokenizer.fit_on_texts(titles) word_index = tokenizer.word_index # a dict: word_index[word]=index print('Found %s unique news.' % len(news_index)) print('Found %s unique tokens.' % len(word_index)) news_title = np.zeros((len(titles), MAX_TITLE_LENGTH), dtype='int32') news_title_test = np.zeros((len(titles_test), MAX_TITLE_LENGTH), dtype='int32') for i, title in enumerate(titles): wordTokens = text_to_word_sequence(title) k = 0 for _, word in enumerate(wordTokens): if k < MAX_TITLE_LENGTH: news_title[i, k] = word_index[word] k = k + 1 for i, title in enumerate(titles_test): wordTokens = text_to_word_sequence(title) k = 0 for _, word in enumerate(wordTokens): if k < MAX_TITLE_LENGTH: news_title_test[i, k] = word_index[word] k = k + 1 return news_index, word_index, news_title, news_index_test, news_title_test
def transform_sentence_complete(sentence): def camel_case_split(identifier): matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier) return [m.group(0) for m in matches] if FLAGS.cs_use_clef_data and 'task1' in FLAGS.cs_raw_clef_train_loc: sentence = emoji.get_emoji_regexp().sub(r'', sentence) sentence = re.sub('(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)', 'url', sentence) # sentence = re.sub('^@?(\w){1,15}$', 'user', sentence) sentence = ' '.join([' '.join(camel_case_split(x[1:])) if x[0] == '#' or x[0] == '@' else x for x in sentence.split()]) sentence = ' '.join(['ebola' if any([y in x.lower() for y in ['covid', 'corona']]) else x for x in sentence.split()]) print(sentence) sentence = correct_mistakes(sentence) if not FLAGS.cs_custom_preprc: return sentence.strip() sentence = (process_sentence_ner_spacy(sentence) if FLAGS.cs_ner_spacy else sentence) sentence = ' '.join(text_to_word_sequence(sentence)) sentence = expand_contractions(sentence) sentence = remove_possessives(sentence) sentence = remove_kill_words(sentence) return sentence.strip()
def preprocessing(sentence): # divide sentences as each words using(text to word sequences) words = set(text_to_word_sequence(sentence)) # remove duplicate words vocab_size = len(words) # words to numeric value(vector) results = one_hot(sentence, round(vocab_size * 1.3)) return results
def preprocess(w2v,text,language): embeddings = [] if language == 'english': text = text.replace("'",'') words = text_to_word_sequence(text) elif language == 'hindi': text = text.replace(",", '') text = text.replace("|", ' ') words = text.split() else: raise Exception("Choose lang as 'hindi' or 'english' ") for word in words: if word in w2v: embeddings.append(w2v[word]) cur_seq_len = len(embeddings) print(words) print(cur_seq_len,language) # print(text) if cur_seq_len < max_len: embeddings = np.pad(embeddings, [(0, max_len - cur_seq_len), (0, 0)]) else: embeddings = embeddings[cur_seq_len - max_len:] return embeddings
def preprocess(labels, titles, abstracts, texts): news = [] labels = to_categorical(np.asarray(labels)) for i in texts: sentences = tokenize.sent_tokenize(i) news.append(sentences) tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) data = np.zeros((len(texts), MAX_SENTS, MAX_SEQUENCE_LENGTH), dtype='int32') for i, sentences in enumerate(news): for j, sent in enumerate(sentences): if j < MAX_SENTS: wordTokens = text_to_word_sequence(sent) k = 0 for _, word in enumerate(wordTokens): if k < MAX_SEQUENCE_LENGTH and tokenizer.word_index[ word] < max_features: data[i, j, k] = tokenizer.word_index[word] k = k + 1 word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) return word_index, data, labels
def load_data(file, max_fatures, max_sequence_length): data = pd.read_excel(file) data = data[['text', 'sentiment']] data['text'] = data['text'].apply(lambda x: x.lower()) data['text'] = data['text'].apply(lambda x: clean_str(x)) data['text'] = data['text'].apply( (lambda x: re.sub('[^a-zA-z0-9\s]', '', x))) stop_words = set(stopwords.words('english')) text = [] for row in data['text'].values: word_list = text_to_word_sequence(row) no_stop_words = [w for w in word_list if not w in stop_words] no_stop_words = " ".join(no_stop_words) text.append(no_stop_words) tokenizer = Tokenizer(num_words=max_fatures, split=' ') tokenizer.fit_on_texts(text) X = tokenizer.texts_to_sequences(text) X = pad_sequences(X, maxlen=max_sequence_length) word_index = tokenizer.word_index Y = pd.get_dummies(data['sentiment']).values X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42) return X_train, X_test, Y_train, Y_test, word_index, tokenizer
def get_collisions(self): for i in range(len(self.quotes)): words = text_to_word_sequence(self.quotes[i]) for j in range(len(words)): word = words[j] num = self.encoded_quotes[i][j] if num in self.collisions: if (not word in self.collisions[num] ): # new word, same hash = collision! self.collisions_count += 1 l = self.collisions[num] + [word] self.collisions[num] = l else: # new hash id self.collisions[num] = [word] collision_words = 0 self.max_hashKey = 0 for num in self.collisions: print(str(num) + ": " + str(self.collisions[num])) if len(self.collisions[num]) > 1: collision_words += 1 if num > self.max_hashKey: self.max_hashKey = num print("vocab size: " + str(self.vocab_size)) print("dictionary size: " + str(len(self.collisions))) print("number of hash ids with collisions: " + str(collision_words))
def hashing_method(self): # get vocab size motiv = self.flatten(self.motiv_quotes) demotiv = self.flatten(self.demotiv_quotes) self.vocab = set(text_to_word_sequence(motiv + " " + demotiv)) self.vocab_size = len(self.vocab) # perform hash encoding self.quotes = self.motiv_quotes + self.demotiv_quotes before = time.time() for quote in self.quotes: self.encoded_quotes.append( hashing_trick(quote, round(self.vocab_size * 1.5), hash_function='md5')) after = time.time() diff = (after - before) * 1000 print("hashing trick time: " + str(diff) + " ms") # PADDED HASH DATA FOR TRAINING self.padded_encoded_quotes = pad_sequences(self.encoded_quotes, maxlen=280) #print(self.encoded_quotes) #print("----------------------------------------") print(self.padded_encoded_quotes)
def prepare_data(self, data): data = data[['text', 'sentiment']] data['text'] = data['text'].apply(lambda x: x.lower()) data['text'] = data['text'].apply(lambda x: self.clean_str(x)) data['text'] = data['text'].apply( (lambda x: re.sub('[^a-zA-z0-9\s]', '', x))) stop_words = set(stopwords.words('english')) text = [] for row in data['text'].values: word_list = text_to_word_sequence(row) no_stop_words = [w for w in word_list if not w in stop_words] no_stop_words = " ".join(no_stop_words) text.append(no_stop_words) tokenizer = Tokenizer(num_words=self.MAX_FEATURES, split=' ') tokenizer.fit_on_texts(text) X = tokenizer.texts_to_sequences(text) X = pad_sequences(X, maxlen=self.MAXLEN) word_index = tokenizer.word_index Y = pd.get_dummies(data['sentiment']).values x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=self.TEST_DIM, random_state=42) return x_train, x_test, y_train, y_test, word_index, tokenizer
def keras_tokenize_wrapper(texts, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=False, split=' ', **kwargs): """ Examples >>> texts = get_test_data() >>> dfs = keras_tokenize_wrapper(texts) >>> isinstance(dfs[0], pd.DataFrame) True """ # get valid kwargs - only pass on valid arguments ttws_kwargs = {k: kwargs[k] for k in kwargs if k in signature(text_to_word_sequence).parameters} dfs = [pd.DataFrame(text_to_word_sequence(text, filters=filters, lower=lower, split=split, **ttws_kwargs), columns=['token']) for text in texts] return dfs
def text_preprocessing(text): # text to word_sequence texts = [text_to_word_sequence(word) for texts in text for word in texts] # 표제어 추출 n = WordNetLemmatizer() words = [n.lemmatize(word, 'v') for text in texts for word in text] # 파라미터 text를 표제어로 구성된 texts_lemmatized로 변경 texts_lemmatized = [[n.lemmatize(word, 'v') for word in text] for text in texts] # 불용어 제거 stop_words = set(stopwords.words('english')) words = [word for word in words if word not in stop_words] # 전처리 끝난 words로 token 형성 tokenizer = Tokenizer() tokenizer.fit_on_texts(words) tokens = tokenizer.index_word cap_vector = tokenizer.texts_to_sequences(texts_lemmatized) pad_sequences = sequence.pad_sequences(cap_vector, padding='post') result = np.array(pad_sequences) vocab_size = len(tokenizer.index_word) + 1 return result, vocab_size
def database_title_token(data_list): """ DB를 한줄씩 sqlite로 불러와서 튜플로 저장된 하나의 원소들을 for문으로 하나씩 분해 후, title을 token으로 나눈 후, token들을 합친 리스트를 return값에 list로 반환 """ user_history_update_data_list_titletoken = [] for i in data_list: """ output : (13567, 'https://www.youtube.com/', 'YouTube', 36, 1420192312848192) """ # output이 tuple이어서 Id, url, title, visit_count, last_visit_time = i # title의 text를 word로 끊어버리기 title = text_to_word_sequence(title) # title이 빈공간인 건 제외 if len(title) == 0: continue user_history_update_data_list_titletoken.append( (Id, url, title, visit_count, last_visit_time)) return tuple(user_history_update_data_list_titletoken)
def process_data(self): pad = self.tokenizer.texts_to_sequences(['<PAD>'])[0] id_list = [] cap_list = [] cap_length_list = [] self.feat_data = {} with open(self.label_dir) as f: raw_data = json.load(f) for vid in raw_data: vid_id = vid['id'] self.feat_data[vid_id] = np.load(self.feat_dir + vid_id + '.npy') for caption in vid['caption']: words = text_to_word_sequence(caption) for i in range(len(words)): if words[i] not in self.tokenizer.word_index: words[i] = '<UNK>' words.append('<EOS>') one_hot = self.tokenizer.texts_to_sequences([words])[0] cap_length = len(one_hot) one_hot += pad * (self.max_caption_len - cap_length) id_list.append(vid_id) cap_list.append(one_hot) cap_length_list.append(cap_length) self.id_list = np.array(id_list) self.cap_list = np.array(cap_list) self.cap_length_list = np.array(cap_length_list) self.data_size = len(self.cap_list) self.data_idx = np.arange(self.data_size, dtype=np.int)
def generate_text_sequences(lines, pastWords, vocab): X_line = list() Y_line = list() pastWords = pastWords for line in lines: # Tokenize line lineTokenized = text_to_word_sequence(line.item(),\ filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r0123456789'+"'") #Get line length lengthLine = len(lineTokenized) lineBatch = lengthLine - pastWords # Substitute words outside vocab with <Unknown> for idx in range(0,len(lineTokenized)): if lineTokenized[idx] in vocab: continue else: lineTokenized[idx] = '<Unknown>' #Create sequences of text for i in range(0,lineBatch): X_sequence = lineTokenized[i:i+pastWords] X_line.append(X_sequence) Y_sequence = lineTokenized[i+pastWords] Y_line.append(Y_sequence) return(X_line, Y_line)
def tokenizer(data_frame, data_frame_element): # Faz uma limpeza no texto para deixá-lo corretaente ajustado: # Nota: Criar funções para isso #data_frame[data_frame_element] = data_frame[data_frame_element].apply(lambda x: x.lower()) #data_frame[data_frame_element] = data_frame[data_frame_element].apply(lambda x: clean_str(x)) #data_frame[data_frame_element] = data_frame[data_frame_element].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x))) nltk.download('stopwords') # Separa um conjunto de stopwords (inglês) fornecido pelo NLTK: stop_words = set(stopwords.words('english')) # Cria uma lista vazia aonde ficarão os textos: text = [] # Retira-se as stopwords do texto: for row in data_frame[data_frame_element].values: words_clean = sentence_clean(row) word_list = text_to_word_sequence(words_clean) no_stop_words = [w for w in word_list if not w in stop_words] no_stop_words = " ".join(no_stop_words) text.append(no_stop_words) # Aqui é usado o Tokenizer do próprio Keras para transformar as palavras em tokens tokenizer = Tokenizer(split=' ') # São criados Tokens para cada palavra ao longo dos textos: tokenizer.fit_on_texts(text) # Aqui cada texto é transformado em listas de Tokens, ou seja, as palavras neles # são convertidos de Strings para números: text_tokenized = tokenizer.texts_to_sequences(text) return text_tokenized
def Audio_file_Read(filename): universal_dict = {} cnt = {} gantu = [0, 0, 0, 0] analysis = {} token = Tokenizer() recog = Recognizer() try: audioFile = sr.AudioFile(filename) with audioFile as source: audio = recog.record(source) recognized = recog.recognize_google(audio, language="ko-KR") res = text_to_word_sequence(recognized) cnt = collections.Counter(res) universal_dict = dict(cnt) if "어" in universal_dict: gantu[0] = universal_dict["어"] if "아니" in universal_dict: gantu[1] = universal_dict["아니"] if "근데" in universal_dict: gantu[2] = universal_dict["근데"] if "이제" in universal_dict: gantu[3] = universal_dict["이제"] text = recognized analysis['text'] = text analysis['data'] = gantu return analysis except UnknownValueError: analysis['text'] = "당신이 말한 문장이 없습니다." analysis['data'] = [0, 0, 0, 0] return analysis
def createData(self, data, maxLength, embedding): with open(os.path.join(os.getcwd(), 'synonyms.json'), "rb") as f: synonyms = json.load(f) self.tokenizer = Tokenizer(num_words=None) with open(os.path.join(os.getcwd(), 'bureau/models/maxlength.pkl'), "rb") as f: self.max_len = pickle.load(f) self.x_train = data.train_data_frame['query'].tolist() self.y_train = data.train_data_frame['category'].tolist() self.tokenizer.fit_on_texts(list(self.x_train)) for key in synonyms: if key in self.tokenizer.word_index: for synonym in synonyms[key]: if synonym not in self.tokenizer.word_index: self.tokenizer.word_index[ synonym] = self.tokenizer.word_index[key] if embedding != "custom": for i in range(len(self.x_train)): self.x_train[i] = text_to_word_sequence(self.x_train[i]) self.y_train = to_categorical(self.y_train) if embedding == "custom": self.x_train = self.tokenizer.texts_to_sequences(self.x_train) self.x_train = pad_sequences(self.x_train, maxlen=self.max_len) self.y_train = to_categorical(self.y_train) self.word_index = self.tokenizer.word_index print("Setting Maximum Length to : ") print(self.max_len)
def keras_tokenizer(txt): from tensorflow.keras.preprocessing.text import text_to_word_sequence return text_to_word_sequence( txt, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=False, split=" ")
def convert_text_to_indices(text, tokenized_dictionary): # one really important thing that `text_to_word_sequence` does # is make all texts the same length -- in this case, the length # of the longest text in the set. word_indices = [] for word in kpt.text_to_word_sequence(text): if word in tokenized_dictionary: word_indices.append(tokenized_dictionary[word]) return word_indices
def convert_text_to_index_array(text, dictionary): words = kpt.text_to_word_sequence(text) wordIndices = [] for word in words: if word in dictionary: wordIndices.append(dictionary[word]) else: print("'%s' not in training corpus; ignoring." % word) return wordIndices
def _get_freq_dict(all_text): text_dicts = [] for text in all_text: cur_dict = {} word_sequence = text_to_word_sequence(text) for word in word_sequence: if word in cur_dict: cur_dict[word] += 1 else: cur_dict[word] = 1 text_dicts.append(cur_dict) return text_dicts
def get_transcriptions_align(path_to_transcriptions, filename, path_to_transcriptions_to_align): f = open(os.path.join(path_to_transcriptions, filename), 'r').read() f = np.array(f.split('\n')) transcription = {} print("path_to_transcriptions:", path_to_transcriptions) print("path_to_transcriptions_to_align:", path_to_transcriptions_to_align) print("filename:", filename) for i in range(len(f) - 1): g = f[i] i1 = g.find(': ') i0 = g.find(' [') ind_id = g[:i0] ind_ts = g[i1+2:] ind_ts = text_to_word_sequence(ind_ts,filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n', lower=True,split=" ") align_t = [] #print("ind_id_text:", g, flush=True) #print("ind_id:", ind_id, flush=True) align_file_name = path_to_transcriptions_to_align + filename[:-4] + "/" + str(ind_id) + ".wdseg" if os.path.exists(align_file_name): f_align = open(align_file_name, 'r').read() f_align = np.array(f_align.split('\n')) #print("f_align", f_align, flush=True) for i in range(2, len(f_align) - 3): # print(f'f_align{i}', f_align[i]) w = f_align[i].split()[3].split("(")[0] w = text_to_word_sequence(w,filters='!"#$%&()*+,-./:;=>?@[\\]^`{|}~\t\n', lower=True,split=" ")[0] if w in ind_ts: align_t.append({'word': w, 'SFrm': f_align[i].split()[0], 'Efrm': f_align[i].split()[1]}) #print("align_t", align_t, flush=True) #print("w_list", ind_ts, flush=True) assert len(align_t) == len(ind_ts) transcription[ind_id] = {'ind_ts': ind_ts, 'align_t': align_t} return transcription
def get_array_from_directory(path): array = os.listdir(path) m = [] for n in range(len(array)): with open(os.path.join(path, array[n]), encoding='utf8') as f: data = f.read() words = set(text_to_word_sequence(data)) result = one_hot(data, round(len(words) * 1.3)) m.append(result) m = pad_sequences(m, maxlen=2000) return m
def preprocess(text): embeddings = [] words = text_to_word_sequence(text) for word in words: if word in w2v: embeddings.append(w2v[word]) cur_seq_len = len(embeddings) if cur_seq_len < max_len: embeddings = np.pad(embeddings, [(0, max_len - cur_seq_len), (0, 0)]) else: embeddings = embeddings[cur_seq_len - max_len:] return embeddings
def transform_sentence_complete(sentence): sentence = correct_mistakes(sentence) if not FLAGS.cs_custom_preprc: return sentence sentence = (process_sentence_ner_spacy(sentence) if FLAGS.cs_ner_spacy else sentence) sentence = ' '.join(text_to_word_sequence(sentence)) sentence = expand_contractions(sentence) sentence = remove_possessives(sentence) sentence = remove_kill_words(sentence) return sentence
def create_L(): L = [] f = open('reviews.txt', 'r') for review in f.readlines(): words = text_to_word_sequence(review) L += list(words) W = [] for word in L: if word not in stop_words and word.isalpha() and len(word) > 2: W.append(word) word_counts = Counter(W) return L, W, word_counts
def token_string(string: str) -> str: from tensorflow.keras.preprocessing.text import text_to_word_sequence token_list = text_to_word_sequence(string) token_except_stop_list = [] import nltk #nltk.download('stopwords') from nltk.corpus import stopwords stop_words = set(stopwords.words('english')) for token in token_list: if token not in stop_words: token_except_stop_list.append(token) #return " ".join(token_list) return " ".join(token_except_stop_list)
def tokenize(text, filters='\t\n', add_whitespace_op=False, lower=False, **kwargs): """ add_whitespace_op (bool): add whitespace around operator """ from tensorflow.keras.preprocessing.text import text_to_word_sequence tokenlist = text_to_word_sequence(text, filters=filters, lower=lower) if add_whitespace: tokenlist = add_whitespace(tokenlist) return tokenlist