def embed_words(df, embed_dim, embedding_index): tokenizer = RegexpTokenizer(r'\w+') processed_comments = [] cachedStopWords = stopwords.words("english") for comment in df['commenttext']: tokens = tokenizer.tokenize(comment) text = ' '.join( [word for word in tokens if word not in cachedStopWords]) #text = tokens processed_comments.append(text) tokenizer = Tokenizer(num_words=None, filters='!##$%&()*+', lower=True, split=' ') tokenizer.fit_on_texts(processed_comments) word_index = tokenizer.word_index # Prepare embedding matrix words_not_found = [] nb_words = len(word_index) + 1 embedding_matrix = np.zeros((nb_words, embed_dim)) for word, i in word_index.items(): if i >= nb_words: continue embedding_vector = embedding_index.get(word) if (embedding_vector is not None) and len(embedding_vector) > 0: embedding_matrix[i] = embedding_vector[:embed_dim] else: words_not_found.append(word) #print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0)) return (embedding_matrix, nb_words, tokenizer)
def english_tokenizer(docs, MAX_NB_WORDS, max_seq_len): # set stop words tokenizer = RegexpTokenizer(r'\w+') stop_words = set(stopwords.words('english')) stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}']) # pre-processing train data print("pre-processing train data...") processed_docs = [] for doc in docs: tokens = tokenizer.tokenize(doc) filtered = [word for word in tokens if word not in stop_words] processed_docs.append(" ".join(filtered)) # tokenizing input data print("tokenizing input data...") tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False) tokenizer.fit_on_texts(processed_docs) # leaky word_seq = tokenizer.texts_to_sequences(processed_docs) word_index = tokenizer.word_index print("dictionary size: ", len(word_index)) word_seq = sequence.pad_sequences(word_seq, maxlen=max_seq_len) return word_seq, word_index
def tokenize_input_data(processed_docs_train, processed_docs_test): tokenizer = RegexpTokenizer(r'\w+') print("tokenizing input data...") tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False) tokenizer.fit_on_texts(processed_docs_train + processed_docs_test) #leaky word_seq_train = tokenizer.texts_to_sequences(processed_docs_train) word_seq_test = tokenizer.texts_to_sequences(processed_docs_test) word_index = tokenizer.word_index print("dictionary size: ", len(word_index)) return word_index, word_seq_train, word_seq_test
def preprocess(dataset, args, train=True): """ :param dataset: pandas dataframe :param args: config vars dict :param train: if True serializes the tokenizer to use in test data, else deserializes :return: tuple of (x, y) array | x = (text, set) """ tokenizer = RegexpTokenizer(r'\w+') stop_words = set(stopwords.words('english')) stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}']) X = dataset['text'] X_set = dataset['set'] y = dataset['score'] # preprocess - filter stopwords print("pre-processing input data...") raw_docs = X.tolist() processed_docs = [] for doc in tqdm(raw_docs): tokens = tokenizer.tokenize(doc) filtered = [word for word in tokens if word not in stop_words] processed_docs.append(" ".join(filtered)) # tokenize for keras print("Tokenizing input data...") path = args['save_folder'] + 'tokenizer.joblib' if train: tokenizer = Tokenizer(num_words=args['nb_words'], lower=True, char_level=False) tokenizer.fit_on_texts(processed_docs) word_index = tokenizer.word_index print("vocabulary size: ", len(word_index)) # save tokenizer joblib.dump(tokenizer, path) print('Saved tokenizer') else: tokenizer = joblib.load(path) print('Restored tokenizer') word_seq = tokenizer.texts_to_sequences(processed_docs) word_seq = sequence.pad_sequences(word_seq, maxlen=args['max_seq_len'], padding='post', truncating='post') return [word_seq, X_set], y
def preprocess_data(docs): tokenizer = RegexpTokenizer(r'\w+') MAX_NB_WORDS = 100000 max_seq_len = 40 print("pre-processing train data...") processed_docs = [] print("tokenizing input data...") tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False) tokenizer.fit_on_texts(processed_docs) # leaky word_seq = tokenizer.texts_to_sequences(processed_docs) word_index = tokenizer.word_index print("dictionary size: ", len(word_index)) # pad sequences word_seq = sequence.pad_sequences(word_seq, maxlen=max_seq_len) return word_seq, word_index
def tweetAnalysis(tweets, stop_words, tokenizer, embeddings_index, MAX_NB_WORDS): test_df = tweets test_df = test_df.fillna('_NA_') label_names = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] raw_docs_test = test_df['cleaned_tweet'].tolist() # raw_docs_test = [tweets,] num_classes = len(label_names) tokenizer = RegexpTokenizer(r'\w+') processed_docs_test = [] for doc in tqdm(raw_docs_test): tokens = tokenizer.tokenize(doc) filtered = [word for word in tokens if word not in stop_words] processed_docs_test.append(" ".join(filtered)) #end for tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False) tokenizer.fit_on_texts(processed_docs_test) #leaky word_seq_test = tokenizer.texts_to_sequences(processed_docs_test) word_index = tokenizer.word_index #pad sequences word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=168) #embedding matrix words_not_found = [] nb_words = min(MAX_NB_WORDS, len(word_index)) embedding_matrix = np.zeros((nb_words, embed_dim)) for word, i in word_index.items(): if i >= nb_words: continue embedding_vector = embeddings_index.get(word) if (embedding_vector is not None) and len(embedding_vector) > 0: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector else: words_not_found.append(word) print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0)) return word_seq_test
def tokenize_input(df): # We feed a comment into a trained CNN model to obtain the feature vector X in R(mX1) of this comment in the fully connected layer. Each feature x_i in the feature vector X corresponds to a filter. tokenizer = RegexpTokenizer(r'\w+') processed_comments = [] cachedStopWords = stopwords.words("english") for comment in df['commenttext']: tokens = tokenizer.tokenize(comment) text = ' '.join( [word for word in tokens if word not in cachedStopWords]) processed_comments.append(text) tokenizer = Tokenizer(num_words=None, filters='!##$%&()*+', lower=True, split=' ') tokenizer.fit_on_texts(processed_comments) X = tokenizer.texts_to_sequences(processed_comments) X = pad_sequences(X, 1000) # Create reverse word map word_index = tokenizer.word_index reverse_word_map = dict(map(reversed, tokenizer.word_index.items())) return X, reverse_word_map
def my_form_post(): text = request.form['text'] processed_text = text.upper() tokenizer = RegexpTokenizer(r'\w+') tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False) tokenizer.fit_on_texts(processed_docs_train + processed_docs_test) s = processed_text arr = [] arr.append(s) s = tokenizer.texts_to_sequences(arr) s = sequence.pad_sequences(s, maxlen=max_seq_len) res = model.predict(s) final = [] for i in range(len(res)): for j in range(len(res[i])): final.append(str(int(res[i][j] * 100)) + "%") l = [[ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ]] ans = pandas.DataFrame(final, l) var1 = ans.var() return str(ans)
processed_train_data = [] for doc in tqdm(raw_docs_train): tokens = tokenizer.tokenize(doc) filtered = [word for word in tokens if word not in stop_words] processed_train_data.append(" ".join(filtered)) print('pre-processing test data') processed_test_data = [] for doc in tqdm(raw_docs_test): tokens = tokenizer.tokenize(doc) filtered = [word for word in tokens if word not in stop_words] processed_test_data.append(" ".join(filtered)) print("Tokenizing input data....") tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False) tokenizer.fit_on_texts(processed_train_data + processed_test_data) word_seq_train = tokenizer.texts_to_sequences(processed_train_data) word_seq_test = tokenizer.texts_to_sequences(processed_test_data) word_index = tokenizer.word_index print("Dictionary Size: " + str(len(word_index))) # Pad Sequences word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len) word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_seq_len) batch_size = 256 num_epochs = 8 num_filters = 64 embed_dim = 300 weight_decay = 1e-4 words_not_found = []
# generate the training and testing data embeddings = get_word2vec_embeddings(word2vec, clean_text) list_corpus = clean_text["text"].tolist() list_labels = clean_text["term_selected"].tolist() X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec = train_test_split( embeddings, list_labels, test_size=0.2, random_state=40) EMBEDDING_DIM = 300 MAX_SEQUENCE_LENGTH = 46 VOCAB_SIZE = len(VOCAB) VALIDATION_SPLIT = 0.2 tokenizer = Tokenizer(num_words=VOCAB_SIZE) tokenizer.fit_on_texts(clean_text["text"].tolist()) sequences = tokenizer.texts_to_sequences(clean_text["text"].tolist()) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) cnn_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) labels = (np.asarray(clean_text["term_selected"])) indices = np.arange(cnn_data.shape[0]) np.random.shuffle(indices) cnn_data = cnn_data[indices] labels = labels[indices] num_validation_samples = int(VALIDATION_SPLIT * cnn_data.shape[0]) embedding_weights = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) return string.strip() myTexts=[] for each in X: myEach=clean_string(each) myTexts +=[myEach] # tokenize texts into tokens tokenizer = Tokenizer(nb_words=800) tokenizer.fit_on_texts(myTexts) sequences = tokenizer.texts_to_sequences(myTexts) word_index = tokenizer.word_index # trim the length of each sequence to the same length, I set 300. data = pad_sequences(sequences, maxlen=300) y = np.zeros((len(myTexts), 1)) for i in range(len(myTexts)): if i < 1000: y[i]=[True] # positive else: y[i]=[False] # negative embedding_matrix = np.zeros((len(word_index) + 1, 50)) for word, i in word_index.items(): embedding_vector = myDictionary.get(key)
maxlen = 150 # max number of words in a comment to use stop_words = set(stopwords.words('english')) tokenizer = RegexpTokenizer(r'[a-zA-Z]+') #################################################### # DATA PREPARATION #################################################### X_train = train["comment_text"].fillna("fillna").values y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values X_test = test["comment_text"].fillna("fillna").values tokenizer = text.Tokenizer(num_words=max_features) tokenizer.fit_on_texts(list(X_train) + list(X_test)) X_train = tokenizer.texts_to_sequences(X_train) X_test = tokenizer.texts_to_sequences(X_test) x_train = sequence.pad_sequences(X_train, maxlen=maxlen) x_test = sequence.pad_sequences(X_test, maxlen=maxlen) ''' # TRAIN train["comment_text"].fillna('_NA_') train = standardize_text(train, "comment_text") train["tokens"] = train["comment_text"].apply(tokenizer.tokenize) # delete Stop Words train["tokens"] = train["tokens"].apply(lambda vec: [word for word in vec if word not in stop_words]) # Normalize Bad Words train["tokens"] = train["tokens"].apply(lambda vec: normalize_bad_word(vec, bad_words)) #train.to_csv(base_path_output + 'train_normalized.csv', index=False)
print("Max sentence length is %s" % max(test_sentence_lengths)) print("Min sentence length is %s" % min(test_sentence_lengths)) print("Mean sentence length is %s" % mean(test_sentence_lengths)) test["tokens"] = test["tokens"].apply(lambda vec :' '.join(vec)) print("num test: ", test.shape[0]) print(test.head()) # Turn each comment into a list of word indexes of equal length (with truncation or padding as needed) list_sentences_train = train["tokens"].values list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] y = train[list_classes].values list_sentences_test = test["tokens"].values tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(list(list_sentences_train) + list(list_sentences_test)) list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train) list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test) X_t = pad_sequences(list_tokenized_train, maxlen=maxlen) X_te = pad_sequences(list_tokenized_test, maxlen=maxlen) print(X_t.shape) #(159571, 150) # BUILD EMBEDDING MATRIX print('Preparing embedding matrix...') # Read the FastText word vectors (space delimited strings) into a dictionary from word->vector embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE)) print("embeddings_index size: ", len(embeddings_index))
bottom_scores = sorted_contributions['Good']['detractors'][:10].tolist() plot_important_words(top_scores, top_words, bottom_scores, bottom_words, "Most important words for relevance") from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.utils import to_categorical EMBEDDING_DIM = 300 MAX_SEQUENCE_LENGTH = 150 VOCAB_SIZE = len(VOCAB) VALIDATION_SPLIT = .3 tokenizer = Tokenizer(num_words=VOCAB_SIZE) tokenizer.fit_on_texts(data2["review-text"].tolist()) sequences = tokenizer.texts_to_sequences(data2["review-text"].tolist()) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) cnn_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) labels = to_categorical(np.asarray(data2["IsGood"])) indices = np.arange(cnn_data.shape[0]) np.random.shuffle(indices) cnn_data = cnn_data[indices] labels = labels[indices] num_validation_samples = int(VALIDATION_SPLIT * cnn_data.shape[0]) embedding_weights = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
print("pre-processing statuses...") processed_data_train = [] processed_data_test = [] for data in tqdm(raw_data_train): tokens = tokenizer.tokenize(data) filtered = [word for word in tokens if word not in stop_words] processed_data_train.append(" ".join(filtered)) for data in tqdm(raw_data_test): tokens = tokenizer.tokenize(data) filtered = [word for word in tokens if word not in stop_words] processed_data_test.append(" ".join(filtered)) print("tokenizing input data...") tokenizer = Tokenizer(num_words=Max_No_Words, lower=True, char_level=False) tokenizer.fit_on_texts(processed_data_train + processed_data_test) word_seq_train = tokenizer.texts_to_sequences(processed_data_train) word_seq_test = tokenizer.texts_to_sequences(processed_data_test) word_index = tokenizer.word_index print("Dictionary size = ", len(word_index)) #pad sequences word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=Max_Sent_Len) word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=Max_Sent_Len) print("Done !!") ''' Embedding Words ''' embed_dim = 300 print('loading and processing word embeddings...') EMBEDDING_FILE = 'wiki-news-300d-1M.vec'
# word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True) EMBEDDING_DIM = 300 MAX_SEQUENCE_LENGTH = max(sentence_lengths) + 1 VOCAB_SIZE = len(VOCAB) NUM_CLASSES = 2 label2emotion = {0: 'humor', 1: 'NotHumor'} emotion2label = {'humor': 0, 'NotHumor': 1} # label2emotion = {0:"sad", 1:"disgust", 2: "fear", 3:"angry", 4:"surprise", 5:"joy"} # emotion2label = {"sad":0, "disgust":1, "fear":2, "angry":3, "surprise":4, "joy":5} VALIDATION_SPLIT = .2 tokenizer = Tokenizer(num_words=VOCAB_SIZE) tokenizer.fit_on_texts(clean_questions["text"].astype(str).tolist()) # tokenizer.fit_on_texts(trial_data["comment_text"].astype(str).tolist()) # tokenizer.fit_on_texts(test_data["comment_text"].astype(str).tolist()) sequences_train = tokenizer.texts_to_sequences( clean_questions["text"].astype(str).tolist()) #转换成列表 # sequences_trial = tokenizer.texts_to_sequences(trial_data["comment_text"].astype(str).tolist()) # sequences_test = tokenizer.texts_to_sequences(test_data["comment_text"].astype(str).tolist()) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) #训练样例总数 cnn_data = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH) #keras只能接受长度相同的序列输入。 # 因此如果目前序列长度参差不齐,这时需要使用pad_sequences()。 # 该函数是将序列转化为经过填充以后的一个长度相同的新序列新序列。 # cnn_data_trial = pad_sequences(sequences_trial, maxlen=MAX_SEQUENCE_LENGTH)
def mymain(): """CONFIGURATION""" # consts DATA_DIR = "C:\\Users\\T149900\\ml_mercari\\" WORD_COUNT_MEAN_PLUSS_STD = 1 WORD_COUNT_MEAN_THIRD = 2 num_words = 100000 batch_size = 256 num_epochs = 8 num_splits = 5 embed_dim = 300 word_count_strategy = WORD_COUNT_MEAN_PLUSS_STD word_database = "toxic\\wiki.simple.vec" # CV accuracy is 0.987488975854 +/- 0.00889961690531 # CV accuracy is 0.988607054794 +/- 0.00992068287633 sns.set_style("whitegrid") np.random.seed(0) tokenizer = RegexpTokenizer(r'\w+') stop_words = set(stopwords.words('english')) stop_words.update( ['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}']) embeddings_index = get_word_embeddings(DATA_DIR + word_database) train_df = pd.read_csv(DATA_DIR + "toxic\\train.csv") print("num train: ", train_df.shape[0]) label_names = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] y_train = train_df[label_names].values print(y_train.shape) train_df['doc_len'] = train_df['comment_text'].apply( lambda words: len(words.split(" "))) max_seq_len = 0 if word_count_strategy == WORD_COUNT_MEAN_PLUSS_STD: max_seq_len = np.round(train_df['doc_len'].mean() + train_df['doc_len'].std()).astype(int) elif word_count_strategy == WORD_COUNT_MEAN_THIRD: max_seq_len = np.round(train_df['doc_len'].mean() / 3.0).astype(int) assert (max_seq_len > 0) raw_docs_train = train_df['comment_text'].tolist() num_classes = len(label_names) print("pre-processing train data...") processed_docs_train = [] for doc in tqdm(raw_docs_train): tokens = tokenizer.tokenize(doc) filtered = [word for word in tokens if word not in stop_words] processed_docs_train.append(" ".join(filtered)) """end for""" tokenizer = keras.preprocessing.text.Tokenizer(num_words=num_words, lower=True, char_level=False) tokenizer.fit_on_texts(processed_docs_train) #non-leaky word_index = tokenizer.word_index print("dictionary size: ", len(word_index)) word_seq_train = tokenizer.texts_to_sequences(processed_docs_train) word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len) embedding_matrix = get_embedding_matrix(embeddings_index, num_words) model = get_model(num_words, max_seq_len, embedding_matrix, embed_dim, num_classes) d = keras_CV(model, word_seq_train, y_train, num_splits, num_epochs, batch_size) print("CV accuracy is " + str(d['score']) + " +/- " + str(d['std']))
embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, generate_missing=generate_missing)) return list(embeddings) training_embeddings = get_word2vec_embeddings(word2vec, clean_train_comments, generate_missing=True) test_embeddings = get_word2vec_embeddings(word2vec, clean_test_comments, generate_missing=True) EMBEDDING_DIM = 300 # how big is each word vector MAX_VOCAB_SIZE = 175303 # how many unique words to use (i.e num rows in embedding vector) MAX_SEQUENCE_LENGTH = 200 # max number of words in a comment to use #training params batch_size = 256 num_epochs = 2 tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False) tokenizer.fit_on_texts(clean_train_comments["comment_text"].tolist()) training_sequences = tokenizer.texts_to_sequences(clean_train_comments["comment_text"].tolist()) train_word_index = tokenizer.word_index print('Found %s unique tokens.' % len(train_word_index)) train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH) train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM)) for word,index in train_word_index.items(): train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM) print(train_embedding_weights.shape) test_sequences = tokenizer.texts_to_sequences(clean_test_comments["comment_text"].tolist()) test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
def load_data(train_file,test_file): clean_questions = pd.read_csv(train_file) clean_test = pd.read_csv(test_file) tokenizer = RegexpTokenizer(r'\w+') clean_questions["tokens"] = clean_questions['comment_text'].astype(str).apply(tokenizer.tokenize) clean_test["tokens"] = clean_test['comment_text'].astype(str).apply(tokenizer.tokenize) all_words = [word for tokens in clean_questions["tokens"] for word in tokens] for tokens in clean_questions["tokens"]: for word in tokens: all_words.append(word) print(all_words[-1]) sentence_lengths = [len(tokens) for tokens in clean_questions["tokens"]] VOCAB = sorted(list(set(all_words))) print("%s words total,with a vocabulary size of %s" % (len(all_words), len(VOCAB))) print("Max sentence length is %s" % max(sentence_lengths)) max_sequence_length = max(sentence_lengths) + 1 num_words = len(VOCAB) VALIDATION_SPLIT = .2 tokenizer = Tokenizer(num_words) tokenizer.fit_on_texts(clean_questions['comment_text'].astype(str).tolist()) tokenizer.fit_on_texts(clean_test['comment_text'].astype(str).tolist()) sequences_train = tokenizer.texts_to_sequences(clean_questions['comment_text'].astype(str).tolist()) sequences_test = tokenizer.texts_to_sequences(clean_test['comment_text'].astype(str).tolist()) train_data = pad_sequences(sequences_train, maxlen=max_sequence_length) test_data = pad_sequences(sequences_test, maxlen=max_sequence_length) clean_questions['to_task1'] = [t1[w] for w in clean_questions['task_1']] clean_questions['to_task2'] = [t2[w] for w in clean_questions['task_2']] train_labelsA = to_categorical(np.array(clean_questions['to_task1'])) train_labelsB = to_categorical(np.array(clean_questions['to_task2']), 4) from collections import Counter print('train_labelsB',Counter(clean_questions['to_task1'])) print('train_labelsB',Counter(clean_questions['to_task2'])) clean_test['to_task1'] = [t1[w] for w in clean_test['task_1']] clean_test['to_task2'] = [t2[w] for w in clean_test['task_2']] teat_labelsA = to_categorical(np.array(clean_test['to_task1'])) teat_labelsB = to_categorical(np.array(clean_test['to_task2']), 4) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) indices = range(train_data.shape[0]) np.random.shuffle(list(indices)) train_data = train_data[indices] train_labelsA = train_labelsA[indices] train_labelsB = train_labelsB[indices] # num_validation_samples = int(VALIDATION_SPLIT * cnn_data.shape[0]) train_x,train_ya,train_yb=[],[],[] val_x,val_ya,val_yb=[],[],[] for i in range(len(train_data)): if i % split == 0: val_x.append(train_data[i]) val_ya.append(train_labelsA[i]) val_yb.append(train_labelsB[i]) else: train_x.append(train_data[i]) train_ya.append(train_labelsA[i]) train_yb.append(train_labelsB[i]) data = [[train_data,train_labelsA,train_labelsB], [test_data, teat_labelsA, teat_labelsB, clean_test['task_3'], clean_test['id']]] # data = [[train_x,train_ya,train_yb], [val_x,val_ya,val_yb],[test_data, teat_labelsA, teat_labelsB, clean_test['task_3'], clean_test['id']]] return data, word_index,num_words,max_sequence_length
print("%s words total,with a vocabulary size of %s" % (len(all_words), len(VOCAB))) print("Max sentence length is %s" % max(sentence_lengths)) word2vec_path = "/home/baiyang/baiyang/code/crawl-300d-2M.vec" word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=False) EMBEDDING_DIM = 300 MAX_SEQUENCE_LENGTH = max(sentence_lengths) + 1 VOCAB_SIZE = len(VOCAB) VALIDATION_SPLIT = 0.2 ##############分词开始 tokenizer = Tokenizer(num_words=VOCAB_SIZE) tokenizer.fit_on_texts( (clean_questions["sent0"] + '<eos>' + clean_questions["sent1"]).tolist()) tokenizer.fit_on_texts( (test_data["sent0"] + '<eos>' + test_data["sent1"]).tolist()) sequences_train = tokenizer.texts_to_sequences( (clean_questions["sent0"] + '<eos>' + clean_questions["sent1"]).tolist()) sequences_test = tokenizer.texts_to_sequences( (test_data["sent0"] + '<eos>' + test_data["sent1"]).tolist()) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) cnn_data = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH) cnn_data_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH) labels = to_categorical(np.asarray(clean_questions["label"])) ## test_labels = test_data["label"]
## **1. Convert NLTK token with sklearn token** """ import numpy as np from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences X.apply(lambda el: " ".join(el)) X = X.values X = np.array(X, dtype='O') tokenizer = Tokenizer(num_words=5000, oov_token='oov') tokenizer.fit_on_texts(X) X = tokenizer.texts_to_sequences(X) X = pad_sequences(X, maxlen=600, padding='post', truncating='post') print(X[0]) # y = pd.get_dummies(y) y = np.array(pd.get_dummies(y).values, dtype='O') # Split data from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=63) print(X[0])
df['doc_len'] = df['tweet'].apply( lambda words: len(words.split(" "))) #length of each tweet max_seq_len = np.round(df['doc_len'].mean() + df['doc_len'].std()).astype(int) raw_docs = df['tweet'].tolist() #processed docs generator processed_docs = [] for doc in tqdm(raw_docs): tokens = tokenizer.tokenize(doc) filtered = [word for word in tokens if word not in stop_words] processed_docs.append(" ".join(filtered)) tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False) tokenizer.fit_on_texts(processed_docs) word_seq = tokenizer.texts_to_sequences( processed_docs ) #each tweet gets tokenized---word_seq is a list of tokenized tweets(length = 3059) word_seq = sequence.pad_sequences(word_seq, maxlen=max_seq_len) word_index = tokenizer.word_index #dictionary of words in tweets and their associated id inverted_word_index = dict( (v, k) for k, v in word_index.iteritems() ) #used for deriving word from an index(used to deal with padded sequences) embed_dim = 300 model_bin = "cc.en.300.bin" #fasttext model model = fasttext.load_model(model_bin) #loading fasttext model
for doc in tqdm(raw_docs_train): tokens = tokenizer.tokenize(doc) filtered = [word for word in tokens if word not in stop_words] processed_docs_train.append(" ".join(filtered)) #end for processed_docs_test = [] for doc in tqdm(raw_docs_test): tokens = tokenizer.tokenize(doc) filtered = [word for word in tokens if word not in stop_words] processed_docs_test.append(" ".join(filtered)) #end for print("tokenizing input data...") tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False) tokenizer.fit_on_texts(processed_docs_train + processed_docs_test) #leaky word_seq_train = tokenizer.texts_to_sequences(processed_docs_train) word_seq_test = tokenizer.texts_to_sequences(processed_docs_test) word_index = tokenizer.word_index print("dictionary size: ", len(word_index)) #pad sequences word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len) word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_seq_len) #training params batch_size = 128 num_epochs = 2 embed_dim = 300 #embedding matrix
lambda words: len(words.split(" "))) max_seq_len = np.round(train_df['doc_len'].mean() + train_df['doc_len'].std()).astype(int) processed_comments_train = preprocess_df(train_df, tokenizer=tokenizer, stop_words=stop_words) processed_comments_val = preprocess_df(val_df, tokenizer=tokenizer, stop_words=stop_words) processed_comments_test = preprocess_df(test_df, tokenizer=tokenizer, stop_words=stop_words) tokenizer = Tokenizer(num_words=max_nb_words, lower=True, char_level=False) tokenizer.fit_on_texts(processed_comments_train + processed_comments_val + processed_comments_test) X_train = tokenizer.texts_to_sequences(processed_comments_train) X_val = tokenizer.texts_to_sequences(processed_comments_val) X_test = tokenizer.texts_to_sequences(processed_comments_test) word_index = tokenizer.word_index #pad sequences X_train = sequence.pad_sequences(X_train, maxlen=max_seq_len) print(X_train) print(X_train.shape) X_val = sequence.pad_sequences(X_val, maxlen=max_seq_len) X_test = sequence.pad_sequences(X_test, maxlen=max_seq_len) # TODO move everything to config file #training params batch_size = 256
def load_data_f(train_file, test_file): clean_questions = pd.read_csv(train_file) clean_test = pd.read_csv(test_file) tokenizer = RegexpTokenizer(r'\w+') clean_questions["tokens"] = clean_questions['comment_text'].astype( str).apply(tokenizer.tokenize) clean_test["tokens"] = clean_test['comment_text'].astype(str).apply( tokenizer.tokenize) all_words = [ word for tokens in clean_questions["tokens"] for word in tokens ] for tokens in clean_questions["tokens"]: for word in tokens: all_words.append(word) print(all_words[-1]) sentence_lengths = [len(tokens) for tokens in clean_questions["tokens"]] VOCAB = sorted(list(set(all_words))) print("%s words total,with a vocabulary size of %s" % (len(all_words), len(VOCAB))) print("Max sentence length is %s" % max(sentence_lengths)) max_sequence_length = max(sentence_lengths) + 1 num_words = len(VOCAB) tokenizer = Tokenizer(num_words) tokenizer.fit_on_texts( clean_questions['comment_text'].astype(str).tolist()) tokenizer.fit_on_texts(clean_test['comment_text'].astype(str).tolist()) sequences_train = tokenizer.texts_to_sequences( clean_questions['comment_text'].astype(str).tolist()) sequences_test = tokenizer.texts_to_sequences( clean_test['comment_text'].astype(str).tolist()) train_data = pad_sequences(sequences_train, maxlen=max_sequence_length) test_data = pad_sequences(sequences_test, maxlen=max_sequence_length) clean_questions['to_task1'] = [t1[w] for w in clean_questions['task_1']] clean_questions['to_task2'] = [t2[w] for w in clean_questions['task_2']] train_labelsA = to_categorical(np.array(clean_questions['to_task1'])) train_labelsB = to_categorical(np.array(clean_questions['to_task2']), 4) clean_test['to_task1'] = [t1[w] for w in clean_test['task_1']] clean_test['to_task2'] = [t2[w] for w in clean_test['task_2']] teat_labelsA = to_categorical(np.array(clean_test['to_task1'])) teat_labelsB = to_categorical(np.array(clean_test['to_task2']), 4) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) indices = range(train_data.shape[0]) np.random.shuffle(list(indices)) train_data = train_data[indices] train_labelsA = train_labelsA[indices] train_labelsB = train_labelsB[indices] data = [[train_data, train_labelsA, train_labelsB], [ test_data, teat_labelsA, teat_labelsB, clean_test['task_3'], clean_test['id'] ]] return data, word_index, max_sequence_length
"""## Neural Networks""" # tokenize to sequence, padding # get list of all words total_word = [] max_len = 0 for i in X_train["words_s"].tolist(): max_len = max(max_len, len(i)) for j in i: total_word.append(j) total_word = list(set(total_word)) # tokenize to sequences tokenizer = Tokenizer(num_words = len(total_word)) tokenizer.fit_on_texts(X_train["processed_s"]) train_sequences = tokenizer.texts_to_sequences(X_train["processed_s"]) test_sequences = tokenizer.texts_to_sequences(X_test["processed_s"]) # padding padded_train_s = pad_sequences(train_sequences, maxlen = max_len, padding = 'post', truncating = 'post') padded_test_s = pad_sequences(test_sequences, maxlen = max_len, padding = 'post', truncating = 'post') print("The padded encoding for document\n",X_train["processed_s"][0],"\n is : ",padded_train_s[0]) """RNN""" # Sequential Model model = Sequential() model.add(Embedding(len(total_word), output_dim = 150)) model.add(Bidirectional(SimpleRNN(150)))
EMBEDDINGS_PATH = './Models/SBW-vectors-300-min5.bin' embedding_model = KeyedVectors.load_word2vec_format(EMBEDDINGS_PATH, binary=True) # Obtención de datos para entrenamiento y prueba # kf = StratifiedKFold(n_splits=5, random_state=42) fold = 1 max_len = 300 for train_index, test_index in kf.split(texts, labels): print("{}-fold".format(fold)) texts_train = [texts[index] for index in train_index] texts_test = [texts[index] for index in test_index] y = pd.get_dummies(labels).values y_train, y_test = y[train_index], y[test_index] tokenizer = Tokenizer() tokenizer.fit_on_texts(texts_train) x_train = tokenizer.texts_to_sequences(texts_train) x_test = tokenizer.texts_to_sequences(texts_test) x_train = pad_sequences(x_train, maxlen=max_len) x_test = pad_sequences(x_test, maxlen=max_len) #Guardado de pesos word2vec# input_dim = len(tokenizer.word_index) embedding_matrix = np.zeros((input_dim + 1, 300)) for word, i in tokenizer.word_index.items(): if word in embedding_model: embedding_matrix[i] = embedding_model[word] np.savez_compressed( './Data/K-Folds/{}-fold/embedding_matrix_baseline.npz'.format(fold), embedding_matrix, )
x, vectors, generate_missing=generate_missing)) return list(embeddings) training_embeddings = get_word2vec_embeddings(word2vec, train_comments, generate_missing=True) test_embeddings = get_word2vec_embeddings(word2vec, test_comments, generate_missing=True) MAX_VOCAB_SIZE = 175303 embedding_vecor_length = 100 tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False) tokenizer.fit_on_texts(train_comments["comment_message"].tolist()) #Transform each sentence in a sequence of integers training_sequences = tokenizer.texts_to_sequences( train_comments["comment_message"].tolist()) train_word_index = tokenizer.word_index print('Found %s unique tokens.' % len(train_word_index)) from keras.preprocessing.sequence import pad_sequences #Fill or trancate with data train_cnn_data = pad_sequences(training_sequences, maxlen=max_review_length) train_embedding_weights = np.zeros( (len(train_word_index) + 1, embedding_vecor_length)) for word, index in train_word_index.items():
def train(address): #DATA_PATH = '/Users/wangergou/Downloads/kaggle/Toxic_Comment_Classification/CNN_Crawl/data/' #EMBEDDING_DIR = '/Users/wangergou/Downloads/kaggle/Toxic_Comment_Classification/CNN_Crawl/data/' DATA_PATH = address EMBEDDING_DIR = address MAX_NB_WORDS = 100000 tokenizer = RegexpTokenizer(r'\w+') stop_words = set(stopwords.words('english')) stop_words.update( ['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}']) print('loading word embeddings...') embeddings_index = {} f = codecs.open(EMBEDDING_DIR + 'crawl-300d-2M.vec', encoding='utf-8') for line in tqdm(f): values = line.rstrip().rsplit(' ') word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() print('found %s word vectors' % len(embeddings_index)) print("loading data...") train_df = pd.read_csv(DATA_PATH + 'train.csv', sep=',', header=0) test_df = pd.read_csv(DATA_PATH + 'test.csv', sep=',', header=0) test_df = test_df.fillna('_NA_') print("num train: ", train_df.shape[0]) print("num test: ", test_df.shape[0]) label_names = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] y_train = train_df[label_names].values raw_docs_train = train_df['comment_text'].tolist() raw_docs_test = test_df['comment_text'].tolist() num_classes = len(label_names) train_df['doc_len'] = train_df['comment_text'].apply( lambda words: len(words.split(" "))) max_seq_len = np.round(train_df['doc_len'].mean() + train_df['doc_len'].std()).astype(int) print("pre-processing train data...") processed_docs_train = [] for doc in tqdm(raw_docs_train): tokens = tokenizer.tokenize(doc) filtered = [word for word in tokens if word not in stop_words] processed_docs_train.append(" ".join(filtered)) #end for processed_docs_test = [] for doc in tqdm(raw_docs_test): tokens = tokenizer.tokenize(doc) filtered = [word for word in tokens if word not in stop_words] processed_docs_test.append(" ".join(filtered)) #end for print("tokenizing input data...") tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False) tokenizer.fit_on_texts(processed_docs_train + processed_docs_test) #leaky word_seq_train = tokenizer.texts_to_sequences(processed_docs_train) word_seq_test = tokenizer.texts_to_sequences(processed_docs_test) word_index = tokenizer.word_index print("dictionary size: ", len(word_index)) #pad sequences word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len) word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_seq_len) #training params batch_size = 256 num_epochs = 8 #model parameters num_filters = 512 embed_dim = 300 weight_decay = 1e-4 #embedding matrix print('preparing embedding matrix...') words_not_found = [] nb_words = min(MAX_NB_WORDS, len(word_index)) embedding_matrix = np.zeros((nb_words, embed_dim)) for word, i in word_index.items(): if i >= nb_words: continue embedding_vector = embeddings_index.get(word) if (embedding_vector is not None) and len(embedding_vector) > 0: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector else: words_not_found.append(word) print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0)) #CNN architecture print("training CNN ...") model = Sequential() model.add( Embedding(nb_words, embed_dim, weights=[embedding_matrix], input_length=max_seq_len, trainable=False)) model.add(Conv1D(num_filters, 7, activation='relu', padding='same')) model.add(MaxPooling1D(2)) model.add(Conv1D(num_filters, 7, activation='relu', padding='same')) model.add(GlobalMaxPooling1D()) model.add(Dropout(0.5)) model.add( Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay))) model.add(Dense(num_classes, activation='sigmoid')) #multi-label (k-hot encoding) adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy']) model.summary() #define callbacks early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1) callbacks_list = [early_stopping] #model training hist = model.fit(word_seq_train, y_train, batch_size=batch_size, epochs=num_epochs, callbacks=callbacks_list, validation_split=0.1, shuffle=True, verbose=2) y_test = model.predict(word_seq_test) #create a submission submission_df = pd.DataFrame(columns=['id'] + label_names) submission_df['id'] = test_df['id'].values submission_df[label_names] = y_test submission_df.to_csv(address + "cnn_fasttext_submission_512.csv", index=False)
MAX_SEQUENCE_LENGTH = 32 MAX_NUM_WORDS = 20000 #max token 19361 something EMBEDDING_DIM = 300 print("MAXIMUM") print(max([len(X_train_pre["headline"][i].strip().split(" ")) for i in range(len(X_train_pre))])) print(max([len(X_valid_pre["headline"][i].strip().split(" ")) for i in range(len(X_valid_pre))])) print(max([len(X_test_pre["headline"][i].strip().split(" ")) for i in range(len(X_test_pre))])) #print(X_test_pre["headline"][2]) # finally, vectorize the text samples into a 2D integer tensor tokenizer = Tokenizer(num_words=MAX_NUM_WORDS,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True) tokenizer.fit_on_texts(X_train_pre['headline'].values) X_train = tokenizer.texts_to_sequences(X_train_pre['headline'].values) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH) print('Shape of train data tensor:', X_train.shape) #print("X_train[0] &&&&&&&&&&&&&&&&&&&&&&&& :",X_train[0]) # print(X_train[1]) X_test = tokenizer.texts_to_sequences(X_test_pre['headline'].values) # Every word got a new number # print(X_test[1]) X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH) print('Shape of test data tensor:', X_test.shape) # print(X_test[1]) X_valid = tokenizer.texts_to_sequences(X_valid_pre['headline'].values) # Every word got a new number
tupleIndex += 1 words = tuple[0].split() wordsList = [] for word in words: wordNF = tokenizeWord(morph.parse(word)[0].normal_form) uniqWords.append(wordNF) wordsList.append(wordNF) testSentences.append(wordsList) from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences print(len(uniqWords)) tokenizer = Tokenizer(num_words=len(uniqWords)) tokenizer.fit_on_texts(sentences) vocab_size = len(tokenizer.word_index) + 1 maxlen = 1000 X_train = tokenizer.texts_to_sequences(sentences) X_test = tokenizer.texts_to_sequences(testSentences) X_train = pad_sequences(X_train, padding='post', maxlen=maxlen) X_test = pad_sequences(X_test, padding='post', maxlen=maxlen) print(vocab_size) from keras.utils import to_categorical y_train = df["label"] y_test = test["label"]