def tokenize(self, writeDictionaryToCsv=False): print('Tokenizing') print('adding n-grams') self.trainData = self.addNgGrams(self.trainData) self.testData = self.addNgGrams(self.testData) all_reviews = self.trainData.append(self.testData) tokenizer = Tokenizer(num_words=30000) print('fitting') tokenizer.fit_on_texts(all_reviews) tokenizer.fit_on_sequences(all_reviews) print('texts_to_sequences') self.trainData = tokenizer.texts_to_sequences(self.trainData) self.testData = tokenizer.texts_to_sequences(self.testData) print('sequences_to_matrix') self.trainData = tokenizer.sequences_to_matrix(self.trainData) self.testData = tokenizer.sequences_to_matrix(self.testData) all_reviews = np.vstack((self.trainData, self.testData)) #does not work with svm no time to change code self.trainData = self.trainData / self.trainData.sum(axis=1)[:, None] self.testData = self.testData / self.testData.sum(axis=1)[:, None] if (writeDictionaryToCsv): self.ExportFeatureSpace(tokenizer) print('Finished tokenizing')
def nn_execute(nnmodelconfig): resultString = validate_model_json.validate_json(nnmodelconfig) if resultString == True: modelJSON = make_model_json.makeKerasModel(nnmodelconfig) np.random.seed(0) number_of_features = 1000 np_load_old = np.load np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k) (train_data, train_labels), (test_data, test_labels) = imdb.load_data( num_words=number_of_features) np.load = np_load_old tokenizer = Tokenizer(num_words=number_of_features) train_features = tokenizer.sequences_to_matrix(train_data, mode='binary') test_features = tokenizer.sequences_to_matrix(test_data, mode='binary') model = keras.models.model_from_json(modelJSON) model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) model.fit(train_features, train_labels, epochs=3, verbose=1, batch_size=100) val_loss, val_acc = model.evaluate(test_features, test_labels) results = {} results['loss'] = float(val_loss) results['accuracy'] = float(val_acc) results = json.dumps(results) #TO_DO - change according to frontend emit('sample_response', results, namespace='/samplenamespace') else: results = {} results['error'] = resultString results = json.dumps(results) #TO_DO - change according to frontend emit('sample_response', results, namespace='/samplenamespace')
def creat_train_data(maxWordsCount = 15000, xLen = 1000, step = 100): tokenizer = Tokenizer(num_words=maxWordsCount) tokenizer.fit_on_texts(trainText) # Преобразовываем текст в последовательность индексов согласно частотному словарю trainWordIndexes = tokenizer.texts_to_sequences(trainText) testWordIndexes = tokenizer.texts_to_sequences(testText) #Формируем обучающую и тестовую выборку xTrain, yTrain = createSetsMultiClasses(trainWordIndexes, xLen, step) xTest, yTest = createSetsMultiClasses(testWordIndexes, xLen, step) # Преобразовываем полученные выборки из последовательности индексов в матрицы нулей и единиц по принципу Bag of Words xTrain01 = tokenizer.sequences_to_matrix(xTrain.tolist()) xTest01 = tokenizer.sequences_to_matrix(xTest.tolist()) return xTrain, xTrain01, yTrain, xTest, xTest01, yTest
def creat_train_data(maxWordsCount = 15000, xLen = 1000, step = 100): tokenizer = Tokenizer(num_words=maxWordsCount) tokenizer.fit_on_texts(trainText) # "Скармливаем" наши тексты, т.е. даём в обработку методу, который соберет словарь частотности # Преобразовываем текст в последовательность индексов согласно частотному словарю trainWordIndexes = tokenizer.texts_to_sequences(trainText) # Обучающие тесты в индексы testWordIndexes = tokenizer.texts_to_sequences(testText) # Проверочные тесты в индексы #Формируем обучающую и тестовую выборку xTrain, yTrain = createSetsMultiClasses(trainWordIndexes, xLen, step) #извлекаем обучающую выборку xTest, yTest = createSetsMultiClasses(testWordIndexes, xLen, step) #извлекаем тестовую выборку # Преобразовываем полученные выборки из последовательности индексов в матрицы нулей и единиц по принципу Bag of Words xTrain01 = tokenizer.sequences_to_matrix(xTrain.tolist()) # Подаем xTrain в виде списка, чтобы метод успешно сработал xTest01 = tokenizer.sequences_to_matrix(xTest.tolist()) # Подаем xTest в виде списка, чтобы метод успешно сработал return xTrain, xTrain01, yTrain, xTest, xTest01, yTest
def execute(self, testData): tokenizer = Tokenizer(num_words=3000) with open(os.getcwd() + '/model/email_classifier/dictionary.json', 'r') as dictionary_file: dictionary = json.load(dictionary_file) # read in your saved model structure json_file = open(os.getcwd() + '/model/email_classifier/model.json', 'r') loaded_model_json = json_file.read() json_file.close() # and create a model from that model = model_from_json(loaded_model_json) # and weight your nodes with your saved values model.load_weights(os.getcwd() + '/model/email_classifier/model.h5') labels = ['unknown_template', 'status_template', 'track_template'] prediction = 0 testArr1 = executeModel.convert_text_to_index_array( testData[0], dictionary) testArr2 = executeModel.convert_text_to_index_array( testData[1], dictionary) testArr3 = executeModel.convert_text_to_index_array( testData[2], dictionary) testArr4 = executeModel.convert_text_to_index_array( testData[3], dictionary) #print('testArr:', testArr1) #print('testArr:', testArr2) #print('testArr:', testArr3) #print('testArr:', testArr4) input1 = tokenizer.sequences_to_matrix([testArr1], mode='binary') input2 = tokenizer.sequences_to_matrix([testArr2], mode='binary') input3 = tokenizer.sequences_to_matrix([testArr3], mode='binary') input4 = tokenizer.sequences_to_matrix([testArr4], mode='binary') #print('input1:', input1) #print('input2:', input2) #print('input3:', input3) #print('input4:', input4) # predict which bucket your input belongs in pred = model.predict([input1, input2, input3, input4]) print('pred:', pred) # and print it for the humons print("%s sentiment; %f%% confidence" % (labels[np.argmax(pred)], pred[0][np.argmax(pred)] * 100)) print('matches:', np.argmax(pred)) return np.argmax(pred)
def create_tfidf(dataset_name): total = 0 for path in Path(data_folder_name).glob('**/*.txt'): total += 1 file = open(dataset_name + ".pv", "w") tokenizer_obj = Tokenizer() #tokenizer_obj.fit_on_texts(notewhorty_words_set) logging.info("FITTING WORDS") with tqdm(total=total) as pbar: for path in Path(data_folder_name).glob('**/*.txt'): document = [0 for _ in notewhorty_words] f = open(str(path), "r") text = f.read() f.close() text = text.lower() text = ''.join(c for c in text if c not in string.punctuation) words = text.split() new_text = [] for word in words: if word in notewhorty_words_set: new_text.append(word) tokenizer_obj.fit_on_texts(new_text) pbar.update(1) logging.info("COMPUTE TF-IDF WORDS") with tqdm(total=total) as pbar: for path in Path(data_folder_name).glob('**/*.txt'): document = [0 for _ in notewhorty_words] f = open(str(path), "r") text = f.read() f.close() file.write(str(path)) file.write(" ") text = text.lower() text = ''.join(c for c in text if c not in string.punctuation) words = text.split() new_text = [] for word in words: if word in notewhorty_words_set: new_text.append(word) sequence = tokenizer_obj.texts_to_sequences([new_text]) document = tokenizer_obj.sequences_to_matrix(sequence, mode='tfidf') for word in document[0]: file.write(str(word)) file.write(" ") for target in get_target(str(path)): file.write(str(target)) file.write(" ") file.write('\n') pbar.update(1)
def closure(mu): (x_train, y_train), (_, _) = imdb.load_data() tokenizer = Tokenizer(num_words=5000) tokenizer.fit_on_sequences(x_train) x_train = tokenizer.sequences_to_matrix(x_train, "tfidf") # Note: svd_solver=full is needed on GPU server x_train = PCA(n_components=100, svd_solver='full').fit_transform(x_train) ds = {"data": x_train, "target": y_train} # Apply noise and return res = preprocess_and_noise(dataset=ds, mu=mu) return res
train_labels = open('train_labels.txt', 'rb').read().decode('utf-8').split('\n') test_texts = open('test_contents.txt', 'rb').read().decode('utf-8').split('\n') test_labels = open('test_labels.txt', 'rb').read().decode('utf-8').split('\n') all_texts = train_texts + test_texts all_labels = train_labels + test_labels print('(2) doc to var...') tokenizer = Tokenizer() tokenizer.fit_on_texts(all_texts) sequences = tokenizer.texts_to_sequences(all_texts) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data = tokenizer.sequences_to_matrix(sequences, mode='tfidf') labels = to_categorical(np.asarray(all_labels)) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) print('(3) split data set...') p1 = int(len(data)*(1-VALIDATION_SPLIT-TEST_SPLIT)) p2 = int(len(data)*(1-TEST_SPLIT)) x_train = data[:p1] y_train = labels[:p1] x_val = data[p1:p2] y_val = labels[p1:p2] x_test = data[p2:] y_test = labels[p2:] print('train docs: '+str(len(x_train)))
num_words=None, skip_top=0, maxlen=None) print(X_train.shape) print(X_test.shape) print(y_train.shape) print(y_test.shape) print(X_train[np.random.randint(0, len(X_train))]) print(set(y_train)) V = 5000 tokenizer = Tokenizer(num_words=V) X_train = tokenizer.sequences_to_matrix(X_train, mode='binary') X_test = tokenizer.sequences_to_matrix(X_test, mode='binary') print(X_train[np.random.randint(0, len(X_train))]) print(X_train.shape) print(X_test.shape) classes = len(set(y_train)) y_train = keras.utils.to_categorical(y_train, classes) y_test = keras.utils.to_categorical(y_test, classes) print(y_train.shape) print(y_test.shape) print("tokens = " + str(V)) i_layer = Input(shape=(V, ))
def main(): if not os.path.exists(os.getcwd() + '/model/'): os.mkdir(os.getcwd() + '/model/') if not os.path.exists(os.getcwd() + '/model/email_classifier'): os.mkdir(os.getcwd() + '/model/email_classifier') training = np.genfromtxt(os.getcwd() + '/data/dnn_model_training_data.txt', delimiter=',', skip_header=1, usecols=(0, 1, 2, 3, 4), dtype=None, encoding='utf-8') feature = [[str(x[1]), str(x[2]), str(x[3]), str(x[4])] for x in training] # index all the sentiment labels label = np.asarray([x[0] for x in training]) # only work with the 3000 most popular words found in our dataset max_words = 3000 # create a new Tokenizer tokenizer = Tokenizer(num_words=max_words) tokenizer.fit_on_texts(feature) dictionary = tokenizer.word_index with open(os.getcwd() + '/model/email_classifier/dictionary.json', 'w') as dictionary_file: json.dump(dictionary, dictionary_file) wordIndicesList1 = [] wordIndicesList2 = [] wordIndicesList3 = [] wordIndicesList4 = [] for featurerow in feature: #print(featurerow[0], featurerow[1], featurerow[2], featurerow[3]) wordIndices1 = convert_text_to_index_array(featurerow[0], dictionary) wordIndices2 = convert_text_to_index_array(featurerow[1], dictionary) wordIndices3 = convert_text_to_index_array(featurerow[2], dictionary) wordIndices4 = convert_text_to_index_array(featurerow[3], dictionary) wordIndicesList1.append(wordIndices1) wordIndicesList2.append(wordIndices2) wordIndicesList3.append(wordIndices3) wordIndicesList4.append(wordIndices4) wordIndicesList1 = np.asarray(wordIndicesList1) wordIndicesList2 = np.asarray(wordIndicesList2) wordIndicesList3 = np.asarray(wordIndicesList3) wordIndicesList4 = np.asarray(wordIndicesList4) feature1 = tokenizer.sequences_to_matrix(wordIndicesList1, mode='binary') feature2 = tokenizer.sequences_to_matrix(wordIndicesList2, mode='binary') feature3 = tokenizer.sequences_to_matrix(wordIndicesList3, mode='binary') feature4 = tokenizer.sequences_to_matrix(wordIndicesList4, mode='binary') label = kr.utils.to_categorical(label, 3) model = Sequential() model.add(Dense(512, input_shape=(max_words, ), activation='relu')) model.add(Dropout(0.5)) model.add(Dense(256, activation='sigmoid')) model.add(Dropout(0.5)) model.add(Dense(3, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(x=[feature1, feature2, feature3, feature4], y=label, batch_size=200, epochs=5, verbose=1, validation_split=0.2, shuffle=True) model_json = model.to_json() with open(os.getcwd() + '/model/email_classifier/model.json', 'w') as json_file: json_file.write(model_json) model.save_weights(os.getcwd() + '/model/email_classifier/model.h5') print('saved model!')
batch_size = 32 epochs = 5 print("Loading data...") (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words, test_split=0.2) print(len(x_train), "train sequences") print(len(x_test), "test sequences") num_classes = np.max(y_train) + 1 print(num_classes, "classes") print("Vectorizing sequence data...") tokenizer = Tokenizer(num_words=max_words) x_train = tokenizer.sequences_to_matrix(x_train, mode="binary") x_test = tokenizer.sequences_to_matrix(x_test, mode="binary") print("x_train shape:", x_train.shape) print("x_test shape:", x_test.shape) print("Convert class vector to binary class matrix " "(for use with categorical_crossentropy)") y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) print("y_train shape:", y_train.shape) print("y_test shape:", y_test.shape) print("Building model...") model = Sequential() model.add(Dense(512, input_shape=(max_words, ))) model.add(Activation("relu"))
class TypeClassifier: def __init__( self, max_words=10000, xmldata='/home/leon/Documents/Golden_Agents/saa-notarialTexts/page', ydata='/home/leon/Documents/Golden_Agents/page-corrector/classification/data/scans.csv' ): self.xmldata = xmldata self._parseScanData(csvpath=ydata) self.tokenizer = Tokenizer(num_words=max_words, filters='#$%&/;<=>@[\\]^_`{|}~\t', lower=True, split=' ', char_level=False, oov_token=None) if max_words: self.max_words = max_words else: self.max_words = len(self.tokenizer.word_counts) # get texts and labels texts, labels = zip( *self.getTexts(folder=self.xmldata, train_only=True)) # fit on the corpus (`xmldata`) self.tokenizer.fit_on_texts(texts) self.X = self.tokenizer.texts_to_sequences(texts) # encode class labels self.encoder = LabelEncoder() self.y = self.encoder.fit_transform(labels) # Split in train and text X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42, shuffle=True, stratify=self.y) self.X_train = self.tokenizer.sequences_to_matrix(X_train, mode='tfidf') self.X_test = self.tokenizer.sequences_to_matrix(X_test, mode='tfidf') self.n_classes = len(list(self.encoder.classes_)) self.y_train = keras.utils.to_categorical(y_train, self.n_classes) self.y_test = keras.utils.to_categorical(y_test, self.n_classes) # construct model self.constructModel() # train self.train() def getTexts(self, folder: str, train_only=True): if train_only: for f in os.listdir(folder): filepath = os.path.join(folder, f) scanid = os.path.splitext(f)[0].upper() if scanid in self.scan2type: yield xml2text(filepath), self.scan2type[scanid] else: return (xml2text(os.path.join(folder, f)) for f in os.listdir(folder)) def constructModel(self): model = Sequential() model.add(Dense(512, input_shape=(self.max_words, ), activation='relu')) model.add(Dropout(0.5)) # model.add(Dense(256, activation='relu')) # model.add(Dropout(0.5)) # model.add(Dense(128, activation='relu')) # model.add(Dropout(0.5)) model.add(Dense(self.n_classes, activation='sigmoid')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', f1]) print(model.metrics_names) self.model = model # self.model = keras.Sequential([ # keras.layers.Embedding(encoder.vocab_size, 16), # keras.layers.GlobalAveragePooling1D(), # keras.layers.Dense(1, activation='sigmoid') # ]) # self.model.compile(optimizer='adam', # loss='binary_crossentropy', # metrics=['accuracy']) def train(self, batch_size=32, epochs=10): self.history = self.model.fit(self.X_train, self.y_train, batch_size=batch_size, shuffle=True, epochs=epochs, verbose=1, validation_data=(self.X_test, self.y_test)) score = self.model.evaluate(self.X_test, self.y_test, batch_size=batch_size, verbose=1) y_val_pred = self.model.predict(self.X_test) # y_pred_bool = np.argmax(y_val_pred, axis=1) print('Test loss:', score[0]) print('Test accuracy:', score[1]) print('Test f1-score', score[2]) print( classification_report(self.y_test, y_val_pred, target_names=self.encoder.classes_)) # self.history = self.model.fit(train_batches, # epochs=epochs, # validation_data=validation_data, # validation_steps=validation_steps) def _parseScanData(self, csvpath: str): df = pd.read_csv(csvpath) scan2type = dict() scan2record = dict() for d in df.to_dict(orient='records'): scanid = d['scan'].split('/Scan/')[1].upper() scan2type[scanid] = d['type'] scan2record[scanid] = d['record'] self.scan2type = scan2type self.scan2record = scan2record
test = neg_test + pos_test train_string = process_doc(train, vocab) test_string = process_doc(test, vocab) full_text_string = neg_train + neg_test + pos_train + pos_test for key, value in vocab.items(): index_to_word[value] = key max_words = 10000 tokenizer = Tokenizer(num_words=len(vocab)) tokenizer.fit_on_texts(train) Xtrain = tokenizer.texts_to_sequences(train) Xtrain = tokenizer.sequences_to_matrix(Xtrain) tokenizer.fit_on_texts(test) Xtest = tokenizer.texts_to_sequences(test) Xtest = tokenizer.sequences_to_matrix(Xtest) ytrain = np.asarray([0 for _ in range(len(neg_train))] + [1 for _ in range(len(pos_train))]).astype( 'float64').reshape(-1, 1) ytest = np.asarray([0 for _ in range(len(neg_test))] + [1 for _ in range(len(pos_test))]).astype( 'float64').reshape(-1, 1) # define network model = Sequential() model.add(Dense(50, input_shape=Xtrain.shape, activation='relu'))
class SequenceTokenizer(): def __init__(self, annotations, node_list, padding='post', maxlen=2000, truncating='post', agg_mode=None, tokenizer=None, verbose=False) -> None: """ Handles text tokenizing for DNA/RNA/Protein sequences. Args: padding: ['post', 'pre', None] maxlen (int): pad all RNA sequence strings to this length truncating: ['post', 'pre', 'random']. If 'random', then 'post' or 'pre' truncating is chosen randomly for each sequence at each iteration agg_mode: one of {"count", "tfidf", "binary", "freq"}, default None. If not None, instead of returning sequence encoding, get_sequence_encoding will return an aggregated numpy vector. tokenizer: pass an existing tokenizer instead of creating one """ self.maxlen = maxlen self.padding = padding self.truncating = truncating self.agg_mode = agg_mode self.annotations = annotations self.node_list = node_list self.verbose = verbose if tokenizer is not None: self.tokenizer = tokenizer else: self.tokenizer = Tokenizer(char_level=True, lower=False) self.tokenizer.fit_on_texts(self.annotations.loc[self.node_list, SEQUENCE_COL]) print("word index:", self.tokenizer.word_index) if self.verbose else None def sample_sequences(self, sequences): return sequences.apply(lambda x: random.choice(x) if isinstance(x, list) else x) def get_sequence_encodings(self, node_list: list, variable_length=False, minlen=None): """ Returns an ndarray of shape (batch_size, sequence length, n_words) given a list of node ids (indexing from self.node_list) :param node_list: a list of node names to fetch transcript sequences :param variable_length: returns a list of sequences with different timestep length :param minlen: pad all sequences with length lower than this minlen """ annotations = self.annotations seqs = annotations.loc[node_list, SEQUENCE_COL] padded_encoded_seqs = self.encode_texts( seqs, maxlen=self.maxlen, minlen=minlen, variable_length=variable_length) return padded_encoded_seqs def encode_texts(self, texts, maxlen=None, minlen=None, variable_length=False): """ Returns a one-hot-vector for a string of RNA transcript sequence :param texts: [str | list(str)] :param maxlen: Set length to maximum length :param single: Set to True if texts is not a list (i.e. only a single node name string). :return: """ # integer encode encoded = self.tokenizer.texts_to_sequences(texts) if variable_length: return encoded elif self.agg_mode: return self.tokenizer.sequences_to_matrix(encoded, mode=self.agg_mode) # Pad sequences to the same length batch_maxlen = max([len(x) for x in encoded]) if batch_maxlen < self.maxlen: maxlen = batch_maxlen if minlen and len(texts[0]) < minlen: maxlen = minlen # pad encoded sequences encoded = pad_sequences( encoded, maxlen=maxlen, padding=self.padding, truncating=np.random.choice(["post", "pre"]) if self.truncating == "random" else self.truncating, dtype=SEQ_DTYPE) return encoded
cc= [] for i in x_train_token: cc.append(len(i)) plt.hist(cc,bins = 50) plt.show() len(x_train_token), len(xTrain) ind = [len(i) < 300 for i in x_train_token] # Отсекаем все примеры где, длина больше 300 sum(ind) x_Train = xTrain[ind] x_train_text = np.array(x_train_token)[ind] xTrainC01 = tokenizer.sequences_to_matrix(x_train_text, mode ="count") xTrainC01.shape y_Train = yTrain[ind] #Выводим раземры обучающей выборки #Чтобы проверить, что мы всё правильно собрали print(x_Train.shape) print(xTrainC01.shape) print(y_Train.shape) #Нормируем размер квартиры в xTrain xScaler = StandardScaler() #Создаём нормировщик нормальным распределением xScaler.fit(x_Train[:,-1].reshape(-1, 1)) #Обучаем его на площадях квартир (последня колонка в xTrain) xTrainScaled = x_Train.copy() xTrainScaled[:,-1] = xScaler.transform(x_Train[:,-1].reshape(-1, 1)).flatten() #Нормируем данные нормировщиком
items = list(tokenizer.word_index.items()) # Вытаскиваем индексы слов для просмотра # Преобразовываем текст в последовательность индексов согласно частотному словарю trainWordIndexes = tokenizer.texts_to_sequences(trainText) # Обучающие тесты в индексы testWordIndexes = tokenizer.texts_to_sequences(testText) # Проверочные тесты в индексы #Задаём базовые параметры xLen = 1000 # Длина отрезка текста, по которой анализируем, в словах step = 100 # Шаг разбиения исходного текста на обучающие векторы #Формируем обучающую и тестовую выборку xTrain, yTrain = createSetsMultiClasses(trainWordIndexes, xLen, step) #извлекаем обучающую выборку xTest, yTest = createSetsMultiClasses(testWordIndexes, xLen, step) #извлекаем тестовую выборку # Преобразовываем полученные выборки из последовательности индексов в матрицы нулей и единиц по принципу Bag of Words xTrain01 = tokenizer.sequences_to_matrix(xTrain.tolist()) # Подаем xTrain в виде списка, чтобы метод успешно сработал xTest01 = tokenizer.sequences_to_matrix(xTest.tolist()) # Подаем xTest в виде списка, чтобы метод успешно сработал # Выводим формы масивов print(xTrain01.shape) print(yTrain.shape) print(xTest01.shape) print(yTest.shape) tokenizer.word_index #Задаём базовые параметры xLen = 1000 # Длина отрезка текста, по которой анализируем, в словах step = 100 # Шаг разбиения исходного текста на обучающие векторы #Формируем обучающую и тестовую выборку
def создать_выборки(xLen): print('Происходит создание выборки для обучения') print('Это может занять несколько минут...') path = 'content/Болезни/' text = [] classes = [] n = 0 codecs_list = ['UTF-8', 'Windows-1251'] for filename in os.listdir(path): # Проходим по всем файлам в директории договоров n +=1 for codec_s in codecs_list: try: text.append(readText(path+filename, codec_s)) # Преобразуем файл в одну строку и добавляем в agreements classes.append(filename.replace(".txt", "")) break except UnicodeDecodeError: print('Не прочитался файл: ', path+currdir+'/'+filename, codec_s) else: next stop_words = nltk.corpus.stopwords.words('russian') lexeme_list = ['POS', 'animacy', 'aspect', 'case', 'gender', 'involvement', 'mood', 'number', 'person', 'tense', 'transitivity', 'voice'] words = [] # Здесь будут лежать все списки слов каждого из описаний заболеваний tags = [] # Здесь будут лежать все списки списков граммем для каждого слова tags_all = [] # Здесь будут лежать все списки граммем всех слов для тренировки токенайзера for i in range(len(text)): word, tag = text2Words(text[i]) words.append(word) tags.append(tag) for k in tags: for t in k: tags_all.append(t) ################# #Преобразовываем текстовые данные в числовые/векторные для обучения нейросетью ################# # Максимальное количество слов в словаре maxWordsCount = 1100 # Токенизатор кераса tokenizer = Tokenizer(num_words=maxWordsCount, filters='!"#$%&()*+,-––—./:;<=>?@[\\]^_`{|}~\t\n\xa0', lower=True, split=' ', oov_token='unknown', char_level=False) # Скармливаем ему слова tokenizer.fit_on_texts(words) items = list(tokenizer.word_index.items()) tokenizer_json1 = tokenizer.to_json() with io.open('tokenizer1.json', 'w', encoding='utf-8') as f: f.write(json.dumps(tokenizer_json1, ensure_ascii=False)) with open('tokenizer1.json') as f: data = json.load(f) tokenizer = tokenizer_from_json(data) items = list(tokenizer.word_index.items()) #Выведем первые 10 слов из словаря # Максимальное количество слов в словаре maxWordsCount2 = 50 # Токенизатор кераса tokenizer2 = Tokenizer(num_words=maxWordsCount2, filters='!"#$%&()*+,-––—./:;<=>?@[\\]^_`{|}~\t\n\xa0', lower=True, split=' ', oov_token='unknown', char_level=False) # Скармливаем ему слова tokenizer2.fit_on_texts(tags_all) items2 = list(tokenizer2.word_index.items()) tokenizer_json2 = tokenizer2.to_json() with io.open('tokenizer2.json', 'w', encoding='utf-8') as f: f.write(json.dumps(tokenizer_json2, ensure_ascii=False)) with open('tokenizer2.json') as f: data = json.load(f) tokenizer2 = tokenizer_from_json(data) items2 = list(tokenizer2.word_index.items()) #Преобразовываем текст в последовательность индексов согласно частотному словарю xTrainIndexes = tokenizer.texts_to_sequences(words) #Обучающие тексты в индексы #Преобразовываем тэги в последовательность индексов согласно частотному словарю xTrainTagsIndexes = [] for tag in tags: # так как теги имеют дополнительные вложенные списки поэтому итерируем в нужном для токенайзера формате xTrainTagsIndexes.append(tokenizer2.texts_to_sequences(tag)) nVal = 200 # Количество слов проверочной выборки trainWords = [] # Здесь будет лежать слова для обучающей выборки valWords = [] # Здесь будет лежать слова для проверочной выборки for i in range(len(xTrainIndexes)): trainWords.append(xTrainIndexes[i][:-nVal]) valWords.append(xTrainIndexes[i][-nVal:]) trainTagsWords = [] # Здесь будет лежать теги для обучающей выборки valTagsWords = [] # Здесь будет лежать тэги для проверочной выборки for i in range(len(xTrainTagsIndexes)): trainTagsWords.append(xTrainTagsIndexes[i][:-nVal]) valTagsWords.append(xTrainTagsIndexes[i][-nVal:]) step = 1 #шаг # Создаем "раскусанные" выборки длины xLen, по 4000 экземпляров на каждый класс (xTrain, yTrain) = createSetsMultiClassesBallanced(trainWords, xLen, step, 4000) #(xTrain, yTrain) = createSetsMultiClasses(trainWords, xLen, step) (xTrainTags, _) = createSetsMultiClassesBallanced(trainTagsWords, xLen, step, 4000) # Преобразовываем полученные обучающие выборки из последовательности индексов в матрицы нулей и единиц по принципу Bag of Words xTrain01 = tokenizer.sequences_to_matrix(xTrain.tolist(), mode="tfidf") # Подаем xTrain в виде списка чтобы метод успешно сработал #xTrainTags01 = tokenizer2.sequences_to_matrix(xTrainTags) # Создаем "раскусанные" выборки длины xLen для проверочной (xVal, yVal) = createSetsMultiClasses(valWords, xLen, step) #(xVal, yVal) = createSetsMultiClassesBallanced(valWords, xLen, step, 600) (xTagsVal, _) = createSetsMultiClasses(valTagsWords, xLen, step) #(xTagsVal, _) = createSetsMultiClassesBallanced(valTagsWords, xLen, step, 600) # Преобразовываем полученные выборки из последовательности индексов в матрицы нулей и единиц по принципу Bag of Words xVal01 = tokenizer.sequences_to_matrix(xVal.tolist(), mode="tfidf") # Подаем xVal в виде списка чтобы метод успешно сработал #xTagsVal01 = tokenizer2.sequences_to_matrix(xTagsVal.tolist()) xTrainTags = np.reshape(xTrainTags, (xTrainTags.shape[0], -1)) xTagsVal = np.reshape(xTagsVal, (xTagsVal.shape[0], -1)) xTrainTags01 = tokenizer2.sequences_to_matrix(xTrainTags.tolist()) xTagsVal01 = tokenizer2.sequences_to_matrix(xTagsVal.tolist()) x_train = [xTrain, xTrain01, xTrainTags01] y_train = yTrain x_val = [xVal, xVal01, xTagsVal01] y_val = yVal valw = (valWords, valTagsWords, tokenizer, tokenizer2) display.clear_output(wait=True) print('Формирование выборки завершено') return (x_train, y_train), (x_val, y_val)
#랜덤 시드 np.random.seed(0) number_of_features = 10000 np_load_old = np.load # modify the default parameters of np.load np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k) # call load_data with allow_pickle implicitly set to true (data_train, target_train), (data_test, target_test) = imdb.load_data(num_words=number_of_features) #영화영화 리뷰 데이터를 원-핫 인코딩된 특성 행렬로 변환 tokenizer = Tokenizer(num_words=number_of_features) features_train = tokenizer.sequences_to_matrix(data_train, mode="binary") features_test = tokenizer.sequences_to_matrix(data_test, mode="binary") # 신경망 모델을 만듭니다. network = models.Sequential() # 입력층으로 드롭아웃 층을 추가합니다. network.add(layers.Dropout(0.3, input_shape=(number_of_features,))) # 렐루 활성화 함수를 사용한 완전 연결 층을 추가합니다. network.add(layers.Dense(units=128, activation="relu")) # Add a dropout layer for previous hidden layer network.add(layers.Dropout(0.6))
from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Dropout from tensorflow.keras.utils import to_categorical np.random.seed(100) from tensorflow.keras.datasets import reuters from tensorflow.keras.preprocessing.text import Tokenizer # Get data np_load_old = np.load np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k) (x, y), (xtest, ytest) = reuters.load_data(num_words=10000) np.load = np_load_old # Process text tokenizer = Tokenizer(num_words=10000) xtrain = tokenizer.sequences_to_matrix(x, mode='binary') xtest = tokenizer.sequences_to_matrix(xtest, mode='binary') ytrain = to_categorical(y) ytest = to_categorical(ytest) # Initialize model model = Sequential() model.add(Dense(512, activation='relu')) model.add(Dropout(.5)) model.add(Dense(ytrain.shape[1], activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # Fit model
char_level=False) tokenizer.fit_on_texts( XTrainInfo ) #"скармливаем" наши тексты, т.е даём в обработку методу, который соберет словарь частотности items = list( tokenizer.word_index.items()) #Вытаскиваем индексы слов для просмотра print(len(items)) #Переводим в индексы обучающую и тестовую выборки XTrainInfoIndexes = tokenizer.texts_to_sequences(XTrainInfo) XTestInfoIndexes = tokenizer.texts_to_sequences(XTestInfo) #Преобразовываем обучающую выборку из последовательности индексов в матрицы нулей и единиц по принципу Bag of Words x_train01 = tokenizer.sequences_to_matrix( XTrainInfoIndexes ) #Подаем XTrainInfoIndexes в виде списка чтобы метод успешно сработал print( x_train01.shape) #Размер обучающей выборки, сформированной по Bag of Words print(x_train01[500][0:20]) print(len(x_train01[500])) #Преобразовываем тестовую выборку из последовательности индексов в матрицы нулей и единиц по принципу Bag of Words x_test01 = tokenizer.sequences_to_matrix( XTestInfoIndexes ) #Подаем XTestInfoIndexes в виде списка чтобы метод успешно сработал print( x_test01.shape) #Размер обучающей выборки, сформированной по Bag of Words print(x_test01[500][0:20]) print(len(x_test01[500]))