예제 #1
0
    def tokenize(self, writeDictionaryToCsv=False):
        print('Tokenizing')
        print('adding n-grams')
        self.trainData = self.addNgGrams(self.trainData)
        self.testData = self.addNgGrams(self.testData)

        all_reviews = self.trainData.append(self.testData)
        tokenizer = Tokenizer(num_words=30000)
        print('fitting')
        tokenizer.fit_on_texts(all_reviews)
        tokenizer.fit_on_sequences(all_reviews)

        print('texts_to_sequences')
        self.trainData = tokenizer.texts_to_sequences(self.trainData)
        self.testData = tokenizer.texts_to_sequences(self.testData)
        print('sequences_to_matrix')
        self.trainData = tokenizer.sequences_to_matrix(self.trainData)
        self.testData = tokenizer.sequences_to_matrix(self.testData)

        all_reviews = np.vstack((self.trainData, self.testData))

        #does not work with svm no time to change code
        self.trainData = self.trainData / self.trainData.sum(axis=1)[:, None]
        self.testData = self.testData / self.testData.sum(axis=1)[:, None]

        if (writeDictionaryToCsv):
            self.ExportFeatureSpace(tokenizer)

        print('Finished tokenizing')
예제 #2
0
def nn_execute(nnmodelconfig):

    resultString = validate_model_json.validate_json(nnmodelconfig)

    if resultString == True:

        modelJSON = make_model_json.makeKerasModel(nnmodelconfig)

        np.random.seed(0)

        number_of_features = 1000

        np_load_old = np.load

        np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k)

        (train_data, train_labels), (test_data, test_labels) = imdb.load_data(
            num_words=number_of_features)

        np.load = np_load_old

        tokenizer = Tokenizer(num_words=number_of_features)
        train_features = tokenizer.sequences_to_matrix(train_data,
                                                       mode='binary')
        test_features = tokenizer.sequences_to_matrix(test_data, mode='binary')

        model = keras.models.model_from_json(modelJSON)

        model.compile(loss='binary_crossentropy',
                      optimizer='rmsprop',
                      metrics=['accuracy'])

        model.fit(train_features,
                  train_labels,
                  epochs=3,
                  verbose=1,
                  batch_size=100)

        val_loss, val_acc = model.evaluate(test_features, test_labels)

        results = {}
        results['loss'] = float(val_loss)
        results['accuracy'] = float(val_acc)
        results = json.dumps(results)

        #TO_DO - change according to frontend
        emit('sample_response', results, namespace='/samplenamespace')

    else:
        results = {}
        results['error'] = resultString
        results = json.dumps(results)

        #TO_DO - change according to frontend
        emit('sample_response', results, namespace='/samplenamespace')
예제 #3
0
def creat_train_data(maxWordsCount = 15000, xLen = 1000, step = 100):
    tokenizer = Tokenizer(num_words=maxWordsCount)
    tokenizer.fit_on_texts(trainText)

    # Преобразовываем текст в последовательность индексов согласно частотному словарю
    trainWordIndexes = tokenizer.texts_to_sequences(trainText)
    testWordIndexes = tokenizer.texts_to_sequences(testText)

    #Формируем обучающую и тестовую выборку
    xTrain, yTrain = createSetsMultiClasses(trainWordIndexes, xLen, step)
    xTest, yTest = createSetsMultiClasses(testWordIndexes, xLen, step)

    # Преобразовываем полученные выборки из последовательности индексов в матрицы нулей и единиц по принципу Bag of Words
    xTrain01 = tokenizer.sequences_to_matrix(xTrain.tolist())
    xTest01 = tokenizer.sequences_to_matrix(xTest.tolist())

    return xTrain, xTrain01, yTrain, xTest, xTest01, yTest
def creat_train_data(maxWordsCount = 15000, xLen = 1000, step = 100):
    tokenizer = Tokenizer(num_words=maxWordsCount)
    tokenizer.fit_on_texts(trainText) # "Скармливаем" наши тексты, т.е. даём в обработку методу, который соберет словарь частотности

    # Преобразовываем текст в последовательность индексов согласно частотному словарю
    trainWordIndexes = tokenizer.texts_to_sequences(trainText) # Обучающие тесты в индексы
    testWordIndexes = tokenizer.texts_to_sequences(testText)  # Проверочные тесты в индексы

    #Формируем обучающую и тестовую выборку
    xTrain, yTrain = createSetsMultiClasses(trainWordIndexes, xLen, step) #извлекаем обучающую выборку
    xTest, yTest = createSetsMultiClasses(testWordIndexes, xLen, step)    #извлекаем тестовую выборку

    # Преобразовываем полученные выборки из последовательности индексов в матрицы нулей и единиц по принципу Bag of Words
    xTrain01 = tokenizer.sequences_to_matrix(xTrain.tolist()) # Подаем xTrain в виде списка, чтобы метод успешно сработал
    xTest01 = tokenizer.sequences_to_matrix(xTest.tolist())   # Подаем xTest в виде списка, чтобы метод успешно сработал

    return xTrain, xTrain01, yTrain, xTest, xTest01, yTest
예제 #5
0
    def execute(self, testData):
        tokenizer = Tokenizer(num_words=3000)

        with open(os.getcwd() + '/model/email_classifier/dictionary.json',
                  'r') as dictionary_file:
            dictionary = json.load(dictionary_file)
        # read in your saved model structure
        json_file = open(os.getcwd() + '/model/email_classifier/model.json',
                         'r')
        loaded_model_json = json_file.read()
        json_file.close()
        # and create a model from that
        model = model_from_json(loaded_model_json)
        # and weight your nodes with your saved values
        model.load_weights(os.getcwd() + '/model/email_classifier/model.h5')
        labels = ['unknown_template', 'status_template', 'track_template']
        prediction = 0
        testArr1 = executeModel.convert_text_to_index_array(
            testData[0], dictionary)
        testArr2 = executeModel.convert_text_to_index_array(
            testData[1], dictionary)
        testArr3 = executeModel.convert_text_to_index_array(
            testData[2], dictionary)
        testArr4 = executeModel.convert_text_to_index_array(
            testData[3], dictionary)
        #print('testArr:', testArr1)
        #print('testArr:', testArr2)
        #print('testArr:', testArr3)
        #print('testArr:', testArr4)
        input1 = tokenizer.sequences_to_matrix([testArr1], mode='binary')
        input2 = tokenizer.sequences_to_matrix([testArr2], mode='binary')
        input3 = tokenizer.sequences_to_matrix([testArr3], mode='binary')
        input4 = tokenizer.sequences_to_matrix([testArr4], mode='binary')
        #print('input1:', input1)
        #print('input2:', input2)
        #print('input3:', input3)
        #print('input4:', input4)
        # predict which bucket your input belongs in
        pred = model.predict([input1, input2, input3, input4])
        print('pred:', pred)
        # and print it for the humons
        print("%s sentiment; %f%% confidence" %
              (labels[np.argmax(pred)], pred[0][np.argmax(pred)] * 100))
        print('matches:', np.argmax(pred))
        return np.argmax(pred)
def create_tfidf(dataset_name):

    total = 0
    for path in Path(data_folder_name).glob('**/*.txt'):
        total += 1

    file = open(dataset_name + ".pv", "w")

    tokenizer_obj = Tokenizer()
    #tokenizer_obj.fit_on_texts(notewhorty_words_set)

    logging.info("FITTING WORDS")
    with tqdm(total=total) as pbar:
        for path in Path(data_folder_name).glob('**/*.txt'):
            document = [0 for _ in notewhorty_words]
            f = open(str(path), "r")
            text = f.read()
            f.close()
            text = text.lower()
            text = ''.join(c for c in text if c not in string.punctuation)
            words = text.split()
            new_text = []
            for word in words:
                if word in notewhorty_words_set:
                    new_text.append(word)
            tokenizer_obj.fit_on_texts(new_text)
            pbar.update(1)

    logging.info("COMPUTE TF-IDF WORDS")
    with tqdm(total=total) as pbar:
        for path in Path(data_folder_name).glob('**/*.txt'):
            document = [0 for _ in notewhorty_words]
            f = open(str(path), "r")
            text = f.read()
            f.close()
            file.write(str(path))
            file.write(" ")
            text = text.lower()
            text = ''.join(c for c in text if c not in string.punctuation)
            words = text.split()
            new_text = []
            for word in words:
                if word in notewhorty_words_set:
                    new_text.append(word)
            sequence = tokenizer_obj.texts_to_sequences([new_text])
            document = tokenizer_obj.sequences_to_matrix(sequence, mode='tfidf')
            for word in document[0]:
                file.write(str(word))
                file.write(" ")
            for target in get_target(str(path)):
                file.write(str(target))
                file.write(" ")
            file.write('\n')
            pbar.update(1)
예제 #7
0
    def closure(mu):
        (x_train, y_train), (_, _) = imdb.load_data()
        tokenizer = Tokenizer(num_words=5000)
        tokenizer.fit_on_sequences(x_train)
        x_train = tokenizer.sequences_to_matrix(x_train, "tfidf")
        # Note: svd_solver=full is needed on GPU server
        x_train = PCA(n_components=100,
                      svd_solver='full').fit_transform(x_train)
        ds = {"data": x_train, "target": y_train}

        # Apply noise and return
        res = preprocess_and_noise(dataset=ds, mu=mu)
        return res
예제 #8
0
train_labels = open('train_labels.txt', 'rb').read().decode('utf-8').split('\n')
test_texts = open('test_contents.txt', 'rb').read().decode('utf-8').split('\n')
test_labels = open('test_labels.txt', 'rb').read().decode('utf-8').split('\n')
all_texts = train_texts + test_texts
all_labels = train_labels + test_labels


print('(2) doc to var...')


tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_texts)
sequences = tokenizer.texts_to_sequences(all_texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = tokenizer.sequences_to_matrix(sequences, mode='tfidf')
labels = to_categorical(np.asarray(all_labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)


print('(3) split data set...')
p1 = int(len(data)*(1-VALIDATION_SPLIT-TEST_SPLIT))
p2 = int(len(data)*(1-TEST_SPLIT))
x_train = data[:p1]
y_train = labels[:p1]
x_val = data[p1:p2]
y_val = labels[p1:p2]
x_test = data[p2:]
y_test = labels[p2:]
print('train docs: '+str(len(x_train)))
예제 #9
0
                                                      num_words=None,
                                                      skip_top=0,
                                                      maxlen=None)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

print(X_train[np.random.randint(0, len(X_train))])

print(set(y_train))

V = 5000
tokenizer = Tokenizer(num_words=V)
X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
X_test = tokenizer.sequences_to_matrix(X_test, mode='binary')
print(X_train[np.random.randint(0, len(X_train))])

print(X_train.shape)
print(X_test.shape)

classes = len(set(y_train))
y_train = keras.utils.to_categorical(y_train, classes)
y_test = keras.utils.to_categorical(y_test, classes)
print(y_train.shape)
print(y_test.shape)

print("tokens = " + str(V))

i_layer = Input(shape=(V, ))
예제 #10
0
def main():
    if not os.path.exists(os.getcwd() + '/model/'):
        os.mkdir(os.getcwd() + '/model/')
    if not os.path.exists(os.getcwd() + '/model/email_classifier'):
        os.mkdir(os.getcwd() + '/model/email_classifier')
    training = np.genfromtxt(os.getcwd() + '/data/dnn_model_training_data.txt',
                             delimiter=',',
                             skip_header=1,
                             usecols=(0, 1, 2, 3, 4),
                             dtype=None,
                             encoding='utf-8')

    feature = [[str(x[1]), str(x[2]), str(x[3]), str(x[4])] for x in training]
    # index all the sentiment labels
    label = np.asarray([x[0] for x in training])

    # only work with the 3000 most popular words found in our dataset
    max_words = 3000

    # create a new Tokenizer
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(feature)

    dictionary = tokenizer.word_index
    with open(os.getcwd() + '/model/email_classifier/dictionary.json',
              'w') as dictionary_file:
        json.dump(dictionary, dictionary_file)

    wordIndicesList1 = []
    wordIndicesList2 = []
    wordIndicesList3 = []
    wordIndicesList4 = []
    for featurerow in feature:
        #print(featurerow[0], featurerow[1], featurerow[2], featurerow[3])
        wordIndices1 = convert_text_to_index_array(featurerow[0], dictionary)
        wordIndices2 = convert_text_to_index_array(featurerow[1], dictionary)
        wordIndices3 = convert_text_to_index_array(featurerow[2], dictionary)
        wordIndices4 = convert_text_to_index_array(featurerow[3], dictionary)
        wordIndicesList1.append(wordIndices1)
        wordIndicesList2.append(wordIndices2)
        wordIndicesList3.append(wordIndices3)
        wordIndicesList4.append(wordIndices4)

    wordIndicesList1 = np.asarray(wordIndicesList1)
    wordIndicesList2 = np.asarray(wordIndicesList2)
    wordIndicesList3 = np.asarray(wordIndicesList3)
    wordIndicesList4 = np.asarray(wordIndicesList4)

    feature1 = tokenizer.sequences_to_matrix(wordIndicesList1, mode='binary')
    feature2 = tokenizer.sequences_to_matrix(wordIndicesList2, mode='binary')
    feature3 = tokenizer.sequences_to_matrix(wordIndicesList3, mode='binary')
    feature4 = tokenizer.sequences_to_matrix(wordIndicesList4, mode='binary')

    label = kr.utils.to_categorical(label, 3)

    model = Sequential()
    model.add(Dense(512, input_shape=(max_words, ), activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(256, activation='sigmoid'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    model.fit(x=[feature1, feature2, feature3, feature4],
              y=label,
              batch_size=200,
              epochs=5,
              verbose=1,
              validation_split=0.2,
              shuffle=True)

    model_json = model.to_json()
    with open(os.getcwd() + '/model/email_classifier/model.json',
              'w') as json_file:
        json_file.write(model_json)

    model.save_weights(os.getcwd() + '/model/email_classifier/model.h5')

    print('saved model!')
예제 #11
0
batch_size = 32
epochs = 5

print("Loading data...")
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words,
                                                         test_split=0.2)

print(len(x_train), "train sequences")
print(len(x_test), "test sequences")

num_classes = np.max(y_train) + 1
print(num_classes, "classes")

print("Vectorizing sequence data...")
tokenizer = Tokenizer(num_words=max_words)
x_train = tokenizer.sequences_to_matrix(x_train, mode="binary")
x_test = tokenizer.sequences_to_matrix(x_test, mode="binary")
print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)

print("Convert class vector to binary class matrix "
      "(for use with categorical_crossentropy)")
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

print("Building model...")
model = Sequential()
model.add(Dense(512, input_shape=(max_words, )))
model.add(Activation("relu"))
예제 #12
0
class TypeClassifier:
    def __init__(
            self,
            max_words=10000,
            xmldata='/home/leon/Documents/Golden_Agents/saa-notarialTexts/page',
            ydata='/home/leon/Documents/Golden_Agents/page-corrector/classification/data/scans.csv'
    ):

        self.xmldata = xmldata

        self._parseScanData(csvpath=ydata)

        self.tokenizer = Tokenizer(num_words=max_words,
                                   filters='#$%&/;<=>@[\\]^_`{|}~\t',
                                   lower=True,
                                   split=' ',
                                   char_level=False,
                                   oov_token=None)

        if max_words:
            self.max_words = max_words
        else:
            self.max_words = len(self.tokenizer.word_counts)

        # get texts and labels
        texts, labels = zip(
            *self.getTexts(folder=self.xmldata, train_only=True))

        # fit on the corpus (`xmldata`)
        self.tokenizer.fit_on_texts(texts)
        self.X = self.tokenizer.texts_to_sequences(texts)

        # encode class labels
        self.encoder = LabelEncoder()
        self.y = self.encoder.fit_transform(labels)

        # Split in train and text
        X_train, X_test, y_train, y_test = train_test_split(self.X,
                                                            self.y,
                                                            test_size=0.2,
                                                            random_state=42,
                                                            shuffle=True,
                                                            stratify=self.y)

        self.X_train = self.tokenizer.sequences_to_matrix(X_train,
                                                          mode='tfidf')
        self.X_test = self.tokenizer.sequences_to_matrix(X_test, mode='tfidf')

        self.n_classes = len(list(self.encoder.classes_))
        self.y_train = keras.utils.to_categorical(y_train, self.n_classes)
        self.y_test = keras.utils.to_categorical(y_test, self.n_classes)

        # construct model
        self.constructModel()

        # train
        self.train()

    def getTexts(self, folder: str, train_only=True):

        if train_only:
            for f in os.listdir(folder):
                filepath = os.path.join(folder, f)
                scanid = os.path.splitext(f)[0].upper()
                if scanid in self.scan2type:
                    yield xml2text(filepath), self.scan2type[scanid]
        else:
            return (xml2text(os.path.join(folder, f))
                    for f in os.listdir(folder))

    def constructModel(self):

        model = Sequential()
        model.add(Dense(512, input_shape=(self.max_words, ),
                        activation='relu'))
        model.add(Dropout(0.5))
        # model.add(Dense(256, activation='relu'))
        # model.add(Dropout(0.5))
        # model.add(Dense(128, activation='relu'))
        # model.add(Dropout(0.5))
        model.add(Dense(self.n_classes, activation='sigmoid'))

        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy', f1])

        print(model.metrics_names)

        self.model = model

        # self.model = keras.Sequential([
        #     keras.layers.Embedding(encoder.vocab_size, 16),
        #     keras.layers.GlobalAveragePooling1D(),
        #     keras.layers.Dense(1, activation='sigmoid')
        # ])

        # self.model.compile(optimizer='adam',
        #                    loss='binary_crossentropy',
        #                    metrics=['accuracy'])

    def train(self, batch_size=32, epochs=10):

        self.history = self.model.fit(self.X_train,
                                      self.y_train,
                                      batch_size=batch_size,
                                      shuffle=True,
                                      epochs=epochs,
                                      verbose=1,
                                      validation_data=(self.X_test,
                                                       self.y_test))

        score = self.model.evaluate(self.X_test,
                                    self.y_test,
                                    batch_size=batch_size,
                                    verbose=1)

        y_val_pred = self.model.predict(self.X_test)
        # y_pred_bool = np.argmax(y_val_pred, axis=1)

        print('Test loss:', score[0])
        print('Test accuracy:', score[1])
        print('Test f1-score', score[2])

        print(
            classification_report(self.y_test,
                                  y_val_pred,
                                  target_names=self.encoder.classes_))

        # self.history = self.model.fit(train_batches,
        #                               epochs=epochs,
        #                               validation_data=validation_data,
        #                               validation_steps=validation_steps)

    def _parseScanData(self, csvpath: str):

        df = pd.read_csv(csvpath)
        scan2type = dict()
        scan2record = dict()

        for d in df.to_dict(orient='records'):
            scanid = d['scan'].split('/Scan/')[1].upper()

            scan2type[scanid] = d['type']
            scan2record[scanid] = d['record']

        self.scan2type = scan2type
        self.scan2record = scan2record
예제 #13
0
    test = neg_test + pos_test

    train_string = process_doc(train, vocab)

    test_string = process_doc(test, vocab)

    full_text_string = neg_train + neg_test + pos_train + pos_test

    for key, value in vocab.items():
        index_to_word[value] = key

    max_words = 10000
    tokenizer = Tokenizer(num_words=len(vocab))
    tokenizer.fit_on_texts(train)
    Xtrain = tokenizer.texts_to_sequences(train)
    Xtrain = tokenizer.sequences_to_matrix(Xtrain)

    tokenizer.fit_on_texts(test)
    Xtest = tokenizer.texts_to_sequences(test)
    Xtest = tokenizer.sequences_to_matrix(Xtest)

    ytrain = np.asarray([0 for _ in range(len(neg_train))] +
                        [1 for _ in range(len(pos_train))]).astype(
                            'float64').reshape(-1, 1)
    ytest = np.asarray([0 for _ in range(len(neg_test))] +
                       [1 for _ in range(len(pos_test))]).astype(
                           'float64').reshape(-1, 1)

    # define network
    model = Sequential()
    model.add(Dense(50, input_shape=Xtrain.shape, activation='relu'))
class SequenceTokenizer():
    def __init__(self,
                 annotations,
                 node_list,
                 padding='post',
                 maxlen=2000,
                 truncating='post',
                 agg_mode=None,
                 tokenizer=None,
                 verbose=False) -> None:
        """
        Handles text tokenizing for DNA/RNA/Protein sequences.

        Args:
            padding: ['post', 'pre', None]
            maxlen (int): pad all RNA sequence strings to this length
            truncating: ['post', 'pre', 'random']. If 'random', then 'post' or 'pre' truncating is chosen randomly for each sequence at each iteration
            agg_mode: one of {"count", "tfidf", "binary", "freq"}, default None. If not None, instead of returning sequence
                encoding, get_sequence_encoding will return an aggregated numpy vector.
            tokenizer: pass an existing tokenizer instead of creating one
        """
        self.maxlen = maxlen
        self.padding = padding
        self.truncating = truncating
        self.agg_mode = agg_mode
        self.annotations = annotations
        self.node_list = node_list
        self.verbose = verbose

        if tokenizer is not None:
            self.tokenizer = tokenizer
        else:
            self.tokenizer = Tokenizer(char_level=True, lower=False)
            self.tokenizer.fit_on_texts(self.annotations.loc[self.node_list,
                                                             SEQUENCE_COL])
            print("word index:",
                  self.tokenizer.word_index) if self.verbose else None

    def sample_sequences(self, sequences):
        return sequences.apply(lambda x: random.choice(x)
                               if isinstance(x, list) else x)

    def get_sequence_encodings(self,
                               node_list: list,
                               variable_length=False,
                               minlen=None):
        """
        Returns an ndarray of shape (batch_size, sequence length, n_words) given a list of node ids
        (indexing from self.node_list)
        :param node_list: a list of node names to fetch transcript sequences
        :param variable_length: returns a list of sequences with different timestep length
        :param minlen: pad all sequences with length lower than this minlen
        """
        annotations = self.annotations
        seqs = annotations.loc[node_list, SEQUENCE_COL]

        padded_encoded_seqs = self.encode_texts(
            seqs,
            maxlen=self.maxlen,
            minlen=minlen,
            variable_length=variable_length)

        return padded_encoded_seqs

    def encode_texts(self,
                     texts,
                     maxlen=None,
                     minlen=None,
                     variable_length=False):
        """
        Returns a one-hot-vector for a string of RNA transcript sequence
        :param texts: [str | list(str)]
        :param maxlen: Set length to maximum length
        :param single: Set to True if texts is not a list (i.e. only a single node name string).
        :return:
        """
        # integer encode
        encoded = self.tokenizer.texts_to_sequences(texts)

        if variable_length:
            return encoded
        elif self.agg_mode:
            return self.tokenizer.sequences_to_matrix(encoded,
                                                      mode=self.agg_mode)

        # Pad sequences to the same length
        batch_maxlen = max([len(x) for x in encoded])
        if batch_maxlen < self.maxlen:
            maxlen = batch_maxlen

        if minlen and len(texts[0]) < minlen:
            maxlen = minlen

        # pad encoded sequences
        encoded = pad_sequences(
            encoded,
            maxlen=maxlen,
            padding=self.padding,
            truncating=np.random.choice(["post", "pre"])
            if self.truncating == "random" else self.truncating,
            dtype=SEQ_DTYPE)

        return encoded
cc= []
for i in x_train_token:
    cc.append(len(i))
plt.hist(cc,bins = 50)
plt.show()

len(x_train_token), len(xTrain)

ind = [len(i) < 300 for i in x_train_token] # Отсекаем все примеры где, длина больше 300
sum(ind)

x_Train = xTrain[ind]
x_train_text = np.array(x_train_token)[ind]

xTrainC01 = tokenizer.sequences_to_matrix(x_train_text, mode ="count")

xTrainC01.shape

y_Train = yTrain[ind]
#Выводим раземры обучающей выборки
#Чтобы проверить, что мы всё правильно собрали
print(x_Train.shape)
print(xTrainC01.shape)
print(y_Train.shape)

#Нормируем размер квартиры в xTrain
xScaler = StandardScaler() #Создаём нормировщик нормальным распределением
xScaler.fit(x_Train[:,-1].reshape(-1, 1)) #Обучаем его на площадях квартир (последня колонка в xTrain)
xTrainScaled = x_Train.copy()
xTrainScaled[:,-1] = xScaler.transform(x_Train[:,-1].reshape(-1, 1)).flatten() #Нормируем данные нормировщиком
items = list(tokenizer.word_index.items()) # Вытаскиваем индексы слов для просмотра

# Преобразовываем текст в последовательность индексов согласно частотному словарю
trainWordIndexes = tokenizer.texts_to_sequences(trainText) # Обучающие тесты в индексы
testWordIndexes = tokenizer.texts_to_sequences(testText)  # Проверочные тесты в индексы

#Задаём базовые параметры
xLen = 1000 # Длина отрезка текста, по которой анализируем, в словах
step = 100  # Шаг разбиения исходного текста на обучающие векторы

#Формируем обучающую и тестовую выборку
xTrain, yTrain = createSetsMultiClasses(trainWordIndexes, xLen, step) #извлекаем обучающую выборку
xTest, yTest = createSetsMultiClasses(testWordIndexes, xLen, step)    #извлекаем тестовую выборку

# Преобразовываем полученные выборки из последовательности индексов в матрицы нулей и единиц по принципу Bag of Words
xTrain01 = tokenizer.sequences_to_matrix(xTrain.tolist()) # Подаем xTrain в виде списка, чтобы метод успешно сработал
xTest01 = tokenizer.sequences_to_matrix(xTest.tolist())   # Подаем xTest в виде списка, чтобы метод успешно сработал

# Выводим формы масивов
print(xTrain01.shape)
print(yTrain.shape)
print(xTest01.shape)
print(yTest.shape)

tokenizer.word_index

#Задаём базовые параметры
xLen = 1000 # Длина отрезка текста, по которой анализируем, в словах
step = 100  # Шаг разбиения исходного текста на обучающие векторы

#Формируем обучающую и тестовую выборку
예제 #17
0
def создать_выборки(xLen):
  print('Происходит создание выборки для обучения')
  print('Это может занять несколько минут...')
  path = 'content/Болезни/'
  text = []
  classes = []
  n = 0
  codecs_list = ['UTF-8', 'Windows-1251']

  for filename in os.listdir(path): # Проходим по всем файлам в директории договоров
      n +=1
      for codec_s in codecs_list:
        try:
            text.append(readText(path+filename, codec_s)) # Преобразуем файл в одну строку и добавляем в agreements
            classes.append(filename.replace(".txt", ""))
            break
        except UnicodeDecodeError:
            print('Не прочитался файл: ', path+currdir+'/'+filename, codec_s)
        else:
            next 

  stop_words = nltk.corpus.stopwords.words('russian')
  lexeme_list = ['POS', 'animacy', 'aspect', 'case', 'gender', 'involvement', 'mood', 'number', 'person', 'tense', 'transitivity', 'voice']

  words = [] # Здесь будут лежать все списки слов каждого из описаний заболеваний
  tags = []   # Здесь будут лежать все списки списков граммем для каждого слова
  tags_all = [] # Здесь будут лежать все списки граммем всех слов для тренировки токенайзера
  for i in range(len(text)):
    word, tag = text2Words(text[i])
    words.append(word)
    tags.append(tag)
  for k in tags:
    for t in k:
      tags_all.append(t)

  #################
  #Преобразовываем текстовые данные в числовые/векторные для обучения нейросетью
  #################

  # Максимальное количество слов в словаре
  maxWordsCount = 1100 
  # Токенизатор кераса
  tokenizer = Tokenizer(num_words=maxWordsCount, filters='!"#$%&()*+,-––—./:;<=>?@[\\]^_`{|}~\t\n\xa0', lower=True, split=' ', oov_token='unknown', char_level=False)
  # Скармливаем ему слова
  tokenizer.fit_on_texts(words) 
  items = list(tokenizer.word_index.items())


  tokenizer_json1 = tokenizer.to_json()
  with io.open('tokenizer1.json', 'w', encoding='utf-8') as f:
      f.write(json.dumps(tokenizer_json1, ensure_ascii=False))
  with open('tokenizer1.json') as f:
      data = json.load(f)
      tokenizer = tokenizer_from_json(data)

  items = list(tokenizer.word_index.items()) 
  #Выведем первые 10 слов из словаря

  # Максимальное количество слов в словаре
  maxWordsCount2 = 50 
  # Токенизатор кераса
  tokenizer2 = Tokenizer(num_words=maxWordsCount2, filters='!"#$%&()*+,-––—./:;<=>?@[\\]^_`{|}~\t\n\xa0', lower=True, split=' ', oov_token='unknown', char_level=False)
  # Скармливаем ему слова
  tokenizer2.fit_on_texts(tags_all) 
  items2 = list(tokenizer2.word_index.items()) 

  tokenizer_json2 = tokenizer2.to_json()
  with io.open('tokenizer2.json', 'w', encoding='utf-8') as f:
      f.write(json.dumps(tokenizer_json2, ensure_ascii=False))

  with open('tokenizer2.json') as f:
      data = json.load(f)
      tokenizer2 = tokenizer_from_json(data)

  items2 = list(tokenizer2.word_index.items())

  #Преобразовываем текст в последовательность индексов согласно частотному словарю
  xTrainIndexes = tokenizer.texts_to_sequences(words) #Обучающие тексты в индексы
  #Преобразовываем тэги в последовательность индексов согласно частотному словарю
  xTrainTagsIndexes = []
  for tag in tags:  # так как теги имеют дополнительные вложенные списки поэтому итерируем в нужном для токенайзера формате
    xTrainTagsIndexes.append(tokenizer2.texts_to_sequences(tag))

  nVal = 200   # Количество слов проверочной выборки

  trainWords = []  # Здесь будет лежать слова для обучающей выборки
  valWords = []    # Здесь будет лежать слова для проверочной выборки

  for i in range(len(xTrainIndexes)):
    trainWords.append(xTrainIndexes[i][:-nVal])
    valWords.append(xTrainIndexes[i][-nVal:])

  trainTagsWords = []  # Здесь будет лежать теги для обучающей выборки
  valTagsWords = []    # Здесь будет лежать тэги для проверочной выборки

  for i in range(len(xTrainTagsIndexes)):
    trainTagsWords.append(xTrainTagsIndexes[i][:-nVal])
    valTagsWords.append(xTrainTagsIndexes[i][-nVal:])

  step = 1    #шаг

  # Создаем "раскусанные" выборки длины xLen, по 4000 экземпляров на каждый класс 
  (xTrain, yTrain) = createSetsMultiClassesBallanced(trainWords, xLen, step, 4000)
  #(xTrain, yTrain) = createSetsMultiClasses(trainWords, xLen, step)

  (xTrainTags, _) = createSetsMultiClassesBallanced(trainTagsWords, xLen, step, 4000)
  # Преобразовываем полученные обучающие выборки из последовательности индексов в матрицы нулей и единиц по принципу Bag of Words
  xTrain01 = tokenizer.sequences_to_matrix(xTrain.tolist(), mode="tfidf")     # Подаем xTrain в виде списка чтобы метод успешно сработал
  #xTrainTags01 = tokenizer2.sequences_to_matrix(xTrainTags)
  # Создаем "раскусанные" выборки длины xLen для проверочной
  (xVal, yVal) = createSetsMultiClasses(valWords, xLen, step)
  #(xVal, yVal) = createSetsMultiClassesBallanced(valWords, xLen, step, 600)

  (xTagsVal, _) = createSetsMultiClasses(valTagsWords, xLen, step)
  #(xTagsVal, _) = createSetsMultiClassesBallanced(valTagsWords, xLen, step, 600)

  # Преобразовываем полученные выборки из последовательности индексов в матрицы нулей и единиц по принципу Bag of Words
  xVal01 = tokenizer.sequences_to_matrix(xVal.tolist(), mode="tfidf")         # Подаем xVal в виде списка чтобы метод успешно сработал
  #xTagsVal01 = tokenizer2.sequences_to_matrix(xTagsVal.tolist())

  xTrainTags = np.reshape(xTrainTags, (xTrainTags.shape[0], -1))
  xTagsVal = np.reshape(xTagsVal, (xTagsVal.shape[0], -1))
  xTrainTags01 = tokenizer2.sequences_to_matrix(xTrainTags.tolist())
  xTagsVal01 = tokenizer2.sequences_to_matrix(xTagsVal.tolist())

  x_train = [xTrain, xTrain01, xTrainTags01]
  y_train = yTrain
  x_val = [xVal, xVal01, xTagsVal01]
  y_val = yVal
  valw = (valWords, valTagsWords, tokenizer, tokenizer2)
  display.clear_output(wait=True)
  print('Формирование выборки завершено')
  return (x_train, y_train), (x_val, y_val)
예제 #18
0
#랜덤 시드
np.random.seed(0)
number_of_features = 10000

np_load_old = np.load

# modify the default parameters of np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

# call load_data with allow_pickle implicitly set to true
(data_train, target_train), (data_test, target_test) = imdb.load_data(num_words=number_of_features)


#영화영화 리뷰 데이터를 원-핫 인코딩된 특성 행렬로 변환
tokenizer = Tokenizer(num_words=number_of_features)
features_train = tokenizer.sequences_to_matrix(data_train, mode="binary")
features_test = tokenizer.sequences_to_matrix(data_test, mode="binary")


# 신경망 모델을 만듭니다.
network = models.Sequential()

# 입력층으로 드롭아웃 층을 추가합니다.
network.add(layers.Dropout(0.3, input_shape=(number_of_features,)))

# 렐루 활성화 함수를 사용한 완전 연결 층을 추가합니다.
network.add(layers.Dense(units=128, activation="relu"))

# Add a dropout layer for previous hidden layer
network.add(layers.Dropout(0.6))
예제 #19
0
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
np.random.seed(100)
from tensorflow.keras.datasets import reuters
from tensorflow.keras.preprocessing.text import Tokenizer

# Get data
np_load_old = np.load
np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k)
(x, y), (xtest, ytest) = reuters.load_data(num_words=10000)
np.load = np_load_old

# Process text
tokenizer = Tokenizer(num_words=10000)
xtrain = tokenizer.sequences_to_matrix(x, mode='binary')
xtest = tokenizer.sequences_to_matrix(xtest, mode='binary')

ytrain = to_categorical(y)
ytest = to_categorical(ytest)

# Initialize model
model = Sequential()
model.add(Dense(512, activation='relu'))
model.add(Dropout(.5))
model.add(Dense(ytrain.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Fit model
                      char_level=False)

tokenizer.fit_on_texts(
    XTrainInfo
)  #"скармливаем" наши тексты, т.е даём в обработку методу, который соберет словарь частотности
items = list(
    tokenizer.word_index.items())  #Вытаскиваем индексы слов для просмотра
print(len(items))

#Переводим в индексы обучающую и тестовую выборки
XTrainInfoIndexes = tokenizer.texts_to_sequences(XTrainInfo)
XTestInfoIndexes = tokenizer.texts_to_sequences(XTestInfo)

#Преобразовываем обучающую выборку из последовательности индексов в матрицы нулей и единиц по принципу Bag of Words
x_train01 = tokenizer.sequences_to_matrix(
    XTrainInfoIndexes
)  #Подаем XTrainInfoIndexes в виде списка чтобы метод успешно сработал
print(
    x_train01.shape)  #Размер обучающей выборки, сформированной по Bag of Words
print(x_train01[500][0:20])
print(len(x_train01[500]))

#Преобразовываем тестовую выборку из последовательности индексов в матрицы нулей и единиц по принципу Bag of Words
x_test01 = tokenizer.sequences_to_matrix(
    XTestInfoIndexes
)  #Подаем XTestInfoIndexes в виде списка чтобы метод успешно сработал
print(
    x_test01.shape)  #Размер обучающей выборки, сформированной по Bag of Words
print(x_test01[500][0:20])
print(len(x_test01[500]))