def process(self, X):
        # нарезаем на куски, ибо кормить в сеть слишком большой не можем.
        split_threshold = self.model.split_threshold
        text_split = split_long_texts(X, split_threshold)  # теперь index - номер произведения, и он дублируется

        # предобработка на уровне слов.
        # извлекаем из модели параметры и использовавшиеся методы.
        nm = self.model.normalizer
        words_tokenizer = self.model.word_tokenizer
        MAX_TEXT_WORDS = self.model.params['MAX_TEXT_WORDS']

        # препроцессим вход.
        filtered_data = filter_chars(text_split)
        text_word = nm.normalize(filtered_data)
        text_word = preprocessing(text_word, encode=words_tokenizer.texts_to_sequences, inputlen=MAX_TEXT_WORDS)

        # предобработка на уровне символов
        ...

        return (text_word, text_char)
Пример #2
0
                    "--train",
                    action="store_true",
                    help="Run the training of the model")
parser.add_argument(
    "-p",
    "--preprocess",
    action="store_true",
    help=
    "Update the train and test csv files with the new images in dataset, used this if you added new images in dataset"
)

args = parser.parse_args()

if args.preprocess:
    print("Preprocessing..")
    preprocessing()
    print("Preprocessing finished!")

cuda_available = torch.cuda.is_available()

# directory results
if not os.path.exists(RESULTS_PATH):
    os.makedirs(RESULTS_PATH)

# Load dataset
mean = m
std_dev = s

transform_train = transforms.Compose([
    transforms.RandomApply([transforms.ColorJitter(0.1, 0.1, 0.1, 0.1)],
                           p=0.5),
Пример #3
0
import tensorflow as tf
import config as cfg
import dataset
from model import build_model

from tensorflow.keras.callbacks import TensorBoard

import os

os.environ['CUDA_VISIBLE_DEVICES'] = '4, 5, 6, 7'
physical_devices = tf.config.experimental.list_physical_devices('GPU')

dense_features, sparse_features, total_data = dataset.preprocessing()

tbCallBack = TensorBoard(log_dir='./logs',
                         histogram_freq=0,
                         write_graph=True,
                         write_grads=True,
                         write_images=True,
                         embeddings_freq=0,
                         update_freq='batch',
                         embeddings_layer_names=None,
                         embeddings_metadata=None)

total_data = total_data.sample(frac=1.0, random_state=1)
train_data = total_data.iloc[:500000]
val_data = total_data.iloc[500000:]

train_dense = [train_data[f].values for f in dense_features]
train_sparse = [train_data[f].values for f in sparse_features]