Пример #1
0
    def run(self):  #create W2V Model
        sentences = []
        count = 0
        print("Start to create W2V model...")
        print("Get input data...")
        ds = datetime.datetime.now()
        with open(get_abs_path(self.Config, "data_corpus_path"),
                  'r',
                  encoding='UTF-8') as f:
            for line in f:
                if len(line.strip()) == 0:
                    continue
                count += 1
                words = [w for w in line.strip().split()]
                sentences.append(words)
        f.close()
        de = datetime.datetime.now()
        print("Got %d lines from file %s in %s" %
              (count, get_abs_path(self.Config, "data_corpus_path"),
               get_formatted_date(ds, de)))
        numpy.random.shuffle(sentences)

        logger = EpochLogger(self.epochs)
        w2v = Word2Vec(size=self.ndim, window=10, min_count=3, workers=10)
        ds = datetime.datetime.now()
        print("Build vocabulary...")
        w2v.build_vocab(sentences)
        de = datetime.datetime.now()
        print("Vocabulary is built in %s" % (get_formatted_date(ds, de)))
        print("Train model...")
        ds = datetime.datetime.now()
        w2v.train(sentences,
                  epochs=int(self.Config["epochs_total"]),
                  total_examples=len(sentences),
                  callbacks=[logger])
        de = datetime.datetime.now()
        print("W2V model is completed in %s" % (get_formatted_date(ds, de)))

        created_model_path = get_abs_path(self.Config, "model_path")
        if self.Config["include_current_time_in_model_name"]:
            modelName = os.path.basename(created_model_path)
            dt = "-" + datetime.datetime.now().strftime("%Y-%b-%d-%H%M%S")
            pInd = modelName.rfind(".")
            if pInd > 0:
                modelName = modelName[:pInd] + dt + modelName[pInd:]
            else:
                modelName += dt
        finalPath = os.path.dirname(created_model_path) + "/" + modelName
        ds = datetime.datetime.now()
        w2v.wv.save_word2vec_format(finalPath, binary=False)
        de = datetime.datetime.now()
        print("W2V model %s is saved in the text format in %s\n" %
              (finalPath, get_formatted_date(ds, de)))
Пример #2
0
    def getWordVectorsSum(self):
        self.nfWords = 0
        self.sdict = dict()
        self.tmpCount = 0
        if self.model.Config["type_of_execution"] != "test":
            ds = datetime.datetime.now()
            self.model.trainArrays = numpy.concatenate([
                self.getDocsArray(x.words, 'Train')
                for x in self.model.Config[self.keyTrain]
            ])
            self.model.trainLabels = numpy.concatenate([
                numpy.array(x.labels).reshape(
                    1, len(self.model.Config["predefined_categories"]))
                for x in self.model.Config[self.keyTrain]
            ])

            if self.addValSet:
                ind = int(
                    len(self.model.trainArrays) *
                    (1 - self.validation_data_size))
                self.model.valArrays = self.model.trainArrays[ind:]
                self.model.valLabels = self.model.trainLabels[ind:]
                self.model.trainArrays = self.model.trainArrays[:ind]
                self.model.trainLabels = self.model.trainLabels[:ind]

                de = datetime.datetime.now()
                print("Prepare train and validation data in %s" %
                      (get_formatted_date(ds, de)))
            else:
                de = datetime.datetime.now()
                print("Prepare train data in %s" %
                      (get_formatted_date(ds, de)))

        self.tmpCount = 0
        ds = datetime.datetime.now()
        self.model.testArrays = numpy.concatenate([
            self.getDocsArray(x.words, "Test")
            for x in self.model.Config[self.keyTest]
        ])
        self.model.testLabels = numpy.concatenate([
            numpy.array(x.labels).reshape(
                1, len(self.model.Config["predefined_categories"]))
            for x in self.model.Config[self.keyTest]
        ])
        if self.model.isCV:
            return
        de = datetime.datetime.now()
        print("Prepare test data in %s" % (get_formatted_date(ds, de)))
        print("Unique words in all documents: %d" % (len(self.sdict)))
        print("Words not found in the w2v vocabulary: %d" % (self.nfWords))
Пример #3
0
 def run(self):
     lib_path = get_abs_path(self.Config,
                             "set_of_docs_lang_tokenization_lib_path")
     print("GRISHA use set_of_docs_lang_tokenization")
     if not lib_path or not os.path.exists(lib_path):
         raise ValueError(
             "Wrong path to the tagger's jar. Tokenization can't be done")
     in_path = self.Config["home"] + "/" + self.Config["source_path"]
     if not self.Config["source_path"] or self.Config[
             "source_path"] == self.Config["target_path"]:
         raise ValueError(
             "Wrong source/target path(s). Tokenization can't be done.")
     out_path = self.Config["home"] + "/" + self.Config["target_path"]
     stop_words = ""
     stop_words = ",".join(list(stopwords.words(
         'arabic'))) if self.Config["stop_words"] == "True" else ""
     ds = datetime.datetime.now()
     srv = subprocess.Popen(
         'java -Xmx2g -jar ' + lib_path + ' "' + in_path + '" "' +
         out_path + '" "' + self.Config["exclude_positions"] + '" "' +
         stop_words + '" "' + self.Config["extra_words"] + '" "' +
         self.Config["normalization"] + '" "' +
         self.Config["language_tokenization"] + '"',
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
         shell=True)
     srv.wait()
     reply = srv.communicate()
     de = datetime.datetime.now()
     print(reply[0].decode())
     print("All process is done in %s" % (get_formatted_date(ds, de)))
Пример #4
0
 def load_data(self):
     if self.Config["enable_tokenization"] == "True":
         print("Start loading and preprocessing of data...")
     else:
         print ("Start loading data...")
     ds = datetime.datetime.now()
     self.Config["predefined_categories"] = self.get_categories(get_abs_path(self.Config, "train_data_path"))
     train_docs = self.get_data_docs(get_abs_path(self.Config, "train_data_path"))
     if not self.splitTrain:
         test_docs = self.get_data_docs(get_abs_path(self.Config, "test_data_path"))
     else:
         ind = int(len(train_docs) * (1 - self.sz))
         random.shuffle(train_docs)
         test_docs = train_docs[ind:]
         train_docs = train_docs[:ind]
     de = datetime.datetime.now()
     self.Config["train_docs"] = random.sample(train_docs, len(train_docs))
     self.Config["test_docs"] = random.sample(test_docs, len(test_docs))
     self.get_max_seq_len()
     self.get_max_chars_length()
     if self.Config["enable_tokenization"] == "True" \
             and self.Config["language_tokenization"] == "True" \
             and self.Config["use_java"] == "True":
         self.jar.stdin.write('!!! STOP !!!\n')
         self.jar.stdin.flush()
     print ("Input data loaded in %s"%(get_formatted_date(ds, de)))
     print ("Training set contains %d documents."%(len(self.Config["train_docs"])))
     print ("Testing set contains %d documents."%(len(self.Config["test_docs"])))
     print ("Documents belong to %d categories."%(len(self.Config["predefined_categories"])))
Пример #5
0
def tokenize_file(Config, parser, inPath, outPath):
    outFile = open(outPath, 'w', encoding='UTF-8')
    ds = datetime.datetime.now()
    q = 0
    qt = 0
    with open(inPath, 'r', encoding='UTF-8') as f:
        for line in f:
            q += 1
            if q > 1:
                result = '\n'
            else:
                result = ''
            line = line.replace('\r', '').replace('\n', '')
            if not line:
                continue
            toks = line.split()
            if len(toks) < 3:
                continue
            qt += len(toks)
            tArr = parser.tag(line.split())
            result += joinTokens(tArr, Config).strip()
            outFile.write(result)
    de = datetime.datetime.now()
    print("File %s (%d lines, %d tokens): in %s" %
          (outPath, q, qt, get_formatted_date(ds, de)))
    f.close()
    outFile.close()
Пример #6
0
def tokens_from_tagger(Config):
    print("GRISHA tokens_from_tagger()")
    test_path(Config, "set_of_docs_lang_tokenization_lib_path",
              "Wrong path to the tagger's jar. Tokenization can't be done")
    tagger_path = get_abs_path(Config,
                               "set_of_docs_lang_tokenization_lib_path")
    source_path = Config["home"] + "/" + Config["source_path"]
    target_path = Config["home"] + "/" + Config["target_path"]
    stop_words = ",".join(list(
        stopwords.words('arabic'))) if Config["stop_words"] == "True" else ""
    ds = datetime.datetime.now()
    srv = subprocess.Popen(
        'java -Xmx2g -jar ' + tagger_path + ' "' + source_path + '" "' +
        target_path + '" "' + Config["exclude_positions"] + '" "' +
        stop_words + '" "' + Config["extra_words"] + '" "' +
        Config["normalization"] + '" "' + Config["language_tokenization"] +
        '"',
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        shell=True)
    srv.wait()
    reply = srv.communicate()
    de = datetime.datetime.now()
    print(reply[0].decode())
    print("All process is done in %s" % (get_formatted_date(ds, de)))
Пример #7
0
 def trainSKLModel(self):
     de = datetime.datetime.now()
     print("Start training...")
     self.model.fit(self.trainArrays, self.trainLabels)
     ds = datetime.datetime.now()
     print("Model is trained in %s" % (get_formatted_date(de, ds)))
     if self.isCV:
         return
     joblib.dump(
         self.model,
         get_abs_path(self.Config, "created_model_path", opt="name"))
     print("Model is saved in %s" %
           get_abs_path(self.Config, "created_model_path", opt="name"))
     print("Model evaluation...")
     prediction = self.model.predict(self.testArrays)
     print('Final accuracy is %.2f' %
           accuracy_score(self.testLabels, prediction))
     de = datetime.datetime.now()
     print("Evaluated in %s" % get_formatted_date(ds, de))
Пример #8
0
def tokenize(Config):
    parser = CoreNLPParser(url='http://localhost:' + Config["servport"],
                           tagtype='pos')
    inPath = Config["home"] + "/" + Config["source_path"]
    outPath = Config["home"] + "/" + Config["target_path"]

    fds = datetime.datetime.now()
    tokenize_data(Config, parser, inPath, outPath)
    fde = datetime.datetime.now()
    print("Tokenization complited in %s" % (get_formatted_date(fds, fde)))
Пример #9
0
 def testNNModel(self):
     print("Start testing...")
     print("Rank threshold: %.2f" % self.rank_threshold)
     ds = datetime.datetime.now()
     self.predictions = self.model.predict(self.testArrays)
     de = datetime.datetime.now()
     print("Test dataset containing %d documents predicted in %s\n" %
           (len(self.testArrays), get_formatted_date(ds, de)))
     if self.isCV:
         return
     self.prepare_resources_for_runtime("keras")
     self.get_metrics()
     self.save_results()
Пример #10
0
 def trainNNModel(self):
     checkpoints = []
     if self.save_intermediate_results and not self.isCV:
         checkpoint = ModelCheckpoint(
             get_abs_path(self.Config, "intermediate_results_path") +
             "/tempModel.hdf5",
             monitor='val_acc',
             verbose=self.verbose,
             save_best_only=True,
             mode='auto')
         checkpoints.append(checkpoint)
     print("Start training...              ")
     ds = datetime.datetime.now()
     self.model.fit(self.trainArrays,
                    self.trainLabels,
                    epochs=self.epochs,
                    validation_data=(self.valArrays, self.valLabels),
                    batch_size=self.train_batch,
                    verbose=self.verbose,
                    callbacks=checkpoints,
                    shuffle=False)
     de = datetime.datetime.now()
     print("Model is trained in %s" % (get_formatted_date(ds, de)))
     if self.isCV:
         return
     self.model.save(
         get_abs_path(self.Config, "created_model_path", opt="name"))
     print("Model evaluation...")
     scores1 = self.model.evaluate(self.testArrays,
                                   self.testLabels,
                                   verbose=self.verbose)
     print("Final model accuracy: %.2f%%" % (scores1[1] * 100))
     if self.save_intermediate_results:
         model1 = load_model(
             get_abs_path(self.Config, "intermediate_results_path") +
             "/tempModel.hdf5")
         scores2 = model1.evaluate(self.testArrays,
                                   self.testLabels,
                                   verbose=self.verbose)
         print("Last saved model accuracy: %.2f%%" % (scores2[1] * 100))
         if scores1[1] < scores2[1]:
             model = model1
         pref = "The best model "
     else:
         pref = "Model "
     self.model.save(
         get_abs_path(self.Config, "created_model_path", opt="name"))
     print(pref + "is saved in %s" %
           get_abs_path(self.Config, "created_model_path", opt="name"))
Пример #11
0
 def load_w2v_model(self):
     if self.Config["w2vmodel"] != None:
         print("W2V model is already loaded...")
         self.w2vModel = self.Config["w2vmodel"]
         return
     print("Load W2V model... ")
     ds = datetime.datetime.now()
     self.w2vModel = gensim.models.KeyedVectors.load_word2vec_format(
         get_abs_path(self.Config, "model_path"))
     de = datetime.datetime.now()
     print("Load W2V model (%s) in %s" % (get_abs_path(
         self.Config, "model_path"), get_formatted_date(ds, de)))
     self.Config["resources"]["w2v"]["created_model_path"] = get_abs_path(
         self.Config, "model_path")
     self.Config["resources"]["w2v"]["ndim"] = self.ndim
Пример #12
0
 def launch_crossvalidation(self):
     print("Start cross-validation...")
     ds = datetime.datetime.now()
     dp = DataPreparation(self, self.addValSet)
     pSize = len(self.cvDocs) // self.cross_validations_total
     ind = 0
     f1 = 0
     attr_metrics = []
     for i in range(self.cross_validations_total):
         print("Cross-validation, cycle %d from %d..." %
               ((i + 1), self.cross_validations_total))
         if i == 0:
             self.Config["cross_validations_train_docs"] = self.cvDocs[
                 pSize:]
             self.Config[
                 "cross_validations_test_docs"] = self.cvDocs[:pSize]
         elif i == self.cross_validations_total - 1:
             self.Config["cross_validations_train_docs"] = self.cvDocs[:ind]
             self.Config["cross_validations_test_docs"] = self.cvDocs[ind:]
         else:
             self.Config[
                 "cross_validations_train_docs"] = self.cvDocs[:
                                                               ind] + self.cvDocs[
                                                                   ind +
                                                                   pSize:]
             self.Config["cross_validations_test_docs"] = self.cvDocs[
                 ind:ind + pSize]
         ind += pSize
         dp.getVectors(self.handleType)
         self.model = self.create_model()
         self.train_model()
         self.test_model()
         ModelMetrics(self)
         attr_metrics.append(self.metrics)
         cycleF1 = self.metrics["all"]["f1"]
         print("Resulting F1-Measure: %f\n" % cycleF1)
         if cycleF1 > f1:
             if self.Config["save_cross_validations_datasets"]:
                 self.save_data_sets()
             f1 = cycleF1
     de = datetime.datetime.now()
     print("Cross-validation is done in %s" % get_formatted_date(ds, de))
     print_averaged_metrics(attr_metrics, self.Config)
     print("The best result is %f" % (f1))
     print("Corresponding data sets are saved in the folder %s" %
           get_abs_path(self.Config, "cross_validations_datasets_path"))
Пример #13
0
 def testSKLModel(self):
     print("Start testing...")
     if self.useProbabilities:
         print("Rank threshold: %.2f" % self.rank_threshold)
     else:
         print("Model doesn't calculate probabilities.")
     ds = datetime.datetime.now()
     if not self.useProbabilities:
         self.predictions = self.model.predict(self.testArrays)
     else:
         self.predictions = self.model.predict_proba(self.testArrays)
     de = datetime.datetime.now()
     print("Test dataset containing %d documents predicted in %s" %
           (self.testArrays.shape[0], get_formatted_date(ds, de)))
     if self.isCV:
         return
     self.prepare_resources_for_runtime("skl")
     self.get_metrics()
     self.save_results()
Пример #14
0
 def getDataForSklearnClassifiers(self):
     mlb = None
     ds = datetime.datetime.now()
     if self.model.Config["type_of_execution"] != "test":
         nmCats = [""] * len(self.model.Config["predefined_categories"])
         for k in list(self.model.Config["predefined_categories"].keys()):
             nmCats[self.model.Config["predefined_categories"][k]] = k
         mlb = MultiLabelBinarizer(classes=nmCats)
         wev = (TfidfVectorizer(ngram_range=(1, 3), max_df=0.50).fit(
             [x.lines for x in self.model.Config[self.keyTrain]],
             [x.nlabs for x in self.model.Config[self.keyTrain]]))
         self.model.trainArrays = wev.transform(
             [x.lines for x in self.model.Config[self.keyTrain]])
         self.model.trainLabels = mlb.fit_transform(
             [x.nlabs for x in self.model.Config[self.keyTrain]])
         if not self.model.isCV:
             with open(get_abs_path(self.model.Config, "binarizer_path"),
                       'wb') as handle:
                 pickle.dump(mlb, handle, protocol=pickle.HIGHEST_PROTOCOL)
             handle.close()
             with open(get_abs_path(self.model.Config, "vectorizer_path"),
                       'wb') as handle:
                 pickle.dump(wev, handle, protocol=pickle.HIGHEST_PROTOCOL)
             handle.close()
     if not mlb:
         with open(get_abs_path(self.model.Config, "binarizer_path"),
                   'rb') as handle:
             mlb = pickle.load(handle)
         handle.close()
         with open(get_abs_path(self.model.Config, "vectorizer_path"),
                   'rb') as handle:
             wev = pickle.load(handle)
         handle.close()
     self.model.testArrays = wev.transform(
         [x.lines for x in self.model.Config[self.keyTest]])
     self.model.testLabels = mlb.fit_transform(
         [x.nlabs for x in self.model.Config[self.keyTest]])
     de = datetime.datetime.now()
     print("Prepare all data in %s" % (get_formatted_date(ds, de)))
Пример #15
0
 def getCharVectors(self):
     ds = datetime.datetime.now()
     """
     if self.model.Config["max_chars_doc_len"] > self.model.Config["max_chars_seq_len"]:
         print(
             "Most of documents from training set have less then %d characters. Longer documents will be truncated." % (
                 self.model.Config["max_chars_seq_len"]))
     """
     if self.model.Config["type_of_execution"] != "test":
         self.model.trainArrays = numpy.concatenate([
             self.stringToIndexes(" ".join(x.words))
             for x in self.model.Config[self.keyTrain]
         ])
         self.model.trainLabels = numpy.concatenate([
             numpy.array(x.labels).reshape(
                 1, len(self.model.Config["predefined_categories"]))
             for x in self.model.Config[self.keyTrain]
         ])
         if self.addValSet:
             ind = int(
                 len(self.model.trainArrays) *
                 (1 - self.validation_data_size))
             self.model.valArrays = self.model.trainArrays[ind:]
             self.model.valLabels = self.model.trainLabels[ind:]
             self.model.trainArrays = self.model.trainArrays[:ind]
             self.model.trainLabels = self.model.trainLabels[:ind]
     self.model.testArrays = numpy.concatenate([
         self.stringToIndexes(" ".join(x.words))
         for x in self.model.Config[self.keyTest]
     ])
     self.model.testLabels = numpy.concatenate([
         numpy.array(x.labels).reshape(
             1, len(self.model.Config["predefined_categories"]))
         for x in self.model.Config[self.keyTest]
     ])
     if self.model.isCV:
         return
     de = datetime.datetime.now()
     print("Prepare all data in %s" % (get_formatted_date(ds, de)))
Пример #16
0
 def prepare_resources_for_runtime(self):
     tokenization_options = [
         "language_tokenization", "normalization", "stop_words",
         "exclude_positions", "extra_words", "max_seq_len",
         "max_chars_seq_len", "single_doc_lang_tokenization_lib_path"
     ]
     self.Config["resources"]["tokenization"] = {}
     ds = datetime.datetime.now()
     self.outDir = get_abs_path(self.Config, "saved_resources_path") + "/"
     for t in tokenization_options:
         if t != "single_doc_lang_tokenization_lib_path":
             self.Config["resources"]["tokenization"][t] = self.Config[t]
         elif self.Config["language_tokenization"] == "True":
             self.Config["resources"]["tokenization"]["single_doc_lang_tokenization_lib_path"] = \
                 self.copyFile(get_abs_path(self.Config, "single_doc_lang_tokenization_lib_path"))
     isW2VNeeded = False
     for key, val in self.Config["resources"]["models"].items():
         val["created_model_path"] = self.copyFile(
             val["created_model_path"])
         if "w2v" in val and val["w2v"] == "True":
             isW2VNeeded = True
     if not isW2VNeeded and "w2v" in self.Config["resources"]:
         self.Config["resources"].pop("w2v", None)
     if "w2v" in self.Config["resources"]:
         w2vDict = {}
         isFirstLine = True
         fEmbeddings = open(
             self.Config["resources"]["w2v"]["created_model_path"],
             encoding="utf-8")
         for line in fEmbeddings:
             if isFirstLine == True:
                 isFirstLine = False
                 continue
             split = line.strip().split(" ")
             word = split[0]
             vector = numpy.array([float(num) for num in split[1:]])
             w2vDict[word] = vector
         fEmbeddings.close()
         with open(
                 self.Config["resources"]["w2v"]["created_model_path"] +
                 '.pkl', 'wb') as file:
             pickle.dump(w2vDict, file, pickle.HIGHEST_PROTOCOL)
         file.close()
         self.Config["resources"]["w2v"]["created_model_path"] = \
             self.copyFile(self.Config["resources"]["w2v"]["created_model_path"] + '.pkl')
     if "indexer" in self.Config["resources"]:
         self.Config["resources"]["indexer"] = self.copyFile(
             self.Config["resources"]["indexer"])
     if "vectorizer" in self.Config["resources"]:
         self.Config["resources"]["vectorizer"] = self.copyFile(
             self.Config["resources"]["vectorizer"])
     if "ptBertModel" in self.Config["resources"]:
         self.Config["resources"]["ptBertModel"] = self.copyFile(
             self.Config["resources"]["ptBertModel"])
         self.Config["resources"]["vocabPath"] = self.copyFile(
             self.Config["resources"]["vocabPath"])
     cNames = [''] * len(self.Config["predefined_categories"])
     for k, v in self.Config["predefined_categories"].items():
         cNames[v] = k
     with open(self.outDir + 'labels.txt', 'w', encoding="utf-8") as file:
         file.write(",".join(cNames))
     file.close()
     self.Config["resources"]["labels"] = "labels.txt"
     self.Config["resources"]["consolidatedRank"] = self.rank_threshold
     with open(self.outDir + 'config.json', 'w', encoding="utf-8") as file:
         json.dump(self.Config["resources"], file, indent=4)
     file.close()
     de = datetime.datetime.now()
     print("\nArtifacts are copied into the folder %s in %s" %
           (get_abs_path(self.Config, "saved_resources_path"),
            get_formatted_date(ds, de)))
Пример #17
0
 def load_w2v_model(self):
     print ("Load W2V model...")
     ds = datetime.datetime.now()
     self.Config["w2vmodel"] = \
         gensim.models.KeyedVectors.load_word2vec_format(get_abs_path(self.Config, "model_path"))
     de = datetime.datetime.now()
     print("Load W2V model (%s) in %s" % (get_abs_path(self.Config, "model_path"), get_formatted_date(ds, de)))
Пример #18
0
 def getWordVectorsMatrix(self):
     tokenizer = None
     ds = datetime.datetime.now()
     if self.model.Config["type_of_execution"] != "test":
         tokenizer = Tokenizer(num_words=self.maxWords)
         trainTexts = []
         for t in self.model.Config[self.keyTrain]:
             trainTexts.append(t.lines)
         tokenizer.fit_on_texts(trainTexts)
         if not self.model.isCV:
             with open(get_abs_path(self.model.Config, "indexer_path"),
                       'wb') as handle:
                 pickle.dump(tokenizer,
                             handle,
                             protocol=pickle.HIGHEST_PROTOCOL)
             handle.close()
             if self.model.Config["max_doc_len"] > self.model.Config[
                     "max_seq_len"]:
                 print(
                     "Most of documents from training set have less then %d tokens. Longer documents will be truncated."
                     % (self.model.Config["max_seq_len"]))
         self.model.trainArrays = pad_sequences(
             tokenizer.texts_to_sequences(trainTexts),
             maxlen=self.model.Config["max_seq_len"])
         self.model.trainLabels = numpy.concatenate([
             numpy.array(x.labels).reshape(
                 1, len(self.model.Config["predefined_categories"]))
             for x in self.model.Config[self.keyTrain]
         ])
         if self.addValSet:
             ind = int(
                 len(self.model.trainArrays) *
                 (1 - self.validation_data_size))
             self.model.valArrays = self.model.trainArrays[ind:]
             self.model.valLabels = self.model.trainLabels[ind:]
             self.model.trainArrays = self.model.trainArrays[:ind]
             self.model.trainLabels = self.model.trainLabels[:ind]
     if tokenizer == None:
         with open(get_abs_path(self.model.Config, "indexer_path"),
                   'rb') as handle:
             tokenizer = pickle.load(handle)
         handle.close()
     testTexts = []
     for t in self.model.Config[self.keyTest]:
         testTexts.append(t.lines)
     self.model.testArrays = pad_sequences(
         tokenizer.texts_to_sequences(testTexts),
         maxlen=self.model.Config["max_seq_len"])
     self.model.testLabels = numpy.concatenate([
         numpy.array(x.labels).reshape(
             1, len(self.model.Config["predefined_categories"]))
         for x in self.model.Config[self.keyTest]
     ])
     embedding_matrix = numpy.zeros((self.maxWords, self.ndim))
     word_index = tokenizer.word_index
     nf = 0
     for word, i in word_index.items():
         if i < self.maxWords:
             try:
                 embedding_vector = self.model.w2vModel[word]
             except KeyError:
                 nf += 1
                 continue
             if embedding_vector is not None:
                 embedding_matrix[i] = embedding_vector
     self.model.embMatrix = embedding_matrix
     self.model.maxWords = self.maxWords
     if self.model.isCV:
         return
     de = datetime.datetime.now()
     print('Found %s unique tokens.' % len(tokenizer.word_index))
     print('Tokens not found in W2V vocabulary: %d' % nf)
     print("All data prepared and embedding matrix built in %s" %
           (get_formatted_date(ds, de)))
     return embedding_matrix, self.maxWords