def run(self): #create W2V Model sentences = [] count = 0 print("Start to create W2V model...") print("Get input data...") ds = datetime.datetime.now() with open(get_abs_path(self.Config, "data_corpus_path"), 'r', encoding='UTF-8') as f: for line in f: if len(line.strip()) == 0: continue count += 1 words = [w for w in line.strip().split()] sentences.append(words) f.close() de = datetime.datetime.now() print("Got %d lines from file %s in %s" % (count, get_abs_path(self.Config, "data_corpus_path"), get_formatted_date(ds, de))) numpy.random.shuffle(sentences) logger = EpochLogger(self.epochs) w2v = Word2Vec(size=self.ndim, window=10, min_count=3, workers=10) ds = datetime.datetime.now() print("Build vocabulary...") w2v.build_vocab(sentences) de = datetime.datetime.now() print("Vocabulary is built in %s" % (get_formatted_date(ds, de))) print("Train model...") ds = datetime.datetime.now() w2v.train(sentences, epochs=int(self.Config["epochs_total"]), total_examples=len(sentences), callbacks=[logger]) de = datetime.datetime.now() print("W2V model is completed in %s" % (get_formatted_date(ds, de))) created_model_path = get_abs_path(self.Config, "model_path") if self.Config["include_current_time_in_model_name"]: modelName = os.path.basename(created_model_path) dt = "-" + datetime.datetime.now().strftime("%Y-%b-%d-%H%M%S") pInd = modelName.rfind(".") if pInd > 0: modelName = modelName[:pInd] + dt + modelName[pInd:] else: modelName += dt finalPath = os.path.dirname(created_model_path) + "/" + modelName ds = datetime.datetime.now() w2v.wv.save_word2vec_format(finalPath, binary=False) de = datetime.datetime.now() print("W2V model %s is saved in the text format in %s\n" % (finalPath, get_formatted_date(ds, de)))
def getWordVectorsSum(self): self.nfWords = 0 self.sdict = dict() self.tmpCount = 0 if self.model.Config["type_of_execution"] != "test": ds = datetime.datetime.now() self.model.trainArrays = numpy.concatenate([ self.getDocsArray(x.words, 'Train') for x in self.model.Config[self.keyTrain] ]) self.model.trainLabels = numpy.concatenate([ numpy.array(x.labels).reshape( 1, len(self.model.Config["predefined_categories"])) for x in self.model.Config[self.keyTrain] ]) if self.addValSet: ind = int( len(self.model.trainArrays) * (1 - self.validation_data_size)) self.model.valArrays = self.model.trainArrays[ind:] self.model.valLabels = self.model.trainLabels[ind:] self.model.trainArrays = self.model.trainArrays[:ind] self.model.trainLabels = self.model.trainLabels[:ind] de = datetime.datetime.now() print("Prepare train and validation data in %s" % (get_formatted_date(ds, de))) else: de = datetime.datetime.now() print("Prepare train data in %s" % (get_formatted_date(ds, de))) self.tmpCount = 0 ds = datetime.datetime.now() self.model.testArrays = numpy.concatenate([ self.getDocsArray(x.words, "Test") for x in self.model.Config[self.keyTest] ]) self.model.testLabels = numpy.concatenate([ numpy.array(x.labels).reshape( 1, len(self.model.Config["predefined_categories"])) for x in self.model.Config[self.keyTest] ]) if self.model.isCV: return de = datetime.datetime.now() print("Prepare test data in %s" % (get_formatted_date(ds, de))) print("Unique words in all documents: %d" % (len(self.sdict))) print("Words not found in the w2v vocabulary: %d" % (self.nfWords))
def run(self): lib_path = get_abs_path(self.Config, "set_of_docs_lang_tokenization_lib_path") print("GRISHA use set_of_docs_lang_tokenization") if not lib_path or not os.path.exists(lib_path): raise ValueError( "Wrong path to the tagger's jar. Tokenization can't be done") in_path = self.Config["home"] + "/" + self.Config["source_path"] if not self.Config["source_path"] or self.Config[ "source_path"] == self.Config["target_path"]: raise ValueError( "Wrong source/target path(s). Tokenization can't be done.") out_path = self.Config["home"] + "/" + self.Config["target_path"] stop_words = "" stop_words = ",".join(list(stopwords.words( 'arabic'))) if self.Config["stop_words"] == "True" else "" ds = datetime.datetime.now() srv = subprocess.Popen( 'java -Xmx2g -jar ' + lib_path + ' "' + in_path + '" "' + out_path + '" "' + self.Config["exclude_positions"] + '" "' + stop_words + '" "' + self.Config["extra_words"] + '" "' + self.Config["normalization"] + '" "' + self.Config["language_tokenization"] + '"', stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) srv.wait() reply = srv.communicate() de = datetime.datetime.now() print(reply[0].decode()) print("All process is done in %s" % (get_formatted_date(ds, de)))
def load_data(self): if self.Config["enable_tokenization"] == "True": print("Start loading and preprocessing of data...") else: print ("Start loading data...") ds = datetime.datetime.now() self.Config["predefined_categories"] = self.get_categories(get_abs_path(self.Config, "train_data_path")) train_docs = self.get_data_docs(get_abs_path(self.Config, "train_data_path")) if not self.splitTrain: test_docs = self.get_data_docs(get_abs_path(self.Config, "test_data_path")) else: ind = int(len(train_docs) * (1 - self.sz)) random.shuffle(train_docs) test_docs = train_docs[ind:] train_docs = train_docs[:ind] de = datetime.datetime.now() self.Config["train_docs"] = random.sample(train_docs, len(train_docs)) self.Config["test_docs"] = random.sample(test_docs, len(test_docs)) self.get_max_seq_len() self.get_max_chars_length() if self.Config["enable_tokenization"] == "True" \ and self.Config["language_tokenization"] == "True" \ and self.Config["use_java"] == "True": self.jar.stdin.write('!!! STOP !!!\n') self.jar.stdin.flush() print ("Input data loaded in %s"%(get_formatted_date(ds, de))) print ("Training set contains %d documents."%(len(self.Config["train_docs"]))) print ("Testing set contains %d documents."%(len(self.Config["test_docs"]))) print ("Documents belong to %d categories."%(len(self.Config["predefined_categories"])))
def tokenize_file(Config, parser, inPath, outPath): outFile = open(outPath, 'w', encoding='UTF-8') ds = datetime.datetime.now() q = 0 qt = 0 with open(inPath, 'r', encoding='UTF-8') as f: for line in f: q += 1 if q > 1: result = '\n' else: result = '' line = line.replace('\r', '').replace('\n', '') if not line: continue toks = line.split() if len(toks) < 3: continue qt += len(toks) tArr = parser.tag(line.split()) result += joinTokens(tArr, Config).strip() outFile.write(result) de = datetime.datetime.now() print("File %s (%d lines, %d tokens): in %s" % (outPath, q, qt, get_formatted_date(ds, de))) f.close() outFile.close()
def tokens_from_tagger(Config): print("GRISHA tokens_from_tagger()") test_path(Config, "set_of_docs_lang_tokenization_lib_path", "Wrong path to the tagger's jar. Tokenization can't be done") tagger_path = get_abs_path(Config, "set_of_docs_lang_tokenization_lib_path") source_path = Config["home"] + "/" + Config["source_path"] target_path = Config["home"] + "/" + Config["target_path"] stop_words = ",".join(list( stopwords.words('arabic'))) if Config["stop_words"] == "True" else "" ds = datetime.datetime.now() srv = subprocess.Popen( 'java -Xmx2g -jar ' + tagger_path + ' "' + source_path + '" "' + target_path + '" "' + Config["exclude_positions"] + '" "' + stop_words + '" "' + Config["extra_words"] + '" "' + Config["normalization"] + '" "' + Config["language_tokenization"] + '"', stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) srv.wait() reply = srv.communicate() de = datetime.datetime.now() print(reply[0].decode()) print("All process is done in %s" % (get_formatted_date(ds, de)))
def trainSKLModel(self): de = datetime.datetime.now() print("Start training...") self.model.fit(self.trainArrays, self.trainLabels) ds = datetime.datetime.now() print("Model is trained in %s" % (get_formatted_date(de, ds))) if self.isCV: return joblib.dump( self.model, get_abs_path(self.Config, "created_model_path", opt="name")) print("Model is saved in %s" % get_abs_path(self.Config, "created_model_path", opt="name")) print("Model evaluation...") prediction = self.model.predict(self.testArrays) print('Final accuracy is %.2f' % accuracy_score(self.testLabels, prediction)) de = datetime.datetime.now() print("Evaluated in %s" % get_formatted_date(ds, de))
def tokenize(Config): parser = CoreNLPParser(url='http://localhost:' + Config["servport"], tagtype='pos') inPath = Config["home"] + "/" + Config["source_path"] outPath = Config["home"] + "/" + Config["target_path"] fds = datetime.datetime.now() tokenize_data(Config, parser, inPath, outPath) fde = datetime.datetime.now() print("Tokenization complited in %s" % (get_formatted_date(fds, fde)))
def testNNModel(self): print("Start testing...") print("Rank threshold: %.2f" % self.rank_threshold) ds = datetime.datetime.now() self.predictions = self.model.predict(self.testArrays) de = datetime.datetime.now() print("Test dataset containing %d documents predicted in %s\n" % (len(self.testArrays), get_formatted_date(ds, de))) if self.isCV: return self.prepare_resources_for_runtime("keras") self.get_metrics() self.save_results()
def trainNNModel(self): checkpoints = [] if self.save_intermediate_results and not self.isCV: checkpoint = ModelCheckpoint( get_abs_path(self.Config, "intermediate_results_path") + "/tempModel.hdf5", monitor='val_acc', verbose=self.verbose, save_best_only=True, mode='auto') checkpoints.append(checkpoint) print("Start training... ") ds = datetime.datetime.now() self.model.fit(self.trainArrays, self.trainLabels, epochs=self.epochs, validation_data=(self.valArrays, self.valLabels), batch_size=self.train_batch, verbose=self.verbose, callbacks=checkpoints, shuffle=False) de = datetime.datetime.now() print("Model is trained in %s" % (get_formatted_date(ds, de))) if self.isCV: return self.model.save( get_abs_path(self.Config, "created_model_path", opt="name")) print("Model evaluation...") scores1 = self.model.evaluate(self.testArrays, self.testLabels, verbose=self.verbose) print("Final model accuracy: %.2f%%" % (scores1[1] * 100)) if self.save_intermediate_results: model1 = load_model( get_abs_path(self.Config, "intermediate_results_path") + "/tempModel.hdf5") scores2 = model1.evaluate(self.testArrays, self.testLabels, verbose=self.verbose) print("Last saved model accuracy: %.2f%%" % (scores2[1] * 100)) if scores1[1] < scores2[1]: model = model1 pref = "The best model " else: pref = "Model " self.model.save( get_abs_path(self.Config, "created_model_path", opt="name")) print(pref + "is saved in %s" % get_abs_path(self.Config, "created_model_path", opt="name"))
def load_w2v_model(self): if self.Config["w2vmodel"] != None: print("W2V model is already loaded...") self.w2vModel = self.Config["w2vmodel"] return print("Load W2V model... ") ds = datetime.datetime.now() self.w2vModel = gensim.models.KeyedVectors.load_word2vec_format( get_abs_path(self.Config, "model_path")) de = datetime.datetime.now() print("Load W2V model (%s) in %s" % (get_abs_path( self.Config, "model_path"), get_formatted_date(ds, de))) self.Config["resources"]["w2v"]["created_model_path"] = get_abs_path( self.Config, "model_path") self.Config["resources"]["w2v"]["ndim"] = self.ndim
def launch_crossvalidation(self): print("Start cross-validation...") ds = datetime.datetime.now() dp = DataPreparation(self, self.addValSet) pSize = len(self.cvDocs) // self.cross_validations_total ind = 0 f1 = 0 attr_metrics = [] for i in range(self.cross_validations_total): print("Cross-validation, cycle %d from %d..." % ((i + 1), self.cross_validations_total)) if i == 0: self.Config["cross_validations_train_docs"] = self.cvDocs[ pSize:] self.Config[ "cross_validations_test_docs"] = self.cvDocs[:pSize] elif i == self.cross_validations_total - 1: self.Config["cross_validations_train_docs"] = self.cvDocs[:ind] self.Config["cross_validations_test_docs"] = self.cvDocs[ind:] else: self.Config[ "cross_validations_train_docs"] = self.cvDocs[: ind] + self.cvDocs[ ind + pSize:] self.Config["cross_validations_test_docs"] = self.cvDocs[ ind:ind + pSize] ind += pSize dp.getVectors(self.handleType) self.model = self.create_model() self.train_model() self.test_model() ModelMetrics(self) attr_metrics.append(self.metrics) cycleF1 = self.metrics["all"]["f1"] print("Resulting F1-Measure: %f\n" % cycleF1) if cycleF1 > f1: if self.Config["save_cross_validations_datasets"]: self.save_data_sets() f1 = cycleF1 de = datetime.datetime.now() print("Cross-validation is done in %s" % get_formatted_date(ds, de)) print_averaged_metrics(attr_metrics, self.Config) print("The best result is %f" % (f1)) print("Corresponding data sets are saved in the folder %s" % get_abs_path(self.Config, "cross_validations_datasets_path"))
def testSKLModel(self): print("Start testing...") if self.useProbabilities: print("Rank threshold: %.2f" % self.rank_threshold) else: print("Model doesn't calculate probabilities.") ds = datetime.datetime.now() if not self.useProbabilities: self.predictions = self.model.predict(self.testArrays) else: self.predictions = self.model.predict_proba(self.testArrays) de = datetime.datetime.now() print("Test dataset containing %d documents predicted in %s" % (self.testArrays.shape[0], get_formatted_date(ds, de))) if self.isCV: return self.prepare_resources_for_runtime("skl") self.get_metrics() self.save_results()
def getDataForSklearnClassifiers(self): mlb = None ds = datetime.datetime.now() if self.model.Config["type_of_execution"] != "test": nmCats = [""] * len(self.model.Config["predefined_categories"]) for k in list(self.model.Config["predefined_categories"].keys()): nmCats[self.model.Config["predefined_categories"][k]] = k mlb = MultiLabelBinarizer(classes=nmCats) wev = (TfidfVectorizer(ngram_range=(1, 3), max_df=0.50).fit( [x.lines for x in self.model.Config[self.keyTrain]], [x.nlabs for x in self.model.Config[self.keyTrain]])) self.model.trainArrays = wev.transform( [x.lines for x in self.model.Config[self.keyTrain]]) self.model.trainLabels = mlb.fit_transform( [x.nlabs for x in self.model.Config[self.keyTrain]]) if not self.model.isCV: with open(get_abs_path(self.model.Config, "binarizer_path"), 'wb') as handle: pickle.dump(mlb, handle, protocol=pickle.HIGHEST_PROTOCOL) handle.close() with open(get_abs_path(self.model.Config, "vectorizer_path"), 'wb') as handle: pickle.dump(wev, handle, protocol=pickle.HIGHEST_PROTOCOL) handle.close() if not mlb: with open(get_abs_path(self.model.Config, "binarizer_path"), 'rb') as handle: mlb = pickle.load(handle) handle.close() with open(get_abs_path(self.model.Config, "vectorizer_path"), 'rb') as handle: wev = pickle.load(handle) handle.close() self.model.testArrays = wev.transform( [x.lines for x in self.model.Config[self.keyTest]]) self.model.testLabels = mlb.fit_transform( [x.nlabs for x in self.model.Config[self.keyTest]]) de = datetime.datetime.now() print("Prepare all data in %s" % (get_formatted_date(ds, de)))
def getCharVectors(self): ds = datetime.datetime.now() """ if self.model.Config["max_chars_doc_len"] > self.model.Config["max_chars_seq_len"]: print( "Most of documents from training set have less then %d characters. Longer documents will be truncated." % ( self.model.Config["max_chars_seq_len"])) """ if self.model.Config["type_of_execution"] != "test": self.model.trainArrays = numpy.concatenate([ self.stringToIndexes(" ".join(x.words)) for x in self.model.Config[self.keyTrain] ]) self.model.trainLabels = numpy.concatenate([ numpy.array(x.labels).reshape( 1, len(self.model.Config["predefined_categories"])) for x in self.model.Config[self.keyTrain] ]) if self.addValSet: ind = int( len(self.model.trainArrays) * (1 - self.validation_data_size)) self.model.valArrays = self.model.trainArrays[ind:] self.model.valLabels = self.model.trainLabels[ind:] self.model.trainArrays = self.model.trainArrays[:ind] self.model.trainLabels = self.model.trainLabels[:ind] self.model.testArrays = numpy.concatenate([ self.stringToIndexes(" ".join(x.words)) for x in self.model.Config[self.keyTest] ]) self.model.testLabels = numpy.concatenate([ numpy.array(x.labels).reshape( 1, len(self.model.Config["predefined_categories"])) for x in self.model.Config[self.keyTest] ]) if self.model.isCV: return de = datetime.datetime.now() print("Prepare all data in %s" % (get_formatted_date(ds, de)))
def prepare_resources_for_runtime(self): tokenization_options = [ "language_tokenization", "normalization", "stop_words", "exclude_positions", "extra_words", "max_seq_len", "max_chars_seq_len", "single_doc_lang_tokenization_lib_path" ] self.Config["resources"]["tokenization"] = {} ds = datetime.datetime.now() self.outDir = get_abs_path(self.Config, "saved_resources_path") + "/" for t in tokenization_options: if t != "single_doc_lang_tokenization_lib_path": self.Config["resources"]["tokenization"][t] = self.Config[t] elif self.Config["language_tokenization"] == "True": self.Config["resources"]["tokenization"]["single_doc_lang_tokenization_lib_path"] = \ self.copyFile(get_abs_path(self.Config, "single_doc_lang_tokenization_lib_path")) isW2VNeeded = False for key, val in self.Config["resources"]["models"].items(): val["created_model_path"] = self.copyFile( val["created_model_path"]) if "w2v" in val and val["w2v"] == "True": isW2VNeeded = True if not isW2VNeeded and "w2v" in self.Config["resources"]: self.Config["resources"].pop("w2v", None) if "w2v" in self.Config["resources"]: w2vDict = {} isFirstLine = True fEmbeddings = open( self.Config["resources"]["w2v"]["created_model_path"], encoding="utf-8") for line in fEmbeddings: if isFirstLine == True: isFirstLine = False continue split = line.strip().split(" ") word = split[0] vector = numpy.array([float(num) for num in split[1:]]) w2vDict[word] = vector fEmbeddings.close() with open( self.Config["resources"]["w2v"]["created_model_path"] + '.pkl', 'wb') as file: pickle.dump(w2vDict, file, pickle.HIGHEST_PROTOCOL) file.close() self.Config["resources"]["w2v"]["created_model_path"] = \ self.copyFile(self.Config["resources"]["w2v"]["created_model_path"] + '.pkl') if "indexer" in self.Config["resources"]: self.Config["resources"]["indexer"] = self.copyFile( self.Config["resources"]["indexer"]) if "vectorizer" in self.Config["resources"]: self.Config["resources"]["vectorizer"] = self.copyFile( self.Config["resources"]["vectorizer"]) if "ptBertModel" in self.Config["resources"]: self.Config["resources"]["ptBertModel"] = self.copyFile( self.Config["resources"]["ptBertModel"]) self.Config["resources"]["vocabPath"] = self.copyFile( self.Config["resources"]["vocabPath"]) cNames = [''] * len(self.Config["predefined_categories"]) for k, v in self.Config["predefined_categories"].items(): cNames[v] = k with open(self.outDir + 'labels.txt', 'w', encoding="utf-8") as file: file.write(",".join(cNames)) file.close() self.Config["resources"]["labels"] = "labels.txt" self.Config["resources"]["consolidatedRank"] = self.rank_threshold with open(self.outDir + 'config.json', 'w', encoding="utf-8") as file: json.dump(self.Config["resources"], file, indent=4) file.close() de = datetime.datetime.now() print("\nArtifacts are copied into the folder %s in %s" % (get_abs_path(self.Config, "saved_resources_path"), get_formatted_date(ds, de)))
def load_w2v_model(self): print ("Load W2V model...") ds = datetime.datetime.now() self.Config["w2vmodel"] = \ gensim.models.KeyedVectors.load_word2vec_format(get_abs_path(self.Config, "model_path")) de = datetime.datetime.now() print("Load W2V model (%s) in %s" % (get_abs_path(self.Config, "model_path"), get_formatted_date(ds, de)))
def getWordVectorsMatrix(self): tokenizer = None ds = datetime.datetime.now() if self.model.Config["type_of_execution"] != "test": tokenizer = Tokenizer(num_words=self.maxWords) trainTexts = [] for t in self.model.Config[self.keyTrain]: trainTexts.append(t.lines) tokenizer.fit_on_texts(trainTexts) if not self.model.isCV: with open(get_abs_path(self.model.Config, "indexer_path"), 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) handle.close() if self.model.Config["max_doc_len"] > self.model.Config[ "max_seq_len"]: print( "Most of documents from training set have less then %d tokens. Longer documents will be truncated." % (self.model.Config["max_seq_len"])) self.model.trainArrays = pad_sequences( tokenizer.texts_to_sequences(trainTexts), maxlen=self.model.Config["max_seq_len"]) self.model.trainLabels = numpy.concatenate([ numpy.array(x.labels).reshape( 1, len(self.model.Config["predefined_categories"])) for x in self.model.Config[self.keyTrain] ]) if self.addValSet: ind = int( len(self.model.trainArrays) * (1 - self.validation_data_size)) self.model.valArrays = self.model.trainArrays[ind:] self.model.valLabels = self.model.trainLabels[ind:] self.model.trainArrays = self.model.trainArrays[:ind] self.model.trainLabels = self.model.trainLabels[:ind] if tokenizer == None: with open(get_abs_path(self.model.Config, "indexer_path"), 'rb') as handle: tokenizer = pickle.load(handle) handle.close() testTexts = [] for t in self.model.Config[self.keyTest]: testTexts.append(t.lines) self.model.testArrays = pad_sequences( tokenizer.texts_to_sequences(testTexts), maxlen=self.model.Config["max_seq_len"]) self.model.testLabels = numpy.concatenate([ numpy.array(x.labels).reshape( 1, len(self.model.Config["predefined_categories"])) for x in self.model.Config[self.keyTest] ]) embedding_matrix = numpy.zeros((self.maxWords, self.ndim)) word_index = tokenizer.word_index nf = 0 for word, i in word_index.items(): if i < self.maxWords: try: embedding_vector = self.model.w2vModel[word] except KeyError: nf += 1 continue if embedding_vector is not None: embedding_matrix[i] = embedding_vector self.model.embMatrix = embedding_matrix self.model.maxWords = self.maxWords if self.model.isCV: return de = datetime.datetime.now() print('Found %s unique tokens.' % len(tokenizer.word_index)) print('Tokens not found in W2V vocabulary: %d' % nf) print("All data prepared and embedding matrix built in %s" % (get_formatted_date(ds, de))) return embedding_matrix, self.maxWords