def load_w2v_model(self): print ("Load W2V model...") ds = datetime.datetime.now() self.Config["w2vmodel"] = \ gensim.models.KeyedVectors.load_word2vec_format(get_abs_path(self.Config, "model_path")) de = datetime.datetime.now() print("Load W2V model (%s) in %s" % (get_abs_path(self.Config, "model_path"), get_formatted_date(ds, de)))
def load_data(self): if self.Config["enable_tokenization"] == "True": print("Start loading and preprocessing of data...") else: print ("Start loading data...") ds = datetime.datetime.now() self.Config["predefined_categories"] = self.get_categories(get_abs_path(self.Config, "train_data_path")) train_docs = self.get_data_docs(get_abs_path(self.Config, "train_data_path")) if not self.splitTrain: test_docs = self.get_data_docs(get_abs_path(self.Config, "test_data_path")) else: ind = int(len(train_docs) * (1 - self.sz)) random.shuffle(train_docs) test_docs = train_docs[ind:] train_docs = train_docs[:ind] de = datetime.datetime.now() self.Config["train_docs"] = random.sample(train_docs, len(train_docs)) self.Config["test_docs"] = random.sample(test_docs, len(test_docs)) self.get_max_seq_len() self.get_max_chars_length() if self.Config["enable_tokenization"] == "True" \ and self.Config["language_tokenization"] == "True" \ and self.Config["use_java"] == "True": self.jar.stdin.write('!!! STOP !!!\n') self.jar.stdin.flush() print ("Input data loaded in %s"%(get_formatted_date(ds, de))) print ("Training set contains %d documents."%(len(self.Config["train_docs"]))) print ("Testing set contains %d documents."%(len(self.Config["test_docs"]))) print ("Documents belong to %d categories."%(len(self.Config["predefined_categories"])))
def run(self): #create W2V Model sentences = [] count = 0 print("Start to create W2V model...") print("Get input data...") ds = datetime.datetime.now() with open(get_abs_path(self.Config, "data_corpus_path"), 'r', encoding='UTF-8') as f: for line in f: if len(line.strip()) == 0: continue count += 1 words = [w for w in line.strip().split()] sentences.append(words) f.close() de = datetime.datetime.now() print("Got %d lines from file %s in %s" % (count, get_abs_path(self.Config, "data_corpus_path"), get_formatted_date(ds, de))) numpy.random.shuffle(sentences) logger = EpochLogger(self.epochs) w2v = Word2Vec(size=self.ndim, window=10, min_count=3, workers=10) ds = datetime.datetime.now() print("Build vocabulary...") w2v.build_vocab(sentences) de = datetime.datetime.now() print("Vocabulary is built in %s" % (get_formatted_date(ds, de))) print("Train model...") ds = datetime.datetime.now() w2v.train(sentences, epochs=int(self.Config["epochs_total"]), total_examples=len(sentences), callbacks=[logger]) de = datetime.datetime.now() print("W2V model is completed in %s" % (get_formatted_date(ds, de))) created_model_path = get_abs_path(self.Config, "model_path") if self.Config["include_current_time_in_model_name"]: modelName = os.path.basename(created_model_path) dt = "-" + datetime.datetime.now().strftime("%Y-%b-%d-%H%M%S") pInd = modelName.rfind(".") if pInd > 0: modelName = modelName[:pInd] + dt + modelName[pInd:] else: modelName += dt finalPath = os.path.dirname(created_model_path) + "/" + modelName ds = datetime.datetime.now() w2v.wv.save_word2vec_format(finalPath, binary=False) de = datetime.datetime.now() print("W2V model %s is saved in the text format in %s\n" % (finalPath, get_formatted_date(ds, de)))
def trainNNModel(self): checkpoints = [] if self.save_intermediate_results and not self.isCV: checkpoint = ModelCheckpoint( get_abs_path(self.Config, "intermediate_results_path") + "/tempModel.hdf5", monitor='val_acc', verbose=self.verbose, save_best_only=True, mode='auto') checkpoints.append(checkpoint) print("Start training... ") ds = datetime.datetime.now() self.model.fit(self.trainArrays, self.trainLabels, epochs=self.epochs, validation_data=(self.valArrays, self.valLabels), batch_size=self.train_batch, verbose=self.verbose, callbacks=checkpoints, shuffle=False) de = datetime.datetime.now() print("Model is trained in %s" % (get_formatted_date(ds, de))) if self.isCV: return self.model.save( get_abs_path(self.Config, "created_model_path", opt="name")) print("Model evaluation...") scores1 = self.model.evaluate(self.testArrays, self.testLabels, verbose=self.verbose) print("Final model accuracy: %.2f%%" % (scores1[1] * 100)) if self.save_intermediate_results: model1 = load_model( get_abs_path(self.Config, "intermediate_results_path") + "/tempModel.hdf5") scores2 = model1.evaluate(self.testArrays, self.testLabels, verbose=self.verbose) print("Last saved model accuracy: %.2f%%" % (scores2[1] * 100)) if scores1[1] < scores2[1]: model = model1 pref = "The best model " else: pref = "Model " self.model.save( get_abs_path(self.Config, "created_model_path", opt="name")) print(pref + "is saved in %s" % get_abs_path(self.Config, "created_model_path", opt="name"))
def load_w2v_model(self): if self.Config["w2vmodel"] != None: print("W2V model is already loaded...") self.w2vModel = self.Config["w2vmodel"] return print("Load W2V model... ") ds = datetime.datetime.now() self.w2vModel = gensim.models.KeyedVectors.load_word2vec_format( get_abs_path(self.Config, "model_path")) de = datetime.datetime.now() print("Load W2V model (%s) in %s" % (get_abs_path( self.Config, "model_path"), get_formatted_date(ds, de))) self.Config["resources"]["w2v"]["created_model_path"] = get_abs_path( self.Config, "model_path") self.Config["resources"]["w2v"]["ndim"] = self.ndim
def start_server(Config): stanford_path = get_abs_path(Config, "servsource") + "/" os.chdir(stanford_path) os.environ["CLASSPATH"] = "*" def run_server(restore_initial_dir, popenArgs): def runInThread(restore_initial_dir, popenArgs): srv = Popen( 'java -Xmx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -serverProperties ' + stanford_path + 'StanfordCoreNLP-arabic.properties -preload tokenize,ssplit,pos ' + '-status_port ' + Config["servport"] + ' -port ' + Config["servport"] + ' -timeout 20000', shell=True) srv.wait() restore_initial_dir() return thread = threading.Thread(target=runInThread, args=(restore_initial_dir, '')) thread.start() return thread def restore_initial_dir(): os.chdir(initial_dir) print("Server is down") run_server(restore_initial_dir, '') time.sleep(10) print("Server is running")
def run(self): lib_path = get_abs_path(self.Config, "set_of_docs_lang_tokenization_lib_path") print("GRISHA use set_of_docs_lang_tokenization") if not lib_path or not os.path.exists(lib_path): raise ValueError( "Wrong path to the tagger's jar. Tokenization can't be done") in_path = self.Config["home"] + "/" + self.Config["source_path"] if not self.Config["source_path"] or self.Config[ "source_path"] == self.Config["target_path"]: raise ValueError( "Wrong source/target path(s). Tokenization can't be done.") out_path = self.Config["home"] + "/" + self.Config["target_path"] stop_words = "" stop_words = ",".join(list(stopwords.words( 'arabic'))) if self.Config["stop_words"] == "True" else "" ds = datetime.datetime.now() srv = subprocess.Popen( 'java -Xmx2g -jar ' + lib_path + ' "' + in_path + '" "' + out_path + '" "' + self.Config["exclude_positions"] + '" "' + stop_words + '" "' + self.Config["extra_words"] + '" "' + self.Config["normalization"] + '" "' + self.Config["language_tokenization"] + '"', stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) srv.wait() reply = srv.communicate() de = datetime.datetime.now() print(reply[0].decode()) print("All process is done in %s" % (get_formatted_date(ds, de)))
def save_additions(self): self.resources["w2v"] = "True" if not "indexer" in self.Config["resources"]: self.Config["resources"]["indexer"] = get_abs_path( self.Config, "indexer_path") self.resources["indexer"] = "True" self.resources["handleType"] = "wordVectorsMatrix"
def save_data_sets(self): root = get_abs_path(self.Config, "cross_validations_datasets_path") shutil.rmtree(root) os.mkdir(root) train_data_path = root + "/train" test_data_path = root + "/test" folds = {} os.mkdir(train_data_path) for doc in self.Config["cross_validations_train_docs"]: for nlab in doc.nlabs: foldPath = train_data_path + "/" + nlab if nlab not in folds: os.mkdir(foldPath) folds[nlab] = True with open(foldPath + '/' + doc.name, 'w', encoding="utf-8") as file: file.write(doc.lines) file.close() folds = {} os.mkdir(test_data_path) for doc in self.Config["cross_validations_test_docs"]: for nlab in doc.nlabs: foldPath = test_data_path + "/" + nlab if nlab not in folds: os.mkdir(foldPath) folds[nlab] = True with open(foldPath + '/' + doc.name, 'w', encoding="utf-8") as file: file.write(doc.lines) file.close()
def tokens_from_tagger(Config): print("GRISHA tokens_from_tagger()") test_path(Config, "set_of_docs_lang_tokenization_lib_path", "Wrong path to the tagger's jar. Tokenization can't be done") tagger_path = get_abs_path(Config, "set_of_docs_lang_tokenization_lib_path") source_path = Config["home"] + "/" + Config["source_path"] target_path = Config["home"] + "/" + Config["target_path"] stop_words = ",".join(list( stopwords.words('arabic'))) if Config["stop_words"] == "True" else "" ds = datetime.datetime.now() srv = subprocess.Popen( 'java -Xmx2g -jar ' + tagger_path + ' "' + source_path + '" "' + target_path + '" "' + Config["exclude_positions"] + '" "' + stop_words + '" "' + Config["extra_words"] + '" "' + Config["normalization"] + '" "' + Config["language_tokenization"] + '"', stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) srv.wait() reply = srv.communicate() de = datetime.datetime.now() print(reply[0].decode()) print("All process is done in %s" % (get_formatted_date(ds, de)))
def run(self): test_path(self.Config, "train_data_path", "Wrong path to training set. Data can't be loaded.") if self.Config["test_data_path"]: test_path(self.Config, "test_data_path", "Wrong path to testing set. Data can't be loaded.") else: self.splitTrain = True try: self.sz = float(self.Config["test_data_size"]) except ValueError: self.sz = 0 if not self.Config["test_data_path"] and (self.sz <= 0 or self.sz >= 1): raise ValueError("Wrong size of testing set. Data can't be loaded.") if self.Config["enable_tokenization"] == "True": if self.Config["language_tokenization"] == "True": print("GRISHA use single_doc_lang_tokenization") if self.Config["use_java"] == "True": test_path(self.Config, 'single_doc_lang_tokenization_lib_path', "Wrong path to the tagger's jar. Preprocessing can't be done.") lib_path = get_abs_path(self.Config, 'single_doc_lang_tokenization_lib_path') command_line = 'java -Xmx2g -jar ' + lib_path + ' "' + self.Config["exclude_positions"] + '"' self.jar = subprocess.Popen(command_line, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, encoding="utf-8") else: self.nlp_tokenizer = stanfordnlp.Pipeline(lang="ar", processors='tokenize,mwt', use_gpu=True) if self.Config["stop_words"] == "True": self.stop_words = set(nltk.corpus.stopwords.words('arabic')) else: self.stop_words = set() if self.Config["normalization"] == "True": self.normalizer = ArabicNormalizer() if self.Config["load_w2v_model"] == "True": if not self.Config["model_path"] or not os.path.isfile(get_abs_path(self.Config, "model_path")): raise ValueError("Wrong path to W2V model. Stop.") try: self.ndim = int(self.Config["vectors_dimension"]) except ValueError: raise ValueError("Wrong size of vectors' dimentions. Stop.") self.Config["resources"]["w2v"]["created_model_path"] = get_abs_path(self.Config, "model_path") self.Config["resources"]["w2v"]["ndim"] = self.ndim self.load_w2v_model() else: self.Config["w2vmodel"] = None self.load_data() if self.Config["analysis"] == "True": self.analysis()
def trainSKLModel(self): de = datetime.datetime.now() print("Start training...") self.model.fit(self.trainArrays, self.trainLabels) ds = datetime.datetime.now() print("Model is trained in %s" % (get_formatted_date(de, ds))) if self.isCV: return joblib.dump( self.model, get_abs_path(self.Config, "created_model_path", opt="name")) print("Model is saved in %s" % get_abs_path(self.Config, "created_model_path", opt="name")) print("Model evaluation...") prediction = self.model.predict(self.testArrays) print('Final accuracy is %.2f' % accuracy_score(self.testLabels, prediction)) de = datetime.datetime.now() print("Evaluated in %s" % get_formatted_date(ds, de))
def run(self): print("\nCalculate consolidated metrics...") if not self.Config["results"]: print( "No results to consolidate them. Consolidation can not be performed." ) return if self.Config["save_reports"] == "True": if not self.Config["reports_path"] or not os.path.isdir( get_abs_path(self.Config, "reports_path")): print( "Wrong path to the folder, containing reports. Reports can not be created." ) else: self.save_reports = True if self.Config["prepare_resources_for_runtime"] == "True": if (not self.Config["saved_resources_path"] or not os.path.isdir( get_abs_path(self.Config, "saved_resources_path"))): print( "Wrong path to the folder, containing resources for runtime. Resources can not be saved." ) else: self.runtime = True print("Rank threshold for consolidated results: %.2f" % (self.rank_threshold)) if self.save_reports or self.Config[ "show_consolidated_results"] == "True": self.getConsolidatedResults() self.get_metrics() if self.save_reports: self.saveReports() if self.runtime: saved_rc_path = get_abs_path(self.Config, "saved_resources_path") if len(os.listdir(saved_rc_path)) > 0: print( "Warning: folder %s is not empty. All its content will be deleted." % saved_rc_path) shutil.rmtree(saved_rc_path) os.makedirs(saved_rc_path, exist_ok=True) print("\nCollect arfifacts for runtime...") self.prepare_resources_for_runtime()
def saveReports(self): print("Save report...") report = Report() report.requestId = self.Config["reqid"] report.sourcesPath = self.Config["actual_path"] report.datasetPath = self.Config["test_data_path"] tokenization_options = [ "language_tokenization", "normalization", "stop_words", "exclude_positions", "extra_words", "exclude_categories" ] for t in tokenization_options: report.preprocess[t] = self.Config[t] for t in self.Config["test_docs"]: report.docs[t.name] = {} report.docs[t.name]["actual"] = ",".join(t.nlabs) if not self.Config["exclude_categories"]: exclude_categories = [] else: exclude_categories = self.Config["exclude_categories"].split(",") cNames = [''] * (len(self.Config["predefined_categories"]) - len(exclude_categories)) for k, v in self.Config["predefined_categories"].items(): if k not in exclude_categories: cNames[v] = k report.categories = cNames for key, val in self.Config["results"].items(): for i in range(len(val)): labs = [] for j in range(self.qLabs): #if val[i][j] >= self.rank_threshold: if val[i][j] >= self.Config["ranks"][key]: labs.append("%s[%.2f]" % (cNames[j], val[i][j])) report.docs[self.Config["test_docs"][i].name][key] = ",".join( labs) for key, val in self.Config["metrics"].items(): report.models[key] = val for key, val in self.Config["ranks"].items(): report.ranks[key] = val if len(self.Config["results"]) > 1: for i in range(len(self.predictions)): labs = [] for j in range(self.qLabs): if self.predictions[i][j] == 1: labs.append(cNames[j]) report.docs[self.Config["test_docs"] [i].name]["consolidated"] = ",".join(labs) report.models["consolidated"] = self.rank_threshold rPath = get_abs_path( self.Config, "reports_path") + "/" + self.Config["reqid"] + ".json" with open(rPath, 'w', encoding="utf-8") as file: json.dump(report.toJSON(), file, indent=4) file.close()
def prepare_resources_for_runtime(self, type): self.resources["created_model_path"] = get_abs_path( self.Config, "created_model_path", opt="name") self.resources["modelType"] = type if self.useProbabilities: self.resources["rank_threshold"] = self.rank_threshold else: self.resources["rank_threshold"] = 1.0 self.save_additions() if type == "skl": self.resources["handleType"] = "vectorize" self.Config["resources"]["models"][ "Model" + str(self.Config["modelid"])] = self.resources
def getDataForSklearnClassifiers(self): mlb = None ds = datetime.datetime.now() if self.model.Config["type_of_execution"] != "test": nmCats = [""] * len(self.model.Config["predefined_categories"]) for k in list(self.model.Config["predefined_categories"].keys()): nmCats[self.model.Config["predefined_categories"][k]] = k mlb = MultiLabelBinarizer(classes=nmCats) wev = (TfidfVectorizer(ngram_range=(1, 3), max_df=0.50).fit( [x.lines for x in self.model.Config[self.keyTrain]], [x.nlabs for x in self.model.Config[self.keyTrain]])) self.model.trainArrays = wev.transform( [x.lines for x in self.model.Config[self.keyTrain]]) self.model.trainLabels = mlb.fit_transform( [x.nlabs for x in self.model.Config[self.keyTrain]]) if not self.model.isCV: with open(get_abs_path(self.model.Config, "binarizer_path"), 'wb') as handle: pickle.dump(mlb, handle, protocol=pickle.HIGHEST_PROTOCOL) handle.close() with open(get_abs_path(self.model.Config, "vectorizer_path"), 'wb') as handle: pickle.dump(wev, handle, protocol=pickle.HIGHEST_PROTOCOL) handle.close() if not mlb: with open(get_abs_path(self.model.Config, "binarizer_path"), 'rb') as handle: mlb = pickle.load(handle) handle.close() with open(get_abs_path(self.model.Config, "vectorizer_path"), 'rb') as handle: wev = pickle.load(handle) handle.close() self.model.testArrays = wev.transform( [x.lines for x in self.model.Config[self.keyTest]]) self.model.testLabels = mlb.fit_transform( [x.nlabs for x in self.model.Config[self.keyTest]]) de = datetime.datetime.now() print("Prepare all data in %s" % (get_formatted_date(ds, de)))
def launch_crossvalidation(self): print("Start cross-validation...") ds = datetime.datetime.now() dp = DataPreparation(self, self.addValSet) pSize = len(self.cvDocs) // self.cross_validations_total ind = 0 f1 = 0 attr_metrics = [] for i in range(self.cross_validations_total): print("Cross-validation, cycle %d from %d..." % ((i + 1), self.cross_validations_total)) if i == 0: self.Config["cross_validations_train_docs"] = self.cvDocs[ pSize:] self.Config[ "cross_validations_test_docs"] = self.cvDocs[:pSize] elif i == self.cross_validations_total - 1: self.Config["cross_validations_train_docs"] = self.cvDocs[:ind] self.Config["cross_validations_test_docs"] = self.cvDocs[ind:] else: self.Config[ "cross_validations_train_docs"] = self.cvDocs[: ind] + self.cvDocs[ ind + pSize:] self.Config["cross_validations_test_docs"] = self.cvDocs[ ind:ind + pSize] ind += pSize dp.getVectors(self.handleType) self.model = self.create_model() self.train_model() self.test_model() ModelMetrics(self) attr_metrics.append(self.metrics) cycleF1 = self.metrics["all"]["f1"] print("Resulting F1-Measure: %f\n" % cycleF1) if cycleF1 > f1: if self.Config["save_cross_validations_datasets"]: self.save_data_sets() f1 = cycleF1 de = datetime.datetime.now() print("Cross-validation is done in %s" % get_formatted_date(ds, de)) print_averaged_metrics(attr_metrics, self.Config) print("The best result is %f" % (f1)) print("Corresponding data sets are saved in the folder %s" % get_abs_path(self.Config, "cross_validations_datasets_path"))
def compose_tsv(model, type): cNames = [''] * len(model.Config["predefined_categories"]) for k, v in model.Config["predefined_categories"].items(): cNames[v] = k # if type == "train": # pretrained_bert_model_path = get_abs_path(model.Config, "resulting_bert_files_path", opt="/train.tsv") # data = model.Config[model.keyTrain] # else: # pretrained_bert_model_path = get_abs_path(model.Config, "resulting_bert_files_path", opt="/dev.tsv") # data = model.Config[model.keyTest] pre_trained_bert_model_path = get_abs_path(model.Config, "resulting_bert_files_path", opt=("/train.tsv" if type == "train" else "/dev.tsv")) data = model.Config[model.keyTest] target = open(pre_trained_bert_model_path, "w", encoding="utf-8") for i in range(len(data)): conts = data[i].lines.replace('\r','').replace('\n','.') nl = '\n' if i == 0: nl = '' string = nl + ",".join(data[i].nlabs) + "\t" + conts target.write(string) target.close()
def __init__(self, Config): print ("Start to create info...") self.Config = Config self.curDir = os.path.dirname(__file__) self.info = {} self.startId = "%d%0.2d%0.2d000000"%(date.today().year, date.today().month, date.today().day) if self.Config["info_from"] != "today": arr = self.Config["info_from"].split() prevDays = int(arr[0]) startDay = date.today() - timedelta(days=prevDays) self.startId = "%d%0.2d%0.2d000000" % (startDay.year, startDay.month, startDay.day) self.path = get_abs_path(Config, "reports_path") os.chdir(self.path) for f in glob.glob("*"): resPath = self.path + "/" + f try: ind = f.rindex(".") except ValueError: ind = len(f) key = f[:ind] if (key < self.startId): continue with open(resPath, 'r', encoding='utf-8') as json_file: try: self.info[key] = json.load(json_file) except json.JSONDecodeError: print ("Warning: file %s doesn't have json format. Skipped." % resPath) json_file.close() if not self.info: print ("Folder %s doesn't contain reports, created in required diapason of dates. Exit." % self.path) return self.html = "" self.qReqs = 0 self.footer = "</table></body></html>" self.docsDict = self.getDocsDictionary() self.createHtml()
def prepare_resources_for_runtime(self): tokenization_options = [ "language_tokenization", "normalization", "stop_words", "exclude_positions", "extra_words", "max_seq_len", "max_chars_seq_len", "single_doc_lang_tokenization_lib_path" ] self.Config["resources"]["tokenization"] = {} ds = datetime.datetime.now() self.outDir = get_abs_path(self.Config, "saved_resources_path") + "/" for t in tokenization_options: if t != "single_doc_lang_tokenization_lib_path": self.Config["resources"]["tokenization"][t] = self.Config[t] elif self.Config["language_tokenization"] == "True": self.Config["resources"]["tokenization"]["single_doc_lang_tokenization_lib_path"] = \ self.copyFile(get_abs_path(self.Config, "single_doc_lang_tokenization_lib_path")) isW2VNeeded = False for key, val in self.Config["resources"]["models"].items(): val["created_model_path"] = self.copyFile( val["created_model_path"]) if "w2v" in val and val["w2v"] == "True": isW2VNeeded = True if not isW2VNeeded and "w2v" in self.Config["resources"]: self.Config["resources"].pop("w2v", None) if "w2v" in self.Config["resources"]: w2vDict = {} isFirstLine = True fEmbeddings = open( self.Config["resources"]["w2v"]["created_model_path"], encoding="utf-8") for line in fEmbeddings: if isFirstLine == True: isFirstLine = False continue split = line.strip().split(" ") word = split[0] vector = numpy.array([float(num) for num in split[1:]]) w2vDict[word] = vector fEmbeddings.close() with open( self.Config["resources"]["w2v"]["created_model_path"] + '.pkl', 'wb') as file: pickle.dump(w2vDict, file, pickle.HIGHEST_PROTOCOL) file.close() self.Config["resources"]["w2v"]["created_model_path"] = \ self.copyFile(self.Config["resources"]["w2v"]["created_model_path"] + '.pkl') if "indexer" in self.Config["resources"]: self.Config["resources"]["indexer"] = self.copyFile( self.Config["resources"]["indexer"]) if "vectorizer" in self.Config["resources"]: self.Config["resources"]["vectorizer"] = self.copyFile( self.Config["resources"]["vectorizer"]) if "ptBertModel" in self.Config["resources"]: self.Config["resources"]["ptBertModel"] = self.copyFile( self.Config["resources"]["ptBertModel"]) self.Config["resources"]["vocabPath"] = self.copyFile( self.Config["resources"]["vocabPath"]) cNames = [''] * len(self.Config["predefined_categories"]) for k, v in self.Config["predefined_categories"].items(): cNames[v] = k with open(self.outDir + 'labels.txt', 'w', encoding="utf-8") as file: file.write(",".join(cNames)) file.close() self.Config["resources"]["labels"] = "labels.txt" self.Config["resources"]["consolidatedRank"] = self.rank_threshold with open(self.outDir + 'config.json', 'w', encoding="utf-8") as file: json.dump(self.Config["resources"], file, indent=4) file.close() de = datetime.datetime.now() print("\nArtifacts are copied into the folder %s in %s" % (get_abs_path(self.Config, "saved_resources_path"), get_formatted_date(ds, de)))
def loadSKLModel(self): return joblib.load( get_abs_path(self.Config, "created_model_path", opt="name"))
def loadNNModel(self): return load_model( get_abs_path(self.Config, "created_model_path", opt="name"))
def save_additions(self): if not "vectorizer" in self.Config["resources"]: self.Config["resources"]["vectorizer"] = get_abs_path( self.Config, "vectorizer_path") self.resources["vectorizer"] = "True"
def run(self): try: self.test_data_size = float(self.Config["test_data_size"]) except ValueError: self.test_data_size = -1 if not correct_path(self.Config, "train_data_path"): if self.Config["type_of_execution"] != "test" or not self.Config[ "test_data_path"]: raise ValueError( "Wrong path to the training set: folder %s doesn't exist." % get_abs_path(self.Config, "train_data_path")) if not correct_path(self.Config, "test_data_path"): if not (len(self.Config["test_data_path"]) == 0 and self.test_data_size > 0 and self.test_data_size < 1): raise ValueError( "Wrong path to the testing set: folder %d doesn't exist." % get_abs_path(self.Config, "test_data_path")) test_path(self.Config, "created_model_path", "Wrong path to the models' folder.") if not self.Config["name"]: self.Config["name"] = self.Config["type"] + str( self.Config["modelid"]) mPath = get_abs_path(self.Config, "created_model_path", opt="name") if self.Config["type_of_execution"] == "test" and not os.path.isfile( mPath): raise ValueError("Wrong path to the tested model.") if self.Config["type_of_execution"] != "test": try: self.epochs = int(self.Config["epochs"]) except ValueError: raise ValueError("Wrong quantity of epochs for training.") try: self.train_batch = int(self.Config["train_batch"]) except ValueError: raise ValueError("Wrong batch size for training.") try: self.verbose = int(self.Config["verbose"]) except ValueError: raise ValueError("Wrong value of 'verbose' flag for training.") if self.Config["save_intermediate_results"] == "True": if not self.Config["intermediate_results_path"] or \ not os.path.isdir(get_abs_path(self.Config, "intermediate_results_path")): raise ValueError( "Wrong path to folder with intermediate results.") """ if self.Config["type_of_execution"].lower() != "train": if self.Config["modelinfo"] == "True": if not self.Config["infopath"] or not os.path.isdir(get_abs_path(self.Config, "infopath")): raise ValueError("Wrong path to folder containing model info.") """ if self.Config["type_of_execution"] != "train" and self.Config[ "customrank"] == "True": try: self.rank_threshold = float(self.Config["rank_threshold"]) except ValueError: raise ValueError("Wrong custom rank threshold.") if self.Config["type_of_execution"] == "crossvalidation": if self.Config["save_cross_validations_datasets"] == "True": test_path( self.Config, "cross_validations_datasets_path", "Wrong path to the cross-validation's resulting folder.") try: cross_validations_total = int( self.Config["cross_validations_total"]) except ValueError: raise ValueError("Wrong k-fold value.") #if stop: # print ("Stop.") # self.Config["error"] = True # return if self.Config["type"].lower() == "snn": SnnModel(self.Config) elif self.Config["type"].lower() == "ltsm": LTSMModel(self.Config) elif self.Config["type"].lower() == "cnn": CNNModel(self.Config) elif self.Config["type"].lower() == "pac": PacModel(self.Config) elif self.Config["type"].lower() == "ridge": RidgeModel(self.Config) elif self.Config["type"].lower() == "svc": SVCModel(self.Config) elif self.Config["type"] == "perceptron": PerceptronModel(self.Config) elif self.Config["type"] == "sgd": SGDModel(self.Config) elif self.Config["type"] == "bert": BertModel(self.Config)
def getWordVectorsMatrix(self): tokenizer = None ds = datetime.datetime.now() if self.model.Config["type_of_execution"] != "test": tokenizer = Tokenizer(num_words=self.maxWords) trainTexts = [] for t in self.model.Config[self.keyTrain]: trainTexts.append(t.lines) tokenizer.fit_on_texts(trainTexts) if not self.model.isCV: with open(get_abs_path(self.model.Config, "indexer_path"), 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) handle.close() if self.model.Config["max_doc_len"] > self.model.Config[ "max_seq_len"]: print( "Most of documents from training set have less then %d tokens. Longer documents will be truncated." % (self.model.Config["max_seq_len"])) self.model.trainArrays = pad_sequences( tokenizer.texts_to_sequences(trainTexts), maxlen=self.model.Config["max_seq_len"]) self.model.trainLabels = numpy.concatenate([ numpy.array(x.labels).reshape( 1, len(self.model.Config["predefined_categories"])) for x in self.model.Config[self.keyTrain] ]) if self.addValSet: ind = int( len(self.model.trainArrays) * (1 - self.validation_data_size)) self.model.valArrays = self.model.trainArrays[ind:] self.model.valLabels = self.model.trainLabels[ind:] self.model.trainArrays = self.model.trainArrays[:ind] self.model.trainLabels = self.model.trainLabels[:ind] if tokenizer == None: with open(get_abs_path(self.model.Config, "indexer_path"), 'rb') as handle: tokenizer = pickle.load(handle) handle.close() testTexts = [] for t in self.model.Config[self.keyTest]: testTexts.append(t.lines) self.model.testArrays = pad_sequences( tokenizer.texts_to_sequences(testTexts), maxlen=self.model.Config["max_seq_len"]) self.model.testLabels = numpy.concatenate([ numpy.array(x.labels).reshape( 1, len(self.model.Config["predefined_categories"])) for x in self.model.Config[self.keyTest] ]) embedding_matrix = numpy.zeros((self.maxWords, self.ndim)) word_index = tokenizer.word_index nf = 0 for word, i in word_index.items(): if i < self.maxWords: try: embedding_vector = self.model.w2vModel[word] except KeyError: nf += 1 continue if embedding_vector is not None: embedding_matrix[i] = embedding_vector self.model.embMatrix = embedding_matrix self.model.maxWords = self.maxWords if self.model.isCV: return de = datetime.datetime.now() print('Found %s unique tokens.' % len(tokenizer.word_index)) print('Tokens not found in W2V vocabulary: %d' % nf) print("All data prepared and embedding matrix built in %s" % (get_formatted_date(ds, de))) return embedding_matrix, self.maxWords