def loadData(self): if self.Config["datatoks"] == "yes": print("Start loading and preprocessing of data...") else: print("Start loading data...") ds = datetime.datetime.now() self.Config["cats"] = self.getCategories( fullPath(self.Config, "trainpath")) traindocs = self.getDataDocs(fullPath(self.Config, "trainpath")) if not self.splitTrain: testdocs = self.getDataDocs(fullPath(self.Config, "testpath")) else: ind = int(len(traindocs) * (1 - self.sz)) random.shuffle(traindocs) testdocs = traindocs[ind:] traindocs = traindocs[:ind] de = datetime.datetime.now() self.Config["traindocs"] = random.sample(traindocs, len(traindocs)) self.Config["testdocs"] = random.sample(testdocs, len(testdocs)) self.getMaxSeqLen() self.getMaxCharsLength() if self.Config["datatoks"] == "yes" and self.Config[ "actualtoks"] == "yes": self.jar.stdin.write('!!! STOP !!!\n') self.jar.stdin.flush() print("Input data loaded in %s" % (showTime(ds, de))) print("Training set contains %d documents." % (len(self.Config["traindocs"]))) print("Testing set contains %d documents." % (len(self.Config["testdocs"]))) print("Documents belong to %d categories." % (len(self.Config["cats"])))
def getDataForSklearnClassifiers(self): mlb = None ds = datetime.datetime.now() if self.model.Config["runfor"] != "test": nmCats = [""] * len(self.model.Config["cats"]) cKeys = list(self.model.Config["cats"].keys()) for i in range(len(cKeys)): nmCats[self.model.Config["cats"][cKeys[i]]] = cKeys[i] mlb = MultiLabelBinarizer(classes=nmCats) wev = TfidfVectorizer(ngram_range=(1, 3), max_df=0.50).fit([x.lines for x in self.model.Config[self.keyTrain]], [x.nlabs for x in self.model.Config[self.keyTrain]]) self.model.trainArrays = wev.transform([x.lines for x in self.model.Config[self.keyTrain]]) self.model.trainLabels = mlb.fit_transform([x.nlabs for x in self.model.Config[self.keyTrain]]) if not self.model.isCV: with open(fullPath(self.model.Config, "binarizerpath"), 'wb') as handle: pickle.dump(mlb, handle, protocol=pickle.HIGHEST_PROTOCOL) handle.close() with open(fullPath(self.model.Config, "vectorizerpath"), 'wb') as handle: pickle.dump(wev, handle, protocol=pickle.HIGHEST_PROTOCOL) handle.close() if mlb == None: with open(fullPath(self.model.Config, "binarizerpath"), 'rb') as handle: mlb = pickle.load(handle) handle.close() with open(fullPath(self.model.Config, "vectorizerpath"), 'rb') as handle: wev = pickle.load(handle) handle.close() self.model.testArrays = wev.transform([x.lines for x in self.model.Config[self.keyTest]]) self.model.testLabels = mlb.fit_transform([x.nlabs for x in self.model.Config[self.keyTest]]) de = datetime.datetime.now() print("Prepare all data in %s" % (showTime(ds, de)))
def __init__(self, Config, DefConfig, kwargs): print ("=== Word Embedding ===") updateParams(Config, DefConfig, kwargs) self.Config = Config self.DefConfig = DefConfig; if not os.path.isdir(os.path.dirname(fullPath(Config, "w2vmodelpath"))): print ("Wrong path to W2V model. Word Embedding can't be done.") Config["error"] = True return if Config["w2vcreate"] != "yes": return if len(Config["w2vcorpuspath"]) == 0 or not os.path.isfile(fullPath(Config, "w2vcorpuspath")): print ("Wrong corpus path. W2V model can't be created.") Config["error"] = True return try: self.epochs = int(self.Config["w2vepochs"]) except ValueError: print ("Wrong quantity of epochs for training. W2V model can't be created.") Config["error"] = True return try: self.ndim = int(self.Config["w2vdim"]) except ValueError: print ("Wrong size of resulting vectors. W2V model can't be created.") Config.error = True return self.createW2VModel()
def getWordVectorsMatrix(self): tokenizer = None ds = datetime.datetime.now() if self.model.Config["runfor"] != "test": tokenizer = Tokenizer(num_words=self.maxWords) trainTexts = [] for i in range(len(self.model.Config[self.keyTrain])): trainTexts.append(self.model.Config[self.keyTrain][i].lines) tokenizer.fit_on_texts(trainTexts) if not self.model.isCV: with open(fullPath(self.model.Config, "indexerpath"), 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) handle.close() if self.model.Config["maxdoclen"] > self.model.Config["maxseqlen"]: print("Most of documents from training set have less then %d tokens. Longer documents will be truncated."%( self.model.Config["maxseqlen"])) self.model.trainArrays = pad_sequences(tokenizer.texts_to_sequences(trainTexts), maxlen=self.model.Config["maxseqlen"]) self.model.trainLabels = numpy.concatenate([numpy.array(x.labels). reshape(1, len(self.model.Config["cats"])) for x in self.model.Config[self.keyTrain]]) if self.addValSet: ind = int(len(self.model.trainArrays) * (1 - self.valSize)) self.model.valArrays = self.model.trainArrays[ind:] self.model.valLabels = self.model.trainLabels[ind:] self.model.trainArrays = self.model.trainArrays[:ind] self.model.trainLabels = self.model.trainLabels[:ind] if tokenizer == None: with open(fullPath(self.model.Config, "indexerpath"), 'rb') as handle: tokenizer = pickle.load(handle) handle.close() testTexts = [] for i in range(len(self.model.Config[self.keyTest])): testTexts.append(self.model.Config[self.keyTest][i].lines) self.model.testArrays = pad_sequences(tokenizer.texts_to_sequences(testTexts), maxlen=self.model.Config["maxseqlen"]) self.model.testLabels = numpy.concatenate([numpy.array(x.labels). reshape(1, len(self.model.Config["cats"])) for x in self.model.Config[self.keyTest]]) embedding_matrix = numpy.zeros((self.maxWords, self.ndim)) word_index = tokenizer.word_index nf = 0 for word, i in word_index.items(): if i < self.maxWords: try: embedding_vector = self.model.w2vModel[word] except KeyError: nf += 1 continue if embedding_vector is not None: embedding_matrix[i] = embedding_vector self.model.embMatrix = embedding_matrix self.model.maxWords = self.maxWords if self.model.isCV: return de = datetime.datetime.now() print('Found %s unique tokens.' % len(tokenizer.word_index)) print ('Tokens not found in W2V vocabulary: %d'%nf) print("All data prepared and embedding matrix built in %s"%(showTime(ds, de))) return embedding_matrix, self.maxWords
def loadW2VModel(self): print("Load W2V model...") ds = datetime.datetime.now() self.Config[ "w2vmodel"] = gensim.models.KeyedVectors.load_word2vec_format( fullPath(self.Config, "w2vmodelpath")) de = datetime.datetime.now() print("Load W2V model (%s) in %s" % (fullPath(self.Config, "w2vmodelpath"), showTime(ds, de)))
def loadW2VModel(self): if self.Config["w2vmodel"] != None: print("W2V model is already loaded...") self.w2vModel = self.Config["w2vmodel"] return print("Load W2V model... ") ds = datetime.datetime.now() self.w2vModel = gensim.models.KeyedVectors.load_word2vec_format( fullPath(self.Config, "w2vmodelpath")) de = datetime.datetime.now() print("Load W2V model (%s) in %s" % (fullPath(self.Config, "w2vmodelpath"), showTime(ds, de))) self.Config["resources"]["w2v"]["modelPath"] = fullPath( self.Config, "w2vmodelpath") self.Config["resources"]["w2v"]["ndim"] = self.ndim
def saveDataSets(self): root = fullPath(self.Config, "cvpath") shutil.rmtree(root) os.mkdir(root) trainPath = root + "/train" testPath = root + "/test" folds = {} os.mkdir(trainPath) for i in range(len(self.Config["cvtraindocs"])): doc = self.Config["cvtraindocs"][i] for j in range(len(doc.nlabs)): foldPath = trainPath + "/" + doc.nlabs[j] if doc.nlabs[j] not in folds: os.mkdir(foldPath) folds[doc.nlabs[j]] = True with open(foldPath + '/' + doc.name, 'w', encoding="utf-8") as file: file.write(doc.lines) file.close() folds = {} os.mkdir(testPath) for i in range(len(self.Config["cvtestdocs"])): doc = self.Config["cvtestdocs"][i] for j in range(len(doc.nlabs)): foldPath = testPath + "/" + doc.nlabs[j] if doc.nlabs[j] not in folds: os.mkdir(foldPath) folds[doc.nlabs[j]] = True with open(foldPath + '/' + doc.name, 'w', encoding="utf-8") as file: file.write(doc.lines) file.close()
def startServer(self): stanford_path = fullPath(self.Config, "servsource") + "/" os.chdir(stanford_path) os.environ["CLASSPATH"] = "*" def runServer(onExit, popenArgs): def runInThread(onExit, popenArgs): srv = Popen( 'java -Xmx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -serverProperties ' + stanford_path + 'StanfordCoreNLP-arabic.properties -preload tokenize,ssplit,pos ' + '-status_port ' + self.Config["servport"] + ' -port ' + self.Config["servport"] + ' -timeout 20000', shell=True) srv.wait() onExit() return thread = threading.Thread(target=runInThread, args=(onExit, '')) thread.start() return thread def onExit(): os.chdir(self.curdir) print("Server is down") runServer(onExit, '') time.sleep(10) print("Server is running")
def tokenize(self, Config): taggerPath = fullPath(Config, "taggerpath") if (len(taggerPath) == 0 or not os.path.exists(taggerPath)): print ("Wrong path to the tagger's jar. Tokenization can't be done") Config["error"] = True return inPath = Config["home"] + "/" + Config["sourcepath"] outPath = Config["home"] + "/" + Config["targetpath"] stopWords = "" if Config["stopwords"] == "yes": sWords = list(stopwords.words('arabic')) for i in range(len(sWords)): if i > 0: stopWords += "," stopWords += sWords[i] ds = datetime.datetime.now() srv = subprocess.Popen('java -Xmx2g -jar ' + taggerPath + ' "' + inPath + '" "' + outPath + '" "' + Config["expos"] + '" "'+ stopWords + '" "' + Config["extrawords"] + '" "' + Config["normalization"] + '" "' + Config["actualtoks"] + '"', stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) srv.wait() reply = srv.communicate() de = datetime.datetime.now() print(reply[0].decode()) print("All process is done in %s" % (showTime(ds, de)))
def saveAdditions(self): self.resources["w2v"] = "yes" if not "indexer" in self.Config["resources"]: self.Config["resources"]["indexer"] = fullPath( self.Config, "indexerpath") self.resources["indexer"] = "yes" self.resources["handleType"] = "wordVectorsMatrix"
def __init__(self, Config): super().__init__(Config) if self.Config["w2vmodel"] == None: if len(Config["w2vmodelpath"]) == 0 or not os.path.isfile( fullPath(Config, "w2vmodelpath")): print("Wrong path to W2V model. Stop.") Config["error"] = True return try: self.valSize = float(Config["valsize"]) except ValueError: self.valSize = 0 if self.valSize <= 0 or self.valSize >= 1: print("Wrong size of validation data set. Stop.") Config["error"] = True return try: self.ndim = int(self.Config["w2vdim"]) except ValueError: print("Wrong size of vectors' dimentions. Stop.") Config["error"] = True return self.addValSet = True self.handleType = "wordVectorsSum" self.tempSave = Config["tempsave"] == "yes" self.useProbabilities = True self.w2vModel = None self.loadW2VModel() if Config["runfor"] != "crossvalidation": self.prepareData() self.launchProcess()
def trainSKLModel(self): de = datetime.datetime.now() print("Start training...") self.model.fit(self.trainArrays, self.trainLabels) ds = datetime.datetime.now() print("Model is trained in %s" % (showTime(de, ds))) if self.isCV: return joblib.dump(self.model, fullPath(self.Config, "modelpath", opt="name")) print("Model is saved in %s" % (fullPath(self.Config, "modelpath", opt="name"))) print("Model evaluation...") prediction = self.model.predict(self.testArrays) print('Final accuracy is %.2f' % (accuracy_score(self.testLabels, prediction))) de = datetime.datetime.now() print("Evaluated in %s" % (showTime(ds, de)))
def createW2VModel(self): sentences = [] count = 0 print ("Start to create W2V model...") print ("Get input data...") ds = datetime.datetime.now() with open(fullPath(self.Config, "w2vcorpuspath"), 'r', encoding='UTF-8') as f: for line in f: if len(line.strip()) == 0: continue count += 1 words = [w for w in line.strip().split()] sentences.append(words) f.close() de = datetime.datetime.now() print("Got %d lines from file %s in %s"% (count, fullPath(self.Config, "w2vcorpuspath"), showTime(ds, de))) numpy.random.shuffle(sentences) logger = EpochLogger(self.epochs) w2v = Word2Vec(size=self.ndim, window=10, min_count=3, workers=10) ds = datetime.datetime.now() print("Build vocabulary...") w2v.build_vocab(sentences) de = datetime.datetime.now() print("Vocabulary is built in %s" % (showTime(ds, de))) print("Train model...") ds = datetime.datetime.now() w2v.train(sentences, epochs=int(self.Config["w2vepochs"]), total_examples=len(sentences), callbacks=[logger]) de = datetime.datetime.now() print("W2V model is completed in %s" % (showTime(ds, de))) modelPath = fullPath(self.Config, "w2vmodelpath") if self.Config["w2vtimeinname"]: modelName = os.path.basename(modelPath) dt = "-" + datetime.datetime.now().strftime("%Y-%b-%d-%H%M%S") pInd = modelName.rfind(".") if pInd > 0: modelName = modelName[:pInd] + dt + modelName[pInd:] else: modelName += dt finalPath = os.path.dirname(modelPath) + "/" + modelName ds = datetime.datetime.now() w2v.wv.save_word2vec_format(finalPath, binary=False) de = datetime.datetime.now() print("W2V model %s is saved in the text format in %s\n" % (finalPath, showTime(ds, de)))
def trainNNModel(self): checkpoints = [] if self.tempSave and not self.isCV: checkpoint = ModelCheckpoint(fullPath(self.Config, "temppath") + "/tempModel.hdf5", monitor='val_acc', verbose=self.verbose, save_best_only=True, mode='auto') checkpoints.append(checkpoint) print("Start training... ") ds = datetime.datetime.now() self.model.fit(self.trainArrays, self.trainLabels, epochs=self.epochs, validation_data=(self.valArrays, self.valLabels), batch_size=self.trainBatch, verbose=self.verbose, callbacks=checkpoints, shuffle=False) de = datetime.datetime.now() print("Model is trained in %s" % (showTime(ds, de))) if self.isCV: return self.model.save(fullPath(self.Config, "modelpath", opt="name")) print("Model evaluation...") scores1 = self.model.evaluate(self.testArrays, self.testLabels, verbose=self.verbose) print("Final model accuracy: %.2f%%" % (scores1[1] * 100)) if self.tempSave: model1 = load_model( fullPath(self.Config, "temppath") + "/tempModel.hdf5") scores2 = model1.evaluate(self.testArrays, self.testLabels, verbose=self.verbose) print("Last saved model accuracy: %.2f%%" % (scores2[1] * 100)) if scores1[1] < scores2[1]: model = model1 pref = "The best model " else: pref = "Model " self.model.save(fullPath(self.Config, "modelpath", opt="name")) print(pref + "is saved in %s" % (fullPath(self.Config, "modelpath", opt="name")))
def saveReports(self): print("Save report...") report = Report() report.requestId = self.Config["reqid"] report.sourcesPath = self.Config["actualpath"] report.datasetPath = self.Config["testpath"] tokOpts = [ "actualtoks", "normalization", "stopwords", "expos", "extrawords", "excats" ] for i in range(len(tokOpts)): report.preprocess[tokOpts[i]] = self.Config[tokOpts[i]] for i in range(len(self.Config["testdocs"])): report.docs[self.Config["testdocs"][i].name] = {} report.docs[self.Config["testdocs"][i].name]["actual"] = ",".join( self.Config["testdocs"][i].nlabs) if len(self.Config["excats"]) == 0: exCats = [] else: exCats = self.Config["excats"].split(",") cNames = [''] * (len(self.Config["cats"]) - len(exCats)) for k, v in self.Config["cats"].items(): if k not in exCats: cNames[v] = k report.categories = cNames for key, val in self.Config["results"].items(): for i in range(len(val)): labs = [] addLabs = [] for j in range(self.qLabs): #if val[i][j] >= self.rankThreshold: if val[i][j] >= self.Config["ranks"][key]: labs.append("%s[%.2f]" % (cNames[j], val[i][j])) else: addLabs.append("%s[%.2f]" % (cNames[j], val[i][j])) report.docs[self.Config["testdocs"][i].name][key] = ",".join( labs) + " | " + ",".join(addLabs) for key, val in self.Config["metrics"].items(): report.models[key] = val for key, val in self.Config["ranks"].items(): report.ranks[key] = val if len(self.Config["results"]) > 1: for i in range(len(self.predictions)): labs = [] for j in range(self.qLabs): if self.predictions[i][j] == 1: labs.append(cNames[j]) report.docs[self.Config["testdocs"] [i].name]["consolidated"] = ",".join(labs) report.models["consolidated"] = self.metrics report.ranks["consolidated"] = self.rankThreshold rPath = fullPath(self.Config, "reportspath") + "/" + self.Config["reqid"] + ".json" with open(rPath, 'w', encoding="utf-8") as file: json.dump(report.toJSON(), file, indent=4) file.close()
def __init__(self, Config): super().__init__(Config) if len(Config["binarizerpath"]) == 0 or not os.path.isfile(fullPath(Config, "binarizerpath")): if Config["runfor"] == "test" or (len(Config["binarizerpath"]) != 0 and not os.path.isdir( os.path.dirname(fullPath(Config, "binarizerpath")))): print ("Wrong path to binarizer. Stop.") Config["error"] = True return if len(Config["vectorizerpath"]) == 0 or not os.path.isfile(fullPath(Config, "vectorizerpath")): if Config["runfor"] == "test" or (len(Config["vectorizerpath"]) != 0 and not os.path.isdir( os.path.dirname(fullPath(Config, "vectorizerpath")))): print ("Wrong path to vectorizer. Stop.") Config["error"] = True return self.useProbabilities = False self.handleType = "vectorize" if Config["runfor"] != "crossvalidation": self.prepareData() self.launchProcess()
def __init__(self, Config): super().__init__(Config) self.Config = Config self.useProbabilities = True self.maxBertSeqLength = 512 self.device = 'cpu' self.n_gpu = torch.cuda.device_count() self.model_to_save = None if len(Config["bertpath"]) == 0 or not os.path.isfile( fullPath(Config, "bertpath")): print("Wrong path to archive with pre-trained BERT model. Stop.") Config["error"] = True return if len(Config["bertoutpath"]) == 0 or not os.path.isdir( fullPath(Config, "bertoutpath")): print("Wrong path to folder with resulting BERT files. Stop.") Config["error"] = True return self.args = Args(fullPath(self.Config, "bertpath"), fullPath(self.Config, "bertoutpath")) # model: pytorch_ber.gz self.max_seq_length = min(self.maxBertSeqLength, self.Config["maxseqlen"]) if self.Config["runfor"] != "test": self.do_train = True if self.Config["runfor"] != "train": self.do_eval = True self.do_lower_case = False self.train_batch_size = min(self.trainBatch, 32) self.eval_batch_size = 8 self.learning_rate = 5e-5 self.num_train_epochs = self.epochs self.warmup_proportion = 0.1 self.no_cuda = True self.local_rank = -1 self.seed = 42 self.gradient_accumulation_steps = 1 self.keyTrain = "traindocs" self.keyTest = "testdocs" #self.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if self.Config["runfor"] != "crossvalidation": self.prepareData() self.launchProcess()
def composeTsv(model, type): cNames = [''] * len(model.Config["cats"]) for k, v in model.Config["cats"].items(): cNames[v] = k if type == "train": bertPath = fullPath(model.Config, "bertoutpath", opt="/train.tsv") data = model.Config[model.keyTrain] else: bertPath = fullPath(model.Config, "bertoutpath", opt="/dev.tsv") data = model.Config[model.keyTest] target = open(bertPath, "w", encoding="utf-8") for i in range(len(data)): conts = data[i].lines.replace('\r', '').replace('\n', '.') nl = '\n' if i == 0: nl = '' string = nl + ",".join(data[i].nlabs) + "\t" + conts target.write(string) target.close()
def __init__(self, Config): self.Config = Config if "testdocs" not in Config or len(Config["results"]) == 0: print("Documents have not been classified in this process chain.") print("Consolidation can't be performed.") return self.rankThreshold = 0.5 if Config['consolidatedrank'] == "yes": try: self.rankThreshold = float(Config["consolidatedrankthreshold"]) except ValueError: self.rankThreshold = 0.5 self.testLabels = numpy.concatenate([ numpy.array(x.labels).reshape(1, len(self.Config["cats"])) for x in self.Config["testdocs"] ]) self.qLabs = len(self.Config["cats"]) self.predictions = numpy.zeros([len(self.testLabels), self.qLabs]) self.metrics = {} self.useProbabilities = False self.reports = False self.runtime = False print("\nCalculate consolidated metrics...") if len(self.Config["results"]) == 0: print("No results to consolidate them.") print("Consolidation can't be performed.") return if Config["reports"] == "yes": if len(Config["reportspath"]) == 0 or not os.path.isdir( fullPath(Config, "reportspath")): print("Wrong path to the folder, containing reports.") print("Reports can't be created.") else: self.reports = True if Config["saveresources"] == "yes": if len(Config["resourcespath"]) == 0 or not os.path.isdir( fullPath(Config, "resourcespath")): print( "Wrong path to the folder, containing resources for runtime." ) print("Resources can't be saved.") else: self.runtime = True print("Rank threshold for consolidated results: %.2f" % (self.rankThreshold)) if self.reports or self.Config["showresults"] == "yes": self.getConsolidatedResults() self.getMetrics() if self.reports: self.saveReports() if self.runtime: if len(os.listdir(fullPath(self.Config, "resourcespath"))) > 0: print( "Warning: folder %s is not empty. All its content will be deleted." % (fullPath(self.Config, "resourcespath"))) shutil.rmtree(fullPath(self.Config, "resourcespath")) os.makedirs(fullPath(self.Config, "resourcespath"), exist_ok=True) print("\nCollect arfifacts for runtime...") self.saveResources()
def saveResources(self, type): self.resources["id"] = str(self.Config["modelid"]) self.resources["modelPath"] = fullPath(self.Config, "bertoutpath", opt="name") self.resources["modelType"] = type if not "ptBertModel" in self.Config["resources"]: self.Config["resources"]["ptBertModel"] = self.args.bert_model self.Config["resources"]["vocabPath"] = self.vocabPath self.resources["ptBertModel"] = "yes" self.resources["handleType"] = "bert" self.resources["rankThreshold"] = self.rankThreshold self.Config["resources"]["models"][ "Model" + str(self.Config["modelid"])] = self.resources
def saveResources(self, type): self.resources["modelPath"] = fullPath(self.Config, "modelpath", opt="name") self.resources["modelType"] = type if self.useProbabilities: self.resources["rankThreshold"] = self.rankThreshold else: self.resources["rankThreshold"] = 1.0 self.saveAdditions() if type == "skl": self.resources["handleType"] = "vectorize" self.Config["resources"]["models"][ "Model" + str(self.Config["modelid"])] = self.resources
def parseConfigInfo(path): parser = ConfigParser() parser.read_file(open(path)) try: sections = parser.sections() for i in range(len(sections)): options = parser.items(sections[i]) for j in range(len(options)): Config[options[j][0]] = options[j][1] if not Config["home"]: Config["home"] = str(Path.home()) if not Config["infofrom"]: Config["infofrom"] = "today" if Config["infofrom"] != "today": chk = Config["infofrom"].split() if len(chk) != 2 and not chk[1].startswith("day"): print("Wrong value of 'infofrom' option. Exit.") return try: days = int(chk[0]) except ValueError: print("Wrong value of 'infofrom' option. Exit.") return if len(Config["reportspath"]) == 0 or not os.path.isdir( fullPath(Config, "reportspath")): print("Wrong path to the folder, containing reports. Exit.") return if len(Config["actualpath"]) == 0 or not os.path.isdir( fullPath(Config, "actualpath")): print( "Warning: wrong path to the folder, containing original documents." ) print("It will not be possible to view this documents.") except Error: print("Config file's parsing error. Exit.") return InfoCreator(Config)
def __init__(self, Config): print("Start to create info...") self.Config = Config self.curDir = os.path.dirname(__file__) self.info = {} self.startId = "%d%0.2d%0.2d000000" % ( date.today().year, date.today().month, date.today().day) if self.Config["infofrom"] != "today": arr = self.Config["infofrom"].split() prevDays = int(arr[0]) startDay = date.today() - timedelta(days=prevDays) self.startId = "%d%0.2d%0.2d000000" % ( startDay.year, startDay.month, startDay.day) self.path = fullPath(Config, "reportspath") os.chdir(self.path) for f in glob.glob("*"): resPath = self.path + "/" + f try: ind = f.rindex(".") except ValueError: ind = len(f) key = f[:ind] if (key < self.startId): continue with open(resPath, 'r', encoding='utf-8') as json_file: try: self.info[key] = json.load(json_file) except json.JSONDecodeError: print( "Warning: file %s doesn't have json format. Skipped." % (resPath)) json_file.close() if len(self.info) == 0: print( "Folder %s doesn't contain reports, created in required diapason of dates. Exit." % (self.path)) return self.html = "" self.qReqs = 0 self.footer = "</table></body></html>" self.docsDict = self.getDocsDictionary() self.createHtml()
def launchCrossValidation(self): print("Start cross-validation...") ds = datetime.datetime.now() self.cvDocs = self.Config["traindocs"] + self.Config["testdocs"] random.shuffle(self.cvDocs) self.keyTrain = "cvtraindocs" self.keyTest = "cvtestdocs" pSize = len(self.cvDocs) // self.kfold ind = 0 f1 = 0 arrMetrics = [] for i in range(self.kfold): print("Cross-validation, cycle %d from %d..." % ((i + 1), self.kfold)) if i == 0: self.Config["cvtraindocs"] = self.cvDocs[pSize:] self.Config["cvtestdocs"] = self.cvDocs[:pSize] elif i == self.kfold - 1: self.Config["cvtraindocs"] = self.cvDocs[:ind] self.Config["cvtestdocs"] = self.cvDocs[ind:] else: self.Config["cvtraindocs"] = self.cvDocs[:ind] + self.cvDocs[ ind + pSize:] self.Config["cvtestdocs"] = self.cvDocs[ind:ind + pSize] ind += pSize self.prepareData() self.model = self.createModel() self.trainModel() self.testModel() arrMetrics.append(self.metrics) cycleF1 = self.metrics["all"]["f1"] print("Resulting F1-Measure: %f\n" % (cycleF1)) if cycleF1 > f1: if self.Config["cvsave"]: self.saveDataSets() f1 = cycleF1 de = datetime.datetime.now() print("Cross-validation is done in %s" % (showTime(ds, de))) printAveragedMetrics(arrMetrics, self.Config) print("The best result is %f" % (f1)) print("Corresponding data sets are saved in the folder %s" % (fullPath(self.Config, "cvpath")))
def __init__(self, Config, DefConfig, kwargs): Config["modelid"] += 1 print("=== Model " + str(Config["modelid"]) + " ===") updateParams(Config, DefConfig, kwargs) Config["type"] = Config["type"].lower() Config["runfor"] = Config["runfor"].lower() if Config["runfor"] != "none" and Config["type"] not in modelTypes: print( "Request contains definition of model with wrong type. Stop.") Config["error"] = True return if Config["runfor"] not in modelGoals: print("Request doesn't define the goal of the model process") print( "It should be one of 'trainAndTest', 'train', 'test', 'crossValidation' or 'none'. Stop." ) Config["error"] = True return if Config["runfor"] != "none": print("Model type: " + Config["type"].upper() + ", " + userInfo[Config["runfor"]]) else: print("Model : " + userInfo[Config["runfor"]]) if Config["runfor"] == "none": return self.Config = Config self.DefConfig = DefConfig if "cats" not in Config or "traindocs" not in Config or "testdocs" not in Config: print("Input data isn't loaded. Stop.") Config["error"] = True return stop = False try: self.testSize = float(Config["testsize"]) except ValueError: self.testSize = -1 if len(Config["trainpath"]) == 0 or not os.path.isdir( fullPath(Config, "trainpath")): if Config["runfor"] != "test" or len(Config["testpath"]) == 0: print( "Wrong path to the training set: folder %s doesn't exist." % (fullPath(Config, "trainpath"))) stop = True if len(Config["testpath"]) == 0 or not os.path.isdir( fullPath(Config, "testpath")): if not (len(Config["testpath"]) == 0 and self.testSize > 0 and self.testSize < 1): print( "Wrong path to the testing set: folder %d doesn't exist." % (fullPath(Config, "testpath"))) stop = True if len(Config["modelpath"]) == 0 or not os.path.isdir( fullPath(Config, "modelpath")): print("Wrong path to the models' folder.") stop = True if len(Config["name"]) == 0: Config["name"] = Config["type"] + str(Config["modelid"]) mPath = fullPath(Config, "modelpath", opt="name") if Config["runfor"] == "test" and not os.path.isfile(mPath): print("Wrong path to the tested model.") stop = True if Config["runfor"] != "test": try: self.epochs = int(Config["epochs"]) except ValueError: print("Wrong quantity of epochs for training.") stop = True try: self.trainBatch = int(Config["trainbatch"]) except ValueError: print("Wrong batch size for training.") stop = True try: self.verbose = int(Config["verbose"]) except ValueError: print("Wrong value of 'verbose' flag for training.") stop = True if Config["tempsave"] == "yes": if len(Config["temppath"]) == 0 or not os.path.isdir( fullPath(Config, "temppath")): print("Wrong path to folder with intermediate results.") stop = True if Config["runfor"] != "train" and Config["customrank"] == "yes": try: self.rankThreshold = float(Config["rankthreshold"]) except ValueError: print("Wrong custom rank threshold.") stop = True if Config["runfor"] == "crossvalidation": if Config["cvsave"] == "yes": if len(Config["cvpath"]) == 0 or not os.path.isdir( fullPath(Config, "cvpath")): print( "Wrong path to the cross-validation's resulting folder." ) stop = True try: kfold = int(Config["kfold"]) except ValueError: print("Wrong k-fold value.") stop = True if stop: print("Stop.") Config["error"] = True return if Config["type"].lower() == "snn": SnnModel(Config) elif Config["type"].lower() == "ltsm": LTSMModel(Config) elif Config["type"].lower() == "cnn": CNNModel(Config) elif Config["type"].lower() == "pac": PacModel(Config) elif Config["type"].lower() == "ridge": RidgeModel(Config) elif Config["type"].lower() == "svc": SVCModel(Config) elif Config["type"] == "perceptron": PerceptronModel(Config) elif Config["type"] == "sgd": SGDModel(Config) elif Config["type"] == "bert": BertModel(Config)
def __init__(self, Config, DefConfig, kwargs): print("=== Loading data ===") updateParams(Config, DefConfig, kwargs) self.Config = Config self.DefConfig = DefConfig self.exCats = Config["excats"].split(",") self.sz = 0 self.splitTrain = False self.topBound = 0.9 self.charsTopBound = 0.6 if len(Config["trainpath"]) == 0 or not os.path.isdir( fullPath(Config, "trainpath")): print("Wrong path to training set. Data can't be loaded.") Config["error"] = True return if len(Config["testpath"]) > 0 and not os.path.isdir( fullPath(Config, "testpath")): print("Wrong path to testing set. Data can't be loaded.") Config["error"] = True return elif len(Config["testpath"]) == 0: self.splitTrain = True try: self.sz = float(Config["testsize"]) except ValueError: self.sz = 0 if len(Config["testpath"]) == 0 and (self.sz <= 0 or self.sz >= 1): print("Wrong size of testing set. Data can't be loaded.") Config["error"] = True return if Config["datatoks"] == "yes": if Config["actualtoks"] == "yes": taggerPath = fullPath(Config, 'rttaggerpath') if (self.Config["rttaggerpath"] == 0 or not os.path.exists(taggerPath)): print( "Wrong path to the tagger's jar. Preprocessing can't be done" ) Config["error"] = True return self.jar = subprocess.Popen('java -Xmx2g -jar ' + taggerPath + ' "' + self.Config["expos"] + '"', stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, encoding="utf-8") if self.Config["stopwords"] == "yes": self.stopWords = set(nltk.corpus.stopwords.words('arabic')) else: self.stopWords = set() if self.Config["normalization"] == "yes": self.normalizer = ArabicNormalizer() if Config["w2vload"] == "yes": if len(Config["w2vmodelpath"]) == 0 or not os.path.isfile( fullPath(Config, "w2vmodelpath")): print("Wrong path to W2V model. Stop.") Config["error"] = True return try: self.ndim = int(self.Config["w2vdim"]) except ValueError: print("Wrong size of vectors' dimentions. Stop.") Config["error"] = True return self.Config["resources"]["w2v"]["modelPath"] = fullPath( Config, "w2vmodelpath") self.Config["resources"]["w2v"]["ndim"] = self.ndim self.loadW2VModel() else: self.Config["w2vmodel"] = None self.loadData() if Config["analysis"] == "yes": self.analysis()
def saveResources(self): tokOpts = [ "actualtoks", "normalization", "stopwords", "expos", "extrawords", "maxseqlen", "maxcharsseqlen", "rttaggerpath" ] self.Config["resources"]["tokenization"] = {} ds = datetime.datetime.now() self.outDir = fullPath(self.Config, "resourcespath") + "/" for i in range(len(tokOpts)): if tokOpts[i] != "rttaggerpath": self.Config["resources"]["tokenization"][ tokOpts[i]] = self.Config[tokOpts[i]] elif self.Config["actualtoks"] == "yes": self.Config["resources"]["tokenization"]["rttaggerpath"] = \ self.copyFile(fullPath(self.Config, "rttaggerpath")) isW2VNeeded = False for key, val in self.Config["resources"]["models"].items(): val["modelPath"] = self.copyFile(val["modelPath"]) if "w2v" in val and val["w2v"] == "yes": isW2VNeeded = True if not isW2VNeeded and "w2v" in self.Config["resources"]: self.Config["resources"].pop("w2v", None) if "w2v" in self.Config["resources"]: w2vDict = {} isFirstLine = True fEmbeddings = open(self.Config["resources"]["w2v"]["modelPath"], encoding="utf-8") for line in fEmbeddings: if isFirstLine == True: isFirstLine = False continue split = line.strip().split(" ") word = split[0] vector = numpy.array([float(num) for num in split[1:]]) w2vDict[word] = vector fEmbeddings.close() with open(self.Config["resources"]["w2v"]["modelPath"] + '.pkl', 'wb') as file: pickle.dump(w2vDict, file, pickle.HIGHEST_PROTOCOL) file.close() self.Config["resources"]["w2v"]["modelPath"] = self.copyFile( self.Config["resources"]["w2v"]["modelPath"] + '.pkl') if "indexer" in self.Config["resources"]: self.Config["resources"]["indexer"] = self.copyFile( self.Config["resources"]["indexer"]) if "vectorizer" in self.Config["resources"]: self.Config["resources"]["vectorizer"] = self.copyFile( self.Config["resources"]["vectorizer"]) if "ptBertModel" in self.Config["resources"]: self.Config["resources"]["ptBertModel"] = self.copyFile( self.Config["resources"]["ptBertModel"]) self.Config["resources"]["vocabPath"] = self.copyFile( self.Config["resources"]["vocabPath"]) cNames = [''] * len(self.Config["cats"]) for k, v in self.Config["cats"].items(): cNames[v] = k with open(self.outDir + 'labels.txt', 'w', encoding="utf-8") as file: file.write(",".join(cNames)) file.close() self.Config["resources"]["labels"] = "labels.txt" self.Config["resources"]["consolidatedRank"] = self.rankThreshold with open(self.outDir + 'config.json', 'w', encoding="utf-8") as file: json.dump(self.Config["resources"], file, indent=4) file.close() de = datetime.datetime.now() print("\nArtifacts are copied into the folder %s in %s" % (fullPath(self.Config, "resourcespath"), showTime(ds, de)))
def loadSKLModel(self): return joblib.load(fullPath(self.Config, "modelpath", opt="name"))
def loadNNModel(self): return load_model(fullPath(self.Config, "modelpath", opt="name"))
def saveAdditions(self): if not "vectorizer" in self.Config["resources"]: self.Config["resources"]["vectorizer"] = fullPath( self.Config, "vectorizerpath") self.resources["vectorizer"] = "yes"