def __init__(self, Config, DefConfig, kwargs): print ("=== Word Embedding ===") updateParams(Config, DefConfig, kwargs) self.Config = Config self.DefConfig = DefConfig; if not os.path.isdir(os.path.dirname(fullPath(Config, "w2vmodelpath"))): print ("Wrong path to W2V model. Word Embedding can't be done.") Config["error"] = True return if Config["w2vcreate"] != "yes": return if len(Config["w2vcorpuspath"]) == 0 or not os.path.isfile(fullPath(Config, "w2vcorpuspath")): print ("Wrong corpus path. W2V model can't be created.") Config["error"] = True return try: self.epochs = int(self.Config["w2vepochs"]) except ValueError: print ("Wrong quantity of epochs for training. W2V model can't be created.") Config["error"] = True return try: self.ndim = int(self.Config["w2vdim"]) except ValueError: print ("Wrong size of resulting vectors. W2V model can't be created.") Config.error = True return self.createW2VModel()
def __init__(self, Config, DefConfig, kwargs): self.Config = Config self.DefConfig = DefConfig self.Config["modelid"] += 1 print("=== Model " + str(self.Config["modelid"]) + " ===") updateParams(self.Config, DefConfig, kwargs) self.Config["type"] = self.Config["type"].lower() self.Config["type_of_execution"] = self.Config[ "type_of_execution"].lower() if self.Config["type_of_execution"] != "none" and self.Config[ "type"] not in modelTypes: raise ValueError( "Request contains definition of model with wrong type. Stop.") if self.Config["type_of_execution"] not in modelGoals: raise ValueError( "Request doesn't define the goal of the model process. " "It should be one of 'trainAndTest', 'train', 'test', 'crossValidation' or 'none'. Stop." ) if self.Config["type_of_execution"] != "none": print("Model type: " + self.Config["type"].upper() + ", " + userInfo[self.Config["type_of_execution"]]) else: print("Model : " + userInfo[self.Config["type_of_execution"]]) if self.Config["type_of_execution"] == "none": return if "predefined_categories" not in self.Config or "train_docs" not in self.Config or "test_docs" not in self.Config: raise ValueError("Input data isn't loaded. Stop.")
def __init__(self, Config, DefConfig, kwargs): print ("=== Loading data ===") updateParams(Config, DefConfig, kwargs) self.Config = Config self.DefConfig = DefConfig self.exclude_categories = Config["exclude_categories"].split(",") self.sz = 0 self.splitTrain = False self.topBound = 0.9 self.charsTopBound = 0.6 self.run()
def __init__(self, Config, DefConfig, kwargs): print("=== Preprocessing ===") updateParams(Config, DefConfig, kwargs) self.Config = Config self.DefConfig = DefConfig if len(Config["sourcepath"] ) == 0 or Config["sourcepath"] == Config["targetpath"]: print("Wrong source/target path(s). Tokenization can't be done.") Config["error"] = True return self.process(Config)
def __init__(self, Config, DefConfig, kwargs): print("=== Tokenization ===") updateParams(Config, DefConfig, kwargs) self.Config = Config self.DefConfig = DefConfig #if Config["language_tokenization"] != "True": # return if not Config["source_path"] or Config["source_path"] == Config[ "target_path"]: print("Wrong source/target path(s). Tokenization can't be done.") Config["error"] = True return if Config["typetoks"] == "server": tokens_from_server(Config) elif Config["typetoks"] == "tagger": tokens_from_tagger(Config) else: print("Wrong tokenization type. Tokenization can't be done.")
def __init__(self, Config, DefConfig, kwargs): print("=== Tokenization ===") updateParams(Config, DefConfig, kwargs) self.Config = Config self.DefConfig = DefConfig #if Config["actualtoks"] != "yes": # return if len(Config["sourcepath"] ) == 0 or Config["sourcepath"] == Config["targetpath"]: print("Wrong source/target path(s). Tokenization can't be done.") Config["error"] = True return if Config["typetoks"] == "server": TokensFromServer(Config) elif Config["typetoks"] == "tagger": TokensFromTagger(Config) else: print("Wrong tokenization type. Tokenization can't be done.")
def __init__(self, Config, DefConfig, kwargs): print("=== Word Embedding ===") updateParams(Config, DefConfig, kwargs) self.Config = Config self.DefConfig = DefConfig test_path(Config, "model_path", "Wrong path to W2V model. Word Embedding can't be done.") if Config["need_create_model"] != "True": return test_path(Config, "data_corpus_path", "Wrong corpus path. W2V model can't be created.") try: self.epochs = int(self.Config["epochs_total"]) except ValueError: raise ValueError( "Wrong quantity of epochs for training. W2V model can't be created." ) try: self.ndim = int(self.Config["vectors_dimension"]) except ValueError: raise ValueError( "Wrong size of resulting vectors. W2V model can't be created.")
def __init__(self, Config, DefConfig, kwargs): Config["modelid"] += 1 print("=== Model " + str(Config["modelid"]) + " ===") updateParams(Config, DefConfig, kwargs) Config["type"] = Config["type"].lower() Config["runfor"] = Config["runfor"].lower() if Config["runfor"] != "none" and Config["type"] not in modelTypes: print( "Request contains definition of model with wrong type. Stop.") Config["error"] = True return if Config["runfor"] not in modelGoals: print("Request doesn't define the goal of the model process") print( "It should be one of 'trainAndTest', 'train', 'test', 'crossValidation' or 'none'. Stop." ) Config["error"] = True return if Config["runfor"] != "none": print("Model type: " + Config["type"].upper() + ", " + userInfo[Config["runfor"]]) else: print("Model : " + userInfo[Config["runfor"]]) if Config["runfor"] == "none": return self.Config = Config self.DefConfig = DefConfig if "cats" not in Config or "traindocs" not in Config or "testdocs" not in Config: print("Input data isn't loaded. Stop.") Config["error"] = True return stop = False try: self.testSize = float(Config["testsize"]) except ValueError: self.testSize = -1 if len(Config["trainpath"]) == 0 or not os.path.isdir( fullPath(Config, "trainpath")): if Config["runfor"] != "test" or len(Config["testpath"]) == 0: print( "Wrong path to the training set: folder %s doesn't exist." % (fullPath(Config, "trainpath"))) stop = True if len(Config["testpath"]) == 0 or not os.path.isdir( fullPath(Config, "testpath")): if not (len(Config["testpath"]) == 0 and self.testSize > 0 and self.testSize < 1): print( "Wrong path to the testing set: folder %d doesn't exist." % (fullPath(Config, "testpath"))) stop = True if len(Config["modelpath"]) == 0 or not os.path.isdir( fullPath(Config, "modelpath")): print("Wrong path to the models' folder.") stop = True if len(Config["name"]) == 0: Config["name"] = Config["type"] + str(Config["modelid"]) mPath = fullPath(Config, "modelpath", opt="name") if Config["runfor"] == "test" and not os.path.isfile(mPath): print("Wrong path to the tested model.") stop = True if Config["runfor"] != "test": try: self.epochs = int(Config["epochs"]) except ValueError: print("Wrong quantity of epochs for training.") stop = True try: self.trainBatch = int(Config["trainbatch"]) except ValueError: print("Wrong batch size for training.") stop = True try: self.verbose = int(Config["verbose"]) except ValueError: print("Wrong value of 'verbose' flag for training.") stop = True if Config["tempsave"] == "yes": if len(Config["temppath"]) == 0 or not os.path.isdir( fullPath(Config, "temppath")): print("Wrong path to folder with intermediate results.") stop = True if Config["runfor"] != "train" and Config["customrank"] == "yes": try: self.rankThreshold = float(Config["rankthreshold"]) except ValueError: print("Wrong custom rank threshold.") stop = True if Config["runfor"] == "crossvalidation": if Config["cvsave"] == "yes": if len(Config["cvpath"]) == 0 or not os.path.isdir( fullPath(Config, "cvpath")): print( "Wrong path to the cross-validation's resulting folder." ) stop = True try: kfold = int(Config["kfold"]) except ValueError: print("Wrong k-fold value.") stop = True if stop: print("Stop.") Config["error"] = True return if Config["type"].lower() == "snn": SnnModel(Config) elif Config["type"].lower() == "ltsm": LTSMModel(Config) elif Config["type"].lower() == "cnn": CNNModel(Config) elif Config["type"].lower() == "pac": PacModel(Config) elif Config["type"].lower() == "ridge": RidgeModel(Config) elif Config["type"].lower() == "svc": SVCModel(Config) elif Config["type"] == "perceptron": PerceptronModel(Config) elif Config["type"] == "sgd": SGDModel(Config) elif Config["type"] == "bert": BertModel(Config)
def __init__(self, Config, DefConfig, kwargs): print("=== Loading data ===") updateParams(Config, DefConfig, kwargs) self.Config = Config self.DefConfig = DefConfig self.exCats = Config["excats"].split(",") self.sz = 0 self.splitTrain = False self.topBound = 0.9 self.charsTopBound = 0.6 if len(Config["trainpath"]) == 0 or not os.path.isdir( fullPath(Config, "trainpath")): print("Wrong path to training set. Data can't be loaded.") Config["error"] = True return if len(Config["testpath"]) > 0 and not os.path.isdir( fullPath(Config, "testpath")): print("Wrong path to testing set. Data can't be loaded.") Config["error"] = True return elif len(Config["testpath"]) == 0: self.splitTrain = True try: self.sz = float(Config["testsize"]) except ValueError: self.sz = 0 if len(Config["testpath"]) == 0 and (self.sz <= 0 or self.sz >= 1): print("Wrong size of testing set. Data can't be loaded.") Config["error"] = True return if Config["datatoks"] == "yes": if Config["actualtoks"] == "yes": taggerPath = fullPath(Config, 'rttaggerpath') if (self.Config["rttaggerpath"] == 0 or not os.path.exists(taggerPath)): print( "Wrong path to the tagger's jar. Preprocessing can't be done" ) Config["error"] = True return self.jar = subprocess.Popen('java -Xmx2g -jar ' + taggerPath + ' "' + self.Config["expos"] + '"', stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, encoding="utf-8") if self.Config["stopwords"] == "yes": self.stopWords = set(nltk.corpus.stopwords.words('arabic')) else: self.stopWords = set() if self.Config["normalization"] == "yes": self.normalizer = ArabicNormalizer() if Config["w2vload"] == "yes": if len(Config["w2vmodelpath"]) == 0 or not os.path.isfile( fullPath(Config, "w2vmodelpath")): print("Wrong path to W2V model. Stop.") Config["error"] = True return try: self.ndim = int(self.Config["w2vdim"]) except ValueError: print("Wrong size of vectors' dimentions. Stop.") Config["error"] = True return self.Config["resources"]["w2v"]["modelPath"] = fullPath( Config, "w2vmodelpath") self.Config["resources"]["w2v"]["ndim"] = self.ndim self.loadW2VModel() else: self.Config["w2vmodel"] = None self.loadData() if Config["analysis"] == "yes": self.analysis()
def __init__(self, Config, DefConfig, kwargs): print("=== Preprocessing ===") updateParams(Config, DefConfig, kwargs) self.Config = Config self.DefConfig = DefConfig