示例#1
0
    def __init__(self, Config, DefConfig, kwargs):
        print ("=== Word Embedding ===")
        updateParams(Config, DefConfig, kwargs)
        self.Config = Config
        self.DefConfig = DefConfig;
        if not os.path.isdir(os.path.dirname(fullPath(Config, "w2vmodelpath"))):
            print ("Wrong path to W2V model. Word Embedding can't be done.")
            Config["error"] = True
            return
        if Config["w2vcreate"] != "yes":
            return
        if len(Config["w2vcorpuspath"]) == 0 or not os.path.isfile(fullPath(Config, "w2vcorpuspath")):
            print ("Wrong corpus path. W2V model can't be created.")
            Config["error"] = True
            return
        try:
            self.epochs = int(self.Config["w2vepochs"])
        except ValueError:
            print ("Wrong quantity of epochs for training. W2V model can't be created.")
            Config["error"] = True
            return
        try:
            self.ndim = int(self.Config["w2vdim"])
        except ValueError:
            print ("Wrong size of resulting vectors. W2V model can't be created.")
            Config.error = True
            return

        self.createW2VModel()
示例#2
0
 def __init__(self, Config, DefConfig, kwargs):
     self.Config = Config
     self.DefConfig = DefConfig
     self.Config["modelid"] += 1
     print("=== Model " + str(self.Config["modelid"]) + " ===")
     updateParams(self.Config, DefConfig, kwargs)
     self.Config["type"] = self.Config["type"].lower()
     self.Config["type_of_execution"] = self.Config[
         "type_of_execution"].lower()
     if self.Config["type_of_execution"] != "none" and self.Config[
             "type"] not in modelTypes:
         raise ValueError(
             "Request contains definition of model with wrong type. Stop.")
     if self.Config["type_of_execution"] not in modelGoals:
         raise ValueError(
             "Request doesn't define the goal of the model process. "
             "It should be one of 'trainAndTest', 'train', 'test', 'crossValidation' or 'none'. Stop."
         )
     if self.Config["type_of_execution"] != "none":
         print("Model type: " + self.Config["type"].upper() + ", " +
               userInfo[self.Config["type_of_execution"]])
     else:
         print("Model : " + userInfo[self.Config["type_of_execution"]])
     if self.Config["type_of_execution"] == "none":
         return
     if "predefined_categories" not in self.Config or "train_docs" not in self.Config or "test_docs" not in self.Config:
         raise ValueError("Input data isn't loaded. Stop.")
示例#3
0
 def __init__(self, Config, DefConfig, kwargs):
     print ("=== Loading data ===")
     updateParams(Config, DefConfig, kwargs)
     self.Config = Config
     self.DefConfig = DefConfig
     self.exclude_categories = Config["exclude_categories"].split(",")
     self.sz = 0
     self.splitTrain = False
     self.topBound = 0.9
     self.charsTopBound = 0.6
     self.run()
示例#4
0
 def __init__(self, Config, DefConfig, kwargs):
     print("=== Preprocessing ===")
     updateParams(Config, DefConfig, kwargs)
     self.Config = Config
     self.DefConfig = DefConfig
     if len(Config["sourcepath"]
            ) == 0 or Config["sourcepath"] == Config["targetpath"]:
         print("Wrong source/target path(s). Tokenization can't be done.")
         Config["error"] = True
         return
     self.process(Config)
示例#5
0
 def __init__(self, Config, DefConfig, kwargs):
     print("=== Tokenization ===")
     updateParams(Config, DefConfig, kwargs)
     self.Config = Config
     self.DefConfig = DefConfig
     #if Config["language_tokenization"] != "True":
     #    return
     if not Config["source_path"] or Config["source_path"] == Config[
             "target_path"]:
         print("Wrong source/target path(s). Tokenization can't be done.")
         Config["error"] = True
         return
     if Config["typetoks"] == "server":
         tokens_from_server(Config)
     elif Config["typetoks"] == "tagger":
         tokens_from_tagger(Config)
     else:
         print("Wrong tokenization type. Tokenization can't be done.")
示例#6
0
 def __init__(self, Config, DefConfig, kwargs):
     print("=== Tokenization ===")
     updateParams(Config, DefConfig, kwargs)
     self.Config = Config
     self.DefConfig = DefConfig
     #if Config["actualtoks"] != "yes":
     #    return
     if len(Config["sourcepath"]
            ) == 0 or Config["sourcepath"] == Config["targetpath"]:
         print("Wrong source/target path(s). Tokenization can't be done.")
         Config["error"] = True
         return
     if Config["typetoks"] == "server":
         TokensFromServer(Config)
     elif Config["typetoks"] == "tagger":
         TokensFromTagger(Config)
     else:
         print("Wrong tokenization type. Tokenization can't be done.")
示例#7
0
 def __init__(self, Config, DefConfig, kwargs):
     print("=== Word Embedding ===")
     updateParams(Config, DefConfig, kwargs)
     self.Config = Config
     self.DefConfig = DefConfig
     test_path(Config, "model_path",
               "Wrong path to W2V model. Word Embedding can't be done.")
     if Config["need_create_model"] != "True":
         return
     test_path(Config, "data_corpus_path",
               "Wrong corpus path. W2V model can't be created.")
     try:
         self.epochs = int(self.Config["epochs_total"])
     except ValueError:
         raise ValueError(
             "Wrong quantity of epochs for training. W2V model can't be created."
         )
     try:
         self.ndim = int(self.Config["vectors_dimension"])
     except ValueError:
         raise ValueError(
             "Wrong size of resulting vectors. W2V model can't be created.")
示例#8
0
 def __init__(self, Config, DefConfig, kwargs):
     Config["modelid"] += 1
     print("=== Model " + str(Config["modelid"]) + " ===")
     updateParams(Config, DefConfig, kwargs)
     Config["type"] = Config["type"].lower()
     Config["runfor"] = Config["runfor"].lower()
     if Config["runfor"] != "none" and Config["type"] not in modelTypes:
         print(
             "Request contains definition of model with wrong type. Stop.")
         Config["error"] = True
         return
     if Config["runfor"] not in modelGoals:
         print("Request doesn't define the goal of the model process")
         print(
             "It should be one of 'trainAndTest', 'train', 'test', 'crossValidation' or 'none'. Stop."
         )
         Config["error"] = True
         return
     if Config["runfor"] != "none":
         print("Model type: " + Config["type"].upper() + ", " +
               userInfo[Config["runfor"]])
     else:
         print("Model : " + userInfo[Config["runfor"]])
     if Config["runfor"] == "none":
         return
     self.Config = Config
     self.DefConfig = DefConfig
     if "cats" not in Config or "traindocs" not in Config or "testdocs" not in Config:
         print("Input data isn't loaded. Stop.")
         Config["error"] = True
         return
     stop = False
     try:
         self.testSize = float(Config["testsize"])
     except ValueError:
         self.testSize = -1
     if len(Config["trainpath"]) == 0 or not os.path.isdir(
             fullPath(Config, "trainpath")):
         if Config["runfor"] != "test" or len(Config["testpath"]) == 0:
             print(
                 "Wrong path to the training set: folder %s doesn't exist."
                 % (fullPath(Config, "trainpath")))
             stop = True
     if len(Config["testpath"]) == 0 or not os.path.isdir(
             fullPath(Config, "testpath")):
         if not (len(Config["testpath"]) == 0 and self.testSize > 0
                 and self.testSize < 1):
             print(
                 "Wrong path to the testing set: folder %d doesn't exist." %
                 (fullPath(Config, "testpath")))
             stop = True
     if len(Config["modelpath"]) == 0 or not os.path.isdir(
             fullPath(Config, "modelpath")):
         print("Wrong path to the models' folder.")
         stop = True
     if len(Config["name"]) == 0:
         Config["name"] = Config["type"] + str(Config["modelid"])
     mPath = fullPath(Config, "modelpath", opt="name")
     if Config["runfor"] == "test" and not os.path.isfile(mPath):
         print("Wrong path to the tested model.")
         stop = True
     if Config["runfor"] != "test":
         try:
             self.epochs = int(Config["epochs"])
         except ValueError:
             print("Wrong quantity of epochs for training.")
             stop = True
         try:
             self.trainBatch = int(Config["trainbatch"])
         except ValueError:
             print("Wrong batch size for training.")
             stop = True
         try:
             self.verbose = int(Config["verbose"])
         except ValueError:
             print("Wrong value of 'verbose' flag for training.")
             stop = True
         if Config["tempsave"] == "yes":
             if len(Config["temppath"]) == 0 or not os.path.isdir(
                     fullPath(Config, "temppath")):
                 print("Wrong path to folder with intermediate results.")
                 stop = True
     if Config["runfor"] != "train" and Config["customrank"] == "yes":
         try:
             self.rankThreshold = float(Config["rankthreshold"])
         except ValueError:
             print("Wrong custom rank threshold.")
             stop = True
     if Config["runfor"] == "crossvalidation":
         if Config["cvsave"] == "yes":
             if len(Config["cvpath"]) == 0 or not os.path.isdir(
                     fullPath(Config, "cvpath")):
                 print(
                     "Wrong path to the cross-validation's resulting folder."
                 )
                 stop = True
         try:
             kfold = int(Config["kfold"])
         except ValueError:
             print("Wrong k-fold value.")
             stop = True
     if stop:
         print("Stop.")
         Config["error"] = True
         return
     if Config["type"].lower() == "snn":
         SnnModel(Config)
     elif Config["type"].lower() == "ltsm":
         LTSMModel(Config)
     elif Config["type"].lower() == "cnn":
         CNNModel(Config)
     elif Config["type"].lower() == "pac":
         PacModel(Config)
     elif Config["type"].lower() == "ridge":
         RidgeModel(Config)
     elif Config["type"].lower() == "svc":
         SVCModel(Config)
     elif Config["type"] == "perceptron":
         PerceptronModel(Config)
     elif Config["type"] == "sgd":
         SGDModel(Config)
     elif Config["type"] == "bert":
         BertModel(Config)
示例#9
0
    def __init__(self, Config, DefConfig, kwargs):
        print("=== Loading data ===")
        updateParams(Config, DefConfig, kwargs)
        self.Config = Config
        self.DefConfig = DefConfig
        self.exCats = Config["excats"].split(",")
        self.sz = 0
        self.splitTrain = False
        self.topBound = 0.9
        self.charsTopBound = 0.6

        if len(Config["trainpath"]) == 0 or not os.path.isdir(
                fullPath(Config, "trainpath")):
            print("Wrong path to training set. Data can't be loaded.")
            Config["error"] = True
            return
        if len(Config["testpath"]) > 0 and not os.path.isdir(
                fullPath(Config, "testpath")):
            print("Wrong path to testing set. Data can't be loaded.")
            Config["error"] = True
            return
        elif len(Config["testpath"]) == 0:
            self.splitTrain = True
            try:
                self.sz = float(Config["testsize"])
            except ValueError:
                self.sz = 0
            if len(Config["testpath"]) == 0 and (self.sz <= 0 or self.sz >= 1):
                print("Wrong size of testing set. Data can't be loaded.")
                Config["error"] = True
                return
        if Config["datatoks"] == "yes":
            if Config["actualtoks"] == "yes":
                taggerPath = fullPath(Config, 'rttaggerpath')
                if (self.Config["rttaggerpath"] == 0
                        or not os.path.exists(taggerPath)):
                    print(
                        "Wrong path to the tagger's jar. Preprocessing can't be done"
                    )
                    Config["error"] = True
                    return
                self.jar = subprocess.Popen('java -Xmx2g -jar ' + taggerPath +
                                            ' "' + self.Config["expos"] + '"',
                                            stdin=subprocess.PIPE,
                                            stdout=subprocess.PIPE,
                                            stderr=subprocess.PIPE,
                                            shell=True,
                                            encoding="utf-8")
            if self.Config["stopwords"] == "yes":
                self.stopWords = set(nltk.corpus.stopwords.words('arabic'))
            else:
                self.stopWords = set()
            if self.Config["normalization"] == "yes":
                self.normalizer = ArabicNormalizer()
        if Config["w2vload"] == "yes":
            if len(Config["w2vmodelpath"]) == 0 or not os.path.isfile(
                    fullPath(Config, "w2vmodelpath")):
                print("Wrong path to W2V model. Stop.")
                Config["error"] = True
                return
            try:
                self.ndim = int(self.Config["w2vdim"])
            except ValueError:
                print("Wrong size of vectors' dimentions. Stop.")
                Config["error"] = True
                return
            self.Config["resources"]["w2v"]["modelPath"] = fullPath(
                Config, "w2vmodelpath")
            self.Config["resources"]["w2v"]["ndim"] = self.ndim
            self.loadW2VModel()
        else:
            self.Config["w2vmodel"] = None

        self.loadData()
        if Config["analysis"] == "yes":
            self.analysis()
示例#10
0
 def __init__(self, Config, DefConfig, kwargs):
     print("=== Preprocessing ===")
     updateParams(Config, DefConfig, kwargs)
     self.Config = Config
     self.DefConfig = DefConfig