Пример #1
0
 def loadData(self):
     if self.Config["datatoks"] == "yes":
         print("Start loading and preprocessing of data...")
     else:
         print("Start loading data...")
     ds = datetime.datetime.now()
     self.Config["cats"] = self.getCategories(
         fullPath(self.Config, "trainpath"))
     traindocs = self.getDataDocs(fullPath(self.Config, "trainpath"))
     if not self.splitTrain:
         testdocs = self.getDataDocs(fullPath(self.Config, "testpath"))
     else:
         ind = int(len(traindocs) * (1 - self.sz))
         random.shuffle(traindocs)
         testdocs = traindocs[ind:]
         traindocs = traindocs[:ind]
     de = datetime.datetime.now()
     self.Config["traindocs"] = random.sample(traindocs, len(traindocs))
     self.Config["testdocs"] = random.sample(testdocs, len(testdocs))
     self.getMaxSeqLen()
     self.getMaxCharsLength()
     if self.Config["datatoks"] == "yes" and self.Config[
             "actualtoks"] == "yes":
         self.jar.stdin.write('!!! STOP !!!\n')
         self.jar.stdin.flush()
     print("Input data loaded in %s" % (showTime(ds, de)))
     print("Training set contains %d documents." %
           (len(self.Config["traindocs"])))
     print("Testing set contains %d documents." %
           (len(self.Config["testdocs"])))
     print("Documents belong to %d categories." %
           (len(self.Config["cats"])))
Пример #2
0
 def getDataForSklearnClassifiers(self):
     mlb = None
     ds = datetime.datetime.now()
     if self.model.Config["runfor"] != "test":
         nmCats = [""] * len(self.model.Config["cats"])
         cKeys = list(self.model.Config["cats"].keys())
         for i in range(len(cKeys)):
             nmCats[self.model.Config["cats"][cKeys[i]]] = cKeys[i]
         mlb = MultiLabelBinarizer(classes=nmCats)
         wev = TfidfVectorizer(ngram_range=(1, 3), max_df=0.50).fit([x.lines for x in self.model.Config[self.keyTrain]],
                                                                [x.nlabs for x in self.model.Config[self.keyTrain]])
         self.model.trainArrays = wev.transform([x.lines for x in self.model.Config[self.keyTrain]])
         self.model.trainLabels = mlb.fit_transform([x.nlabs for x in self.model.Config[self.keyTrain]])
         if not self.model.isCV:
             with open(fullPath(self.model.Config, "binarizerpath"), 'wb') as handle:
                 pickle.dump(mlb, handle, protocol=pickle.HIGHEST_PROTOCOL)
             handle.close()
             with open(fullPath(self.model.Config, "vectorizerpath"), 'wb') as handle:
                 pickle.dump(wev, handle, protocol=pickle.HIGHEST_PROTOCOL)
             handle.close()
     if mlb == None:
         with open(fullPath(self.model.Config, "binarizerpath"), 'rb') as handle:
             mlb = pickle.load(handle)
         handle.close()
         with open(fullPath(self.model.Config, "vectorizerpath"), 'rb') as handle:
             wev = pickle.load(handle)
         handle.close()
     self.model.testArrays = wev.transform([x.lines for x in self.model.Config[self.keyTest]])
     self.model.testLabels = mlb.fit_transform([x.nlabs for x in self.model.Config[self.keyTest]])
     de = datetime.datetime.now()
     print("Prepare all data in %s" % (showTime(ds, de)))
Пример #3
0
    def __init__(self, Config, DefConfig, kwargs):
        print ("=== Word Embedding ===")
        updateParams(Config, DefConfig, kwargs)
        self.Config = Config
        self.DefConfig = DefConfig;
        if not os.path.isdir(os.path.dirname(fullPath(Config, "w2vmodelpath"))):
            print ("Wrong path to W2V model. Word Embedding can't be done.")
            Config["error"] = True
            return
        if Config["w2vcreate"] != "yes":
            return
        if len(Config["w2vcorpuspath"]) == 0 or not os.path.isfile(fullPath(Config, "w2vcorpuspath")):
            print ("Wrong corpus path. W2V model can't be created.")
            Config["error"] = True
            return
        try:
            self.epochs = int(self.Config["w2vepochs"])
        except ValueError:
            print ("Wrong quantity of epochs for training. W2V model can't be created.")
            Config["error"] = True
            return
        try:
            self.ndim = int(self.Config["w2vdim"])
        except ValueError:
            print ("Wrong size of resulting vectors. W2V model can't be created.")
            Config.error = True
            return

        self.createW2VModel()
Пример #4
0
 def getWordVectorsMatrix(self):
     tokenizer = None
     ds = datetime.datetime.now()
     if self.model.Config["runfor"] != "test":
         tokenizer = Tokenizer(num_words=self.maxWords)
         trainTexts = []
         for i in range(len(self.model.Config[self.keyTrain])):
             trainTexts.append(self.model.Config[self.keyTrain][i].lines)
         tokenizer.fit_on_texts(trainTexts)
         if not self.model.isCV:
             with open(fullPath(self.model.Config, "indexerpath"), 'wb') as handle:
                 pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
             handle.close()
             if self.model.Config["maxdoclen"] > self.model.Config["maxseqlen"]:
                 print("Most of documents from training set have less then %d tokens. Longer documents will be truncated."%(
                     self.model.Config["maxseqlen"]))
         self.model.trainArrays = pad_sequences(tokenizer.texts_to_sequences(trainTexts),
                                                 maxlen=self.model.Config["maxseqlen"])
         self.model.trainLabels = numpy.concatenate([numpy.array(x.labels).
                         reshape(1, len(self.model.Config["cats"])) for x in self.model.Config[self.keyTrain]])
         if self.addValSet:
             ind = int(len(self.model.trainArrays) * (1 - self.valSize))
             self.model.valArrays = self.model.trainArrays[ind:]
             self.model.valLabels = self.model.trainLabels[ind:]
             self.model.trainArrays = self.model.trainArrays[:ind]
             self.model.trainLabels = self.model.trainLabels[:ind]
     if tokenizer == None:
         with open(fullPath(self.model.Config, "indexerpath"), 'rb') as handle:
             tokenizer = pickle.load(handle)
         handle.close()
     testTexts = []
     for i in range(len(self.model.Config[self.keyTest])):
         testTexts.append(self.model.Config[self.keyTest][i].lines)
     self.model.testArrays = pad_sequences(tokenizer.texts_to_sequences(testTexts),
                                           maxlen=self.model.Config["maxseqlen"])
     self.model.testLabels = numpy.concatenate([numpy.array(x.labels).
                         reshape(1, len(self.model.Config["cats"])) for x in self.model.Config[self.keyTest]])
     embedding_matrix = numpy.zeros((self.maxWords, self.ndim))
     word_index = tokenizer.word_index
     nf = 0
     for word, i in word_index.items():
         if i < self.maxWords:
             try:
                 embedding_vector = self.model.w2vModel[word]
             except KeyError:
                 nf += 1
                 continue
             if embedding_vector is not None:
                 embedding_matrix[i] = embedding_vector
     self.model.embMatrix =  embedding_matrix
     self.model.maxWords = self.maxWords
     if self.model.isCV:
         return
     de = datetime.datetime.now()
     print('Found %s unique tokens.' % len(tokenizer.word_index))
     print ('Tokens not found in W2V vocabulary: %d'%nf)
     print("All data prepared and embedding matrix built in %s"%(showTime(ds, de)))
     return embedding_matrix, self.maxWords
Пример #5
0
 def loadW2VModel(self):
     print("Load W2V model...")
     ds = datetime.datetime.now()
     self.Config[
         "w2vmodel"] = gensim.models.KeyedVectors.load_word2vec_format(
             fullPath(self.Config, "w2vmodelpath"))
     de = datetime.datetime.now()
     print("Load W2V model (%s) in %s" %
           (fullPath(self.Config, "w2vmodelpath"), showTime(ds, de)))
Пример #6
0
 def loadW2VModel(self):
     if self.Config["w2vmodel"] != None:
         print("W2V model is already loaded...")
         self.w2vModel = self.Config["w2vmodel"]
         return
     print("Load W2V model... ")
     ds = datetime.datetime.now()
     self.w2vModel = gensim.models.KeyedVectors.load_word2vec_format(
         fullPath(self.Config, "w2vmodelpath"))
     de = datetime.datetime.now()
     print("Load W2V model (%s) in %s" %
           (fullPath(self.Config, "w2vmodelpath"), showTime(ds, de)))
     self.Config["resources"]["w2v"]["modelPath"] = fullPath(
         self.Config, "w2vmodelpath")
     self.Config["resources"]["w2v"]["ndim"] = self.ndim
Пример #7
0
 def saveDataSets(self):
     root = fullPath(self.Config, "cvpath")
     shutil.rmtree(root)
     os.mkdir(root)
     trainPath = root + "/train"
     testPath = root + "/test"
     folds = {}
     os.mkdir(trainPath)
     for i in range(len(self.Config["cvtraindocs"])):
         doc = self.Config["cvtraindocs"][i]
         for j in range(len(doc.nlabs)):
             foldPath = trainPath + "/" + doc.nlabs[j]
             if doc.nlabs[j] not in folds:
                 os.mkdir(foldPath)
                 folds[doc.nlabs[j]] = True
             with open(foldPath + '/' + doc.name, 'w',
                       encoding="utf-8") as file:
                 file.write(doc.lines)
             file.close()
     folds = {}
     os.mkdir(testPath)
     for i in range(len(self.Config["cvtestdocs"])):
         doc = self.Config["cvtestdocs"][i]
         for j in range(len(doc.nlabs)):
             foldPath = testPath + "/" + doc.nlabs[j]
             if doc.nlabs[j] not in folds:
                 os.mkdir(foldPath)
                 folds[doc.nlabs[j]] = True
             with open(foldPath + '/' + doc.name, 'w',
                       encoding="utf-8") as file:
                 file.write(doc.lines)
             file.close()
Пример #8
0
    def startServer(self):
        stanford_path = fullPath(self.Config, "servsource") + "/"
        os.chdir(stanford_path)
        os.environ["CLASSPATH"] = "*"

        def runServer(onExit, popenArgs):
            def runInThread(onExit, popenArgs):
                srv = Popen(
                    'java -Xmx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -serverProperties '
                    + stanford_path +
                    'StanfordCoreNLP-arabic.properties -preload tokenize,ssplit,pos '
                    + '-status_port ' + self.Config["servport"] + ' -port ' +
                    self.Config["servport"] + ' -timeout 20000',
                    shell=True)
                srv.wait()
                onExit()
                return

            thread = threading.Thread(target=runInThread, args=(onExit, ''))
            thread.start()
            return thread

        def onExit():
            os.chdir(self.curdir)
            print("Server is down")

        runServer(onExit, '')
        time.sleep(10)
        print("Server is running")
Пример #9
0
 def tokenize(self, Config):
     taggerPath = fullPath(Config, "taggerpath")
     if (len(taggerPath) == 0 or not os.path.exists(taggerPath)):
         print ("Wrong path to the tagger's jar. Tokenization can't be done")
         Config["error"] = True
         return
     inPath = Config["home"] + "/" + Config["sourcepath"]
     outPath = Config["home"] + "/" + Config["targetpath"]
     stopWords = ""
     if Config["stopwords"] == "yes":
         sWords = list(stopwords.words('arabic'))
         for i in range(len(sWords)):
             if i > 0:
                 stopWords += ","
             stopWords += sWords[i]
     ds = datetime.datetime.now()
     srv = subprocess.Popen('java -Xmx2g -jar ' + taggerPath + ' "' + inPath +  '" "'  +
                            outPath + '" "' + Config["expos"] + '" "'+ stopWords + '" "' +
                            Config["extrawords"] + '" "' + Config["normalization"] + '" "' +
                            Config["actualtoks"] + '"',
                            stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
     srv.wait()
     reply = srv.communicate()
     de = datetime.datetime.now()
     print(reply[0].decode())
     print("All process is done in %s" % (showTime(ds, de)))
Пример #10
0
 def saveAdditions(self):
     self.resources["w2v"] = "yes"
     if not "indexer" in self.Config["resources"]:
         self.Config["resources"]["indexer"] = fullPath(
             self.Config, "indexerpath")
     self.resources["indexer"] = "yes"
     self.resources["handleType"] = "wordVectorsMatrix"
Пример #11
0
 def __init__(self, Config):
     super().__init__(Config)
     if self.Config["w2vmodel"] == None:
         if len(Config["w2vmodelpath"]) == 0 or not os.path.isfile(
                 fullPath(Config, "w2vmodelpath")):
             print("Wrong path to W2V model. Stop.")
             Config["error"] = True
             return
     try:
         self.valSize = float(Config["valsize"])
     except ValueError:
         self.valSize = 0
     if self.valSize <= 0 or self.valSize >= 1:
         print("Wrong size of validation data set. Stop.")
         Config["error"] = True
         return
     try:
         self.ndim = int(self.Config["w2vdim"])
     except ValueError:
         print("Wrong size of vectors' dimentions. Stop.")
         Config["error"] = True
         return
     self.addValSet = True
     self.handleType = "wordVectorsSum"
     self.tempSave = Config["tempsave"] == "yes"
     self.useProbabilities = True
     self.w2vModel = None
     self.loadW2VModel()
     if Config["runfor"] != "crossvalidation":
         self.prepareData()
     self.launchProcess()
Пример #12
0
 def trainSKLModel(self):
     de = datetime.datetime.now()
     print("Start training...")
     self.model.fit(self.trainArrays, self.trainLabels)
     ds = datetime.datetime.now()
     print("Model is trained in %s" % (showTime(de, ds)))
     if self.isCV:
         return
     joblib.dump(self.model, fullPath(self.Config, "modelpath", opt="name"))
     print("Model is saved in %s" %
           (fullPath(self.Config, "modelpath", opt="name")))
     print("Model evaluation...")
     prediction = self.model.predict(self.testArrays)
     print('Final accuracy is %.2f' %
           (accuracy_score(self.testLabels, prediction)))
     de = datetime.datetime.now()
     print("Evaluated in %s" % (showTime(ds, de)))
Пример #13
0
    def createW2VModel(self):
        sentences = []
        count = 0
        print ("Start to create W2V model...")
        print ("Get input data...")
        ds = datetime.datetime.now()
        with open(fullPath(self.Config, "w2vcorpuspath"), 'r', encoding='UTF-8') as f:
            for line in f:
                if len(line.strip()) == 0:
                    continue
                count += 1
                words = [w for w in line.strip().split()]
                sentences.append(words)
        f.close()
        de = datetime.datetime.now()
        print("Got %d lines from file %s in %s"% (count, fullPath(self.Config, "w2vcorpuspath"), showTime(ds, de)))
        numpy.random.shuffle(sentences)

        logger = EpochLogger(self.epochs)
        w2v = Word2Vec(size=self.ndim, window=10, min_count=3, workers=10)
        ds = datetime.datetime.now()
        print("Build vocabulary...")
        w2v.build_vocab(sentences)
        de = datetime.datetime.now()
        print("Vocabulary is built in %s" % (showTime(ds, de)))
        print("Train model...")
        ds = datetime.datetime.now()
        w2v.train(sentences, epochs=int(self.Config["w2vepochs"]), total_examples=len(sentences), callbacks=[logger])
        de = datetime.datetime.now()
        print("W2V model is completed in %s" % (showTime(ds, de)))

        modelPath = fullPath(self.Config, "w2vmodelpath")
        if self.Config["w2vtimeinname"]:
            modelName = os.path.basename(modelPath)
            dt = "-" + datetime.datetime.now().strftime("%Y-%b-%d-%H%M%S")
            pInd = modelName.rfind(".")
            if pInd > 0:
                modelName = modelName[:pInd] + dt + modelName[pInd:]
            else:
                modelName += dt
        finalPath = os.path.dirname(modelPath) + "/" + modelName
        ds = datetime.datetime.now()
        w2v.wv.save_word2vec_format(finalPath, binary=False)
        de = datetime.datetime.now()
        print("W2V model %s is saved in the text format in %s\n" % (finalPath, showTime(ds, de)))
Пример #14
0
 def trainNNModel(self):
     checkpoints = []
     if self.tempSave and not self.isCV:
         checkpoint = ModelCheckpoint(fullPath(self.Config, "temppath") +
                                      "/tempModel.hdf5",
                                      monitor='val_acc',
                                      verbose=self.verbose,
                                      save_best_only=True,
                                      mode='auto')
         checkpoints.append(checkpoint)
     print("Start training...              ")
     ds = datetime.datetime.now()
     self.model.fit(self.trainArrays,
                    self.trainLabels,
                    epochs=self.epochs,
                    validation_data=(self.valArrays, self.valLabels),
                    batch_size=self.trainBatch,
                    verbose=self.verbose,
                    callbacks=checkpoints,
                    shuffle=False)
     de = datetime.datetime.now()
     print("Model is trained in %s" % (showTime(ds, de)))
     if self.isCV:
         return
     self.model.save(fullPath(self.Config, "modelpath", opt="name"))
     print("Model evaluation...")
     scores1 = self.model.evaluate(self.testArrays,
                                   self.testLabels,
                                   verbose=self.verbose)
     print("Final model accuracy: %.2f%%" % (scores1[1] * 100))
     if self.tempSave:
         model1 = load_model(
             fullPath(self.Config, "temppath") + "/tempModel.hdf5")
         scores2 = model1.evaluate(self.testArrays,
                                   self.testLabels,
                                   verbose=self.verbose)
         print("Last saved model accuracy: %.2f%%" % (scores2[1] * 100))
         if scores1[1] < scores2[1]:
             model = model1
         pref = "The best model "
     else:
         pref = "Model "
     self.model.save(fullPath(self.Config, "modelpath", opt="name"))
     print(pref + "is saved in %s" %
           (fullPath(self.Config, "modelpath", opt="name")))
Пример #15
0
    def saveReports(self):
        print("Save report...")
        report = Report()
        report.requestId = self.Config["reqid"]
        report.sourcesPath = self.Config["actualpath"]
        report.datasetPath = self.Config["testpath"]

        tokOpts = [
            "actualtoks", "normalization", "stopwords", "expos", "extrawords",
            "excats"
        ]
        for i in range(len(tokOpts)):
            report.preprocess[tokOpts[i]] = self.Config[tokOpts[i]]
        for i in range(len(self.Config["testdocs"])):
            report.docs[self.Config["testdocs"][i].name] = {}
            report.docs[self.Config["testdocs"][i].name]["actual"] = ",".join(
                self.Config["testdocs"][i].nlabs)
        if len(self.Config["excats"]) == 0:
            exCats = []
        else:
            exCats = self.Config["excats"].split(",")
        cNames = [''] * (len(self.Config["cats"]) - len(exCats))
        for k, v in self.Config["cats"].items():
            if k not in exCats:
                cNames[v] = k
        report.categories = cNames
        for key, val in self.Config["results"].items():
            for i in range(len(val)):
                labs = []
                addLabs = []
                for j in range(self.qLabs):
                    #if val[i][j] >= self.rankThreshold:
                    if val[i][j] >= self.Config["ranks"][key]:
                        labs.append("%s[%.2f]" % (cNames[j], val[i][j]))
                    else:
                        addLabs.append("%s[%.2f]" % (cNames[j], val[i][j]))
                report.docs[self.Config["testdocs"][i].name][key] = ",".join(
                    labs) + " | " + ",".join(addLabs)
        for key, val in self.Config["metrics"].items():
            report.models[key] = val
        for key, val in self.Config["ranks"].items():
            report.ranks[key] = val
        if len(self.Config["results"]) > 1:
            for i in range(len(self.predictions)):
                labs = []
                for j in range(self.qLabs):
                    if self.predictions[i][j] == 1:
                        labs.append(cNames[j])
                report.docs[self.Config["testdocs"]
                            [i].name]["consolidated"] = ",".join(labs)
            report.models["consolidated"] = self.metrics
            report.ranks["consolidated"] = self.rankThreshold
        rPath = fullPath(self.Config,
                         "reportspath") + "/" + self.Config["reqid"] + ".json"
        with open(rPath, 'w', encoding="utf-8") as file:
            json.dump(report.toJSON(), file, indent=4)
        file.close()
Пример #16
0
 def __init__(self, Config):
     super().__init__(Config)
     if len(Config["binarizerpath"]) == 0 or not os.path.isfile(fullPath(Config, "binarizerpath")):
         if Config["runfor"] == "test" or (len(Config["binarizerpath"]) != 0 and not os.path.isdir(
                 os.path.dirname(fullPath(Config, "binarizerpath")))):
             print ("Wrong path to binarizer. Stop.")
             Config["error"] = True
             return
     if len(Config["vectorizerpath"]) == 0 or not os.path.isfile(fullPath(Config, "vectorizerpath")):
         if Config["runfor"] == "test" or (len(Config["vectorizerpath"]) != 0 and not os.path.isdir(
                 os.path.dirname(fullPath(Config, "vectorizerpath")))):
             print ("Wrong path to vectorizer. Stop.")
             Config["error"] = True
             return
     self.useProbabilities = False
     self.handleType = "vectorize"
     if Config["runfor"] != "crossvalidation":
         self.prepareData()
     self.launchProcess()
Пример #17
0
 def __init__(self, Config):
     super().__init__(Config)
     self.Config = Config
     self.useProbabilities = True
     self.maxBertSeqLength = 512
     self.device = 'cpu'
     self.n_gpu = torch.cuda.device_count()
     self.model_to_save = None
     if len(Config["bertpath"]) == 0 or not os.path.isfile(
             fullPath(Config, "bertpath")):
         print("Wrong path to archive with pre-trained BERT model. Stop.")
         Config["error"] = True
         return
     if len(Config["bertoutpath"]) == 0 or not os.path.isdir(
             fullPath(Config, "bertoutpath")):
         print("Wrong path to folder with resulting BERT files. Stop.")
         Config["error"] = True
         return
     self.args = Args(fullPath(self.Config, "bertpath"),
                      fullPath(self.Config,
                               "bertoutpath"))  # model: pytorch_ber.gz
     self.max_seq_length = min(self.maxBertSeqLength,
                               self.Config["maxseqlen"])
     if self.Config["runfor"] != "test":
         self.do_train = True
     if self.Config["runfor"] != "train":
         self.do_eval = True
     self.do_lower_case = False
     self.train_batch_size = min(self.trainBatch, 32)
     self.eval_batch_size = 8
     self.learning_rate = 5e-5
     self.num_train_epochs = self.epochs
     self.warmup_proportion = 0.1
     self.no_cuda = True
     self.local_rank = -1
     self.seed = 42
     self.gradient_accumulation_steps = 1
     self.keyTrain = "traindocs"
     self.keyTest = "testdocs"
     #self.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
     if self.Config["runfor"] != "crossvalidation":
         self.prepareData()
     self.launchProcess()
Пример #18
0
def composeTsv(model, type):
    cNames = [''] * len(model.Config["cats"])
    for k, v in model.Config["cats"].items():
        cNames[v] = k
    if type == "train":
        bertPath = fullPath(model.Config, "bertoutpath", opt="/train.tsv")
        data = model.Config[model.keyTrain]
    else:
        bertPath = fullPath(model.Config, "bertoutpath", opt="/dev.tsv")
        data = model.Config[model.keyTest]
    target = open(bertPath, "w", encoding="utf-8")
    for i in range(len(data)):
        conts = data[i].lines.replace('\r', '').replace('\n', '.')
        nl = '\n'
        if i == 0:
            nl = ''
        string = nl + ",".join(data[i].nlabs) + "\t" + conts
        target.write(string)
    target.close()
Пример #19
0
 def __init__(self, Config):
     self.Config = Config
     if "testdocs" not in Config or len(Config["results"]) == 0:
         print("Documents have not been classified in this process chain.")
         print("Consolidation can't be performed.")
         return
     self.rankThreshold = 0.5
     if Config['consolidatedrank'] == "yes":
         try:
             self.rankThreshold = float(Config["consolidatedrankthreshold"])
         except ValueError:
             self.rankThreshold = 0.5
     self.testLabels = numpy.concatenate([
         numpy.array(x.labels).reshape(1, len(self.Config["cats"]))
         for x in self.Config["testdocs"]
     ])
     self.qLabs = len(self.Config["cats"])
     self.predictions = numpy.zeros([len(self.testLabels), self.qLabs])
     self.metrics = {}
     self.useProbabilities = False
     self.reports = False
     self.runtime = False
     print("\nCalculate consolidated metrics...")
     if len(self.Config["results"]) == 0:
         print("No results to consolidate them.")
         print("Consolidation can't be performed.")
         return
     if Config["reports"] == "yes":
         if len(Config["reportspath"]) == 0 or not os.path.isdir(
                 fullPath(Config, "reportspath")):
             print("Wrong path to the folder, containing reports.")
             print("Reports can't be created.")
         else:
             self.reports = True
     if Config["saveresources"] == "yes":
         if len(Config["resourcespath"]) == 0 or not os.path.isdir(
                 fullPath(Config, "resourcespath")):
             print(
                 "Wrong path to the folder, containing resources for runtime."
             )
             print("Resources can't be saved.")
         else:
             self.runtime = True
     print("Rank threshold for consolidated results: %.2f" %
           (self.rankThreshold))
     if self.reports or self.Config["showresults"] == "yes":
         self.getConsolidatedResults()
         self.getMetrics()
         if self.reports:
             self.saveReports()
     if self.runtime:
         if len(os.listdir(fullPath(self.Config, "resourcespath"))) > 0:
             print(
                 "Warning: folder %s is not empty. All its content will be deleted."
                 % (fullPath(self.Config, "resourcespath")))
             shutil.rmtree(fullPath(self.Config, "resourcespath"))
             os.makedirs(fullPath(self.Config, "resourcespath"),
                         exist_ok=True)
         print("\nCollect arfifacts for runtime...")
         self.saveResources()
Пример #20
0
 def saveResources(self, type):
     self.resources["id"] = str(self.Config["modelid"])
     self.resources["modelPath"] = fullPath(self.Config,
                                            "bertoutpath",
                                            opt="name")
     self.resources["modelType"] = type
     if not "ptBertModel" in self.Config["resources"]:
         self.Config["resources"]["ptBertModel"] = self.args.bert_model
         self.Config["resources"]["vocabPath"] = self.vocabPath
     self.resources["ptBertModel"] = "yes"
     self.resources["handleType"] = "bert"
     self.resources["rankThreshold"] = self.rankThreshold
     self.Config["resources"]["models"][
         "Model" + str(self.Config["modelid"])] = self.resources
Пример #21
0
 def saveResources(self, type):
     self.resources["modelPath"] = fullPath(self.Config,
                                            "modelpath",
                                            opt="name")
     self.resources["modelType"] = type
     if self.useProbabilities:
         self.resources["rankThreshold"] = self.rankThreshold
     else:
         self.resources["rankThreshold"] = 1.0
     self.saveAdditions()
     if type == "skl":
         self.resources["handleType"] = "vectorize"
     self.Config["resources"]["models"][
         "Model" + str(self.Config["modelid"])] = self.resources
Пример #22
0
def parseConfigInfo(path):
    parser = ConfigParser()
    parser.read_file(open(path))
    try:
        sections = parser.sections()
        for i in range(len(sections)):
            options = parser.items(sections[i])
            for j in range(len(options)):
                Config[options[j][0]] = options[j][1]
        if not Config["home"]:
            Config["home"] = str(Path.home())
        if not Config["infofrom"]:
            Config["infofrom"] = "today"
        if Config["infofrom"] != "today":
            chk = Config["infofrom"].split()
            if len(chk) != 2 and not chk[1].startswith("day"):
                print("Wrong value of 'infofrom' option. Exit.")
                return
            try:
                days = int(chk[0])
            except ValueError:
                print("Wrong value of 'infofrom' option. Exit.")
                return
        if len(Config["reportspath"]) == 0 or not os.path.isdir(
                fullPath(Config, "reportspath")):
            print("Wrong path to the folder, containing reports. Exit.")
            return
        if len(Config["actualpath"]) == 0 or not os.path.isdir(
                fullPath(Config, "actualpath")):
            print(
                "Warning: wrong path to the folder, containing original documents."
            )
            print("It will not be possible to view this documents.")
    except Error:
        print("Config file's parsing error. Exit.")
        return
    InfoCreator(Config)
Пример #23
0
 def __init__(self, Config):
     print("Start to create info...")
     self.Config = Config
     self.curDir = os.path.dirname(__file__)
     self.info = {}
     self.startId = "%d%0.2d%0.2d000000" % (
         date.today().year, date.today().month, date.today().day)
     if self.Config["infofrom"] != "today":
         arr = self.Config["infofrom"].split()
         prevDays = int(arr[0])
         startDay = date.today() - timedelta(days=prevDays)
         self.startId = "%d%0.2d%0.2d000000" % (
             startDay.year, startDay.month, startDay.day)
     self.path = fullPath(Config, "reportspath")
     os.chdir(self.path)
     for f in glob.glob("*"):
         resPath = self.path + "/" + f
         try:
             ind = f.rindex(".")
         except ValueError:
             ind = len(f)
         key = f[:ind]
         if (key < self.startId):
             continue
         with open(resPath, 'r', encoding='utf-8') as json_file:
             try:
                 self.info[key] = json.load(json_file)
             except json.JSONDecodeError:
                 print(
                     "Warning: file %s doesn't have json format. Skipped." %
                     (resPath))
         json_file.close()
     if len(self.info) == 0:
         print(
             "Folder %s doesn't contain reports, created in required diapason of dates. Exit."
             % (self.path))
         return
     self.html = ""
     self.qReqs = 0
     self.footer = "</table></body></html>"
     self.docsDict = self.getDocsDictionary()
     self.createHtml()
Пример #24
0
 def launchCrossValidation(self):
     print("Start cross-validation...")
     ds = datetime.datetime.now()
     self.cvDocs = self.Config["traindocs"] + self.Config["testdocs"]
     random.shuffle(self.cvDocs)
     self.keyTrain = "cvtraindocs"
     self.keyTest = "cvtestdocs"
     pSize = len(self.cvDocs) // self.kfold
     ind = 0
     f1 = 0
     arrMetrics = []
     for i in range(self.kfold):
         print("Cross-validation, cycle %d from %d..." %
               ((i + 1), self.kfold))
         if i == 0:
             self.Config["cvtraindocs"] = self.cvDocs[pSize:]
             self.Config["cvtestdocs"] = self.cvDocs[:pSize]
         elif i == self.kfold - 1:
             self.Config["cvtraindocs"] = self.cvDocs[:ind]
             self.Config["cvtestdocs"] = self.cvDocs[ind:]
         else:
             self.Config["cvtraindocs"] = self.cvDocs[:ind] + self.cvDocs[
                 ind + pSize:]
             self.Config["cvtestdocs"] = self.cvDocs[ind:ind + pSize]
         ind += pSize
         self.prepareData()
         self.model = self.createModel()
         self.trainModel()
         self.testModel()
         arrMetrics.append(self.metrics)
         cycleF1 = self.metrics["all"]["f1"]
         print("Resulting F1-Measure: %f\n" % (cycleF1))
         if cycleF1 > f1:
             if self.Config["cvsave"]:
                 self.saveDataSets()
             f1 = cycleF1
     de = datetime.datetime.now()
     print("Cross-validation is done in %s" % (showTime(ds, de)))
     printAveragedMetrics(arrMetrics, self.Config)
     print("The best result is %f" % (f1))
     print("Corresponding data sets are saved in the folder %s" %
           (fullPath(self.Config, "cvpath")))
Пример #25
0
 def __init__(self, Config, DefConfig, kwargs):
     Config["modelid"] += 1
     print("=== Model " + str(Config["modelid"]) + " ===")
     updateParams(Config, DefConfig, kwargs)
     Config["type"] = Config["type"].lower()
     Config["runfor"] = Config["runfor"].lower()
     if Config["runfor"] != "none" and Config["type"] not in modelTypes:
         print(
             "Request contains definition of model with wrong type. Stop.")
         Config["error"] = True
         return
     if Config["runfor"] not in modelGoals:
         print("Request doesn't define the goal of the model process")
         print(
             "It should be one of 'trainAndTest', 'train', 'test', 'crossValidation' or 'none'. Stop."
         )
         Config["error"] = True
         return
     if Config["runfor"] != "none":
         print("Model type: " + Config["type"].upper() + ", " +
               userInfo[Config["runfor"]])
     else:
         print("Model : " + userInfo[Config["runfor"]])
     if Config["runfor"] == "none":
         return
     self.Config = Config
     self.DefConfig = DefConfig
     if "cats" not in Config or "traindocs" not in Config or "testdocs" not in Config:
         print("Input data isn't loaded. Stop.")
         Config["error"] = True
         return
     stop = False
     try:
         self.testSize = float(Config["testsize"])
     except ValueError:
         self.testSize = -1
     if len(Config["trainpath"]) == 0 or not os.path.isdir(
             fullPath(Config, "trainpath")):
         if Config["runfor"] != "test" or len(Config["testpath"]) == 0:
             print(
                 "Wrong path to the training set: folder %s doesn't exist."
                 % (fullPath(Config, "trainpath")))
             stop = True
     if len(Config["testpath"]) == 0 or not os.path.isdir(
             fullPath(Config, "testpath")):
         if not (len(Config["testpath"]) == 0 and self.testSize > 0
                 and self.testSize < 1):
             print(
                 "Wrong path to the testing set: folder %d doesn't exist." %
                 (fullPath(Config, "testpath")))
             stop = True
     if len(Config["modelpath"]) == 0 or not os.path.isdir(
             fullPath(Config, "modelpath")):
         print("Wrong path to the models' folder.")
         stop = True
     if len(Config["name"]) == 0:
         Config["name"] = Config["type"] + str(Config["modelid"])
     mPath = fullPath(Config, "modelpath", opt="name")
     if Config["runfor"] == "test" and not os.path.isfile(mPath):
         print("Wrong path to the tested model.")
         stop = True
     if Config["runfor"] != "test":
         try:
             self.epochs = int(Config["epochs"])
         except ValueError:
             print("Wrong quantity of epochs for training.")
             stop = True
         try:
             self.trainBatch = int(Config["trainbatch"])
         except ValueError:
             print("Wrong batch size for training.")
             stop = True
         try:
             self.verbose = int(Config["verbose"])
         except ValueError:
             print("Wrong value of 'verbose' flag for training.")
             stop = True
         if Config["tempsave"] == "yes":
             if len(Config["temppath"]) == 0 or not os.path.isdir(
                     fullPath(Config, "temppath")):
                 print("Wrong path to folder with intermediate results.")
                 stop = True
     if Config["runfor"] != "train" and Config["customrank"] == "yes":
         try:
             self.rankThreshold = float(Config["rankthreshold"])
         except ValueError:
             print("Wrong custom rank threshold.")
             stop = True
     if Config["runfor"] == "crossvalidation":
         if Config["cvsave"] == "yes":
             if len(Config["cvpath"]) == 0 or not os.path.isdir(
                     fullPath(Config, "cvpath")):
                 print(
                     "Wrong path to the cross-validation's resulting folder."
                 )
                 stop = True
         try:
             kfold = int(Config["kfold"])
         except ValueError:
             print("Wrong k-fold value.")
             stop = True
     if stop:
         print("Stop.")
         Config["error"] = True
         return
     if Config["type"].lower() == "snn":
         SnnModel(Config)
     elif Config["type"].lower() == "ltsm":
         LTSMModel(Config)
     elif Config["type"].lower() == "cnn":
         CNNModel(Config)
     elif Config["type"].lower() == "pac":
         PacModel(Config)
     elif Config["type"].lower() == "ridge":
         RidgeModel(Config)
     elif Config["type"].lower() == "svc":
         SVCModel(Config)
     elif Config["type"] == "perceptron":
         PerceptronModel(Config)
     elif Config["type"] == "sgd":
         SGDModel(Config)
     elif Config["type"] == "bert":
         BertModel(Config)
Пример #26
0
    def __init__(self, Config, DefConfig, kwargs):
        print("=== Loading data ===")
        updateParams(Config, DefConfig, kwargs)
        self.Config = Config
        self.DefConfig = DefConfig
        self.exCats = Config["excats"].split(",")
        self.sz = 0
        self.splitTrain = False
        self.topBound = 0.9
        self.charsTopBound = 0.6

        if len(Config["trainpath"]) == 0 or not os.path.isdir(
                fullPath(Config, "trainpath")):
            print("Wrong path to training set. Data can't be loaded.")
            Config["error"] = True
            return
        if len(Config["testpath"]) > 0 and not os.path.isdir(
                fullPath(Config, "testpath")):
            print("Wrong path to testing set. Data can't be loaded.")
            Config["error"] = True
            return
        elif len(Config["testpath"]) == 0:
            self.splitTrain = True
            try:
                self.sz = float(Config["testsize"])
            except ValueError:
                self.sz = 0
            if len(Config["testpath"]) == 0 and (self.sz <= 0 or self.sz >= 1):
                print("Wrong size of testing set. Data can't be loaded.")
                Config["error"] = True
                return
        if Config["datatoks"] == "yes":
            if Config["actualtoks"] == "yes":
                taggerPath = fullPath(Config, 'rttaggerpath')
                if (self.Config["rttaggerpath"] == 0
                        or not os.path.exists(taggerPath)):
                    print(
                        "Wrong path to the tagger's jar. Preprocessing can't be done"
                    )
                    Config["error"] = True
                    return
                self.jar = subprocess.Popen('java -Xmx2g -jar ' + taggerPath +
                                            ' "' + self.Config["expos"] + '"',
                                            stdin=subprocess.PIPE,
                                            stdout=subprocess.PIPE,
                                            stderr=subprocess.PIPE,
                                            shell=True,
                                            encoding="utf-8")
            if self.Config["stopwords"] == "yes":
                self.stopWords = set(nltk.corpus.stopwords.words('arabic'))
            else:
                self.stopWords = set()
            if self.Config["normalization"] == "yes":
                self.normalizer = ArabicNormalizer()
        if Config["w2vload"] == "yes":
            if len(Config["w2vmodelpath"]) == 0 or not os.path.isfile(
                    fullPath(Config, "w2vmodelpath")):
                print("Wrong path to W2V model. Stop.")
                Config["error"] = True
                return
            try:
                self.ndim = int(self.Config["w2vdim"])
            except ValueError:
                print("Wrong size of vectors' dimentions. Stop.")
                Config["error"] = True
                return
            self.Config["resources"]["w2v"]["modelPath"] = fullPath(
                Config, "w2vmodelpath")
            self.Config["resources"]["w2v"]["ndim"] = self.ndim
            self.loadW2VModel()
        else:
            self.Config["w2vmodel"] = None

        self.loadData()
        if Config["analysis"] == "yes":
            self.analysis()
Пример #27
0
 def saveResources(self):
     tokOpts = [
         "actualtoks", "normalization", "stopwords", "expos", "extrawords",
         "maxseqlen", "maxcharsseqlen", "rttaggerpath"
     ]
     self.Config["resources"]["tokenization"] = {}
     ds = datetime.datetime.now()
     self.outDir = fullPath(self.Config, "resourcespath") + "/"
     for i in range(len(tokOpts)):
         if tokOpts[i] != "rttaggerpath":
             self.Config["resources"]["tokenization"][
                 tokOpts[i]] = self.Config[tokOpts[i]]
         elif self.Config["actualtoks"] == "yes":
             self.Config["resources"]["tokenization"]["rttaggerpath"] = \
                 self.copyFile(fullPath(self.Config, "rttaggerpath"))
     isW2VNeeded = False
     for key, val in self.Config["resources"]["models"].items():
         val["modelPath"] = self.copyFile(val["modelPath"])
         if "w2v" in val and val["w2v"] == "yes":
             isW2VNeeded = True
     if not isW2VNeeded and "w2v" in self.Config["resources"]:
         self.Config["resources"].pop("w2v", None)
     if "w2v" in self.Config["resources"]:
         w2vDict = {}
         isFirstLine = True
         fEmbeddings = open(self.Config["resources"]["w2v"]["modelPath"],
                            encoding="utf-8")
         for line in fEmbeddings:
             if isFirstLine == True:
                 isFirstLine = False
                 continue
             split = line.strip().split(" ")
             word = split[0]
             vector = numpy.array([float(num) for num in split[1:]])
             w2vDict[word] = vector
         fEmbeddings.close()
         with open(self.Config["resources"]["w2v"]["modelPath"] + '.pkl',
                   'wb') as file:
             pickle.dump(w2vDict, file, pickle.HIGHEST_PROTOCOL)
         file.close()
         self.Config["resources"]["w2v"]["modelPath"] = self.copyFile(
             self.Config["resources"]["w2v"]["modelPath"] + '.pkl')
     if "indexer" in self.Config["resources"]:
         self.Config["resources"]["indexer"] = self.copyFile(
             self.Config["resources"]["indexer"])
     if "vectorizer" in self.Config["resources"]:
         self.Config["resources"]["vectorizer"] = self.copyFile(
             self.Config["resources"]["vectorizer"])
     if "ptBertModel" in self.Config["resources"]:
         self.Config["resources"]["ptBertModel"] = self.copyFile(
             self.Config["resources"]["ptBertModel"])
         self.Config["resources"]["vocabPath"] = self.copyFile(
             self.Config["resources"]["vocabPath"])
     cNames = [''] * len(self.Config["cats"])
     for k, v in self.Config["cats"].items():
         cNames[v] = k
     with open(self.outDir + 'labels.txt', 'w', encoding="utf-8") as file:
         file.write(",".join(cNames))
     file.close()
     self.Config["resources"]["labels"] = "labels.txt"
     self.Config["resources"]["consolidatedRank"] = self.rankThreshold
     with open(self.outDir + 'config.json', 'w', encoding="utf-8") as file:
         json.dump(self.Config["resources"], file, indent=4)
     file.close()
     de = datetime.datetime.now()
     print("\nArtifacts are copied into the folder %s in %s" %
           (fullPath(self.Config, "resourcespath"), showTime(ds, de)))
Пример #28
0
 def loadSKLModel(self):
     return joblib.load(fullPath(self.Config, "modelpath", opt="name"))
Пример #29
0
 def loadNNModel(self):
     return load_model(fullPath(self.Config, "modelpath", opt="name"))
Пример #30
0
 def saveAdditions(self):
     if not "vectorizer" in self.Config["resources"]:
         self.Config["resources"]["vectorizer"] = fullPath(
             self.Config, "vectorizerpath")
     self.resources["vectorizer"] = "yes"