def mainTrainModel(): print("============ Start =============\n") print("1 - Load Configuration\n") config = cg.Config() dataL = dt.Data(config) print("2 - Load Data and Targets\n") map_labels = dataL.loadMapFromJson(config.data_path + "map_labels.json") tickets = dataL.loadDataInArray( config.data_path + "tickets_balanced_15000.txt", config.csv_encoding) targets = dataL.loadDataInArray(config.data_path + "target_balanced_15000.txt") labels = dataL.getfirstLevelTargets(map_labels['map']) print("3 - Preprocess Data\n") tickets, targets = ut.removeIdenticalTickets(tickets, targets) tickets_to_lower, targets, words = preprocessData(tickets, targets, labels) print("4 - Build Vocabulary\n") # Create Vocabulary voc = vc.Vocabulary(config) dictionary, reverse_dict = voc.build_dictionary(words, labels) voc.saveDictionary(dictionary, "vocabulary") print("5 - Create Ticket Sequences and Targets Hot Vectors\n") #Create Sequences and HotVectors for the Target tickets_sequences = dataL.createDataSequence(tickets_to_lower, dictionary) oneHotVectorTarget = dataL.transformInOneHotVector(labels, targets) print("6 - Filter Data - Removeing Token OOV\n") filtdata = fd.FilterData(config, labels) tickets_sequences, oneHotVectorTarget, trash = filtdata.removeTokenOOV( tickets_sequences, oneHotVectorTarget, dictionary) print(" *** Class Trash len : " + str(len(trash))) print("7 - Generate Training and Testing Dataset\n") X_train, X_test, y_train, y_test = ut.get_train_and_test( tickets_sequences, oneHotVectorTarget, test_size=0.2) dataL.writeArrayStringInFile( X_train, 'parsed_sequences_15000/tickets_training.txt', "utf-8") dataL.writeArrayStringInFile(X_test, 'parsed_sequences_15000/tickets_test.txt', "utf-8") dataL.writeArrayStringInFile(y_train, 'parsed_sequences_15000/target_training.txt', "utf-8") dataL.writeArrayStringInFile(y_test, 'parsed_sequences_15000/target_test.txt', "utf-8") print(" *** Training Size : " + str(len(X_train)) + "\n") if config.use_pretrained_embs: print(" *** Uso pretrained Words Embedding\n") skip = sk.SkipgramModel(config) skipgramModel = skip.get_skipgram() skipgramEmbedding = skip.getCustomEmbeddingMatrix( skipgramModel, reverse_dict) config.skipgramEmbedding = skipgramEmbedding print("8 - Start Training\n") ml.runTraining(config, X_train, y_train, labels) print("============ End =============\n")
def mainTrainModelOnPreprocessedData(): print("============ Start =============\n") print("1 - Load Configuration\n") config = cg.Config() dataL = dt.Data(config) print("2 - Load Data and Targets Sequences\n") map_labels = dataL.loadMapFromJson(config.data_path + "map_labels.json") tickets = dataL.loadDataInArray( config.main_path + "parsed_sequences_b/tickets_training.txt", config.csv_encoding) targets = dataL.loadDataInArray(config.main_path + "parsed_sequences_b/target_training.txt") labels = dataL.getfirstLevelTargets(map_labels['map']) oneHotVectorTarget = dataL.transformListStringInOneHot(targets) print("*** Training Size : " + str(len(tickets)) + "\n") tickets_parsed = [] for t in tickets: tickets_work = [] tt = re.split("\[", t) tt = re.split("\]", tt[1]) tt = re.split(",", tt[0]) for inner_t in tt: a = int(inner_t) tickets_work.append(a) tickets_parsed.append(tickets_work) print("3 - Load Vocabulary\n") voc = vc.Vocabulary(config) dictionary = voc.loadDictionary("vocabulary") reverse_dict = voc.getReverseDictionary(dictionary) print("*** Training Size : " + str(len(tickets)) + "\n") if config.use_pretrained_embs: print("*** Uso pretrained Words Embedding\n") skip = sk.SkipgramModel(config) skipgramModel = skip.get_skipgram() skipgramEmbedding = skip.getCustomEmbeddingMatrix( skipgramModel, reverse_dict) config.skipgramEmbedding = skipgramEmbedding print("4 - Start Training\n") ml.runTraining(config, tickets_parsed, oneHotVectorTarget, labels) print("============ End =============\n")
def trainPriority(): print("============ Start =============\n") print("1 - Load Configuration\n") config = cg.Config() config.configFromFile("config/priority_config.json") dataL = dt.Data(config) print("2 - Load Data and Targets\n") tickets_training = dataL.loadDataInArray( config.main_path + "onlyAperturaPriority/tickets_training.txt", config.csv_encoding) tickets_test = dataL.loadDataInArray( config.main_path + "onlyAperturaPriority/tickets_test.txt", config.csv_encoding) targets_training = dataL.loadDataInArray( config.main_path + "onlyAperturaPriority/targets_training.txt", config.csv_encoding) targets_test = dataL.loadDataInArray( config.main_path + "onlyAperturaPriority/targets_test.txt", config.csv_encoding) labels = ["1", "2", "3", "4", "5"] print("3 - Preprocess Data\n") tickets_training_tl, targets_training, words = preprocessData( tickets_training, targets_training, labels) tickets_test_tl, targets_test, w_ = preprocessData(tickets_test, targets_test, labels) if config.loadOrbuild_dictionary == "build": print("4 - Build Vocabulary\n") # Create Vocabulary voc = vc.Vocabulary(config) dictionary, reverse_dict = voc.build_dictionary(words, labels) voc.saveDictionary(dictionary, "vocabulary") print("*** Vocabulary saved \n") else: print("4 - Load Vocabulary\n") # Load Existing Vocabulary voc = vc.Vocabulary(config) dictionary = voc.loadDictionary("vocabulary") reverse_dict = voc.getReverseDictionary(dictionary) print("5 - Create Ticket Sequences and Targets Hot Vectors\n") # Create Sequences and HotVectors for the Target tickets_training_sequences = dataL.createDataSequence( tickets_training_tl, dictionary) oneHotVectorTarget_training = dataL.transformInOneHotVector( labels, targets_training) tickets_test_sequences = dataL.createDataSequence(tickets_test_tl, dictionary) oneHotVectorTarget_test = dataL.transformInOneHotVector( labels, targets_test) print("6 - Filter Data - Removeing Token OOV\n") filtdata = fd.FilterData(config, labels) tickets_training_sequences, oneHotVectorTarget_training, trash = filtdata.removeTokenOOV( tickets_training_sequences, oneHotVectorTarget_training, dictionary) print(" *** Classe Cestino in Training : " + str(len(trash)) + "\n") tickets_test_sequences, oneHotVectorTarget_test, trash = filtdata.removeTokenOOV( tickets_test_sequences, oneHotVectorTarget_test, dictionary) print(" *** Classe Cestino in Test : " + str(len(trash)) + "\n") print("7 - Generate Training and Testing Dataset\n") dataL.writeArrayInFileCompleteDataPath( tickets_training_sequences, config.data_sequences_path + '/tickets_training.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( tickets_test_sequences, config.data_sequences_path + '/tickets_test.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( oneHotVectorTarget_training, config.data_sequences_path + '/target_training.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( oneHotVectorTarget_test, config.data_sequences_path + '/target_test.txt', "utf-8") print(" *** Training Size : " + str(len(tickets_training_sequences)) + "\n") print(" *** Test Size : " + str(len(tickets_test_sequences)) + "\n") if config.use_pretrained_embs: print(" *** Use pretrained Words Embedding\n") skip = sk.SkipgramModel(config) skipgramModel = skip.get_skipgram() skipgramEmbedding = skip.getCustomEmbeddingMatrix( skipgramModel, reverse_dict) config.skipgramEmbedding = skipgramEmbedding print("8 - Start Training\n") ml.runTraining(config, tickets_training_sequences, oneHotVectorTarget_training, labels) print("============ End =============\n")
def training_model(main_path, type, config_file, from_date, to_date, customer): #logging.basicConfig(filename=logCustomer, level=logging.INFO) #lg.configureLogger(QIUserLogger, customer, "training") # QIUserLogger.info( "-----------------------------------------------------------------") QIUserLogger.info( "------------------------Training Start---------------------------") # QIUserLogger.info("** Initialization start... **") main_path = main_path type = type config_file = config_file from_date = from_date to_date = to_date QIUserLogger.info(" MainPath - " + str(main_path)) QIUserLogger.info(" Type - " + str(type)) QIUserLogger.info(" ConfigFile - " + str(config_file)) QIUserLogger.info(" FromDate - " + str(from_date)) QIUserLogger.info(" ToDate - " + str(to_date)) # QIUserLogger.info("** Initialization End **") try: QIUserLogger.info("1 - Load Configurations") QIUserLogger.info(" ** Config for Classification") # Load Config files configModel = cg.Config() configModel.configFromFile(config_file) configModel.main_path = main_path configModel.updateDataOfMainPath(config_file, main_path) dataL = dt.Data(configModel) # QIUserLogger.info("2 - Login In API") # Login to API configConnection = con.ConfigConnection() dir_path = os.path.dirname(os.path.realpath(__file__)) configConnection.configFromFile(dir_path + "/config/" + customer + "/connector_config.json") connector = con.Connector(configConnection) # Create Persistent Session Reqsess = requests.session() # LogIN connector.login(Reqsess) QIUserLogger.info("3 - GET TICKETS FROM API") # params = "closedfrom=" + str(from_date) + "&closedto=" + str( to_date) + "&maxnum=" + str(configConnection.max_tickets_to_get) #params = {"closedfrom": from_date, "closedto": to_date, "maxnum" : configConnection.max_tickets_to_get} responseTicket = connector.getTickets(Reqsess, params) if len(responseTicket) > 0: rTicket = [] for t in responseTicket: rTicket.append(t['description']) # id2lab = dict( zip(configModel.labels_map.values(), configModel.labels_map.keys())) # gather_tickets, gather_targets = gatherData( type, responseTicket, configModel, id2lab) # QIUserLogger.info("4 - REMOVE STOP WORDS FROM NEW TICKETS") tok = tk.Tokenizer(gather_tickets) tok.tokenizeTickets() tickets_to_lower = tok.toLower() gather_tickets, gather_targets = tok.removeStopWordsToString( tickets_to_lower, gather_targets) QIUserLogger.info("5 - GET STORED DATA TICKETS") tickets_train = dataL.loadDataInArray( configModel.data_path + "/tickets.txt", configModel.csv_encoding) targets_train = dataL.loadDataInArray(configModel.data_path + "/targets.txt") # # Count if we reached the threshold QIUserLogger.info("6 - MERGE THE DATA - STORED AND GATHERED") max_length = configModel.max_num_tickets len_gather_tickets = len(gather_tickets) len_tickets = len(tickets_train) #Effettuo un nuovo training su tutto il dataset e non un transfer #learning perchè voglio utilizzare sempre un vocabolario aggiornato. tickets = tickets_train + gather_tickets targets = targets_train + gather_targets reached_dim = len_gather_tickets + len_tickets if reached_dim > max_length: elem_to_cut = reached_dim - max_length #cut out the firsts elem_to_cut elements merged_targets = tickets[elem_to_cut:] merged_tickets = targets[elem_to_cut:] tickets = merged_tickets targets = merged_targets reached_dim = max_length QIUserLogger.info("7 - REMOVE IDENTICAL TICKETS") #tickets, targets = ut.removeIdenticalTickets(tickets, targets) tickets, targets = ut.removeIdenticalTicketsFromNew( tickets, targets, len_tickets, reached_dim) QIUserLogger.info("8 - SAVING MERGED DATA") dataL.writeArrayInFileCompleteDataPath( tickets, configModel.data_path + '/tickets.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( targets, configModel.data_path + '/targets.txt', "utf-8") # QIUserLogger.info("9 - EXTRACT WORDS FROM TICKETS") words = tok.extractWordsTicketString(tickets) # QIUserLogger.info("10 - BUILD NEW VOCABULARY") # Create Vocabulary voc = vc.Vocabulary(configModel) dictionary, reverse_dict = voc.build_dictionary( words, configModel.labels) voc.saveDictionary(dictionary, "vocabulary") QIUserLogger.info("*** Vocabulary saved") # QIUserLogger.info("11 -- SPLIT DATA IN TRAINING AND TEST DATASET") tickets_training, tickets_test, Target_training, Target_test = ut.get_train_and_test( tickets, targets) dataL.writeArrayInFileCompleteDataPath( tickets_training, configModel.data_path + '/tickets_training.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( Target_training, configModel.data_path + '/targets_training.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( tickets_test, configModel.data_path + '/tickets_test.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( Target_test, configModel.data_path + '/targets_test.txt', "utf-8") # QIUserLogger.info("12 - CREATE TICKETS AND TARGETS SEQUENCES") # Create Sequences and HotVectors for the Target tickets_training_sequences = dataL.createDataSequenceTicketsString( tickets_training, dictionary) oneHotVectorTarget_training = dataL.transformInOneHotVector( configModel.labels, Target_training) # QIUserLogger.info("13 - FILTER OUT DATA - Removing Token OOV") filtdata = fd.FilterData(configModel, configModel.labels) tickets_training_sequences, oneHotVectorTarget_training, trash = filtdata.removeTokenOOV( tickets_training_sequences, oneHotVectorTarget_training, dictionary) QIUserLogger.info(" *** Classe Cestino in Training : " + str(len(trash))) # #QIUserLogger.info(" -- Split Training | Test Dataset") #tickets_training_sequences, tickets_test_sequences, oneHotVectorTarget_training, oneHotVectorTarget_test = ut.get_train_and_test(tickets_training_sequences, oneHotVectorTarget_training) # QIUserLogger.info("14 - SAVING TRAINING SEQUENCES") dataL.writeArrayInFileCompleteDataPath( tickets_training_sequences, configModel.data_sequences_path + '/tickets_training.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( oneHotVectorTarget_training, configModel.data_sequences_path + '/target_training.txt', "utf-8") QIUserLogger.info(" *** Training Size : " + str(len(tickets_training_sequences)) + "\n") if configModel.use_pretrained_embs: QIUserLogger.info(" *** Use pretrained Words Embedding") skip = sk.SkipgramModel(configModel) skipgramModel = skip.get_skipgram() skipgramEmbedding = skip.getCustomEmbeddingMatrix( skipgramModel, reverse_dict) configModel.skipgramEmbedding = skipgramEmbedding # Start Training QIUserLogger.info("15 - START TRAINING") ml.runTraining(configModel, tickets_training_sequences, oneHotVectorTarget_training, configModel.labels) QIUserLogger.info("============ End =============") else: QIUserLogger.info( "No New Tickets found. There is no need of a new training.") # LogIN connector.logout(Reqsess) # except Exception as e: print(str(e)) QIUserLogger.error("Error in training_model " + str(e))