def copy_file_to_temp(filename: str) -> Path: """ Copies a file exclusively from the files folder to the temp folder. A fast and reliable way to move files internally. :param filename: The actual filename. File must be present in the files folder! :return: The file path """ try: shutil.copyfile(file_folder.joinpath(filename), temp_folder.joinpath(filename)) return temp_folder.joinpath(filename) except FileNotFoundError as err: print(err)
def load_models(self, model_name: str = None): model_path: Path = file_folder.joinpath('models') training_files = list(model_path.glob('*_tf_training_data')) intents_files = list(model_path.glob('*_intents.json')) if model_name == None: for file in training_files: model_name = file.name[:-len('_tf_training_data')] self._models['training_data'][model_name] = file.absolute() for file in intents_files: model_name = file.name[:-len('_intents.json')] self._models['intents'][model_name] = file.absolute() else: for file in training_files: if model_name in file.name: self._models['training_data'][model_name] = file.absolute() for file in intents_files: if model_name in file.name: self._models['intents'][model_name] = file.absolute() return self
def load_tflearn_model(self, activation='softmax'): # reset underlying graph data if self.tflearn_model != None: pass else: try: tf.reset_default_graph() net = tflearn.input_data(shape=[None, len(self.train_x[0])]) net = tflearn.fully_connected(net, 8) net = tflearn.fully_connected(net, 8) net = tflearn.fully_connected(net, len(self.train_y[0]), activation) net = tflearn.regression(net) self.tflearn_model: DNN = tflearn.DNN(net) tflearn_model_path = file_folder.joinpath('models') os.chdir(tflearn_model_path) self.tflearn_model.load('{}_model.tflearn'.format(self.brand)) except Exception as err: print("Couldn't load tflearn model") print(err)
def get_positive_brand_intents( brand_name_or_path: str or PosixPath) -> dict() or None: if type(brand_name_or_path) == PosixPath: try: return json.load(open(brand_name_or_path.absolute(), 'r')) except Exception as err: print("Couldn't open json file:", brand_name_or_path.absolute()) print(err) elif type(brand_name_or_path) == str: manual_intents_path = file_folder.joinpath('models').joinpath( 'manual_intents') intents_files = list(manual_intents_path.glob('*_positive.json')) for file in intents_files: intent_name = file.name[:-len('_positive.json')] if intent_name == brand_name_or_path: try: return json.load(open(file.absolute(), 'r')) except Exception as err: print("Couldn't open json file:", file.absolute()) print(err) return None
def save_positive_brand_intents(brand_name: str, wiki_name: str, patterns: []) -> dict() or None: intent_path = file_folder.joinpath('models').joinpath( 'manual_intents').joinpath('{}_positive.json'.format(brand_name)) original_brand_intents = get_positive_brand_intents( brand_name_or_path=intent_path) patterns = [ pattern for pattern in patterns if pattern.casefold() != brand_name.casefold() ] if original_brand_intents is not None: data_updated = False for intent in original_brand_intents['intents']: if intent['tag'] == wiki_name: original_patterns = intent['patterns'] patterns_set = set(patterns) original_patterns_set = set(original_patterns) combined_patterns = list(original_patterns_set | patterns_set) intent['patterns'] = combined_patterns data_updated = True if not data_updated: new_data = dict() new_data['tag'] = wiki_name new_data['patterns'] = patterns original_brand_intents['intents'].append(new_data) data_updated = True else: original_brand_intents = dict() new_data = dict() new_data['tag'] = wiki_name new_data['patterns'] = patterns original_brand_intents['intents'] = [] original_brand_intents['intents'].append(new_data) try: if original_brand_intents is not None: with open(intent_path.absolute(), 'w') as outfile: json.dump(original_brand_intents, outfile) except Exception as err: print("Couldn't write json intent file:", intent_path.absolute()) print(err)
def train(train_x, train_y, model_name, words, classes, n_epoch=1000, batch_size=16, activation='softmax'): os.chdir(file_folder.joinpath('models')) # reset underlying graph data tf.reset_default_graph() # Build neural network net = tflearn.input_data(shape=[None, len(train_x[0])]) net = tflearn.fully_connected(net, 8) net = tflearn.fully_connected(net, 8) net = tflearn.fully_connected(net, len(train_y[0]), activation) net = tflearn.regression(net) # Define model and setup tensorboard model = tflearn.DNN(net, tensorboard_dir='tflearn_logs') # Start training (apply gradient descent algorithm) model.fit(train_x, train_y, n_epoch=n_epoch, batch_size=batch_size, show_metric=True, run_id=model_name) tflearn_model = Path.cwd().joinpath('{}_model.tflearn'.format(model_name)) tf_training_data = Path.cwd().joinpath("{}_tf_training_data".format(model_name)) delete_file(file=tflearn_model) delete_file(file=tf_training_data) model.save(tflearn_model.name) pickle.dump({'words': words, 'classes': classes, 'train_x': train_x, 'train_y': train_y}, open(tf_training_data, "wb"))
def preprocess_training_data(brand: str, class_limit: int = None, manufacturer_cars: dict() = None, max_sentence: int = None, remove_list: [] = None, manual_intents_marker: bool = True, wiki_pages: bool = True, intent_cars_only: bool = False): intents = {} training_data = [] carcounter = 0 car: str manual_intents = {} manual_intents = get_positive_brand_intents(brand_name_or_path=brand) for car in manufacturer_cars: if class_limit != None and carcounter > class_limit: break carcounter += 1 other_tags = [k.replace(brand, '').strip() for k in manufacturer_cars.keys() if k != car] car_data = {} patterns = [] brand_aliases: [] = [] wiki_name = manufacturer_cars[car].wiki_name manual_car_intents = None tag_name = wiki_name car_data['tag'] = tag_name car_name = manufacturer_cars[car].car_name patterns.append(tag_name) patterns.append(tag_name.casefold()) patterns.append(tag_name.title()) patterns.append(car_name) patterns.append(car_name.casefold()) patterns.append(car_name.title()) # tag_name_words = nltk.word_tokenize(tag_name) # tag_name_powerset = powerset(tag_name_words) # tag_name_powerset = [power for power in tag_name_powerset if # power != brand or power.casefold() != brand.casefold()] # tag_name_powerset_lower = [power.casefold() for power in tag_name_powerset] # [patterns.append(power) for power in tag_name_powerset] # [patterns.append(power) for power in tag_name_powerset_lower] if brand in manufacturer_aliases: brand_aliases.extend(manufacturer_aliases[brand]) if manual_intents_marker and manual_intents is not None: for intent in manual_intents['intents']: if intent['tag'] == wiki_name: print("Using manual intents for car:", wiki_name) manual_car_intents = intent['patterns'] if manual_car_intents is not None: for intent in manual_car_intents: intent_words = nltk.word_tokenize(intent) intent_powerset = powerset(intent_words) intent_powerset = [power for power in intent_powerset if power != brand or power.casefold() != brand.casefold()] intent_powerset_lower = [word.casefold() for word in intent_powerset] [patterns.append(power) for power in intent_powerset] [patterns.append(power) for power in intent_powerset_lower] patterns.append(intent) patterns.append(intent.casefold()) patterns.append(intent.title()) if manual_car_intents is None and wiki_pages is False: pass elif wiki_pages: car_text: str = manufacturer_cars[car].page_text if type(car_text) != str: sentences: list = [''] else: sentences: list = nltk.sent_tokenize(car_text) for alias in brand_aliases: pattern_words_combined = "".join(alias).join(car_name) patterns.append(pattern_words_combined) patterns.append(pattern_words_combined.capitalize()) patterns.append(pattern_words_combined.title()) sentence_counter = 0 sentences_length: int = len(sentences) for sentence in sentences: if sentence_counter >= sentences_length - 1 or max_sentence is not None and sentence_counter >= max_sentence: break else: cleaned_sentences = check_sentence(sentence=sentence, model_name=car, remove_list=remove_list, other_tags=other_tags) sentence_counter += 1 [patterns.append(cleaned_sentence.replace(brand, '').replace(brand.casefold(), '')) for cleaned_sentence in cleaned_sentences] if sentence_counter == 0: sub_counter = 0 break_counter = (len(sentences) / 100) * 10 for sentence in sentences: if sub_counter >= break_counter: break else: patterns.append(sentence.replace(brand, '').replace(brand.casefold(), '')) if len(patterns) > 0: if intent_cars_only and manual_car_intents is None: pass else: patterns = sorted(list(set(patterns))) car_data['patterns'] = patterns training_data.append(car_data) intents['intents'] = training_data with open(file_folder.joinpath('models').joinpath("{}_intents.json".format(brand)), "w") as outfile: json.dump(intents, outfile) return intents
from multiprocessing.dummy import Pool as ThreadPool from pathlib import Path, PosixPath import nltk import numpy as np import tensorflow as tf import tflearn from tflearn import DNN from tqdm import tqdm from openfuelservice.server import file_folder, ann_settings, ofs_settings, car_brands, verbose from openfuelservice.server.utils.database.queries import Wikipedia from openfuelservice.server.utils.misc.data_handling import powerset, get_positive_brand_intents from openfuelservice.server.utils.misc.file_management import delete_file word_list_en_path = file_folder.joinpath(ann_settings['word_lists']['en']) word_list_de_path = file_folder.joinpath(ann_settings['word_lists']['de']) _wordlist_en = None cpu = ofs_settings['general']['cpu'] manufacturer_aliases = car_brands['aliases'] if not verbose: tf.logging.set_verbosity(tf.logging.ERROR) def preprocess_remove_list(remove_list, manufacturer): return {x.replace(manufacturer, '').replace(manufacturer.casefold(), '') for x in remove_list} def preprocess_training_data(brand: str, class_limit: int = None, manufacturer_cars: dict() = None, max_sentence: int = None, remove_list: [] = None, manual_intents_marker: bool = True, wiki_pages: bool = True, intent_cars_only: bool = False):