Пример #1
0
def copy_file_to_temp(filename: str) -> Path:
    """
    Copies a file exclusively from the files folder to the temp folder.
    A fast and reliable way to move files internally.
    :param filename: The actual filename. File must be present in the files folder!
    :return: The file path
    """
    try:
        shutil.copyfile(file_folder.joinpath(filename),
                        temp_folder.joinpath(filename))
        return temp_folder.joinpath(filename)
    except FileNotFoundError as err:
        print(err)
Пример #2
0
 def load_models(self, model_name: str = None):
     model_path: Path = file_folder.joinpath('models')
     training_files = list(model_path.glob('*_tf_training_data'))
     intents_files = list(model_path.glob('*_intents.json'))
     if model_name == None:
         for file in training_files:
             model_name = file.name[:-len('_tf_training_data')]
             self._models['training_data'][model_name] = file.absolute()
         for file in intents_files:
             model_name = file.name[:-len('_intents.json')]
             self._models['intents'][model_name] = file.absolute()
     else:
         for file in training_files:
             if model_name in file.name:
                 self._models['training_data'][model_name] = file.absolute()
         for file in intents_files:
             if model_name in file.name:
                 self._models['intents'][model_name] = file.absolute()
     return self
Пример #3
0
 def load_tflearn_model(self, activation='softmax'):
     # reset underlying graph data
     if self.tflearn_model != None:
         pass
     else:
         try:
             tf.reset_default_graph()
             net = tflearn.input_data(shape=[None, len(self.train_x[0])])
             net = tflearn.fully_connected(net, 8)
             net = tflearn.fully_connected(net, 8)
             net = tflearn.fully_connected(net, len(self.train_y[0]), activation)
             net = tflearn.regression(net)
             self.tflearn_model: DNN = tflearn.DNN(net)
             tflearn_model_path = file_folder.joinpath('models')
             os.chdir(tflearn_model_path)
             self.tflearn_model.load('{}_model.tflearn'.format(self.brand))
         except Exception as err:
             print("Couldn't load tflearn model")
             print(err)
Пример #4
0
def get_positive_brand_intents(
        brand_name_or_path: str or PosixPath) -> dict() or None:
    if type(brand_name_or_path) == PosixPath:
        try:
            return json.load(open(brand_name_or_path.absolute(), 'r'))
        except Exception as err:
            print("Couldn't open json file:", brand_name_or_path.absolute())
            print(err)
    elif type(brand_name_or_path) == str:
        manual_intents_path = file_folder.joinpath('models').joinpath(
            'manual_intents')
        intents_files = list(manual_intents_path.glob('*_positive.json'))
        for file in intents_files:
            intent_name = file.name[:-len('_positive.json')]
            if intent_name == brand_name_or_path:
                try:
                    return json.load(open(file.absolute(), 'r'))
                except Exception as err:
                    print("Couldn't open json file:", file.absolute())
                    print(err)
    return None
Пример #5
0
def save_positive_brand_intents(brand_name: str, wiki_name: str,
                                patterns: []) -> dict() or None:
    intent_path = file_folder.joinpath('models').joinpath(
        'manual_intents').joinpath('{}_positive.json'.format(brand_name))
    original_brand_intents = get_positive_brand_intents(
        brand_name_or_path=intent_path)
    patterns = [
        pattern for pattern in patterns
        if pattern.casefold() != brand_name.casefold()
    ]
    if original_brand_intents is not None:
        data_updated = False
        for intent in original_brand_intents['intents']:
            if intent['tag'] == wiki_name:
                original_patterns = intent['patterns']
                patterns_set = set(patterns)
                original_patterns_set = set(original_patterns)
                combined_patterns = list(original_patterns_set | patterns_set)
                intent['patterns'] = combined_patterns
                data_updated = True
        if not data_updated:
            new_data = dict()
            new_data['tag'] = wiki_name
            new_data['patterns'] = patterns
            original_brand_intents['intents'].append(new_data)
            data_updated = True
    else:
        original_brand_intents = dict()
        new_data = dict()
        new_data['tag'] = wiki_name
        new_data['patterns'] = patterns
        original_brand_intents['intents'] = []
        original_brand_intents['intents'].append(new_data)
    try:
        if original_brand_intents is not None:
            with open(intent_path.absolute(), 'w') as outfile:
                json.dump(original_brand_intents, outfile)
    except Exception as err:
        print("Couldn't write json intent file:", intent_path.absolute())
        print(err)
Пример #6
0
def train(train_x, train_y, model_name, words, classes, n_epoch=1000, batch_size=16, activation='softmax'):
    os.chdir(file_folder.joinpath('models'))
    # reset underlying graph data
    tf.reset_default_graph()
    # Build neural network
    net = tflearn.input_data(shape=[None, len(train_x[0])])
    net = tflearn.fully_connected(net, 8)
    net = tflearn.fully_connected(net, 8)
    net = tflearn.fully_connected(net, len(train_y[0]), activation)
    net = tflearn.regression(net)

    # Define model and setup tensorboard
    model = tflearn.DNN(net, tensorboard_dir='tflearn_logs')
    # Start training (apply gradient descent algorithm)
    model.fit(train_x, train_y, n_epoch=n_epoch, batch_size=batch_size, show_metric=True, run_id=model_name)
    tflearn_model = Path.cwd().joinpath('{}_model.tflearn'.format(model_name))
    tf_training_data = Path.cwd().joinpath("{}_tf_training_data".format(model_name))
    delete_file(file=tflearn_model)
    delete_file(file=tf_training_data)
    model.save(tflearn_model.name)
    pickle.dump({'words': words, 'classes': classes, 'train_x': train_x, 'train_y': train_y},
                open(tf_training_data, "wb"))
Пример #7
0
def preprocess_training_data(brand: str, class_limit: int = None, manufacturer_cars: dict() = None,
                             max_sentence: int = None, remove_list: [] = None, manual_intents_marker: bool = True,
                             wiki_pages: bool = True, intent_cars_only: bool = False):
    intents = {}
    training_data = []
    carcounter = 0
    car: str
    manual_intents = {}
    manual_intents = get_positive_brand_intents(brand_name_or_path=brand)
    for car in manufacturer_cars:
        if class_limit != None and carcounter > class_limit:
            break
        carcounter += 1
        other_tags = [k.replace(brand, '').strip() for k in manufacturer_cars.keys() if k != car]
        car_data = {}
        patterns = []
        brand_aliases: [] = []
        wiki_name = manufacturer_cars[car].wiki_name
        manual_car_intents = None
        tag_name = wiki_name
        car_data['tag'] = tag_name
        car_name = manufacturer_cars[car].car_name
        patterns.append(tag_name)
        patterns.append(tag_name.casefold())
        patterns.append(tag_name.title())
        patterns.append(car_name)
        patterns.append(car_name.casefold())
        patterns.append(car_name.title())
        # tag_name_words = nltk.word_tokenize(tag_name)
        # tag_name_powerset = powerset(tag_name_words)
        # tag_name_powerset = [power for power in tag_name_powerset if
        #                      power != brand or power.casefold() != brand.casefold()]
        # tag_name_powerset_lower = [power.casefold() for power in tag_name_powerset]
        # [patterns.append(power) for power in tag_name_powerset]
        # [patterns.append(power) for power in tag_name_powerset_lower]
        if brand in manufacturer_aliases:
            brand_aliases.extend(manufacturer_aliases[brand])
        if manual_intents_marker and manual_intents is not None:
            for intent in manual_intents['intents']:
                if intent['tag'] == wiki_name:
                    print("Using manual intents for car:", wiki_name)
                    manual_car_intents = intent['patterns']
        if manual_car_intents is not None:
            for intent in manual_car_intents:
                intent_words = nltk.word_tokenize(intent)
                intent_powerset = powerset(intent_words)
                intent_powerset = [power for power in intent_powerset if
                                   power != brand or power.casefold() != brand.casefold()]
                intent_powerset_lower = [word.casefold() for word in intent_powerset]
                [patterns.append(power) for power in intent_powerset]
                [patterns.append(power) for power in intent_powerset_lower]
                patterns.append(intent)
                patterns.append(intent.casefold())
                patterns.append(intent.title())
        if manual_car_intents is None and wiki_pages is False:
            pass
        elif wiki_pages:
            car_text: str = manufacturer_cars[car].page_text
            if type(car_text) != str:
                sentences: list = ['']
            else:
                sentences: list = nltk.sent_tokenize(car_text)
            for alias in brand_aliases:
                pattern_words_combined = "".join(alias).join(car_name)
                patterns.append(pattern_words_combined)
                patterns.append(pattern_words_combined.capitalize())
                patterns.append(pattern_words_combined.title())
            sentence_counter = 0
            sentences_length: int = len(sentences)
            for sentence in sentences:
                if sentence_counter >= sentences_length - 1 or max_sentence is not None and sentence_counter >= max_sentence:
                    break
                else:
                    cleaned_sentences = check_sentence(sentence=sentence, model_name=car, remove_list=remove_list,
                                                       other_tags=other_tags)
                    sentence_counter += 1
                    [patterns.append(cleaned_sentence.replace(brand, '').replace(brand.casefold(), '')) for
                     cleaned_sentence in cleaned_sentences]
            if sentence_counter == 0:
                sub_counter = 0
                break_counter = (len(sentences) / 100) * 10
                for sentence in sentences:
                    if sub_counter >= break_counter:
                        break
                    else:
                        patterns.append(sentence.replace(brand, '').replace(brand.casefold(), ''))
        if len(patterns) > 0:
            if intent_cars_only and manual_car_intents is None:
                pass
            else:
                patterns = sorted(list(set(patterns)))
                car_data['patterns'] = patterns
                training_data.append(car_data)
    intents['intents'] = training_data
    with open(file_folder.joinpath('models').joinpath("{}_intents.json".format(brand)), "w") as outfile:
        json.dump(intents, outfile)
    return intents
Пример #8
0
from multiprocessing.dummy import Pool as ThreadPool
from pathlib import Path, PosixPath

import nltk
import numpy as np
import tensorflow as tf
import tflearn
from tflearn import DNN
from tqdm import tqdm

from openfuelservice.server import file_folder, ann_settings, ofs_settings, car_brands, verbose
from openfuelservice.server.utils.database.queries import Wikipedia
from openfuelservice.server.utils.misc.data_handling import powerset, get_positive_brand_intents
from openfuelservice.server.utils.misc.file_management import delete_file

word_list_en_path = file_folder.joinpath(ann_settings['word_lists']['en'])
word_list_de_path = file_folder.joinpath(ann_settings['word_lists']['de'])
_wordlist_en = None
cpu = ofs_settings['general']['cpu']
manufacturer_aliases = car_brands['aliases']
if not verbose:
    tf.logging.set_verbosity(tf.logging.ERROR)


def preprocess_remove_list(remove_list, manufacturer):
    return {x.replace(manufacturer, '').replace(manufacturer.casefold(), '') for x in remove_list}


def preprocess_training_data(brand: str, class_limit: int = None, manufacturer_cars: dict() = None,
                             max_sentence: int = None, remove_list: [] = None, manual_intents_marker: bool = True,
                             wiki_pages: bool = True, intent_cars_only: bool = False):