Exemplo n.º 1
0
    def test_parser_error(self):
        with pytest.raises(ValueError):
            Usage(':without_type')

        with pytest.raises(ValueError):
            Usage(':pos str with_default_val')

        with pytest.raises(ValueError):
            Usage(':arg invalid_type')

        with pytest.raises(ValueError):
            Usage(':too int many args')
Exemplo n.º 2
0
 def test_renderers(self):
     usage = Usage('''
         desc 1
         :alpha str
         :beta str
     ''',
                   alpha_beta=lambda args: args.alpha + args.beta)
     a = usage.parse(['aval', 'bval'])
     assert a.alpha == 'aval'
     assert a.beta == 'bval'
     assert a.alpha_beta == 'avalbval'
     usage |= Usage(betabeta=lambda args: args.beta * 2)
     assert usage.parse(['aval', 'bval']).betabeta == 'bvalbval'
Exemplo n.º 3
0
 def test_or(self):
     usage = Usage('''
         desc 1
         :-a --alpha
             alpha 1
         :-b --beta
     ''') | Usage('''
         desc 2
         :-a --alpha
             alpha 2
         :-g --gamma
     ''')
     assert set(usage.arguments) == {'alpha', 'beta', 'gamma'}
     assert usage.desc == 'desc 1'
     assert usage.arguments['alpha']['help'] == 'alpha 1'
Exemplo n.º 4
0
 def test_all(self):
     usage = '''
     <description line 1>
     <description line 2>
     
     :alpha int
         <alpha help line 1>
         <alpha help line 2>
     :-b --beta str beta_default
         <beta desc>
     :-g --gamma
         <gamma desc>
     '''
     usage = Usage(usage)
     assert len(usage.arguments) == 3
     assert usage.desc == '<description line 1> <description line 2>'
     assert usage.arguments == {
         'alpha':
         dict(_0='alpha',
              type=int,
              help='<alpha help line 1> <alpha help line 2>'),
         'beta':
         dict(_0='-b',
              _1='--beta',
              type=str,
              help='<beta desc>. Default: beta_default',
              default='beta_default'),
         'gamma':
         dict(_0='-g',
              _1='--gamma',
              help='<gamma desc>',
              action='store_true')
     }
Exemplo n.º 5
0
 def test_fuzzy_indent(self):
     Usage('''
       description
           :-a --alpha
         this is alpha
             description
         :-b --beta
     ''')
Exemplo n.º 6
0
    def test_add_to_parser(self):
        parser = ArgumentParser(description='desc')
        parser.add_argument('alpha')

        Usage(':beta int').apply(parser)

        args = parser.parse_args(['<alpha>', '32'])
        assert args.alpha == '<alpha>'
        assert args.beta == 32
Exemplo n.º 7
0
 def test_overlap(self):
     parser = ArgumentParser()
     Usage('''
         description
         :-a --alpha
         :-b --beta
     ''').apply(parser)
     usage = Usage('''
             description
             :-a --alpha
                 desc 1
             :-a --alpha
                 desc 2
         ''')
     assert len(usage.arguments) == 1
     assert usage.arguments['alpha']['help'] == 'desc 2'
     with pytest.raises(ArgumentError):
         Usage(':-a --alpha').apply(parser)
     Usage(':-g --gamma').apply(parser)
Exemplo n.º 8
0
def main():
    Usage.types['evaluator'] = evaluator_type
    usage = Usage(__doc__)
    args = usage.parse()

    data_folder = join(dirname(realpath(__file__)), 'data')
    mat = load_mat_from_zip(args.hrir_zip.format(data_folder=data_folder))
    hrir = hrir_data_from_mat(mat)

    processor = Processor(hrir,
                          calc_azimuth=args.azimuth,
                          calc_elevation=args.elevation,
                          calc_distance=args.distance,
                          update_interval=args.update_interval)

    def callback(in_data, frame_count, time_info, status):
        audio = np.frombuffer(in_data, dtype=np.float32).reshape((-1, 2))[:, 0]
        left, right = processor.process(audio)
        out_data = np.vstack(
            (left, right)).T.flatten().astype('float32').tobytes()
        return out_data, pyaudio.paContinue

    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paFloat32,
                    channels=2,
                    rate=44100,
                    input=True,
                    output=True,
                    frames_per_buffer=args.chunk_size,
                    stream_callback=callback)
    stream.start_stream()

    try:
        while stream.is_active():
            sleep(0.2)
    finally:
        stream.stop_stream()
        stream.close()
        p.terminate()
class BaseScript:
    """A class to standardize the way scripts are defined"""
    usage = Usage()

    def __init__(self, args):
        self.args = args

    @classmethod
    def create(cls, **args):
        values = {}
        for arg_name, arg_data in cls.usage.arguments.items():
            if arg_name in args:
                values[arg_name] = args.pop(arg_name)
            else:
                if 'default' not in arg_data and arg_name and not arg_data[
                        '_0'].startswith('-'):
                    raise TypeError(
                        'Calling script without required "{}" argument.'.
                        format(arg_name))
                typ = arg_data.get('type')
                if arg_data.get('action', '').startswith('store_') and not typ:
                    typ = bool
                if not typ:
                    typ = lambda x: x
                values[arg_name] = typ(arg_data.get('default'))
        args = Namespace(**values)
        cls.usage.render_args(args)
        return cls(args)

    @abstractmethod
    def run(self):
        pass

    @classmethod
    def run_main(cls):
        parser = ArgumentParser()
        cls.usage.apply(parser)
        args = cls.usage.render_args(parser.parse_args())

        try:
            script = cls(args)
        except ValueError as e:
            parser.error('Error parsing args: ' + str(e))
            raise SystemExit(1)

        try:
            script.run()
        except KeyboardInterrupt:
            print()
Exemplo n.º 10
0
class CropScript(BaseScript):
    usage = Usage('''
        Crop images based on network output csv

        :data_csv str
            Network output csv
        
        :dataset_folder str
            Dataset folder

        :out_folder str
            Output folder to write cropped images to
        
        :-t --test
            Use test images
        
        :-s --size str 3380,2710
            Bounding box image size
    ''',
                  size=lambda x: tuple(map(int, x.size.split(','))))

    def run(self):
        args = self.args
        dataset = Dataset.from_folder(args.dataset_folder)
        makedirs(args.out_folder, exist_ok=True)
        source_folder = dataset.images_folder[1 if args.test else 0]
        df = pd.read_csv(args.data_csv)
        for image_id, rows in df.groupby('image_id'):
            image_file_name = join(source_folder, image_id + '.jpg')
            for i, (_, row) in enumerate(rows.iterrows()):
                out_file_name = join(args.out_folder,
                                     image_id + '-{:02}.jpg'.format(i))
                img = pyvips.Image.new_from_file(
                    image_file_name, access='sequential')  # type: pyvips.Image
                width = img.width - 1
                height = img.height - 1
                w, h = args.size
                xmin = max(0.0, min(1.0, row['xmin'] / w))
                ymin = max(0.0, min(1.0, row['ymin'] / h))
                xmax = max(0.0, min(1.0, row['xmax'] / w))
                ymax = max(0.0, min(1.0, row['ymax'] / h))
                cropped = img.crop(
                    int(width * xmin),
                    int(height * ymin),
                    max(1, int(width * (xmax - xmin))),
                    max(1, int(height * (ymax - ymin))),
                )
                cropped.write_to_file(out_file_name)
                print('Wrote to {}.'.format(out_file_name))
Exemplo n.º 11
0
class TrainStageThreeScript(BaseScript):
    usage = Usage('''
        Train the stage three network
        
        :model_file str
            Model file to load from/save to

        :-b --best-model-file
            Save best model file
    ''',
                  best_model_file=lambda x: x.best_model_file or x.model_file
                  ) | StageThreeDataset.usage

    def run(self):
        args = self.args
        dataset = StageThreeDataset.from_args(args)
        train_stage_three(dataset, args.best_model_file, args.model_file)
class MycroftScript(BaseScript):
    usage = Usage(__doc__)

    def __init__(self, args):
        super().__init__(args)

        if args.model == 'hey-mycroft':
            args.model = None

        self.engine = PreciseEngine(exe_file=None,
                                    model_file=args.model,
                                    chunk_size=args.chunk_size)
        self.runner = PreciseRunner(self.engine,
                                    args.trigger_level,
                                    sensitivity=args.sensitivity,
                                    on_activation=self.on_activation,
                                    on_prediction=self.on_prediction)
        self.session_id, self.chunk_num = '%09d' % randint(0, 999999999), 0

    def on_activation(self):
        activate_notify()

        if self.args.save_dir:
            nm = join(
                self.args.save_dir, self.args.save_prefix + self.session_id +
                '.' + str(self.chunk_num) + '.wav')
            save_audio(nm, self.audio_buffer)
            print()
            print('Saved to ' + nm + '.')
            self.chunk_num += 1

    def on_prediction(self, conf):
        if self.args.basic_mode:
            print('!' if conf > 0.7 else '.', end='', flush=True)
        else:
            max_width = 80
            width = min(get_terminal_size()[0], max_width)
            units = int(round(conf * width))
            bar = 'X' * units + '-' * (width - units)
            cutoff = round((1.0 - self.args.sensitivity) * width)
            print(bar[:cutoff] + bar[cutoff:].replace('X', 'x'))

    def run(self):
        self.runner.start()
        Event().wait()  # Wait forever
Exemplo n.º 13
0
class EngineScript(BaseScript):
    usage = Usage('''
        stdin should be a stream of raw int16 audio, written in
        groups of CHUNK_SIZE samples. If no CHUNK_SIZE is given
        it will read until EOF. For every chunk, an inference
        will be given via stdout as a float string, one per line

        :model_name str
            Keras or TensorFlow model to read from

        ...
    ''')
    usage.add_argument('-v',
                       '--version',
                       action='version',
                       version=__version__)
    usage.add_argument(
        'chunk_size',
        type=int,
        nargs='?',
        default=-1,
        help='Number of bytes to read before making a prediction. '
        'Higher values are less computationally expensive')
    usage.add_customizer(add_audio_pipe_to_parser)

    def __init__(self, args):
        super().__init__(args)
        if sys.stdin.isatty():
            raise ValueError('Please pipe audio via stdin using < audio.wav')

    def run(self):
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
        stdout = sys.stdout
        sys.stdout = sys.stderr
        listener = Listener(self.args.model_name, self.args.chunk_size)

        try:
            while True:
                conf = listener.update(sys.stdin.buffer)
                stdout.buffer.write((str(conf) + '\n').encode('ascii'))
                stdout.buffer.flush()
        except (EOFError, KeyboardInterrupt):
            pass
        finally:
            sys.stdout = stdout
Exemplo n.º 14
0
class TestScript(BaseScript):
    usage = Usage('''
        Test a model against a dataset

        :model str
            Either Keras (.net) or TensorFlow (.pb) model to test

        :-u --use-train
            Evaluate training data instead of test data

        :-nf --no-filenames
            Don't print out the names of files that failed

        :-t --threshold float 0.5
            Network output required to be considered an activation

        ...
    ''') | TrainData.usage

    def run(self):
        args = self.args
        inject_params(args.model)
        data = TrainData.from_both(args.tags_file, args.tags_folder, args.folder)
        train, test = data.load(args.use_train, not args.use_train, shuffle=False)
        inputs, targets = train if args.use_train else test

        filenames = sum(data.train_files if args.use_train else data.test_files, [])
        predictions = Listener.find_runner(args.model)(args.model).predict(inputs)
        stats = Stats(predictions, targets, filenames)

        print('Data:', data)

        if not args.no_filenames:
            fp_files = stats.calc_filenames(False, True, args.threshold)
            fn_files = stats.calc_filenames(False, False, args.threshold)
            print('=== False Positives ===')
            print('\n'.join(fp_files))
            print()
            print('=== False Negatives ===')
            print('\n'.join(fn_files))
            print()
        print(stats.counts_str(args.threshold))
        print()
        print(stats.summary_str(args.threshold))
Exemplo n.º 15
0
class RunStageThreeScript(BaseScript):
    usage = Usage('''
        Train the stage three nethreerk
        
        :model_file str
            Model file to load from
        
        :stage_two_file str
            Csv file with output from stage two
        
        :image_folder str
            Folder with source images
        
        :-g --gpu
            Train with GPU
        
        :-o --output-file str -
            Output csv to write to
    ''')

    def run(self):
        args = self.args
        boxes = pandas.read_csv(args.stage_two_file)
        predictor = StageThreePredictor(args.model_file, args.gpu)
        rows = []
        try:
            for image_id, image_rows in boxes.groupby('image_id'):
                image_filename = join(args.image_folder,
                                      '{}.jpg'.format(image_id))
                xcenters, ycenters = image_rows[['center_x',
                                                 'center_y']].values.T
                label = predictor.predict([image_filename], [xcenters],
                                          [ycenters])[0]
                for box_id, pos in enumerate(label):
                    rows.append(StageThreeRow(image_id, box_id, *pos))
                print(label)
        except KeyboardInterrupt:
            print('Stopping...')
        finally:
            if args.output_file:
                pandas.DataFrame(data=rows).to_csv(args.output_file)
Exemplo n.º 16
0
class RunStageTwoScript(BaseScript):
    usage = Usage('''
        Train the stage two network
        
        :model_file str
            Model file to load from
        
        :boxes_file str
            Csv file with predicted boxes
        
        :cropped_folder str
            String with cropped images of boxes
        
        :-g --gpu
            Train with GPU
        
        :-o --output-file str -
            Output csv to write to
    ''')

    def run(self):
        args = self.args
        boxes = pandas.read_csv(args.boxes_file)
        predictor = StageTwoPredictor(args.model_file, args.gpu)
        rows = []
        try:
            for _, row in boxes.iterrows():
                image_id, box_id = row['image_id'], row['box_id']
                box = [row[i] for i in ['xmin', 'ymin', 'xmax', 'ymax']]
                image_filename = join(args.cropped_folder,
                                      '{}-{:02}.jpg'.format(image_id, box_id))
                label = predictor.predict([image_filename], [box])[0]
                rows.append(StageTwoRow(image_id, box_id, *label))
                print(label)
        except KeyboardInterrupt:
            print('Stopping...')
        finally:
            if args.output_file:
                pandas.DataFrame(data=rows).to_csv(args.output_file)
Exemplo n.º 17
0
class PocketsphinxListenScript(BaseScript):
    usage = Usage('''
        Run Pocketsphinx on microphone audio input

        :key_phrase str
            Key phrase composed of words from dictionary

        :dict_file str
            Filename of dictionary with word pronunciations

        :hmm_folder str
            Folder containing hidden markov model

        :-th --threshold str 1e-90
            Threshold for activations

        :-c --chunk-size int 2048
            Samples between inferences
    ''')

    def run(self):
        def on_activation():
            activate_notify()

        def on_prediction(conf):
            print('!' if conf > 0.5 else '.', end='', flush=True)

        args = self.args
        runner = PreciseRunner(ListenerEngine(
            PocketsphinxListener(args.key_phrase, args.dict_file,
                                 args.hmm_folder, args.threshold,
                                 args.chunk_size)),
                               3,
                               on_activation=on_activation,
                               on_prediction=on_prediction)
        runner.start()
        Event().wait()  # Wait forever
Exemplo n.º 18
0
class TrainIncrementalScript(TrainScript):
    usage = Usage('''
        Train a model to inhibit activation by
        marking false activations and retraining

        :-e --epochs int 1
            Number of epochs to train before continuing evaluation

        :-ds --delay-samples int 10
            Number of false activations to save before re-training

        :-c --chunk-size int 2048
            Number of samples between testing the neural network

        :-r --random-data-folder str data/random
            Folder with properly encoded wav files of
            random audio that should not cause an activation

        :-th --threshold float 0.5
            Network output to be considered activated

        ...
    ''') | TrainScript.usage

    def __init__(self, args):
        super().__init__(args)

        for i in (
                join(self.args.folder, 'not-wake-word', 'generated'),
                join(self.args.folder, 'test', 'not-wake-word', 'generated')
        ):
            makedirs(i, exist_ok=True)

        self.trained_fns = load_trained_fns(self.args.model)
        self.audio_buffer = np.zeros(pr.buffer_samples, dtype=float)

        params = ModelParams(
            skip_acc=self.args.no_validation, extra_metrics=self.args.extra_metrics,
            loss_bias=1.0 - self.args.sensitivity
        )
        model = create_model(self.args.model, params)
        self.listener = Listener(self.args.model, self.args.chunk_size, runner_cls=KerasRunner)
        self.listener.runner = KerasRunner(self.args.model)
        self.listener.runner.model = model
        self.samples_since_train = 0

    @staticmethod
    def load_data(args: Any):
        data = TrainData.from_tags(args.tags_file, args.tags_folder)
        return data.load(True, not args.no_validation)

    def retrain(self):
        """Train for a session, pulling in any new data from the filesystem"""
        folder = TrainData.from_folder(self.args.folder)
        train_data, test_data = folder.load(True, not self.args.no_validation)

        train_data = TrainData.merge(train_data, self.sampled_data)
        test_data = TrainData.merge(test_data, self.test)
        train_inputs, train_outputs = train_data
        print()
        try:
            self.listener.runner.model.fit(
                train_inputs, train_outputs, self.args.batch_size, self.epoch + self.args.epochs,
                validation_data=test_data, callbacks=self.callbacks, initial_epoch=self.epoch
            )
        finally:
            self.listener.runner.model.save(self.args.model)

    def train_on_audio(self, fn: str):
        """Run through a single audio file"""
        save_test = random() > 0.8
        audio = load_audio(fn)
        num_chunks = len(audio) // self.args.chunk_size

        self.listener.clear()

        for i, chunk in enumerate(chunk_audio(audio, self.args.chunk_size)):
            print('\r' + str(i * 100. / num_chunks) + '%', end='', flush=True)
            self.audio_buffer = np.concatenate((self.audio_buffer[len(chunk):], chunk))
            conf = self.listener.update(chunk)
            if conf > self.args.threshold:
                self.samples_since_train += 1
                name = splitext(basename(fn))[0] + '-' + str(i) + '.wav'
                name = join(self.args.folder, 'test' if save_test else '', 'not-wake-word',
                            'generated', name)
                save_audio(name, self.audio_buffer)
                print()
                print('Saved to:', name)

            if not save_test and self.samples_since_train >= self.args.delay_samples and \
                    self.args.epochs > 0:
                self.samples_since_train = 0
                self.retrain()

    def run(self):
        """
        Begin reading through audio files, saving false
        activations and retraining when necessary
        """
        for fn in glob_all(self.args.random_data_folder, '*.wav'):
            if fn in self.trained_fns:
                print('Skipping ' + fn + '...')
                continue

            print('Starting file ' + fn + '...')
            self.train_on_audio(fn)
            print('\r100%                 ')

            self.trained_fns.append(fn)
            save_trained_fns(self.trained_fns, self.args.model)
Exemplo n.º 19
0
 def test_indent(self):
     usage = '''
     <line one>
     <line two>
     '''
     assert Usage(usage).desc == '<line one> <line two>'
from time import sleep

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from prettyparse import Usage

usage = Usage('''
    Get the content of pages shielded by Shibboleth

    :username str
        Illinois netid to use
    :password str
        Corresponding password for netid
    :url str
        Protected URL to get
    :-d --delay float 2.0
        Seconds to wait for page to load
    :-s --screenshot str -
    	Take a screenshot to the given .png file
    :-q --quiet
        Hide status output
''')


def shibboleth_get(username, password, url, driver=None, debug=True):
    """Returns a selenium webdriver with the authenticated page"""
    if driver is None:
        if debug:
            print('Launching headless Chrome...', file=sys.stderr)
        chrome_options = webdriver.ChromeOptions()
Exemplo n.º 21
0
class CalcThresholdScript(BaseScript):
    usage = Usage('''
        Update the threshold values of a model for a dataset.
        This makes the sensitivity more accurate and linear

        :model str
            Either Keras (.net) or TensorFlow (.pb) model to adjust

        :input_file str
            Input stats file that was outputted from precise-graph

        :-k --model-key str -
            Custom model name to use from the stats.json

        :-s --smoothing float 1.2
            Amount of extra smoothing to apply

        :-c --center float 0.2
            Decoded threshold that is mapped to 0.5. Proportion of
            false negatives at sensitivity=0.5
    ''')

    def __init__(self, args):
        super().__init__(args)

    def run(self):
        args = self.args
        import numpy as np

        model_data = {
            name: Stats.from_np_dict(data)
            for name, data in np.load(args.input_file)['data'].item().items()
        }
        model_name = args.model_key or basename(splitext(args.model)[0])

        if model_name not in model_data:
            print(
                "Could not find model '{}' in saved models in stats file: {}".
                format(model_name, list(model_data)))
            raise SystemExit(1)

        stats = model_data[model_name]

        save_spots = (stats.outputs != 0) & (stats.outputs != 1)
        if save_spots.sum() == 0:
            print('No data (or all NaN)')
            return

        stats.outputs = stats.outputs[save_spots]
        stats.targets = stats.targets[save_spots]
        inv = -np.log(1 / stats.outputs - 1)

        pos = np.extract(stats.targets > 0.5, inv)
        pos_mu = pos.mean().item()
        pos_std = sqrt(np.mean((pos - pos_mu)**2)) * args.smoothing

        print('Peak: {:.2f} mu, {:.2f} std'.format(pos_mu, pos_std))
        pr = inject_params(args.model)
        pr.__dict__.update(threshold_config=((pos_mu, pos_std), ))
        save_params(args.model)
        print('Saved params to {}.params'.format(args.model))
Exemplo n.º 22
0
class ListenScript(BaseScript):
    usage = Usage('''
        Run a model on microphone audio input

        :model str
            Either Keras (.net) or TensorFlow (.pb) model to run

        :-c --chunk-size int 2048
            Samples between inferences

        :-l --trigger-level int 3
            Number of activated chunks to cause an activation

        :-s --sensitivity float 0.5
            Network output required to be considered activated

        :-b --basic-mode
            Report using . or ! rather than a visual representation

        :-d --save-dir str -
            Folder to save false positives

        :-p --save-prefix str -
            Prefix for saved filenames
    ''')

    def __init__(self, args):
        super().__init__(args)
        self.listener = Listener(args.model, args.chunk_size)
        self.audio_buffer = np.zeros(self.listener.pr.buffer_samples,
                                     dtype=float)
        self.engine = ListenerEngine(self.listener, args.chunk_size)
        self.engine.get_prediction = self.get_prediction
        self.runner = PreciseRunner(self.engine,
                                    args.trigger_level,
                                    sensitivity=args.sensitivity,
                                    on_activation=self.on_activation,
                                    on_prediction=self.on_prediction)
        self.session_id, self.chunk_num = '%09d' % randint(0, 999999999), 0

    def on_activation(self):
        activate_notify()

        if self.args.save_dir:
            nm = join(
                self.args.save_dir, self.args.save_prefix + self.session_id +
                '.' + str(self.chunk_num) + '.wav')
            save_audio(nm, self.audio_buffer)
            print()
            print('Saved to ' + nm + '.')
            self.chunk_num += 1

    def on_prediction(self, conf):
        if self.args.basic_mode:
            print('!' if conf > 0.7 else '.', end='', flush=True)
        else:
            max_width = 80
            width = min(get_terminal_size()[0], max_width)
            units = int(round(conf * width))
            bar = 'X' * units + '-' * (width - units)
            cutoff = round((1.0 - self.args.sensitivity) * width)
            print(bar[:cutoff] + bar[cutoff:].replace('X', 'x'))

    def get_prediction(self, chunk):
        audio = buffer_to_audio(chunk)
        self.audio_buffer = np.concatenate(
            (self.audio_buffer[len(audio):], audio))
        return self.listener.update(chunk)

    def run(self):
        self.runner.start()
        Event().wait()  # Wait forever
Exemplo n.º 23
0
class TrainOptimizeScript(TrainScript):
    Usage('''
        Use black box optimization to tune model hyperparameters

        :-t --trials-name str -
            Filename to save hyperparameter optimization trials in
            '.bbopt.json' will automatically be appended

        :-c --cycles int 20
            Number of cycles of optimization to run

        :-m --model str .cache/optimized.net
            Model to load from

        ...
    ''') | TrainScript.usage

    def __init__(self, args):
        super().__init__(args)
        from bbopt import BlackBoxOptimizer
        self.bb = BlackBoxOptimizer(file=self.args.trials_name)
        if not self.test:
            data = TrainData.from_both(self.args.tags_file,
                                       self.args.tags_folder, self.args.folder)
            _, self.test = data.load(False, True)

        from keras.callbacks import ModelCheckpoint
        for i in list(self.callbacks):
            if isinstance(i, ModelCheckpoint):
                self.callbacks.remove(i)

    def process_args(self, args: Any):
        model_parts = glob(splitext(args.model)[0] + '.*')
        if len(model_parts) < 5:
            for name in model_parts:
                if isfile(name):
                    remove(name)
                else:
                    rmtree(name)
        args.trials_name = args.trials_name.replace('.bbopt.json',
                                                    '').replace('.json', '')
        if not args.trials_name:
            if isfile(join('.cache', 'trials.bbopt.json')):
                remove(join('.cache', 'trials.bbopt.json'))
            args.trials_name = join('.cache', 'trials')

    def run(self):
        print('Writing to:', self.args.trials_name + '.bbopt.json')
        for i in range(self.args.cycles):
            self.bb.run(backend="random")
            print("\n= %d = (example #%d)" %
                  (i + 1, len(self.bb.get_data()["examples"]) + 1))

            params = ModelParams(recurrent_units=self.bb.randint("units",
                                                                 1,
                                                                 70,
                                                                 guess=50),
                                 dropout=self.bb.uniform("dropout",
                                                         0.1,
                                                         0.9,
                                                         guess=0.6),
                                 extra_metrics=self.args.extra_metrics,
                                 skip_acc=self.args.no_validation,
                                 loss_bias=1.0 - self.args.sensitivity)
            print('Testing with:', params)
            model = create_model(self.args.model, params)
            model.fit(*self.sampled_data,
                      batch_size=self.args.batch_size,
                      epochs=self.epoch + self.args.epochs,
                      validation_data=self.test *
                      (not self.args.no_validation),
                      callbacks=self.callbacks,
                      initial_epoch=self.epoch)
            resp = model.evaluate(*self.test, batch_size=self.args.batch_size)
            if not isinstance(resp, (list, tuple)):
                resp = [resp, None]
            test_loss, test_acc = resp
            predictions = model.predict(self.test[0],
                                        batch_size=self.args.batch_size)

            num_false_positive = numpy.sum(predictions *
                                           (1 - self.test[1]) > 0.5)
            num_false_negative = numpy.sum(
                (1 - predictions) * self.test[1] > 0.5)
            false_positives = num_false_positive / numpy.sum(
                self.test[1] < 0.5)
            false_negatives = num_false_negative / numpy.sum(
                self.test[1] > 0.5)

            from math import exp
            param_score = 1.0 / (1.0 + exp(
                (model.count_params() - 11000) / 2000))
            fitness = param_score * (1.0 - 0.2 * false_negatives -
                                     0.8 * false_positives)

            self.bb.remember({
                "test loss": test_loss,
                "test accuracy": test_acc,
                "false positive%": false_positives,
                "false negative%": false_negatives,
                "fitness": fitness
            })

            print("False positive: ", false_positives * 100, "%")

            self.bb.maximize(fitness)
            pprint(self.bb.get_current_run())
        best_example = self.bb.get_optimal_run()
        print("\n= BEST = (example #%d)" %
              self.bb.get_data()["examples"].index(best_example))
        pprint(best_example)
Exemplo n.º 24
0
class TrainSampledScript(TrainScript):
    usage = Usage('''
        Train a model, sampling data points with the highest loss from a larger dataset

        :-c --cycles int 200
            Number of sampling cycles of size {epoch} to run

        :-n --num-sample-chunk int 50
            Number of new samples to introduce at a time between training cycles

        :-sf --samples-file str -
            Json file to write selected samples to.
            Default = {model_base}.samples.json

        :-is --invert-samples
            Unused parameter
        ...
    ''') | TrainScript.usage

    def __init__(self, args):
        super().__init__(args)
        if self.args.invert_samples:
            raise ValueError('--invert-samples should be left blank')
        self.args.samples_file = (self.args.samples_file
                                  or '{model_base}.samples.json').format(
                                      model_base=self.model_base)
        self.samples, self.hash_to_ind = self.load_sample_data(
            self.args.samples_file, self.train)
        self.metrics_fiti = Fitipy(self.model_base + '.logs',
                                   'sampling-metrics.txt')

    def write_sampling_metrics(self, predicted):
        correct = float(
            sum((predicted > 0.5) == (self.train[1] > 0.5)) /
            len(self.train[1]))
        print('Successfully calculated: {0:.3%}'.format(correct))

        lines = self.metrics_fiti.read().lines()
        lines.append('{}\t{}'.format(
            len(self.samples) / len(self.train[1]), correct))
        self.metrics_fiti.write().lines(lines)

    def choose_new_samples(self, predicted):
        failed_samples = {
            calc_sample_hash(inp, target)
            for i, (inp, pred, target) in enumerate(
                zip(self.train[0], predicted, self.train[1]))
            if (pred > 0.5) != (target > 0.5)
        }
        remaining_failed_samples = failed_samples - self.samples
        print('Remaining failed samples:', len(remaining_failed_samples))
        return islice(remaining_failed_samples, self.args.num_sample_chunk)

    def run(self):
        print('Writing to:', self.args.samples_file)
        print('Writing metrics to:', self.metrics_fiti.path)
        for _ in range(self.args.cycles):
            print('Calculating on whole dataset...')
            predicted = self.model.predict(self.train[0])

            self.samples.update(self.choose_new_samples(predicted))
            Fitipy(self.args.samples_file).write().set(self.samples)
            print('Added', self.args.num_sample_chunk, 'samples')

            self.write_sampling_metrics(predicted)

            self.model.fit(*self.sampled_data,
                           batch_size=self.args.batch_size,
                           epochs=self.epoch + self.args.epochs,
                           callbacks=self.callbacks,
                           initial_epoch=self.epoch,
                           validation_data=self.test)
Exemplo n.º 25
0
class TrainGeneratedScript(BaseScript):
    usage = Usage('''
        Train a model on infinitely generated batches

        :model str
            Keras .net model file to load from and write to

        :-e --epochs int 100
            Number of epochs to train on

        :-b --batch-size int 200
            Number of samples in each batch

        :-t --steps-per-epoch int 100
            Number of steps that are considered an epoch

        :-c --chunk-size int 2048
            Number of audio samples between generating a training sample

        :-r --random-data-folder str data/random
            Folder with properly encoded wav files of
            random audio that should not cause an activation

        :-s --sensitivity float 0.2
            Weighted loss bias. Higher values decrease increase positives

        :-sb --save-best
            Only save the model each epoch if its stats improve

        :-nv --no-validation
            Disable accuracy and validation calculation
            to improve speed during training

        :-mm --metric-monitor str loss
            Metric used to determine when to save

        :-em --extra-metrics
            Add extra metrics during training

        :-p --save-prob float 0.0
            Probability of saving audio into debug/ww and debug/nww folders

        ...
    ''') | TrainData.usage
    """A trainer the runs on generated data by overlaying wakewords on background audio"""
    def __init__(self, args):
        super().__init__(args)
        self.audio_buffer = np.zeros(pr.buffer_samples, dtype=float)
        self.vals_buffer = np.zeros(pr.buffer_samples, dtype=float)

        params = ModelParams(skip_acc=args.no_validation,
                             extra_metrics=args.extra_metrics,
                             loss_bias=1.0 - args.sensitivity)
        self.model = create_model(args.model, params)
        self.listener = Listener('',
                                 args.chunk_size,
                                 runner_cls=lambda x: None)

        from keras.callbacks import ModelCheckpoint, TensorBoard
        checkpoint = ModelCheckpoint(args.model,
                                     monitor=args.metric_monitor,
                                     save_best_only=args.save_best)
        epoch_fiti = Fitipy(splitext(args.model)[0] + '.epoch')
        self.epoch = epoch_fiti.read().read(0, int)

        def on_epoch_end(_a, _b):
            self.epoch += 1
            epoch_fiti.write().write(self.epoch, str)

        self.model_base = splitext(self.args.model)[0]

        self.callbacks = [
            checkpoint,
            TensorBoard(log_dir=self.model_base + '.logs', ),
            LambdaCallback(on_epoch_end=on_epoch_end)
        ]

        self.data = TrainData.from_both(args.tags_file, args.tags_folder,
                                        args.folder)
        pos_files, neg_files = self.data.train_files
        self.neg_files_it = iter(cycle(neg_files))
        self.pos_files_it = iter(cycle(pos_files))

    def layer_with(self, sample: np.ndarray, value: int) -> np.ndarray:
        """Create an identical 2d array where the second row is filled with value"""
        b = np.full((2, len(sample)), value, dtype=float)
        b[0] = sample
        return b

    def generate_wakeword_pieces(self, volume):
        """Generates chunks of audio that represent the wakeword stream"""
        while True:
            target = 1 if random() > 0.5 else 0
            it = self.pos_files_it if target else self.neg_files_it
            sample_file = next(it)
            yield self.layer_with(
                self.normalize_volume_to(load_audio(sample_file), volume),
                target)
            yield self.layer_with(
                np.zeros(int(pr.sample_rate * (0.5 + 2.0 * random()))), 0)

    def chunk_audio_pieces(self, pieces, chunk_size):
        """Convert chunks of audio into a series of equally sized pieces"""
        left_over = np.array([])
        for piece in pieces:
            if left_over.size == 0:
                combined = piece
            else:
                combined = np.concatenate([left_over, piece], axis=-1)
            for chunk in chunk_audio(combined.T, chunk_size):
                yield chunk.T
            left_over = piece[-(len(piece) % chunk_size):]

    def calc_volume(self, sample: np.ndarray):
        """Find the RMS of the audio"""
        return sqrt(np.mean(np.square(sample)))

    def normalize_volume_to(self, sample, volume):
        """Normalize the volume to a certain RMS"""
        return volume * sample / self.calc_volume(sample)

    def merge(self, a, b, ratio):
        """Perform a weighted sum of a and b. ratio=1.0 means 100% of b and 0% of a"""
        return (1.0 - ratio) * a + ratio * b

    @staticmethod
    def max_run_length(x: np.ndarray, val: int):
        """Finds the maximum continuous length of the given value in the sequence"""
        if x.size == 0:
            return 0
        else:
            y = np.array(x[1:] != x[:-1])
            i = np.append(np.where(y), len(x) - 1)
            run_lengths = np.diff(np.append(-1, i))
            run_length_values = x[i]
            return max([
                rl for rl, v in zip(run_lengths, run_length_values) if v == val
            ],
                       default=0)

    def vectors_from_fn(self, fn: str):
        """
        Run through a single background audio file, overlaying with wake words.
        Generates (mfccs, target) where mfccs is a series of mfcc values and
        target is a single integer classification of the target network output for that chunk
        """
        audio = load_audio(fn)
        audio_volume = self.calc_volume(audio)
        audio_volume *= 0.4 + 0.5 * random()
        audio = self.normalize_volume_to(audio, audio_volume)

        self.listener.clear()
        chunked_bg = chunk_audio(audio, self.args.chunk_size)
        chunked_ww = self.chunk_audio_pieces(
            self.generate_wakeword_pieces(audio_volume), self.args.chunk_size)

        for i, (chunk_bg, (chunk_ww,
                           targets)) in enumerate(zip(chunked_bg, chunked_ww)):
            chunk = self.merge(chunk_bg, chunk_ww, 0.6)
            self.vals_buffer = np.concatenate(
                (self.vals_buffer[len(targets):], targets))
            self.audio_buffer = np.concatenate(
                (self.audio_buffer[len(chunk):], chunk))
            mfccs = self.listener.update_vectors(chunk)
            percent_overlapping = self.max_run_length(
                self.vals_buffer, 1) / len(self.vals_buffer)

            if self.vals_buffer[-1] == 0 and percent_overlapping > 0.8:
                target = 1
            elif percent_overlapping < 0.5:
                target = 0
            else:
                continue

            if random() > 1.0 - self.args.save_prob:
                name = splitext(basename(fn))[0]
                wav_file = join('debug', 'ww' if target == 1 else 'nww',
                                '{} - {}.wav'.format(name, i))
                save_audio(wav_file, self.audio_buffer)
            yield mfccs, target

    @staticmethod
    def samples_to_batches(samples: Iterable, batch_size: int):
        """Chunk a series of network inputs and outputs into larger batches"""
        it = iter(samples)
        while True:
            with suppress(StopIteration):
                batch_in, batch_out = [], []
                for i in range(batch_size):
                    sample_in, sample_out = next(it)
                    batch_in.append(sample_in)
                    batch_out.append(sample_out)
            if not batch_in:
                raise StopIteration
            yield np.array(batch_in), np.array(batch_out)

    def generate_samples(self):
        """Generate training samples (network inputs and outputs)"""
        filenames = glob_all(self.args.random_data_folder, '*.wav')
        shuffle(filenames)
        while True:
            for fn in filenames:
                for x, y in self.vectors_from_fn(fn):
                    yield x, y

    def run(self):
        """Train the model on randomly generated batches"""
        _, test_data = self.data.load(train=False, test=True)
        try:
            self.model.fit_generator(self.samples_to_batches(
                self.generate_samples(), self.args.batch_size),
                                     steps_per_epoch=self.args.steps_per_epoch,
                                     epochs=self.epoch + self.args.epochs,
                                     validation_data=test_data,
                                     callbacks=self.callbacks,
                                     initial_epoch=self.epoch)
        finally:
            self.model.save(self.args.model)
            save_params(self.args.model)
Exemplo n.º 26
0
import json
import sys
from collections import deque

from prettyparse import Usage
from shibboleth_get import shibboleth_get

usage = Usage('''
    Mediaspace m3u8 extractor using Selenium
    
    :username str
        Illinois netid to use
    :password str
        Corresponding password for netid
    :-j --json
        Output as json lines
    :-t --title
        Output title of video
    :-q --quiet
        Hide status output
    ...
''')
usage.add_argument('mediaspace_urls', help='Mediaspace URLs to extract m3u8 from', nargs='*')
usage.add_argument('-m', '--metadata', type=json.loads, default={}, help='Extra json metadata to attach to each element')


def main():
    args = usage.parse()
    if not args.mediaspace_urls:
        return
    driver = shibboleth_get(args.username, args.password, args.mediaspace_urls[0], debug=not args.quiet)
Exemplo n.º 27
0
 def test_extra(self):
     usage = '''
     <desc>
     ...
     '''
     assert Usage(usage).desc == '<desc>'
Exemplo n.º 28
0
    def test_newline(self):
        usage = '''
<line one>
<line two>
'''
        assert Usage(usage).desc == '<line one> <line two>'
Exemplo n.º 29
0
 def test_description(self):
     usage = 'This is the description'
     assert Usage(usage).desc == usage
Exemplo n.º 30
0
    def test_customizer(self):
        def modify_usage(parser):
            parser.usage = (parser.usage or '') + 'hello'

        parser = ArgumentParser()
        assert parser.usage is None
        usage = Usage()
        usage.add_customizer(modify_usage)
        usage.apply(parser)
        assert parser.usage == 'hello'

        def modify_usage_2(parser):
            parser.usage = (parser.usage or '') + ' hi'

        usage_2 = Usage()
        usage_2.add_customizer(modify_usage_2)
        usage |= usage_2
        parser = ArgumentParser()
        usage.apply(parser)
        assert parser.usage == 'hello hi'