Exemplo n.º 1
0
def split_data_txt(
        training_data_output: OutputDirectory(type='AnyDirectory'),
        validation_data_output: OutputDirectory(type='AnyDirectory'),
        test_data_output: OutputDirectory(type='AnyDirectory'),
        input_dir: InputDirectory(type='AnyDirectory') = None,
        training_data_ratio=0.7,
        validation_data_ratio=0.1,
        random_split=False,
        seed=0):
    print('============================================')
    print(
        f"value of input_dir:'{input_dir}', type of input_dir:'{type(input_dir)}'"
    )
    with open(input_dir, 'r', encoding='utf-8') as f:
        data = f.readlines()
    random.seed(seed if random_split else 0)
    # list shuffle
    random.shuffle(data)
    n = len(data)
    # for logging
    run = Run.get_context()
    training_data_num = int(n * training_data_ratio)
    dev_data_num = int(n * validation_data_ratio)
    train = data[:training_data_num]
    dev = data[training_data_num:training_data_num + dev_data_num]
    test = data[training_data_num + dev_data_num:]
    print('num of total data:', len(data))
    print('num of training data:', len(train))
    print('num of validation data:', len(dev))
    print('num of test_data:', len(test))
    # for metrics
    run.log(name='num of total data', value=len(data))
    run.log(name='num of training data', value=len(train))
    run.log(name='num of validation data', value=len(dev))
    run.log(name='num of test_data', value=len(test))

    os.makedirs(training_data_output, exist_ok=True)
    path = os.path.join(training_data_output, "train.txt")
    with open(path, 'w', encoding='utf-8') as f:
        f.writelines(train)
    print(path)
    print(os.listdir(training_data_output))

    os.makedirs(validation_data_output, exist_ok=True)
    path = os.path.join(validation_data_output, "dev.txt")
    with open(path, 'w', encoding='utf-8') as f:
        f.writelines(dev)
    print(path)
    print(os.listdir(validation_data_output))

    os.makedirs(test_data_output, exist_ok=True)
    path = os.path.join(test_data_output, "test.txt")
    with open(path, 'w', encoding='utf-8') as f:
        f.writelines(test)
    print(path)
    print(os.listdir(test_data_output))
    print('============================================')
Exemplo n.º 2
0
def split_data_txt(training_data_output: OutputDirectory(),
                   validation_data_output: OutputDirectory(),
                   test_data_output: OutputDirectory(),
                   input_dir: InputDirectory() = None,
                   training_data_ratio=0.7,
                   validation_data_ratio=0.1,
                   random_split=False,
                   seed=0):
    print('============================================')
    print(
        f"value of input_dir:'{input_dir}', type of input_dir:'{type(input_dir)}'"
    )
    path_input_data = os.path.join(input_dir, 'data.txt')
    with open(path_input_data, 'r', encoding='utf-8') as f:
        data = f.readlines()
    random.seed(seed if random_split else 0)
    random.shuffle(data)
    n = len(data)
    # for metrics
    run = Run.get_context()
    training_data_num = int(n * training_data_ratio)
    dev_data_num = int(n * validation_data_ratio)
    train = data[:training_data_num]
    dev = data[training_data_num:training_data_num + dev_data_num]
    test = data[training_data_num + dev_data_num:]
    print('num of total data:', len(data))
    print('num of training data:', len(train))
    print('num of validation data:', len(dev))
    print('num of test_data:', len(test))
    # for metrics
    run.log(name='num of total data', value=len(data))
    run.log(name='num of training data', value=len(train))
    run.log(name='num of validation data', value=len(dev))
    run.log(name='num of test_data', value=len(test))
    path_label = os.path.join(input_dir, 'label.txt')
    path_word_to_index = os.path.join(input_dir, 'word_to_index.json')

    shutil.copy(src=path_label, dst=training_data_output)
    shutil.copy(src=path_word_to_index, dst=training_data_output)
    path = os.path.join(training_data_output, "data.txt")
    with open(path, 'w', encoding='utf-8') as f:
        f.writelines(train)

    shutil.copy(src=path_label, dst=validation_data_output)
    shutil.copy(src=path_word_to_index, dst=validation_data_output)
    path = os.path.join(validation_data_output, "data.txt")
    with open(path, 'w', encoding='utf-8') as f:
        f.writelines(dev)

    shutil.copy(src=path_label, dst=test_data_output)
    shutil.copy(src=path_word_to_index, dst=test_data_output)
    path = os.path.join(test_data_output, "data.txt")
    with open(path, 'w', encoding='utf-8') as f:
        f.writelines(test)
    print('============================================')
Exemplo n.º 3
0
def gdal_sample(
        ##define interface(input, output, paratmers) of the module here
        output_dir1: OutputDirectory(),
        output_dir2: OutputDirectory(),
        input_dir1: InputDirectory(),
        input_dir2: InputDirectory()):
    print('I am in module definition')
    print(f'input_dir1: {Path(input_dir1).resolve()}')
    print(f'input_dir2: {Path(input_dir2).resolve()}')

    ## add custom logic here

    dfd1 = load_data_frame_from_directory(input_dir1)
    data_frame1 = dfd1.data
    print(data_frame1.head(10))
def compare_two_models(
    the_better_model: OutputDirectory(),
    first_trained_model: InputDirectory(type='AnyDirectory') = None,
    first_trained_result: InputDirectory(type='AnyDirectory') = None,
    second_trained_model: InputDirectory(type='AnyDirectory') = None,
    second_trained_result: InputDirectory(type='AnyDirectory') = None,
):
    print('=====================================================')
    print(f'input_dir: {Path(first_trained_model).resolve()}')
    print(f'input_dir: {Path(first_trained_result).resolve()}')
    print(f'input_dir: {Path(second_trained_model).resolve()}')
    print(f'input_dir: {Path(second_trained_result).resolve()}')
    # for logging
    run = Run.get_context()
    path = os.path.join(first_trained_result, 'result.json')
    result_first = json.load(open(path, 'r'))['acc']

    path = os.path.join(second_trained_result, 'result.json')
    second_first = json.load(open(path, 'r'))['acc']

    dst = os.path.join(the_better_model, 'BestModel')
    if result_first >= second_first:
        print('choose the first model')
        run.log(name='which one', value='first')
        src = os.path.join(first_trained_model, 'BestModel')
        shutil.copy(src=src, dst=dst)
    else:
        print('choose the second model')
        run.log(name='which one', value='second')
        src = os.path.join(second_trained_model, 'BestModel')
        shutil.copy(src=src, dst=dst)
    print('=====================================================')
Exemplo n.º 5
0
def fasttext_evaluation(
        model_testing_result: OutputDirectory(type='AnyDirectory'),
        trained_model_dir: InputDirectory(type='AnyDirectory') = None,
        test_data_dir: InputDirectory(type='AnyDirectory') = None,
        char2index_dir: InputDirectory(type='AnyDirectory') = None):
    print('=====================================================')
    print(f'trained_model_dir: {Path(trained_model_dir).resolve()}')
    print(f'test_data_dir: {Path(test_data_dir).resolve()}')
    print(f'char2index_dir: {Path(char2index_dir).resolve()}')

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    max_len_ = 38
    path = os.path.join(test_data_dir, 'test.txt')
    test_samples = load_dataset(file_path=path,
                                max_len=max_len_,
                                char2index_dir=char2index_dir)

    test_iter = DataIter(test_samples)

    path = os.path.join(trained_model_dir, 'BestModel')
    model = torch.load(f=path)

    path = os.path.join(model_testing_result, 'result.json')
    acc_ = test(model, test_iter, device)
    json.dump({"acc": acc_}, open(path, 'w'))
    print('\n============================================')
Exemplo n.º 6
0
def fasttext_evaluation(model_testing_result: OutputDirectory(),
                        trained_model_dir: InputDirectory() = None,
                        test_data_dir: InputDirectory() = None):
    print('=====================================================')
    print(f'trained_model_dir: {Path(trained_model_dir).resolve()}')
    print(f'test_data_dir: {Path(test_data_dir).resolve()}')
    path_word_to_index = os.path.join(test_data_dir, 'word_to_index.json')
    word_to_index = get_vocab(path_word_to_index)
    path_label = os.path.join(test_data_dir, 'label.txt')
    map_id_label, map_label_id = get_id_label(path_label)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('device:', device)
    path = os.path.join(trained_model_dir, 'shared_params.json')
    with open(path, 'r', encoding='utf-8') as f:
        shared_params = json.load(f)
    path = os.path.join(test_data_dir, 'data.txt')
    test_samples = load_dataset(file_path=path,
                                max_len=shared_params['max_len'],
                                ngram_size=shared_params['ngram_size'],
                                word_to_index=word_to_index,
                                map_label_id=map_label_id)
    test_iter = DataIter(samples=test_samples, shuffle=False, device=device)
    path = os.path.join(trained_model_dir, 'BestModel')
    model = torch.load(f=path, map_location=device)
    path = os.path.join(model_testing_result, 'result.json')
    acc_ = test(model, test_iter)
    with open(path, 'w', encoding='utf-8') as f:
        json.dump({"acc": acc_}, f)
    print('\n============================================')
def fasttext_score(
        scored_data_output_dir: OutputDirectory(),
        fasttext_model_dir: InputDirectory() = '.'
):
    print('=====================================================')
    print(f'fasttext_model: {Path(fasttext_model_dir).resolve()}')
    print(f'scored_data_output_dir: {scored_data_output_dir}')
    path_word_to_index = os.path.join(fasttext_model_dir, 'word_to_index.json')
    word_to_index = get_vocab(path_word_to_index)
    path_label = os.path.join(fasttext_model_dir, 'label.txt')
    map_id_label, map_label_id = get_id_label(path_label)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('device:', device)
    path = os.path.join(fasttext_model_dir, 'shared_params.json')
    with open(path, 'r', encoding='utf-8') as f:
        shared_params = json.load(f)
    path = os.path.join(fasttext_model_dir, 'BestModel')
    model = torch.load(f=path, map_location=device)

    def run(files):
        if len(files) == 0:
            return []
        with torch.no_grad():
            test_samples = load_dataset(file_path=files, max_len=shared_params['max_len'],
                                        ngram_size=shared_params['ngram_size'], word_to_index=word_to_index,
                                        map_label_id=map_label_id)
            test_iter = DataIter(samples=test_samples, batch_size=1, shuffle=False, device=device)
            results = predict_parallel(model, test_iter, map_id_label)
            dict_ = {'Filename': files, 'Class': results}
            df = pd.DataFrame(data=dict_)
            output_file = os.path.join(scored_data_output_dir, f"{uuid4().hex}.parquet")
            df.to_parquet(output_file, index=False)
        return results

    return run
Exemplo n.º 8
0
def fasttext_score_parallel(
        scored_dataset: OutputDirectory(type='AnyDirectory'),
        fasttext_model: InputDirectory(type='AnyDirectory') = '.',
        char2index_dir: InputDirectory(type='AnyDirectory') = None
):
    print('=====================================================')
    print(f'fasttext_model: {Path(fasttext_model).resolve()}')
    print(f'char2index_dir: {Path(char2index_dir).resolve()}')
    print(f'scored_dataset: {scored_dataset}')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    max_len_ = 38
    path = os.path.join(fasttext_model, 'BestModel')
    model = torch.load(f=path)

    def run(files):
        if len(files) == 0:
            return []
        print(f"Ready to process {len(files)} texts.")
        print('\n'.join(files))

        with torch.no_grad():
            test_samples = load_dataset_parallel(files=files, max_len=max_len_, char2index_dir=char2index_dir)
            test_iter = DataIter_Parallel(test_samples, shuffle=False)
            results = predict_parallel(model, test_iter, device)
            dict_ = {'Filename': files, 'Class': results}
            df = pd.DataFrame(data=dict_)
            print("Result:")
            print(df)
            output_file = os.path.join(scored_dataset, f"{uuid4().hex}.parquet")
            df.to_parquet(output_file, index=False)
        return results

    return run
Exemplo n.º 9
0
def basic_module(
    output_dir: OutputDirectory(),
    input_dir: InputDirectory() = '.',
    str_param='some_string',
):
    print(f'input_dir: {Path(input_dir).resolve()}')
    print(f'str_param: {str_param}')
    output_dir = Path(output_dir)
    with open(output_dir / f"output.txt", 'w') as fout:
        fout.write(str_param)
Exemplo n.º 10
0
def parallel_score_images(
    scored_dataset: OutputDirectory(),
    trained_model: InputDirectory() = None,
):
    # Use the path of a prepared model if trained_model is None
    if trained_model is None:
        trained_model = str(
            Path(__file__).parent /
            'tests/parallel_score_images/inputs/trained_model/')
    print("Scored dataset:", scored_dataset)
    print("Trained model:", trained_model)
    map_location = 'cpu' if not torch.cuda.is_available() else None
    model = torch.load(os.path.join(trained_model, 'model.pt'),
                       map_location=map_location)
    os.makedirs(scored_dataset, exist_ok=True)
    print("Model is loaded:", model)

    def run(files):
        if len(files) == 0:
            return []
        results = []
        nthreads = min(2 * cpu_count(), len(files))

        print(f"Ready to process {len(files)} images.")
        print('\n'.join(files))
        with ThreadPool(nthreads) as pool:
            imgs = pool.map(Image.open, files)

        for f, img in zip(files, imgs):
            img = Image.open(f)
            tensor = transform(img).unsqueeze(0)
            if torch.cuda.is_available():
                tensor = tensor.cuda()

            with torch.no_grad():
                output = model(tensor)
                softmax = nn.Softmax(dim=1)
                pred_probs = softmax(output).cpu().numpy()[0]
                index = torch.argmax(output, 1)[0].cpu().item()
                result = {
                    'Filename': Path(f).name,
                    'Class': MNIST.classes[index]
                }
                for c, prob in zip(MNIST.classes, pred_probs):
                    result[f"Prob of {c}"] = prob
            results.append(result)
        columns = sorted(list(results[0].keys()))
        df = pd.DataFrame(results, columns=columns)
        print("Result:")
        print(df)
        output_file = os.path.join(scored_dataset, f"{uuid4().hex}.parquet")
        df.to_parquet(output_file, index=False)
        return results

    return run
Exemplo n.º 11
0
def slice_video(
    input_video: InputDirectory(
        description="input directory of video file") = './data/input/video',
    output_audio: OutputDirectory(
        description="output directory of audio from video"
    ) = '/data/output/video',
    output_images: OutputDirectory(
        description="output directory of images slice from video"
    ) = '/data/output/images',
):

    ## this module takes input video, and slice the video into images with ffmpeg

    subprocess.run("ffmpeg -i {} {}/video.aac".format(input_video,
                                                      output_audio),
                   shell=True,
                   check=True)

    subprocess.run("ffmpeg -i {} {}/%05d_video.jpg -hide_banner".format(
        input_video, output_images),
                   shell=True,
                   check=True)
Exemplo n.º 12
0
def train(model, trained_model_dir: OutputDirectory(type='AnyDirectory'), train_iter, dev_iter=None,
          epochs=20, learning_rate=0.0001, stop_patience=3, device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print('device:', device)

    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    model.to(device)
    loss = torch.nn.CrossEntropyLoss()

    min_loss_epoch = (None, None)
    stop_flag = False

    model_name = model._get_name()
    # for metrics
    run = Run.get_context()
    for epoch in range(epochs):
        loss_value_list = []
        total_iter = len(train_iter)
        for i, (btach_x, btach_y) in enumerate(train_iter):
            outputs = model(btach_x)
            optimizer.zero_grad()
            loss_value = loss(outputs, btach_y)
            loss_value.backward()
            optimizer.step()
            loss_value_list.append(loss_value.cpu().data.numpy())
            # for metrics
            run.log(name='CrossEntropyLoss', value=np.mean(loss_value_list))
            if i % 50 == 0:
                str_ = f"{model_name} epoch:{epoch + 1}/{epochs} step:{i + 1}/{total_iter} mean_loss:{np.mean(loss_value_list): .4f}"
                print(str_)

            if (i + 1) == total_iter and dev_iter is not None:
                loss_, acc_, prec_, recall_, f1_ = evaluation(model, dev_iter)
                str_ = f" validation loss:{loss_:.4f}  acc:{acc_:.4f}"
                print(str_)
                model.train()

                if (min_loss_epoch[0] is None) or (min_loss_epoch[0] > loss_):
                    min_loss_epoch = (loss_, epoch)
                    os.makedirs(trained_model_dir, exist_ok=True)
                    path = os.path.join(trained_model_dir, "BestModel")
                    torch.save(obj=model, f=path)
                else:
                    if (epoch - min_loss_epoch[1]) >= stop_patience:
                        stop_flag = True
                        break

        if stop_flag is True:
            break
Exemplo n.º 13
0
def sample_module(
    # The input/output port are defined using the following 4 annotations.
    # Note that you need to register data type using
    # DataType.create_data_type(ws, 'MyDirectory', description=description, is_directory=True)
    # DataType.create_data_type(ws, 'MyFile', description=description, is_directory=False)
    # See https://docs.microsoft.com/en-us/python/api/azureml-pipeline-core/azureml.pipeline.core.graph.datatype?view=azure-ml-py#create-data-type-workspace--name--description--is-directory--parent-datatypes-none-
    output_dir: OutputDirectory(type='MyDirectory'),
    output_file: OutputFile(type='MyFile'),
    input_dir: InputDirectory(type='MyDirectory') = None,
    input_file: InputFile(type='MyFile') = None,
    # The parameter with default values will be considered as annotated with such type,
    # Now we support the following 5 types: str, int, float, bool, enum
    str_param='abc',
    int_param=1,
    float_param=0.1,
    bool_param=False,
    enum_param=MyEnum.Enum0,
    # If the default value is None without annotation, it will be treated as str.
    none_param=None,
):
    """A sample module use different parameter types and customized input/output ports."""
    print(f"Arg 'input_dir' = '{input_dir}', type='{type(input_dir)}'")
    if input_dir:
        print(f"Contents of input directory:")
        print('\n'.join(f.name for f in Path(input_dir).iterdir()))
    print(f"Arg 'input_file' = {input_file}, type='{type(input_file)}'")
    print(f"Arg 'output_dir' = {output_dir}, type='{type(output_dir)}'")
    print(f"Arg 'output_file' = {output_file}, type='{type(output_file)}'")
    print(f"Arg 'str_param' = {str_param}, type='{type(str_param)}'")
    print(f"Arg 'int_param' = {int_param}, type='{type(int_param)}'")
    print(f"Arg 'float_param' = {float_param}, type='{type(float_param)}'")
    print(f"Arg 'bool_param' = {bool_param}, type='{type(bool_param)}'")
    print(f"Arg 'enum_param' = {enum_param}, type='{type(enum_param)}'")
    print(f"Arg 'none_param' = {none_param}, type='{type(none_param)}'")

    data = str_param
    if input_file:
        with open(input_file, 'r') as fin:
            data = fin.read()
        print("Content of input file:", data)
    if input_dir:
        shutil.copytree(input_dir, output_dir)
    else:
        os.makedirs(output_dir, exist_ok=True)
        with open(os.path.join(output_dir, "test.txt"), 'w') as fout:
            fout.write(data)
    with open(output_file, 'w') as fout:
        fout.write(data)
def fasttext_train(
        trained_model_dir: OutputDirectory(type='AnyDirectory'),
        training_data_dir: InputDirectory(type='AnyDirectory') = None,
        validation_data_dir: InputDirectory(type='AnyDirectory') = None,
        char2index_dir: InputDirectory(type='AnyDirectory') = None,
        epochs=2,
        batch_size=32,
        learning_rate=0.0005,
        embedding_dim=128):
    print('============================================')
    print('training_data_dir:', training_data_dir)
    print('validation_data_dir:', validation_data_dir)
    c2i = get_vocab(char2index_dir)
    class_ = get_classs()
    max_len_ = 38
    n_class_ = len(class_)
    vocab_size_ = len(c2i)
    stop_patience = 5
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    path = os.path.join(training_data_dir, 'train.txt')
    train_samples = load_dataset(file_path=path,
                                 max_len=max_len_,
                                 char2index_dir=char2index_dir)
    path = os.path.join(validation_data_dir, 'dev.txt')
    dev_samples = load_dataset(file_path=path,
                               max_len=max_len_,
                               char2index_dir=char2index_dir)

    train_iter = DataIter(train_samples, batch_size)
    dev_iter = DataIter(dev_samples, batch_size)

    model = FastText(vocab_size=vocab_size_,
                     n_class=n_class_,
                     embed_dim=embedding_dim)
    start = time.time()
    train(model,
          trained_model_dir,
          train_iter,
          dev_iter=dev_iter,
          epochs=epochs,
          learning_rate=learning_rate,
          stop_patience=stop_patience,
          device=device)
    end = time.time()
    print('\nspent time: %.2f sec' % (end - start))
    print('============================================')
Exemplo n.º 15
0
def add(left: InputDirectory(), right: InputDirectory(),
        output: OutputDirectory()):
    l = float((Path(left).resolve() / f'data').read_text().strip())
    r = float((Path(right).resolve() / f'data').read_text().strip())
    print('left = ', l)
    print('right = ', r)

    m = l + r
    run = Run.get_context()
    run.log('result', m)
    run.log('left', l)
    run.log('right', r)
    run.flush()

    Path(output).absolute().mkdir(parents=True, exist_ok=True)
    with open(Path(output).resolve() / f'data', 'w') as fout:
        fout.write(str(m))
Exemplo n.º 16
0
def tokenizer(
    input_file_path: InputDirectory(description="Input text file path"),
    output_dir_path: OutputDirectory(description="Output file directory path"),
    output_to_file: IntParameter(
        description=
        "whether to interpret output_dir_path as file to write to, or folder containing file to write to"
    ) = 0,
    input_is_tsv: IntParameter(
        description="bool determining whether to use tsv related options") = 0,
    delimiter: StringParameter(
        description="optional, delimiter to use if parsing a tsv type file"
    ) = None,
    ignore_cols: IntParameter(
        description="indices of columns to ignore if parsing a tsv") = None,
    mode: EnumParameter(
        enum=EnumMode,
        description="Tokenizer to use [train, inference, spacy]") = EnumMode.
    train,
    type: EnumParameter(
        enum=EnumType,
        description="Whether to use word tokenizer or sentence tokenizer"
    ) = EnumType.word,
):
    sys.argv = [
        'tokenizer.py',
        '-i',
        str(input_file_path),
        '-o',
        str(output_dir_path),
        '--output_to_file',
        str(output_to_file),
        '--input_is_tsv',
        str(input_is_tsv),
        '-m',
        mode.value,
        '-t',
        type.value,
    ]
    if delimiter is not None:
        sys.argv += ['--delimiter', str(delimiter)]
    if ignore_cols is not None:
        sys.argv += ['--ignore_cols', str(ignore_cols)]
    print(' '.join(sys.argv))
    runpy.run_path('tokenizer.py', run_name='__main__')
Exemplo n.º 17
0
def prepare_data(
    output_data: OutputDirectory(),
    input_data: InputDirectory() = None,
    str_param: str = None,
    int_param: int = 0,
    enum_param: EnumEnumParam = None,
):
    sys.argv = [
        'prepare_data.py',
        '--input_data', str(input_data),
        '--output_data', str(output_data),
        '--int_param', str(int_param),
    ]
    if str_param is not None:
        sys.argv += ['--str_param', str(str_param)]
    if enum_param is not None:
        sys.argv += ['--enum_param', enum_param.value]
    print(' '.join(sys.argv))
    runpy.run_path('prepare_data.py', run_name='__main__')
Exemplo n.º 18
0
def copy_files(
    output_dir: OutputDirectory(),
    input_dir: InputDirectory() = '.',
    str_param='some_string',
):
    input_dir = Path(input_dir)
    print(f'input_dir: {input_dir.resolve()}')
    print(f'str_param: {str_param}')

    files = []
    if input_dir.is_dir():
        files = [str(f) for f in input_dir.iterdir()]

    if (len(files) == 0):
        raise ValueError(f'input_dir should be an directory with files')

    output_dir = Path(output_dir)
    with open(output_dir / f"output.txt", 'w') as fout:
        fout.write(str_param)
Exemplo n.º 19
0
def style_transform_parallel(
    model_dir: InputDirectory(
        description="saved torch model to be used for stylizing the image."),
    output_path: OutputDirectory(
        description="directory holding the output images"),
    style: StringParameter(description="style name") = None,
):
    print(f'output path: {output_path}')
    print(f'Cuda available? {torch.cuda.is_available()}')
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    with torch.no_grad():
        style_model = TransformerNet()
        state_dict = torch.load(os.path.join(model_dir, style + ".pth"))
        # remove saved deprecated running_* keys in InstanceNorm from the checkpoint
        for k in list(state_dict.keys()):
            if re.search(r'in\d+\.running_(mean|var)$', k):
                del state_dict[k]
        style_model.load_state_dict(state_dict)
        style_model.to(device)
    print(f'Model loaded successfully. Path: {model_dir}')

    def run(mini_batch):
        result = []
        for image_file_path in mini_batch:
            img = load_image(image_file_path)
            print(f'load image from: {image_file_path}')
            with torch.no_grad():
                content_transform = transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Lambda(lambda x: x.mul(255))
                ])
                content_image = content_transform(img)
                content_image = content_image.unsqueeze(0).to(device)
                output = style_model(content_image).cpu()
                output_file_path = os.path.join(
                    output_path, os.path.basename(image_file_path))
                save_image(output_file_path, output[0])
                result.append(output_file_path)
                print(f'transferred image saved in: {output_file_path}')
        return result

    return run
Exemplo n.º 20
0
def mpi_module(
    output_dir: OutputDirectory(),
    input_dir: InputDirectory() = '.',
    param0: str = 'abc',
    param1: int = 10,
):
    from mpi4py import MPI
    for k, v in locals().items():
        print(f"{k}: {v}")
    comm = MPI.COMM_WORLD
    size = comm.Get_size()
    rank = comm.Get_rank()
    print(f"This is an MPI module, I'm rank {rank}/{size}.")
    if rank == 0:
        print("I will write data.")
        output_dir = Path(output_dir)
        with open(output_dir / f"output.txt", 'w') as fout:
            fout.write(param0)
            fout.write(str(param1))
    else:
        print("I don't return data.")
Exemplo n.º 21
0
def stitch_video(
        input_images: InputDirectory(description="input directory of images"),
        input_audio: InputDirectory(description="input directory of audio"),
        output_video: OutputDirectory(
            description="output directory of stitched video file")):

    ## this module takes input video, and slice the video into images with ffmpeg

    subprocess.run(
        "ffmpeg -framerate 30 -i {}/%05d_video.jpg -c:v libx264 -profile:v high -crf 20 -pix_fmt yuv420p "
        "-y {}/video_without_audio.mp4".format(args.images_dir,
                                               args.output_dir),
        shell=True,
        check=True)

    subprocess.run(
        "ffmpeg -i {}/video_without_audio.mp4 -i {}/video.aac -map 0:0 -map 1:0 -vcodec "
        "copy -acodec copy -y {}/video_with_audio.mp4".format(
            args.output_dir, args.input_audio, args.output_dir),
        shell=True,
        check=True)
def compare_two_models(the_better_model: OutputDirectory(),
                       first_trained_model: InputDirectory() = None,
                       first_trained_result: InputDirectory() = None,
                       second_trained_model: InputDirectory() = None,
                       second_trained_result: InputDirectory() = None):
    print('=====================================================')
    print(f'input_dir: {Path(first_trained_model).resolve()}')
    print(f'input_dir: {Path(first_trained_result).resolve()}')
    print(f'input_dir: {Path(second_trained_model).resolve()}')
    print(f'input_dir: {Path(second_trained_result).resolve()}')
    # for metrics
    run = Run.get_context()
    path = os.path.join(first_trained_result, 'result.json')
    with open(path, 'r', encoding='utf-8') as f:
        result_first = json.load(f)['acc']
    path = os.path.join(second_trained_result, 'result.json')
    with open(path, 'r', encoding='utf-8') as f:
        second_first = json.load(f)['acc']
    dst = the_better_model
    if result_first >= second_first:
        print('choose the first model')
        run.log(name='which one', value='first')
        src = os.path.join(first_trained_model, 'BestModel')
        shutil.copy(src=src, dst=dst)
    else:
        print('choose the second model')
        run.log(name='which one', value='second')
        src = os.path.join(second_trained_model, 'BestModel')
        shutil.copy(src=src, dst=dst)
    path_word_to_index = os.path.join(first_trained_model,
                                      'word_to_index.json')
    path_label = os.path.join(first_trained_model, 'label.txt')
    path_shared_params = os.path.join(first_trained_model,
                                      'shared_params.json')
    shutil.copy(src=path_word_to_index, dst=dst)
    shutil.copy(src=path_label, dst=dst)
    shutil.copy(src=path_shared_params, dst=dst)
    print('=====================================================')
Exemplo n.º 23
0
def merge(
        cleaned_yellow_data: InputDirectory(
            description=
            "cleaned yellow data, needs to be read as pandas dataframe"),
        cleaned_green_data: InputDirectory(
            description=
            "cleaned green data, needs to be read as pandas dataframe"),
        merged_output: OutputDirectory(description="output data after merge"),
):

    green_df = pd.read_csv(cleaned_green_data)
    yellow_df = pd.read_csv(cleaned_yellow_data)

    print("Argument (output merge taxi data path): %s" % merged_output)

    merge_df = green_df.append(yellow_df, ignore_index=True)
    merge_df.reset_index(inplace=True, drop=True)

    if not (merged_output is None):
        os.makedirs(merged_output, exist_ok=True)
        print("merge output folder %s created" % merged_output)
        path = merged_output + "/merged.csv"
        write_df = merge_df.to_csv(path)
Exemplo n.º 24
0
def fasttext_train(trained_model_dir: OutputDirectory(type='ModelDirectory'),
                   training_data_dir: InputDirectory() = None,
                   validation_data_dir: InputDirectory() = None,
                   epochs=1,
                   batch_size=64,
                   max_len=32,
                   embed_dim=300,
                   hidden_size=256,
                   ngram_size=200000,
                   dropout=0.5,
                   learning_rate=0.001):
    print('============================================')
    print('training_data_dir:', training_data_dir)
    print('validation_data_dir:', validation_data_dir)
    path_word_to_index = os.path.join(training_data_dir, 'word_to_index.json')
    word_to_index = get_vocab(path_word_to_index)
    path_label = os.path.join(training_data_dir, 'label.txt')
    map_id_label, map_label_id = get_id_label(path_label)
    class_num = len(map_id_label)
    vocab_size = len(word_to_index)
    stop_patience = 5
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('device:', device)
    # load training dataset
    path = os.path.join(training_data_dir, 'data.txt')
    train_samples = load_dataset(file_path=path,
                                 word_to_index=word_to_index,
                                 map_label_id=map_label_id,
                                 max_len=max_len,
                                 ngram_size=ngram_size)
    train_iter = DataIter(samples=train_samples,
                          batch_size=batch_size,
                          shuffle=True,
                          device=device)
    # load validation dataset
    path = os.path.join(validation_data_dir, 'data.txt')
    dev_samples = load_dataset(file_path=path,
                               word_to_index=word_to_index,
                               map_label_id=map_label_id,
                               max_len=max_len,
                               ngram_size=ngram_size)
    dev_iter = DataIter(samples=dev_samples,
                        batch_size=batch_size,
                        shuffle=True,
                        device=device)

    model = FastText(vocab_size=vocab_size,
                     class_num=class_num,
                     dropout=dropout,
                     embed_dim=embed_dim,
                     hidden_size=hidden_size,
                     ngram_size=ngram_size)
    # watch parameters
    print(model.parameters)
    # copy word_to_index.json and label.txt for later scoring.
    shutil.copy(src=path_word_to_index, dst=trained_model_dir)
    shutil.copy(src=path_label, dst=trained_model_dir)
    # shared parameters for loading dataset
    shared_params = {'max_len': max_len, 'ngram_size': ngram_size}
    path = os.path.join(trained_model_dir, 'shared_params.json')
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(shared_params, f)
    start = time.time()
    train(model,
          trained_model_dir,
          train_iter=train_iter,
          dev_iter=dev_iter,
          epochs=epochs,
          learning_rate=learning_rate,
          stop_patience=stop_patience,
          device=device)
    end = time.time()
    print('\nduration of training process: %.2f sec' % (end - start))
    print('============================================')
Exemplo n.º 25
0
def enter_num_manually(output: OutputDirectory(), num='0'):
    Path(output).absolute().mkdir(parents=True, exist_ok=True)
    with open(Path(output).resolve() / f'data', 'w') as fout:
        fout.write(num)
Exemplo n.º 26
0
def train(model,
          trained_model_dir: OutputDirectory(type='AnyDirectory'),
          train_iter,
          dev_iter=None,
          epochs=20,
          learning_rate=0.0001,
          stop_patience=3,
          device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    model.to(device)
    loss = torch.nn.CrossEntropyLoss()

    min_loss_epoch = (None, None)
    stop_flag = False

    model_name = model._get_name()
    tip_str = f"\n{model_name} start training....."
    print(tip_str)
    # for logging
    run = Run.get_context()
    for epoch in range(epochs):
        loss_value_list = []
        total_iter = len(train_iter)
        for i, (x_batch, y_batch) in enumerate(train_iter):
            x_batch = torch.LongTensor(x_batch).to(device)
            y_batch = torch.LongTensor(y_batch).to(device)

            outputs = model(x_batch)
            optimizer.zero_grad()
            loss_value = loss(outputs, y_batch)
            loss_value.backward()
            optimizer.step()
            # for metrics
            run.log(name='CrossEntropyLoss', value=np.mean(loss_value_list))
            loss_value_list.append(loss_value.cpu().data.numpy())
            str_ = f"{model_name} epoch:{epoch + 1}/{epochs} step:{i + 1}/{total_iter} mean_loss:{np.mean(loss_value_list): .4f}"
            sys.stdout.write('\r' + str_)
            sys.stdout.flush()

            if (i + 1) == total_iter and dev_iter is not None:
                loss_, acc_, prec_, recall_, f1_ = eval(
                    model, dev_iter, device)
                str_ = f" validation loss:{loss_:.4f}  acc:{acc_:.4f}"
                sys.stdout.write(str_)
                sys.stdout.flush()
                print()

                model.train()

                if (min_loss_epoch[0] is None) or (min_loss_epoch[0] > loss_):
                    min_loss_epoch = (loss_, epoch)
                    os.makedirs(trained_model_dir, exist_ok=True)
                    path = trained_model_dir
                    torch.save(obj=model, f=path)
                else:
                    if (epoch - min_loss_epoch[1]) >= stop_patience:
                        stop_flag = True
                        break

        if stop_flag is True:
            break