Python InputPath 예제들, kfp.components.InputPath Python 예제들

예제 #1

0

파일 보기

def xgboost_predict(
        data_path: InputPath('CSV'),  # Also supports LibSVM
        model_path: InputPath('XGBoostModel'),
        predictions_path: OutputPath('Text'),
        label_column: int = None,
):
    '''Make predictions using a trained XGBoost model.

    Args:
        data_path: Path for the feature data in CSV format.
        model_path: Path for the trained model in binary XGBoost format.
        predictions_path: Output path for the predictions.
        label_column: Column containing the label data.

    Annotations:
        author: Alexey Volkov <*****@*****.**>
    '''
    from pathlib import Path

    import numpy
    import xgboost

    csv_data_spec = data_path + '?format=csv'
    # Only specifying the column if it's passed.
    if label_column is not None:
        csv_data_spec += '&label_column=' + str(label_column)
    testing_data = xgboost.DMatrix(csv_data_spec)

    model = xgboost.Booster(model_file=model_path)

    predictions = model.predict(testing_data)

    Path(predictions_path).parent.mkdir(parents=True, exist_ok=True)
    numpy.savetxt(predictions_path, predictions)

예제 #2

0

파일 보기

def split_data(
        x_path: InputPath(),
        y_path: InputPath(),
        train_x_path: OutputPath(str),
        test_x_path: OutputPath(str),
        train_y_path: OutputPath(str),
        test_y_path: OutputPath(str),
):
    import numpy
    from sklearn.model_selection import train_test_split
    iris_x = numpy.load(x_path)
    iris_y = numpy.load(y_path)
    train_x, test_x, train_y, test_y = train_test_split(iris_x,
                                                        iris_y,
                                                        test_size=0.2)
    print(f"Training data size - X: {train_x.size}, y: {train_y.size}")
    print(f"Testing data size - X: {test_x.size}, y: {test_y.size}")

    with open(train_x_path, 'wb') as f:
        numpy.save(f, train_x)
    with open(test_x_path, 'wb') as f:
        numpy.save(f, test_x)
    with open(train_y_path, 'wb') as f:
        numpy.save(f, train_y)
    with open(test_y_path, 'wb') as f:
        numpy.save(f, test_y)

예제 #3

0

파일 보기

def wikiqa_train(
    dataset_path: InputPath(str),
    wikiqa_path: InputPath(str),
    load_path,
    shared_path,
    run_id,
    sent_size_th,
    ques_size_th,
    num_epochs,
    num_steps,
    eval_period,
    save_period,
    device,
    device_type,
    num_gpus,
    model_path: OutputPath(str),
):
    import tensorflow as tf

    from basic.cli import main

    input_dir = wikiqa_path + "/wikiqa-class"
    output_dir = model_path + "/out/wikiqa"
    full_load_path = dataset_path + load_path
    full_shared_path = dataset_path + shared_path
    tf.app.run(
        main,
        argv=[
            "./basic/cli.py",
            "--data_dir",
            input_dir,
            "--out_base_dir",
            output_dir,
            "--load_path",
            full_load_path,
            "--shared_path",
            full_shared_path,
            "--load_trained_model",
            "--run_id",
            run_id,
            "--sent_size_th",
            sent_size_th,
            "--ques_size_th",
            ques_size_th,
            "--num_epochs",
            num_epochs,
            "--num_steps",
            num_steps,
            "--eval_period",
            eval_period,
            "--save_period",
            save_period,
            "--device",
            device,
            "--device_type",
            device_type,
            "--num_gpus",
            num_gpus,
        ],
    )

예제 #4

0

파일 보기

파일: test_python_op.py 프로젝트: xiaogaozi/pipelines

 def consume_file_path(
         number: int,
         number_1a_path: str,
         number_1b_file: str,
         number_1c_file_path: str,
         number_1d_path_file: str,
         number_2a_path: InputPath(str),
         number_2b_file: InputPath(str),
         number_2c_file_path: InputPath(str),
         number_2d_path_file: InputPath(str),
         number_3a_path: InputTextFile(str),
         number_3b_file: InputTextFile(str),
         number_3c_file_path: InputTextFile(str),
         number_3d_path_file: InputTextFile(str),
         number_4a_path: InputBinaryFile(str),
         number_4b_file: InputBinaryFile(str),
         number_4c_file_path: InputBinaryFile(str),
         number_4d_path_file: InputBinaryFile(str),
         output_number_2a_path: OutputPath(str),
         output_number_2b_file: OutputPath(str),
         output_number_2c_file_path: OutputPath(str),
         output_number_2d_path_file: OutputPath(str),
         output_number_3a_path: OutputTextFile(str),
         output_number_3b_file: OutputTextFile(str),
         output_number_3c_file_path: OutputTextFile(str),
         output_number_3d_path_file: OutputTextFile(str),
         output_number_4a_path: OutputBinaryFile(str),
         output_number_4b_file: OutputBinaryFile(str),
         output_number_4c_file_path: OutputBinaryFile(str),
         output_number_4d_path_file: OutputBinaryFile(str),
 ):
     pass

예제 #5

0

파일 보기

def calculate_regression_metrics_from_csv(
    true_values_path: InputPath(),
    predicted_values_path: InputPath(),
) -> NamedTuple('Outputs', [
    ('number_of_items', int),
    ('max_absolute_error', float),
    ('mean_absolute_error', float),
    ('mean_squared_error', float),
    ('root_mean_squared_error', float),
    ('metrics', dict),
]):
    '''Calculates regression metrics.

    Annotations:
        author: Alexey Volkov <*****@*****.**>
    '''
    import math
    import numpy

    true_values = numpy.loadtxt(true_values_path, dtype=numpy.float64)
    predicted_values = numpy.loadtxt(predicted_values_path,
                                     dtype=numpy.float64)

    if len(predicted_values.shape) != 1:
        raise NotImplemented('Only single prediction values are supported.')
    if len(true_values.shape) != 1:
        raise NotImplemented('Only single true values are supported.')

    if predicted_values.shape != true_values.shape:
        raise ValueError('Input shapes are different: {} != {}'.format(
            predicted_values.shape, true_values.shape))

    number_of_items = true_values.size
    errors = (true_values - predicted_values)
    abs_errors = numpy.abs(errors)
    squared_errors = errors**2
    max_absolute_error = numpy.max(abs_errors)
    mean_absolute_error = numpy.average(abs_errors)
    mean_squared_error = numpy.average(squared_errors)
    root_mean_squared_error = math.sqrt(mean_squared_error)
    metrics = dict(
        number_of_items=number_of_items,
        max_absolute_error=max_absolute_error,
        mean_absolute_error=mean_absolute_error,
        mean_squared_error=mean_squared_error,
        root_mean_squared_error=root_mean_squared_error,
    )

    return (
        number_of_items,
        max_absolute_error,
        mean_absolute_error,
        mean_squared_error,
        root_mean_squared_error,
        metrics,
    )

예제 #6

0

파일 보기

def train_knn(train_x_path: InputPath(), train_y_path: InputPath(),
              model_path: OutputPath()):
    import numpy
    import pickle
    from sklearn.neighbors import KNeighborsClassifier
    train_x = numpy.load(train_x_path)
    train_y = numpy.load(train_y_path)
    print(f"Training data size - X: {train_x.size}, y: {train_y.size}")
    knn = KNeighborsClassifier()
    knn.fit(train_x, train_y)
    with open(model_path, 'wb') as f:
        pickle.dump(knn, f)

예제 #7

0

파일 보기

파일: anomaly-pipeline.py 프로젝트: tbaums/konvoy-kudo-studio

def prep_data_op(anomalous_data_path: InputPath(str),
                 non_anomalous_data_path: InputPath(str)):
    import sys, subprocess
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas'])

    import os
    import pandas as pd

    df_small_noise = pd.read_csv(non_anomalous_data_path)
    df_daily_jumpsup = pd.read_csv(anomalous_data_path)

    print(df_small_noise.head())
    print(df_daily_jumpsup.head())

예제 #8

0

파일 보기

def train_mnist(data_path: InputPath(), model_output: OutputPath()):
    import tensorflow as tf
    import numpy as np
    with np.load(data_path, allow_pickle=True) as f:
        x_train, y_train = f['x_train'], f['y_train']
        x_test, y_test = f['x_test'], f['y_test']
    print(x_train.shape)
    print(y_train.shape)

    model = tf.keras.models.Sequential([
        tf.keras.layers.Flatten(input_shape=(28, 28)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(10)
    ])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(0.001),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
    )

    model.fit(
        x_train,
        y_train,
    )
    model.evaluate(x_test, y_test)

    model.save(model_output)

예제 #9

0

파일 보기

def export_pickle(data_path: InputPath('Pickle'), ):
    '''
        Export pickle to bucket
    '''
    def build_storage_client(project_id):
        '''
            Build Storage client to perform requests to GCP buckets
            Params:
                project_id: the respective project of GCP
        '''

        # step.apply(gcp.use_gcp_secret('user-gcp-sa')) in the dsl.ContainerOP()
        storage_client = storage.Client(project_id)
        return storage_client

    def load_to_bucket(storage_client, bucket_name, file_name):
        '''
            Load file to GCP bucket
        '''

        bucket = storage_client.get_bucket(bucket_name)
        blob = bucket.blob(file_name)
        with open(file_name, "rb") as f:
            blob.upload_from_file(f)

    storage_client = build_storage_client('beto-cloud')
    bucket_name = 'stroke'
    load_to_bucket(storage_client, bucket_name, data_path)

예제 #10

0

파일 보기

파일: wikiqa_prepro.py 프로젝트: sciling/example-kubeflow-qatransfer

def prepro_class(dataset_path: InputPath(str), wikiqa_path: OutputPath(str)):
    import nltk

    from wikiqa.prepro_class import prepro

    nltk.download("punkt")

    def get_args():
        from types import SimpleNamespace

        source_dir = dataset_path + "/WikiQACorpus"
        target_dir = wikiqa_path + "/wikiqa-class"
        glove_dir = dataset_path + "/glove"
        args = SimpleNamespace(
            source_dir=source_dir,
            target_dir=target_dir,
            debug=False,
            glove_corpus="6B",
            glove_dir=glove_dir,
            glove_vec_size="100",
            tokenizer="PTB",
        )
        return args

    args = get_args()
    prepro(args)

예제 #11

0

파일 보기

파일: component.py 프로젝트: chironghua/kubeflow-piplines

def convert_CatBoostModel_to_AppleCoreMLModel(
        model_path: InputPath('CatBoostModel'),
        converted_model_path: OutputPath('AppleCoreMLModel'),
):
    '''Convert CatBoost model to Apple CoreML format.

    Args:
        model_path: Path of a trained model in binary CatBoost model format.
        converted_model_path: Output path for the converted model.

    Outputs:
        converted_model: Model in Apple CoreML format.

    Annotations:
        author: Alexey Volkov <*****@*****.**>
    '''
    from catboost import CatBoost

    model = CatBoost()
    model.load_model(model_path)
    model.save_model(
        converted_model_path,
        format="coreml",
        # export_parameters={'prediction_type': 'probability'},
        # export_parameters={'prediction_type': 'raw'},
    )

예제 #12

0

파일 보기

def print_text(
        text_path: InputPath()
):  # The "text" input is untyped so that any data can be printed
    '''Print text'''
    with open(text_path, 'r') as reader:
        for line in reader:
            print(line, end='')

예제 #13

0

파일 보기

파일: component.py 프로젝트: chironghua/kubeflow-piplines

def Pandas_Transform_DataFrame_in_ApacheParquet_format(
    table_path: InputPath('ApacheParquet'),
    transformed_table_path: OutputPath('ApacheParquet'),
    transform_code: 'PythonCode',
):
    '''Transform DataFrame loaded from an ApacheParquet file.

    Inputs:
        table: DataFrame to transform.
        transform_code: Transformation code. Code is written in Python and can consist of multiple lines.
            The DataFrame variable is called "df".
            Examples:
            - `df['prod'] = df['X'] * df['Y']`
            - `df = df[['X', 'prod']]`
            - `df.insert(0, "is_positive", df["X"] > 0)`

    Outputs:
        transformed_table: Transformed DataFrame.

    Annotations:
        author: Alexey Volkov <*****@*****.**>
    '''
    import pandas

    df = pandas.read_parquet(table_path)
    # The namespace is needed so that the code can replace `df`. For example df = df[['X']]
    namespace = locals()
    exec(transform_code, namespace)
    namespace['df'].to_parquet(transformed_table_path)

예제 #14

0

파일 보기

파일: component.py 프로젝트: tomar27/pipelines

def split_dataset_huggingface(
        dataset_dict_path: InputPath('HuggingFaceDatasetDict'),
        dataset_split_path: OutputPath('HuggingFaceDataset'),
        dataset_path: OutputPath('HuggingFaceArrowDataset'),
        # dataset_indices_path: OutputPath('HuggingFaceArrowDataset'),
        dataset_info_path: OutputPath(dict),
        dataset_state_path: OutputPath(dict),
        split_name: str = None,
):
    import os
    import shutil
    from datasets import config as datasets_config

    print(f'DatasetDict contents: {os.listdir(dataset_dict_path)}')
    shutil.copytree(os.path.join(dataset_dict_path, split_name),
                    dataset_split_path)
    print(
        f'Dataset contents: {os.listdir(os.path.join(dataset_dict_path, split_name))}'
    )
    shutil.copy(
        os.path.join(dataset_dict_path, split_name,
                     datasets_config.DATASET_ARROW_FILENAME), dataset_path)
    # shutil.copy(os.path.join(dataset_dict_path, split_name, datasets_config.DATASET_INDICES_FILENAME), dataset_indices_path)
    shutil.copy(
        os.path.join(dataset_dict_path, split_name,
                     datasets_config.DATASET_INFO_FILENAME), dataset_info_path)
    shutil.copy(
        os.path.join(dataset_dict_path, split_name,
                     datasets_config.DATASET_STATE_JSON_FILENAME),
        dataset_state_path)

예제 #15

0

파일 보기

파일: lightweight_python_functions_v2_pipeline.py 프로젝트: mabubaker1947/pipelines

def train(
    # Use InputPath to get a locally accessible path for the input artifact
    # of type `Dataset`.
    dataset_one_path: InputPath('Dataset'),
    # Use Input[T] to get a metadata-rich handle to the input artifact
    # of type `Dataset`.
    dataset_two: Input[Dataset],
    # An input parameter of type string.
    message: str,
    # Use Output[T] to get a metadata-rich handle to the output artifact
    # of type `Dataset`.
    model: Output[Model],
    # An input parameter of type int with a default value.
    num_steps: int = 100):
    '''Dummy Training step'''
    with open(dataset_one_path, 'r') as input_file:
        dataset_one_contents = input_file.read()

    with open(dataset_two.path, 'r') as input_file:
        dataset_two_contents = input_file.read()

    line = "dataset_one_contents: {} || dataset_two_contents: {} || message: {}\n".format(
        dataset_one_contents, dataset_two_contents, message)

    with open(model.path, 'w') as output_file:
        for i in range(num_steps):
            output_file.write("Step {}\n{}\n=====\n".format(i, line))

    # Use `model` to get a Model artifact, which has a .metadata dictionary
    # to store arbitrary metadata for the output artifact.
    model.metadata['accuracy'] = 0.9

예제 #16

0

파일 보기

파일: component.py 프로젝트: swapdebarshi/pipelines

def Pandas_Transform_DataFrame_in_CSV_format(
        table_path: InputPath('CSV'),
        transformed_table_path: OutputPath('CSV'),
        transform_code: 'PythonCode',
):
    '''Transform DataFrame loaded from a CSV file.

    Inputs:
        table: Table to transform.
        transform_code: Transformation code. Code is written in Python and can consist of multiple lines.
            The DataFrame variable is called "df".
            Examples:
            - `df['prod'] = df['X'] * df['Y']`
            - `df = df[['X', 'prod']]`
            - `df.insert(0, "is_positive", df["X"] > 0)`

    Outputs:
        transformed_table: Transformed table.

    Annotations:
        author: Alexey Volkov <*****@*****.**>
    '''
    import pandas

    df = pandas.read_csv(table_path, )
    exec(transform_code)
    df.to_csv(
        transformed_table_path,
        index=False,
    )

예제 #17

0

파일 보기

 def consume_file_path(number_file_path: InputPath(int) = None) -> int:
     result = -1
     if number_file_path:
         with open(number_file_path) as f:
             string_data = f.read()
             result = int(string_data)
     return result

예제 #18

0

파일 보기

파일: component.py 프로젝트: chironghua/kubeflow-piplines

def catboost_predict_class_probabilities(
        data_path: InputPath('CSV'),
        model_path: InputPath('CatBoostModel'),
        predictions_path: OutputPath(),
        label_column: int = None,
):
    '''Predict class probabilities with a CatBoost model.

    Args:
        data_path: Path for the data in CSV format.
        model_path: Path for the trained model in binary CatBoostModel format.
        label_column: Column containing the label data.
        predictions_path: Output path for the predictions.

    Outputs:
        predictions: Predictions in text format.

    Annotations:
        author: Alexey Volkov <*****@*****.**>
    '''
    import tempfile

    from catboost import CatBoost, Pool
    import numpy

    if label_column:
        column_descriptions = {label_column: 'Label'}
        column_description_path = tempfile.NamedTemporaryFile(
            delete=False).name
        with open(column_description_path, 'w') as column_description_file:
            for idx, kind in column_descriptions.items():
                column_description_file.write('{}\t{}\n'.format(idx, kind))
    else:
        column_description_path = None

    eval_data = Pool(
        data_path,
        column_description=column_description_path,
        has_header=True,
        delimiter=',',
    )

    model = CatBoost()
    model.load_model(model_path)

    predictions = model.predict(eval_data, prediction_type='Probability')
    numpy.savetxt(predictions_path, predictions)

예제 #19

0

파일 보기

파일: nb_component_builder_test.py 프로젝트: bartgras/kf-notebook-component

 def test_build_function_with_input_output_artifacts(self):
     nb_file = get_tmp_notebook(notebook_source)
     builder = NbComponentBuilder('op1', inject_notebook_path=nb_file.name)
     builder.add_input_artifact('a_in')
     builder.add_output_artifact('a_out')
     func = builder.build_component_function()
     self.assertEqual(type(func.__annotations__['a_in']), type(InputPath()))
     self.assertEqual(type(func.__annotations__['a_out']), type(OutputPath()))

예제 #20

0

파일 보기

파일: prepro.py 프로젝트: sciling/example-kubeflow-qatransfer

def prepro_class(
        dataset_path: InputPath(str),
        class_dir: InputPath(str),
        train_ratio,
        glove_vec_size,
        mode,
        tokenizer,
        url,
        port,
        prepro_squad_dir: OutputPath(str),
):
    import nltk

    from squad.prepro_class import prepro

    nltk.download("punkt")

    train_ratio = float(train_ratio)
    glove_vec_size = int(glove_vec_size)
    port = int(port)

    def get_args():
        source_dir = class_dir + "/data/squad-class"
        target_dir = prepro_squad_dir + "/squad-class"
        glove_dir = dataset_path + "/data/glove"
        from types import SimpleNamespace

        args = SimpleNamespace(
            source_dir=source_dir,
            target_dir=target_dir,
            debug=False,
            train_ratio=train_ratio,
            glove_corpus="6B",
            glove_dir=glove_dir,
            glove_vec_size=glove_vec_size,
            mode=mode,
            single_path="",
            tokenizer=tokenizer,
            url=url,
            port=port,
            split=False,
        )
        return args

    args = get_args()
    prepro(args)

예제 #21

0

파일 보기

def test_model(test_x_path: InputPath(), test_y_path: InputPath(),
               model_path: InputPath()):
    import numpy
    import pickle
    import random
    from sklearn.metrics import classification_report
    p = random.random()
    print(p)
    if p > 0.5:
        raise Exception()
    test_x = numpy.load(test_x_path)
    test_y = numpy.load(test_y_path)
    print(f"Testing data size - X: {test_x.size}, y: {test_y.size}")
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    pred_y = model.predict(test_x)
    print(classification_report(test_y, pred_y))

예제 #22

0

파일 보기

def train_logistics(train_x_path: InputPath(), train_y_path: InputPath(),
                    model_path: OutputPath()):
    import numpy
    import pickle
    import random
    from sklearn.linear_model import LogisticRegression
    p = random.random()
    print(p)
    if p > 0.5:
        raise Exception()
    train_x = numpy.load(train_x_path)
    train_y = numpy.load(train_y_path)
    print(f"Training data size - X: {train_x.size}, y: {train_y.size}")
    knn = LogisticRegression()
    knn.fit(train_x, train_y)
    with open(model_path, 'wb') as f:
        pickle.dump(knn, f)

예제 #23

0

파일 보기

def convert_values_to_int(text_path: InputPath('Text'),
                          output_path: OutputPath('Text')):
    """Returns the number of values in a CSV column."""
    import numpy as np

    result = np.loadtxt(text_path)

    np.savetxt(output_path, result, fmt='%d')

예제 #24

0

파일 보기

파일: example_pipeline.py 프로젝트: f6wbl6/kubeflow-pipelines-deploy-action

    def read_csv(input_csv_path: InputPath("CSV")):
        import subprocess

        subprocess.run(["pip", "install", "pandas"])
        import pandas as pd

        df = pd.read_csv(input_csv_path, index_col=0)
        print(f"input_csv_path: {input_csv_path}")
        print(f"type: {type(input_csv_path)}")
        print(df.head())

예제 #25

0

파일 보기

def convert_to_tensorflow_saved_model_from_onnx_model(
    model_path: InputPath('OnnxModel'),
    converted_model_path: OutputPath('TensorflowSavedModel'),
):
    import onnx
    import onnx_tf

    onnx_model = onnx.load(model_path)
    tf_rep = onnx_tf.backend.prepare(onnx_model)
    tf_rep.export_graph(converted_model_path)

예제 #26

0

파일 보기

def train(dataset: InputPath('Dataset'),
          model: OutputPath('Model'),
          num_steps: int = 100):
  '''Dummy Training Step.'''

  with open(dataset, 'r') as input_file:
    input_string = input_file.read()
    with open(model, 'w') as output_file:
      for i in range(num_steps):
        output_file.write("Step {}\n{}\n=====\n".format(i, input_string))

예제 #27

0

파일 보기

파일: model.py 프로젝트: markbarna/kubeflow-pipelines-poc

def train(x_path: InputPath(str), y_path: InputPath(str),
          model_path: OutputPath(str)):
    import pandas as pd
    from sklearn.linear_model import LogisticRegression
    import joblib

    x = pd.read_parquet(x_path)
    y = pd.read_parquet(y_path)

    model = LogisticRegression()
    model.fit(x, y)
    joblib.dump(model, model_path)

    # TODO: output artifact of model stats
    coefs = {
        feature: round(value, 2)
        for feature, value in zip(x.columns, model.coef_.flatten())
    }
    coefs['intercept'] = round(model.intercept_[0], 2)

    return coefs

예제 #28

0

파일 보기

파일: 01_pipeline_definition_lightweight.py 프로젝트: velascoluis/cnn-sentence-classifier-dev

def prepare_embeddings(
    gcp_bucket: str, num_words: int, w2v_model_path: str, embedding_dim: int,
    json_tokenizer_path: InputPath(str), num_classes: int,
    output_emb_matrix_path: OutputPath(str)
) -> NamedTuple('PrepareEmbOutput', [('vocabulary_size', int)]):
    from gensim.models import Word2Vec
    from google.cloud import storage
    from tensorflow.keras.preprocessing.text import tokenizer_from_json
    import os
    import json
    import numpy as np
    from collections import namedtuple
    # Storage client for loading the w2v model
    storage_client = storage.Client()
    bucket = storage_client.bucket(gcp_bucket)
    # Load w2v model
    model = Word2Vec()
    blob_w2v = bucket.get_blob(w2v_model_path)
    destination_uri = '{}/{}'.format(".", blob_w2v.name)
    if not os.path.exists(destination_uri):
        os.mkdir("/model")
    blob_w2v.download_to_filename(destination_uri)
    w2v_model = model.wv.load(destination_uri)
    word_vectors = w2v_model.wv

    # Load Json tokenizer

    # blob_tok = bucket.get_blob(json_tokenizer_path)
    with open(json_tokenizer_path) as f:
        json_token = json.load(f)

    tokenizer = tokenizer_from_json(json_token)
    word_index = tokenizer.word_index

    vocabulary_size = min(len(word_index) + 1, num_words)
    embedding_matrix = np.zeros((vocabulary_size, embedding_dim),
                                dtype=np.int32)
    for word, i in word_index.items():
        if i >= num_words:
            continue
        try:
            embedding_vector = word_vectors[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25),
                                                   embedding_dim)
    del (word_vectors)
    # Save the matrix @ output_embd_matrix_path
    embedding_matrix.tofile(output_emb_matrix_path)

    PrepareEmbOutput = namedtuple('PrepareEmbOuput', ['vocabulary_size'])
    return (PrepareEmbOutput(vocabulary_size))

예제 #29

0

파일 보기

def split_text_lines(source_path: InputPath(str), odd_lines_path: OutputPath(str), even_lines_path: OutputPath(str)):
    with open(source_path, 'r') as reader:
        with open(odd_lines_path, 'w') as odd_writer:
            with open(even_lines_path, 'w') as even_writer:
                while True:
                    line = reader.readline()
                    if line == "":
                        break
                    odd_writer.write(line)
                    line = reader.readline()
                    if line == "":
                        break
                    even_writer.write(line)

예제 #30

0

파일 보기

파일: fetch_data.py 프로젝트: markbarna/kubeflow-pipelines-poc

def split_data(
        x_path: InputPath(str),
        y_path: InputPath(str),
        x_train_path: OutputPath(str),
        y_train_path: OutputPath(str),
        x_test_path: OutputPath(str),
        y_test_path: OutputPath(str),
        test_size: float = 0.2
):
    # split into training and test sets
    import pandas as pd
    from sklearn.model_selection import train_test_split

    x = pd.read_parquet(x_path)
    y = pd.read_parquet(y_path)

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=1)

    x_train.to_parquet(x_train_path)
    x_test.to_parquet(x_test_path)
    y_train.to_parquet(y_train_path)
    y_test.to_parquet(y_test_path)