Python MarvinData 예제들, marvin_python_toolbox.common.data.MarvinData Python 예제들

예제 #1

0

파일 보기

파일: acquisitor_and_cleaner.py 프로젝트: takabayashi/marvin-ml-intro-engine

    def execute(self, **kwargs):
        from marvin_python_toolbox.common.data import MarvinData
        import pickle
        import cPickle

        # the words (features) and authors (labels), already largely preprocessed
        # this preprocessing will be repeated in the text learning mini-project

        print "Downloading files ...."
        authors_file_path = MarvinData.download_file(
            self.params["authors_file_path"])
        word_file_path = MarvinData.download_file(
            self.params["word_file_path"])

        print "Loading files ...."
        authors_file_handler = open(authors_file_path, "r")
        authors = pickle.load(authors_file_handler)
        authors_file_handler.close()

        words_file_handler = open(word_file_path, "r")
        word_data = cPickle.load(words_file_handler)
        words_file_handler.close()

        self.initial_dataset = {"word_data": word_data, "authors": authors}

        print "Done!"

예제 #2

0

파일 보기

파일: test_data.py 프로젝트: guialba/incubator-marvin

def test_load_data_from_filesystem_exception(data_path_key, data_path):
    with mock.patch('marvin_python_toolbox.common.data.open') as mock_open:
        mock_open.side_effect = IOError

        # load_data should propagate IOError
        with pytest.raises(IOError):
            MarvinData.load_data(os.path.join('named_features', 'brands.json'))

예제 #3

0

파일 보기

파일: test_data.py 프로젝트: guialba/incubator-marvin

def test_download_file(mocked_requests, mocked_progressbar):
    file_url = 'google.com/file.json'
    file_path = MarvinData.download_file(file_url)
    assert file_path == '/tmp/data/file.json'

    file_path = MarvinData.download_file(file_url, local_file_name='myfile')
    assert file_path == '/tmp/data/myfile'

예제 #4

0

파일 보기

파일: acquisitor_and_cleaner.py 프로젝트: jeremyelster/kaggle-titanic-engine

    def execute(self, params, **kwargs):
        from marvin_python_toolbox.common.data import MarvinData
        import pandas as pd

        train_df = pd.read_csv(MarvinData.download_file("https://s3.amazonaws.com/marvin-engines-data/titanic/train.csv"))
        test_df = pd.read_csv(MarvinData.download_file("https://s3.amazonaws.com/marvin-engines-data/titanic/test.csv"))

        print ("{} samples to train with {} features...".format(train_df.shape[0], train_df.shape[1]))
        print ("{} samples to test...".format(test_df.shape[0]))

        self.marvin_initial_dataset = {
            'train': train_df,
            'test': test_df
        }

예제 #5

0

파일 보기

파일: test_data.py 프로젝트: watersjamey8/incubator-marvin

def test_download_file_dont_write_file_if_no_content(mocked_requests, mocked_progressbar):
    from requests import Response
    file_url = 'google.com/file.json'

    response = mock.Mock(spec=Response)
    response.iter_content.return_value = ''
    mocked_requests.get.return_value = response
        
    mocked_open = mock.mock_open()
    with mock.patch('marvin_python_toolbox.common.data.open', mocked_open, create=True):
        MarvinData.download_file(file_url, force=True)

    mocked_open.assert_called_once_with('/tmp/data/file.json', 'wb')
    handle = mocked_open()
    assert handle.write.call_count == 0

예제 #6

0

파일 보기

def engine_httpserver(ctx, action, params_file, initial_dataset, dataset, model, metrics, protocol, spark_conf, http_host,
                      http_port, executor_path, max_workers, max_rpc_workers, extra_executor_parameters):
    logger.info("Starting http and grpc servers ...")

    grpcserver = None
    httpserver = None

    def _params(**kwargs):
        params = []
        if kwargs is not None:
            for key, value in iteritems(kwargs):
                if value is not None:
                    params.append("-{0}".format(str(key)))
                    params.append(str(value))
        return params

    try:
        optional_args = _params(id=initial_dataset, d=dataset, m=model, me=metrics, pf=params_file, c=spark_conf)
        grpcserver = subprocess.Popen(['marvin', 'engine-grpcserver', '-a', action, '-w', str(max_workers), '-rw', str(max_rpc_workers)] + optional_args)

        time.sleep(3)

    except:
        logger.exception("Could not start grpc server!")
        sys.exit(1)

    try:
        if not (executor_path and os.path.exists(executor_path)):
            executor_url = Config.get("executor_url", section="marvin")
            executor_path = MarvinData.download_file(executor_url, force=False)

        command_list = ['java']
        command_list.append('-DmarvinConfig.engineHome={}'.format(ctx.obj['config']['inidir']))
        command_list.append('-DmarvinConfig.ipAddress={}'.format(http_host))
        command_list.append('-DmarvinConfig.port={}'.format(http_port))
        command_list.append('-DmarvinConfig.protocol={}'.format(protocol))

        if extra_executor_parameters:
            command_list.append(extra_executor_parameters)

        command_list.append('-jar')
        command_list.append(executor_path)

        httpserver = subprocess.Popen(command_list)

    except:
        logger.exception("Could not start http server!")
        grpcserver.terminate() if grpcserver else None
        sys.exit(1)

    try:
        while True:
            time.sleep(100)

    except KeyboardInterrupt:
        logger.info("Terminating http and grpc servers...")
        grpcserver.terminate() if grpcserver else None
        httpserver.terminate() if httpserver else None
        logger.info("Http and grpc servers terminated!")
        sys.exit(0)

예제 #7

0

파일 보기

파일: acquisitor_and_cleaner.py 프로젝트: cardosolucas/incubator-marvin-trytravis

    def execute(self, params, **kwargs):
        data_file = MarvinData.download_file(
            "https://s3.amazonaws.com/marvin-engines-data/spam.csv")
        data = pd.read_csv(data_file, encoding='latin-1')
        data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
        data = data.rename(columns={"v1": "label", "v2": "text"})
        data['label_num'] = data.label.map({'ham': 0, 'spam': 1})

        self.marvin_initial_dataset = data

예제 #8

0

파일 보기

    def execute(self, params, **kwargs):
        from marvin_python_toolbox.common.data import MarvinData
        import pandas as pd

        initial_dataset = pd.read_csv(MarvinData.download_file("https://s3.amazonaws.com/automl-example/produtos.csv"), delimiter=";", encoding='utf-8')
        initial_dataset["text"] = initial_dataset["nome"] + " " + initial_dataset["descricao"]
        initial_dataset.drop(["descricao", "nome"], axis=1, inplace=True)
        initial_dataset.dropna(inplace=True)
        self.marvin_initial_dataset = initial_dataset

예제 #9

0

파일 보기

파일: test_data.py 프로젝트: watersjamey8/incubator-marvin

def test_download_file_delete_file_if_exception(mocked_requests, mocked_progressbar):
    mocked_requests.get.side_effect = Exception()
    with open('/tmp/data/error.json', 'w') as f:
        f.write('test')
    
    file_url = 'google.com/error.json'
    with pytest.raises(Exception) as excinfo:
        file_path = MarvinData.download_file(file_url, force=True)

    assert os.path.exists('/tmp/data/error.json') is False

예제 #10

0

파일 보기

파일: acquisitor_and_cleaner.py 프로젝트: guialba/incubator-marvin

    def execute(self, params, **kwargs):
        import pandas as pd
        from marvin_python_toolbox.common.data import MarvinData

        file_path = MarvinData.download_file(
            url="https://s3.amazonaws.com/marvin-engines-data/Iris.csv")

        iris = pd.read_csv(file_path)

        self.marvin_initial_dataset = iris

예제 #11

0

파일 보기

파일: test_data.py 프로젝트: watersjamey8/incubator-marvin

def test_load_data_from_filesystem(data_path_key, data_path):
    data = 'return value'

    # If the data was not found try to load from filesystem
    with mock.patch('marvin_python_toolbox.common.data.open', create=True) as mock_open:
        mock_open.return_value = mock.MagicMock(spec=IOBase)
        mocked_fp = mock_open.return_value.__enter__.return_value
        mocked_fp.read.return_value = data
        content = MarvinData.load_data(os.path.join('named_features', 'brands.json'))

    mocked_fp.read.assert_called_once()
    assert content == data

예제 #12

0

파일 보기

파일: acquisitor_and_cleaner.py 프로젝트: danilons/engines

 def execute(self, **kwargs):
     data = os.path.join(MarvinData.data_path,
                         os.path.basename(self.params['DATA']))
     if not os.path.exists(data):
         print("Downloading...")
         data = MarvinData.download_file(url=self.params["DATA"])
         print("Extracting...")
         os.system('tar xvf {} --directory {}'.format(
             data, MarvinData.data_path))
         print("Done.")
     train = self.read_samples(
         os.path.join(MarvinData.data_path, self.params['TRAIN']))
     val = self.read_samples(
         os.path.join(MarvinData.data_path, self.params['VALID']))
     self.initial_dataset = ((train, val))

예제 #13

0

파일 보기

    def execute(self, params, **kwargs):
        import pandas as pd

        # Using MarvinData utility to download file
        from marvin_python_toolbox.common.data import MarvinData

        # getting the initial data set
        file_path = MarvinData.download_file(url="https://s3.amazonaws.com/marvin-engines-data/Iris.csv")

        iris = pd.read_csv(file_path)

        iris.drop('Id', axis=1, inplace=True)

        print(iris.head(2))

        self.marvin_initial_dataset = iris

예제 #14

0

파일 보기

def engine_httpserver(ctx, action, params_file, initial_dataset, dataset,
                      model, metrics, spark_conf, http_host, http_port,
                      executor_path, max_workers, max_rpc_workers):
    logger.info("Starting http and grpc servers ...")

    grpcserver = None
    httpserver = None

    try:
        grpcserver = subprocess.Popen([
            'marvin', 'engine-grpcserver', '-a', action, '-w',
            str(max_workers), '-rw',
            str(max_rpc_workers)
        ])
        time.sleep(3)

    except:
        logger.exception("Could not start grpc server!")
        sys.exit(1)

    try:
        if not (executor_path and os.path.exists(executor_path)):
            executor_url = Config.get("executor_url", section="marvin")
            executor_path = MarvinData.download_file(executor_url, force=False)

        httpserver = subprocess.Popen([
            'java',
            '-DmarvinConfig.engineHome={}'.format(ctx.obj['config']['inidir']),
            '-DmarvinConfig.ipAddress={}'.format(http_host),
            '-DmarvinConfig.port={}'.format(http_port), '-jar', executor_path
        ])

    except:
        logger.exception("Could not start http server!")
        grpcserver.terminate() if grpcserver else None
        sys.exit(1)

    try:
        while True:
            time.sleep(100)
    except KeyboardInterrupt:
        logger.info("Terminating http and grpc servers...")
        grpcserver.terminate() if grpcserver else None
        httpserver.terminate() if httpserver else None
        logger.info("Http and grpc servers terminated!")
        sys.exit(0)

예제 #15

0

파일 보기

    def execute(self, params, **kwargs):
        import nltk
        import unicodedata
        import pandas as pd
        from nltk.corpus import stopwords
        from nltk.tokenize import word_tokenize
        from marvin_python_toolbox.common.data import MarvinData

        nltk.download('stopwords')
        stop_words = stopwords.words('portuguese')

        initial_dataset = pd.read_csv(
            MarvinData.download_file("https://s3.amazonaws.com/automl-example/produtos.csv"),
            delimiter=";", encoding='utf-8')


        def remove_nonlatin(string):
            new_chars = []
            for char in string:
                if char == '\n':
                    new_chars.append(' ')
                    continue
                try:
                    if unicodedata.name(char).startswith(('LATIN', 'SPACE')):
                        new_chars.append(char)
                except:
                    continue
            return ''.join(new_chars)


        def pre_processor(text):
            stops = set(stopwords.words("portuguese"))
            text = remove_nonlatin(text)
            words = text.lower().split()
            words = ' '.join([w for w in words if not w in stops])
            return words


        initial_dataset["text"] = initial_dataset["nome"] + " " + initial_dataset["descricao"]
        initial_dataset.drop(["descricao", "nome"], axis=1, inplace=True)
        initial_dataset.dropna(inplace=True)
        initial_dataset['text'] = initial_dataset['text'].apply(pre_processor)

        self.marvin_initial_dataset = initial_dataset

예제 #16

0

파일 보기

파일: test_data.py 프로젝트: guialba/incubator-marvin

def test_data_key_using_abspath(data_path_key, data_path):
    assert MarvinData._convert_path_to_key(
        os.path.join(data_path, 'brands.json')) == 'brands.json'

예제 #17

0

파일 보기

    def execute(self, **kwargs):
        file_path = MarvinData.download_file(url=self.params["data_url"])

        iris = pd.read_csv(file_path)
        iris.drop('Id', axis=1, inplace=True)
        self.initial_dataset = iris

예제 #18

0

파일 보기

 def execute(self, params, **kwargs):
     df = pd.read_csv(MarvinData.download_file(params.get("url")),
                      sep=str(params.get("separator")),
                      encoding=params.get("encoding"),
                      engine="python")
     self.marvin_initial_dataset = df