def execute(self, **kwargs):
        from marvin_python_toolbox.common.data import MarvinData
        import pickle
        import cPickle

        # the words (features) and authors (labels), already largely preprocessed
        # this preprocessing will be repeated in the text learning mini-project

        print "Downloading files ...."
        authors_file_path = MarvinData.download_file(
            self.params["authors_file_path"])
        word_file_path = MarvinData.download_file(
            self.params["word_file_path"])

        print "Loading files ...."
        authors_file_handler = open(authors_file_path, "r")
        authors = pickle.load(authors_file_handler)
        authors_file_handler.close()

        words_file_handler = open(word_file_path, "r")
        word_data = cPickle.load(words_file_handler)
        words_file_handler.close()

        self.initial_dataset = {"word_data": word_data, "authors": authors}

        print "Done!"
예제 #2
0
def test_download_file(mocked_requests, mocked_progressbar):
    file_url = 'google.com/file.json'
    file_path = MarvinData.download_file(file_url)
    assert file_path == '/tmp/data/file.json'

    file_path = MarvinData.download_file(file_url, local_file_name='myfile')
    assert file_path == '/tmp/data/myfile'
    def execute(self, params, **kwargs):
        from marvin_python_toolbox.common.data import MarvinData
        import pandas as pd

        train_df = pd.read_csv(MarvinData.download_file("https://s3.amazonaws.com/marvin-engines-data/titanic/train.csv"))
        test_df = pd.read_csv(MarvinData.download_file("https://s3.amazonaws.com/marvin-engines-data/titanic/test.csv"))

        print ("{} samples to train with {} features...".format(train_df.shape[0], train_df.shape[1]))
        print ("{} samples to test...".format(test_df.shape[0]))

        self.marvin_initial_dataset = {
            'train': train_df,
            'test': test_df
        }
예제 #4
0
def test_download_file_dont_write_file_if_no_content(mocked_requests, mocked_progressbar):
    from requests import Response
    file_url = 'google.com/file.json'

    response = mock.Mock(spec=Response)
    response.iter_content.return_value = ''
    mocked_requests.get.return_value = response
        
    mocked_open = mock.mock_open()
    with mock.patch('marvin_python_toolbox.common.data.open', mocked_open, create=True):
        MarvinData.download_file(file_url, force=True)

    mocked_open.assert_called_once_with('/tmp/data/file.json', 'wb')
    handle = mocked_open()
    assert handle.write.call_count == 0
예제 #5
0
def engine_httpserver(ctx, action, params_file, initial_dataset, dataset, model, metrics, protocol, spark_conf, http_host,
                      http_port, executor_path, max_workers, max_rpc_workers, extra_executor_parameters):
    logger.info("Starting http and grpc servers ...")

    grpcserver = None
    httpserver = None

    def _params(**kwargs):
        params = []
        if kwargs is not None:
            for key, value in iteritems(kwargs):
                if value is not None:
                    params.append("-{0}".format(str(key)))
                    params.append(str(value))
        return params

    try:
        optional_args = _params(id=initial_dataset, d=dataset, m=model, me=metrics, pf=params_file, c=spark_conf)
        grpcserver = subprocess.Popen(['marvin', 'engine-grpcserver', '-a', action, '-w', str(max_workers), '-rw', str(max_rpc_workers)] + optional_args)

        time.sleep(3)

    except:
        logger.exception("Could not start grpc server!")
        sys.exit(1)

    try:
        if not (executor_path and os.path.exists(executor_path)):
            executor_url = Config.get("executor_url", section="marvin")
            executor_path = MarvinData.download_file(executor_url, force=False)

        command_list = ['java']
        command_list.append('-DmarvinConfig.engineHome={}'.format(ctx.obj['config']['inidir']))
        command_list.append('-DmarvinConfig.ipAddress={}'.format(http_host))
        command_list.append('-DmarvinConfig.port={}'.format(http_port))
        command_list.append('-DmarvinConfig.protocol={}'.format(protocol))

        if extra_executor_parameters:
            command_list.append(extra_executor_parameters)

        command_list.append('-jar')
        command_list.append(executor_path)

        httpserver = subprocess.Popen(command_list)

    except:
        logger.exception("Could not start http server!")
        grpcserver.terminate() if grpcserver else None
        sys.exit(1)

    try:
        while True:
            time.sleep(100)

    except KeyboardInterrupt:
        logger.info("Terminating http and grpc servers...")
        grpcserver.terminate() if grpcserver else None
        httpserver.terminate() if httpserver else None
        logger.info("Http and grpc servers terminated!")
        sys.exit(0)
    def execute(self, params, **kwargs):
        data_file = MarvinData.download_file(
            "https://s3.amazonaws.com/marvin-engines-data/spam.csv")
        data = pd.read_csv(data_file, encoding='latin-1')
        data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
        data = data.rename(columns={"v1": "label", "v2": "text"})
        data['label_num'] = data.label.map({'ham': 0, 'spam': 1})

        self.marvin_initial_dataset = data
예제 #7
0
    def execute(self, params, **kwargs):
        from marvin_python_toolbox.common.data import MarvinData
        import pandas as pd

        initial_dataset = pd.read_csv(MarvinData.download_file("https://s3.amazonaws.com/automl-example/produtos.csv"), delimiter=";", encoding='utf-8')
        initial_dataset["text"] = initial_dataset["nome"] + " " + initial_dataset["descricao"]
        initial_dataset.drop(["descricao", "nome"], axis=1, inplace=True)
        initial_dataset.dropna(inplace=True)
        self.marvin_initial_dataset = initial_dataset
    def execute(self, params, **kwargs):
        import pandas as pd
        from marvin_python_toolbox.common.data import MarvinData

        file_path = MarvinData.download_file(
            url="https://s3.amazonaws.com/marvin-engines-data/Iris.csv")

        iris = pd.read_csv(file_path)

        self.marvin_initial_dataset = iris
예제 #9
0
def test_download_file_delete_file_if_exception(mocked_requests, mocked_progressbar):
    mocked_requests.get.side_effect = Exception()
    with open('/tmp/data/error.json', 'w') as f:
        f.write('test')
    
    file_url = 'google.com/error.json'
    with pytest.raises(Exception) as excinfo:
        file_path = MarvinData.download_file(file_url, force=True)

    assert os.path.exists('/tmp/data/error.json') is False
예제 #10
0
 def execute(self, **kwargs):
     data = os.path.join(MarvinData.data_path,
                         os.path.basename(self.params['DATA']))
     if not os.path.exists(data):
         print("Downloading...")
         data = MarvinData.download_file(url=self.params["DATA"])
         print("Extracting...")
         os.system('tar xvf {} --directory {}'.format(
             data, MarvinData.data_path))
         print("Done.")
     train = self.read_samples(
         os.path.join(MarvinData.data_path, self.params['TRAIN']))
     val = self.read_samples(
         os.path.join(MarvinData.data_path, self.params['VALID']))
     self.initial_dataset = ((train, val))
예제 #11
0
    def execute(self, params, **kwargs):
        import pandas as pd

        # Using MarvinData utility to download file
        from marvin_python_toolbox.common.data import MarvinData

        # getting the initial data set
        file_path = MarvinData.download_file(url="https://s3.amazonaws.com/marvin-engines-data/Iris.csv")

        iris = pd.read_csv(file_path)

        iris.drop('Id', axis=1, inplace=True)

        print(iris.head(2))

        self.marvin_initial_dataset = iris
예제 #12
0
def engine_httpserver(ctx, action, params_file, initial_dataset, dataset,
                      model, metrics, spark_conf, http_host, http_port,
                      executor_path, max_workers, max_rpc_workers):
    logger.info("Starting http and grpc servers ...")

    grpcserver = None
    httpserver = None

    try:
        grpcserver = subprocess.Popen([
            'marvin', 'engine-grpcserver', '-a', action, '-w',
            str(max_workers), '-rw',
            str(max_rpc_workers)
        ])
        time.sleep(3)

    except:
        logger.exception("Could not start grpc server!")
        sys.exit(1)

    try:
        if not (executor_path and os.path.exists(executor_path)):
            executor_url = Config.get("executor_url", section="marvin")
            executor_path = MarvinData.download_file(executor_url, force=False)

        httpserver = subprocess.Popen([
            'java',
            '-DmarvinConfig.engineHome={}'.format(ctx.obj['config']['inidir']),
            '-DmarvinConfig.ipAddress={}'.format(http_host),
            '-DmarvinConfig.port={}'.format(http_port), '-jar', executor_path
        ])

    except:
        logger.exception("Could not start http server!")
        grpcserver.terminate() if grpcserver else None
        sys.exit(1)

    try:
        while True:
            time.sleep(100)
    except KeyboardInterrupt:
        logger.info("Terminating http and grpc servers...")
        grpcserver.terminate() if grpcserver else None
        httpserver.terminate() if httpserver else None
        logger.info("Http and grpc servers terminated!")
        sys.exit(0)
예제 #13
0
    def execute(self, params, **kwargs):
        import nltk
        import unicodedata
        import pandas as pd
        from nltk.corpus import stopwords
        from nltk.tokenize import word_tokenize
        from marvin_python_toolbox.common.data import MarvinData

        nltk.download('stopwords')
        stop_words = stopwords.words('portuguese')

        initial_dataset = pd.read_csv(
            MarvinData.download_file("https://s3.amazonaws.com/automl-example/produtos.csv"),
            delimiter=";", encoding='utf-8')


        def remove_nonlatin(string):
            new_chars = []
            for char in string:
                if char == '\n':
                    new_chars.append(' ')
                    continue
                try:
                    if unicodedata.name(char).startswith(('LATIN', 'SPACE')):
                        new_chars.append(char)
                except:
                    continue
            return ''.join(new_chars)


        def pre_processor(text):
            stops = set(stopwords.words("portuguese"))
            text = remove_nonlatin(text)
            words = text.lower().split()
            words = ' '.join([w for w in words if not w in stops])
            return words


        initial_dataset["text"] = initial_dataset["nome"] + " " + initial_dataset["descricao"]
        initial_dataset.drop(["descricao", "nome"], axis=1, inplace=True)
        initial_dataset.dropna(inplace=True)
        initial_dataset['text'] = initial_dataset['text'].apply(pre_processor)

        self.marvin_initial_dataset = initial_dataset
예제 #14
0
    def execute(self, **kwargs):
        file_path = MarvinData.download_file(url=self.params["data_url"])

        iris = pd.read_csv(file_path)
        iris.drop('Id', axis=1, inplace=True)
        self.initial_dataset = iris
예제 #15
0
 def execute(self, params, **kwargs):
     df = pd.read_csv(MarvinData.download_file(params.get("url")),
                      sep=str(params.get("separator")),
                      encoding=params.get("encoding"),
                      engine="python")
     self.marvin_initial_dataset = df