def execute(self, **kwargs): from marvin_python_toolbox.common.data import MarvinData import pickle import cPickle # the words (features) and authors (labels), already largely preprocessed # this preprocessing will be repeated in the text learning mini-project print "Downloading files ...." authors_file_path = MarvinData.download_file( self.params["authors_file_path"]) word_file_path = MarvinData.download_file( self.params["word_file_path"]) print "Loading files ...." authors_file_handler = open(authors_file_path, "r") authors = pickle.load(authors_file_handler) authors_file_handler.close() words_file_handler = open(word_file_path, "r") word_data = cPickle.load(words_file_handler) words_file_handler.close() self.initial_dataset = {"word_data": word_data, "authors": authors} print "Done!"
def test_load_data_from_filesystem_exception(data_path_key, data_path): with mock.patch('marvin_python_toolbox.common.data.open') as mock_open: mock_open.side_effect = IOError # load_data should propagate IOError with pytest.raises(IOError): MarvinData.load_data(os.path.join('named_features', 'brands.json'))
def test_download_file(mocked_requests, mocked_progressbar): file_url = 'google.com/file.json' file_path = MarvinData.download_file(file_url) assert file_path == '/tmp/data/file.json' file_path = MarvinData.download_file(file_url, local_file_name='myfile') assert file_path == '/tmp/data/myfile'
def execute(self, params, **kwargs): from marvin_python_toolbox.common.data import MarvinData import pandas as pd train_df = pd.read_csv(MarvinData.download_file("https://s3.amazonaws.com/marvin-engines-data/titanic/train.csv")) test_df = pd.read_csv(MarvinData.download_file("https://s3.amazonaws.com/marvin-engines-data/titanic/test.csv")) print ("{} samples to train with {} features...".format(train_df.shape[0], train_df.shape[1])) print ("{} samples to test...".format(test_df.shape[0])) self.marvin_initial_dataset = { 'train': train_df, 'test': test_df }
def test_download_file_dont_write_file_if_no_content(mocked_requests, mocked_progressbar): from requests import Response file_url = 'google.com/file.json' response = mock.Mock(spec=Response) response.iter_content.return_value = '' mocked_requests.get.return_value = response mocked_open = mock.mock_open() with mock.patch('marvin_python_toolbox.common.data.open', mocked_open, create=True): MarvinData.download_file(file_url, force=True) mocked_open.assert_called_once_with('/tmp/data/file.json', 'wb') handle = mocked_open() assert handle.write.call_count == 0
def engine_httpserver(ctx, action, params_file, initial_dataset, dataset, model, metrics, protocol, spark_conf, http_host, http_port, executor_path, max_workers, max_rpc_workers, extra_executor_parameters): logger.info("Starting http and grpc servers ...") grpcserver = None httpserver = None def _params(**kwargs): params = [] if kwargs is not None: for key, value in iteritems(kwargs): if value is not None: params.append("-{0}".format(str(key))) params.append(str(value)) return params try: optional_args = _params(id=initial_dataset, d=dataset, m=model, me=metrics, pf=params_file, c=spark_conf) grpcserver = subprocess.Popen(['marvin', 'engine-grpcserver', '-a', action, '-w', str(max_workers), '-rw', str(max_rpc_workers)] + optional_args) time.sleep(3) except: logger.exception("Could not start grpc server!") sys.exit(1) try: if not (executor_path and os.path.exists(executor_path)): executor_url = Config.get("executor_url", section="marvin") executor_path = MarvinData.download_file(executor_url, force=False) command_list = ['java'] command_list.append('-DmarvinConfig.engineHome={}'.format(ctx.obj['config']['inidir'])) command_list.append('-DmarvinConfig.ipAddress={}'.format(http_host)) command_list.append('-DmarvinConfig.port={}'.format(http_port)) command_list.append('-DmarvinConfig.protocol={}'.format(protocol)) if extra_executor_parameters: command_list.append(extra_executor_parameters) command_list.append('-jar') command_list.append(executor_path) httpserver = subprocess.Popen(command_list) except: logger.exception("Could not start http server!") grpcserver.terminate() if grpcserver else None sys.exit(1) try: while True: time.sleep(100) except KeyboardInterrupt: logger.info("Terminating http and grpc servers...") grpcserver.terminate() if grpcserver else None httpserver.terminate() if httpserver else None logger.info("Http and grpc servers terminated!") sys.exit(0)
def execute(self, params, **kwargs): data_file = MarvinData.download_file( "https://s3.amazonaws.com/marvin-engines-data/spam.csv") data = pd.read_csv(data_file, encoding='latin-1') data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1) data = data.rename(columns={"v1": "label", "v2": "text"}) data['label_num'] = data.label.map({'ham': 0, 'spam': 1}) self.marvin_initial_dataset = data
def execute(self, params, **kwargs): from marvin_python_toolbox.common.data import MarvinData import pandas as pd initial_dataset = pd.read_csv(MarvinData.download_file("https://s3.amazonaws.com/automl-example/produtos.csv"), delimiter=";", encoding='utf-8') initial_dataset["text"] = initial_dataset["nome"] + " " + initial_dataset["descricao"] initial_dataset.drop(["descricao", "nome"], axis=1, inplace=True) initial_dataset.dropna(inplace=True) self.marvin_initial_dataset = initial_dataset
def test_download_file_delete_file_if_exception(mocked_requests, mocked_progressbar): mocked_requests.get.side_effect = Exception() with open('/tmp/data/error.json', 'w') as f: f.write('test') file_url = 'google.com/error.json' with pytest.raises(Exception) as excinfo: file_path = MarvinData.download_file(file_url, force=True) assert os.path.exists('/tmp/data/error.json') is False
def execute(self, params, **kwargs): import pandas as pd from marvin_python_toolbox.common.data import MarvinData file_path = MarvinData.download_file( url="https://s3.amazonaws.com/marvin-engines-data/Iris.csv") iris = pd.read_csv(file_path) self.marvin_initial_dataset = iris
def test_load_data_from_filesystem(data_path_key, data_path): data = 'return value' # If the data was not found try to load from filesystem with mock.patch('marvin_python_toolbox.common.data.open', create=True) as mock_open: mock_open.return_value = mock.MagicMock(spec=IOBase) mocked_fp = mock_open.return_value.__enter__.return_value mocked_fp.read.return_value = data content = MarvinData.load_data(os.path.join('named_features', 'brands.json')) mocked_fp.read.assert_called_once() assert content == data
def execute(self, **kwargs): data = os.path.join(MarvinData.data_path, os.path.basename(self.params['DATA'])) if not os.path.exists(data): print("Downloading...") data = MarvinData.download_file(url=self.params["DATA"]) print("Extracting...") os.system('tar xvf {} --directory {}'.format( data, MarvinData.data_path)) print("Done.") train = self.read_samples( os.path.join(MarvinData.data_path, self.params['TRAIN'])) val = self.read_samples( os.path.join(MarvinData.data_path, self.params['VALID'])) self.initial_dataset = ((train, val))
def execute(self, params, **kwargs): import pandas as pd # Using MarvinData utility to download file from marvin_python_toolbox.common.data import MarvinData # getting the initial data set file_path = MarvinData.download_file(url="https://s3.amazonaws.com/marvin-engines-data/Iris.csv") iris = pd.read_csv(file_path) iris.drop('Id', axis=1, inplace=True) print(iris.head(2)) self.marvin_initial_dataset = iris
def engine_httpserver(ctx, action, params_file, initial_dataset, dataset, model, metrics, spark_conf, http_host, http_port, executor_path, max_workers, max_rpc_workers): logger.info("Starting http and grpc servers ...") grpcserver = None httpserver = None try: grpcserver = subprocess.Popen([ 'marvin', 'engine-grpcserver', '-a', action, '-w', str(max_workers), '-rw', str(max_rpc_workers) ]) time.sleep(3) except: logger.exception("Could not start grpc server!") sys.exit(1) try: if not (executor_path and os.path.exists(executor_path)): executor_url = Config.get("executor_url", section="marvin") executor_path = MarvinData.download_file(executor_url, force=False) httpserver = subprocess.Popen([ 'java', '-DmarvinConfig.engineHome={}'.format(ctx.obj['config']['inidir']), '-DmarvinConfig.ipAddress={}'.format(http_host), '-DmarvinConfig.port={}'.format(http_port), '-jar', executor_path ]) except: logger.exception("Could not start http server!") grpcserver.terminate() if grpcserver else None sys.exit(1) try: while True: time.sleep(100) except KeyboardInterrupt: logger.info("Terminating http and grpc servers...") grpcserver.terminate() if grpcserver else None httpserver.terminate() if httpserver else None logger.info("Http and grpc servers terminated!") sys.exit(0)
def execute(self, params, **kwargs): import nltk import unicodedata import pandas as pd from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from marvin_python_toolbox.common.data import MarvinData nltk.download('stopwords') stop_words = stopwords.words('portuguese') initial_dataset = pd.read_csv( MarvinData.download_file("https://s3.amazonaws.com/automl-example/produtos.csv"), delimiter=";", encoding='utf-8') def remove_nonlatin(string): new_chars = [] for char in string: if char == '\n': new_chars.append(' ') continue try: if unicodedata.name(char).startswith(('LATIN', 'SPACE')): new_chars.append(char) except: continue return ''.join(new_chars) def pre_processor(text): stops = set(stopwords.words("portuguese")) text = remove_nonlatin(text) words = text.lower().split() words = ' '.join([w for w in words if not w in stops]) return words initial_dataset["text"] = initial_dataset["nome"] + " " + initial_dataset["descricao"] initial_dataset.drop(["descricao", "nome"], axis=1, inplace=True) initial_dataset.dropna(inplace=True) initial_dataset['text'] = initial_dataset['text'].apply(pre_processor) self.marvin_initial_dataset = initial_dataset
def test_data_key_using_abspath(data_path_key, data_path): assert MarvinData._convert_path_to_key( os.path.join(data_path, 'brands.json')) == 'brands.json'
def execute(self, **kwargs): file_path = MarvinData.download_file(url=self.params["data_url"]) iris = pd.read_csv(file_path) iris.drop('Id', axis=1, inplace=True) self.initial_dataset = iris
def execute(self, params, **kwargs): df = pd.read_csv(MarvinData.download_file(params.get("url")), sep=str(params.get("separator")), encoding=params.get("encoding"), engine="python") self.marvin_initial_dataset = df