def test_imports_script_src(self, mock_db): # Checking js functions calls work from <script src=""/> self._extractor_script = ImportHandler(self._plan_for_script, PARAMS) row = self._extractor_script.next() self.assertTrue(mock_db.called) self.assertEqual(row['test_script'], 99) self.assertEqual(row['test_script_tag'], 99)
def test_pig_datasource(self, sleep_mock, sqoop_mock): # Amazon mock self.pill.attach( self.session, os.path.abspath( os.path.join(os.path.dirname(__file__), 'placebo_responses/importhandler/pigxml'))) self.pill.playback() self._plan = ExtractionPlan( os.path.join(BASEDIR, 'extractorxml', 'pig-train-import-handler.xml')) # Sqoop import subprocess mock process_mock = Mock() attrs = {'wait.return_value': 0, 'stdout.readlines.return_value': []} process_mock.configure_mock(**attrs) sqoop_mock.return_value = process_mock with patch('psycopg2.extras.DictCursor.execute'): with patch('psycopg2.connect'): self._extractor = ImportHandler(self._plan, PARAMS) pig_ds = self._extractor.plan.datasources['pig'] # Checking iterator row = self._extractor.next() self.assertEquals(row['opening_id'], 57)
def test_store_data_csv(self, mock_db): self._extractor = ImportHandler(self._plan, PARAMS) self._extractor.store_data_csv("data.csv.bak") self.assertTrue(os.path.isfile("data.csv.bak")) with open("data.csv.bak") as fp: reader = csv.reader(fp) rows = [row for row in reader] self.assertEquals(len(rows), 2) os.remove("data.csv.bak")
def composite_test(self, mock_db): self._extractor = ImportHandler(self._plan, { 'start': '2012-12-03', 'end': '2012-12-04', }) row = self._extractor.next() self.assertEqual(row['country_pair'], 'Australia,Philippines') self.assertEqual( row['tsexams']['English Spelling Test (U.S. Version)'], 5)
def test_validate_input_params(self): self._extractor = ImportHandler(self._plan, PARAMS) with self.assertRaisesRegexp(ImportHandlerException, "Missing input parameters"): self._extractor.process_input_params({'end': '2013-01-30'}) with self.assertRaisesRegexp(ImportHandlerException, "Missing input parameters"): self._extractor.process_input_params({}) with self.assertRaisesRegexp(ImportHandlerException, "Missing input parameters"): self._extractor.process_input_params(None)
def test_store_data_json(self, mock_db): self._extractor = ImportHandler(self._plan, PARAMS) self._extractor.store_data_json("data.json.bak") self.assertTrue(os.path.isfile("data.json.bak")) with open("data.json.bak") as fp: json_data = fp.read() data = json.loads(json_data) self.assertEquals(data['application_id'], 555) os.remove("data.json.bak") self._extractor.store_data_json("data.gz.bak", True) self.assertTrue(os.path.isfile("data.gz.bak")) os.remove("data.gz.bak")
def main(argv=None): parser = create_parser() args = parser.parse_args(argv) init_logging(args.debug) try: transformer = Transformer(args.path) except (TransformerSchemaException, IOError) as e: logging.warn('Invalid feature model: %s' % e.message) print_exception(e) return INVALID_TRANSFORMER_CONFIG try: if args.input is not None: file_format = os.path.splitext(args.input)[1][1:] with open(args.input, 'r') as train_fp: transformer.train( streamingiterload(train_fp, source_format=file_format)) elif args.extraction is not None: train_context = list_to_dict(args.train_params) try: plan = ExtractionPlan(args.extraction) train_handler = ImportHandler(plan, train_context) except ImportHandlerException, e: logging.warn('Invalid extraction plan: %s' % e.message) print_exception(e) return INVALID_EXTRACTION_PLAN logging.info('Starting training with params:') for key, value in train_context.items(): logging.info('%s --> %s' % (key, value)) transformer.train(train_handler) else:
class InputDatasourceTest(unittest.TestCase): def setUp(self): from cloudml.importhandler.importhandler import ExtractionPlan self._plan = ExtractionPlan( os.path.join(BASEDIR, 'extractorxml', 'input-datasource-handler.xml')) def test_json(self): from cloudml.importhandler.importhandler import ImportHandler self._extractor = ImportHandler( self._plan, { 'contractor_info': '{ "skills":[{"skl_status":"0","ts_tests_count"\ :"0","skl_name":"microsoft-excel","skl_external_link":"http:\/\/en.wikipedia.\ org\/wiki\/Microsoft_Excel","skl_has_tests":"1","skl_pretty_name":"Microsoft\ Excel","skill_uid":"475721704063008779","skl_rank":"1","skl_description":\ "Microsoft Excel is a proprietary commercial spreadsheet application written\ and distributed by Microsoft for Microsoft Windows and Mac OS X. It features\ calculation, graphing tools, pivot tables, and a macro programming language\ called Visual Basic for Applications."},{"skl_status":"0","ts_tests_count":\ "0","skl_name":"microsoft-word","skl_external_link":"http:\/\/en.wikipedia.\ org\/wiki\/Microsoft_Word","skl_has_tests":"1","skl_pretty_name":"Microsoft\ Word","skill_uid":"475721704071397377","skl_rank":"2","skl_description":\ "Microsoft Office Word is a word processor designed by Microsoft."}]}', }) row = self._extractor.next() self.assertEqual(row['contractor.skills'], 'microsoft-excel,microsoft-word')
def main(argv=None): parser = create_parser() args = parser.parse_args(argv) init_logging(args.debug) try: if args.user_params is not None: param_list = [x.split('=', 1) for x in args.user_params] context = dict((key, value) for (key, value) in param_list) else: context = {} logging.info('User-defined parameters:') for key, value in context.items(): logging.info('%s --> %s' % (key, value)) try: plan = ExtractionPlan(args.path) extractor = ImportHandler(plan, context) except ImportHandlerException, e: logging.warn('Invalid extraction plan: {}'.format(e.message)) print_exception(e) return INVALID_EXTRACTION_PLAN if args.output is not None: logging.info('Storing data to %s...' % args.output) getattr(extractor, 'store_data_{}'.format(args.format), extractor.store_data_json)(args.output) logging.info('Total %s lines' % (extractor.count, )) logging.info('Ignored %s lines' % (extractor.ignored, ))
class HttpXMLPlanTest(unittest.TestCase): def setUp(self): self._plan = ExtractionPlan( os.path.join(BASEDIR, 'extractorxml', 'http-train-import-handler.xml')) def test_http_datasource(self): with HTTMock(http_mock): self._extractor = ImportHandler(self._plan, PARAMS) row = self._extractor.next() self.assertEqual(row['application_id'], 123456) def test_http_query(self): with HTTMock(http_mock): self._plan.entity.query = '/some/other/path.json' self._extractor = ImportHandler(self._plan, PARAMS) row = self._extractor.next() self.assertEqual(row['application_id'], 78910)
def test_imports(self, mock_db): self._extractor = ImportHandler(self._plan, PARAMS) row = self._extractor.next() self.assertTrue(mock_db.called) # Checking types self.assertEqual(row['check_float'], float(ROW["float_field"])) self.assertEqual(row['check_string'], ROW["float_field"]) self.assertEqual(row['check_int'], int(ROW["int_field"])) self.assertEqual(row['check_boolean'], True) self.assertEqual(row['check_integer_with_float'], None) self.assertEqual(row['check_json'], ROW["json_field"]) self.assertEqual(row['check_json_jsonpath'], "Professional and \ experienced person") # Checking subentries as json datasources self.assertEqual(row['employer.country'], 'Philippines') # Checking jsonpath and join self.assertEqual(row['autors'], 'Nigel and Evelyn') # Checking regex and split self.assertEqual(row['say_hello'], 'hello') self.assertEqual(row['words'], ['Words', 'words', 'words']) # Checking javascript func self.assertEqual(row['test_script'], 99) self.assertEqual(row['test_script_tag'], 99) # Checking dataFormat self.assertEqual(row['date'], datetime(2014, 6, 1, 13, 33)) # Checking template self.assertEqual(row['template'], "Greatings: hello and hi and pruvit.") # Checking global nested datasources self.assertEqual(row['application_title'], 'Application Title') self.assertEqual( mock_db.call_args_list[1][0][0], "SELECT title FROM applications where id==%s;" % ROW['application'])
class CsvXMLPlanTest(unittest.TestCase): def setUp(self): self._plan = ExtractionPlan( os.path.join(BASEDIR, 'extractorxml', 'csv-train-import-handler.xml')) def test_csv_datasource(self): self._extractor = ImportHandler(self._plan, PARAMS) row = self._extractor.next() self.assertEqual(row['class'], 'hire') self.assertEqual(row['money'], 10)
class CompositeTypeTest(unittest.TestCase): def setUp(self): self._plan = ExtractionPlan( os.path.join(BASEDIR, 'extractorxml', 'composite-type-import-handler.xml')) @patch('cloudml.importhandler.datasources.DbDataSource._get_iter', return_value=db_row_iter_mock()) def composite_test(self, mock_db): self._extractor = ImportHandler(self._plan, { 'start': '2012-12-03', 'end': '2012-12-04', }) row = self._extractor.next() self.assertEqual(row['country_pair'], 'Australia,Philippines') self.assertEqual( row['tsexams']['English Spelling Test (U.S. Version)'], 5)
class PigXMLPlanTest(unittest.TestCase): PIG_DS = 'cloudml.importhandler.datasources.PigDataSource' def setUp(self): super(PigXMLPlanTest, self).setUp() self.pill = StreamPill(debug=True) self.session = boto3.session.Session() boto3.DEFAULT_SESSION = self.session @patch('subprocess.Popen') @patch('time.sleep', return_value=None) def test_pig_datasource(self, sleep_mock, sqoop_mock): # Amazon mock self.pill.attach( self.session, os.path.abspath( os.path.join(os.path.dirname(__file__), 'placebo_responses/importhandler/pigxml'))) self.pill.playback() self._plan = ExtractionPlan( os.path.join(BASEDIR, 'extractorxml', 'pig-train-import-handler.xml')) # Sqoop import subprocess mock process_mock = Mock() attrs = {'wait.return_value': 0, 'stdout.readlines.return_value': []} process_mock.configure_mock(**attrs) sqoop_mock.return_value = process_mock with patch('psycopg2.extras.DictCursor.execute'): with patch('psycopg2.connect'): self._extractor = ImportHandler(self._plan, PARAMS) pig_ds = self._extractor.plan.datasources['pig'] # Checking iterator row = self._extractor.next() self.assertEquals(row['opening_id'], 57)
def main(argv=None): parser = create_parser() args = parser.parse_args(argv) init_logging(args.debug) try: with open(args.path, 'r') as fp: trainer = load_trainer(fp) except (IOError, InvalidTrainerFile) as exc: logging.warn('Invalid trainer file: {0!s}'.format(exc)) print_exception(exc) return INVALID_TRAINER try: iterator = None if args.input is not None: # Read evaluation data from file. eval_fp = open(args.input, 'r') file_format = determine_data_format(args.input) iterator = streamingiterload(eval_fp, source_format=file_format) elif args.extraction is not None: # Use import handler try: eval_context = list_to_dict(args.eval_params) plan = ExtractionPlan(args.extraction) eval_handler = ImportHandler(plan, eval_context) except ImportHandlerException, e: logging.warn('Invalid extraction plan: %s' % e.message) print_exception(e) return INVALID_EXTRACTION_PLAN logging.info('Starting training with params:') for key, value in eval_context.items(): logging.info('%s --> %s' % (key, value)) iterator = eval_handler else:
def test_csv_datasource(self): self._extractor = ImportHandler(self._plan, PARAMS) row = self._extractor.next() self.assertEqual(row['class'], 'hire') self.assertEqual(row['money'], 10)
class ImportHandlerTest(unittest.TestCase): def setUp(self): self._plan = ExtractionPlan( os.path.join(BASEDIR, 'extractorxml', 'train-import-handler.xml')) self._plan_for_script = ExtractionPlan( os.path.join(BASEDIR, 'extractorxml', 'train-import-handler-script-file.xml')) @patch('cloudml.importhandler.datasources.DbDataSource._get_iter', return_value=db_iter_mock()) def test_imports(self, mock_db): self._extractor = ImportHandler(self._plan, PARAMS) row = self._extractor.next() self.assertTrue(mock_db.called) # Checking types self.assertEqual(row['check_float'], float(ROW["float_field"])) self.assertEqual(row['check_string'], ROW["float_field"]) self.assertEqual(row['check_int'], int(ROW["int_field"])) self.assertEqual(row['check_boolean'], True) self.assertEqual(row['check_integer_with_float'], None) self.assertEqual(row['check_json'], ROW["json_field"]) self.assertEqual(row['check_json_jsonpath'], "Professional and \ experienced person") # Checking subentries as json datasources self.assertEqual(row['employer.country'], 'Philippines') # Checking jsonpath and join self.assertEqual(row['autors'], 'Nigel and Evelyn') # Checking regex and split self.assertEqual(row['say_hello'], 'hello') self.assertEqual(row['words'], ['Words', 'words', 'words']) # Checking javascript func self.assertEqual(row['test_script'], 99) self.assertEqual(row['test_script_tag'], 99) # Checking dataFormat self.assertEqual(row['date'], datetime(2014, 6, 1, 13, 33)) # Checking template self.assertEqual(row['template'], "Greatings: hello and hi and pruvit.") # Checking global nested datasources self.assertEqual(row['application_title'], 'Application Title') self.assertEqual( mock_db.call_args_list[1][0][0], "SELECT title FROM applications where id==%s;" % ROW['application']) @patch('cloudml.importhandler.datasources.DbDataSource._get_iter', return_value=db_iter_mock()) def test_imports_script_src(self, mock_db): # Checking js functions calls work from <script src=""/> self._extractor_script = ImportHandler(self._plan_for_script, PARAMS) row = self._extractor_script.next() self.assertTrue(mock_db.called) self.assertEqual(row['test_script'], 99) self.assertEqual(row['test_script_tag'], 99) @patch('cloudml.importhandler.datasources.DbDataSource._get_iter', return_value=db_iter_mock()) def test_store_data_json(self, mock_db): self._extractor = ImportHandler(self._plan, PARAMS) self._extractor.store_data_json("data.json.bak") self.assertTrue(os.path.isfile("data.json.bak")) with open("data.json.bak") as fp: json_data = fp.read() data = json.loads(json_data) self.assertEquals(data['application_id'], 555) os.remove("data.json.bak") self._extractor.store_data_json("data.gz.bak", True) self.assertTrue(os.path.isfile("data.gz.bak")) os.remove("data.gz.bak") @patch('cloudml.importhandler.datasources.DbDataSource._get_iter', return_value=db_iter_mock()) def test_store_data_csv(self, mock_db): self._extractor = ImportHandler(self._plan, PARAMS) self._extractor.store_data_csv("data.csv.bak") self.assertTrue(os.path.isfile("data.csv.bak")) with open("data.csv.bak") as fp: reader = csv.reader(fp) rows = [row for row in reader] self.assertEquals(len(rows), 2) os.remove("data.csv.bak") @patch('cloudml.importhandler.datasources.DbDataSource._get_iter', return_value=db_iter_mock()) def test_store_data_csv_compressed(self, mock_db): self._extractor = ImportHandler(self._plan, PARAMS) self._extractor.store_data_csv("data.gz.bak", True) self.assertTrue(os.path.isfile("data.gz.bak")) os.remove("data.gz.bak") def test_validate_input_params(self): self._extractor = ImportHandler(self._plan, PARAMS) with self.assertRaisesRegexp(ImportHandlerException, "Missing input parameters"): self._extractor.process_input_params({'end': '2013-01-30'}) with self.assertRaisesRegexp(ImportHandlerException, "Missing input parameters"): self._extractor.process_input_params({}) with self.assertRaisesRegexp(ImportHandlerException, "Missing input parameters"): self._extractor.process_input_params(None)
def test_http_query(self): with HTTMock(http_mock): self._plan.entity.query = '/some/other/path.json' self._extractor = ImportHandler(self._plan, PARAMS) row = self._extractor.next() self.assertEqual(row['application_id'], 78910)
def test_http_datasource(self): with HTTMock(http_mock): self._extractor = ImportHandler(self._plan, PARAMS) row = self._extractor.next() self.assertEqual(row['application_id'], 123456)
def test_store_data_csv_compressed(self, mock_db): self._extractor = ImportHandler(self._plan, PARAMS) self._extractor.store_data_csv("data.gz.bak", True) self.assertTrue(os.path.isfile("data.gz.bak")) os.remove("data.gz.bak")
trainer.test( streamingiterload(test_fp, source_format=file_format), test_percent) if args.test is not None and args.skip_tests is False: file_format = os.path.splitext(args.test)[1][1:] with open(args.test, 'r') as test_fp: trainer.test( streamingiterload(test_fp, source_format=file_format)) elif args.extraction is not None: train_context = list_to_dict(args.train_params) try: plan = ExtractionPlan(args.extraction) train_handler = ImportHandler(plan, train_context) except ImportHandlerException, e: logging.warn('Invalid extraction plan: %s' % e.message) print_exception(e) return INVALID_EXTRACTION_PLAN logging.info('Starting training with params:') for key, value in train_context.items(): logging.info('%s --> %s' % (key, value)) trainer.train(train_handler, test_percent) if args.skip_tests is False: if test_percent != 0: if args.test_params is None: test_handler = ImportHandler(plan, train_context)