def _run_wordcount_it(self, **opts): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose output = '/'.join([ test_pipeline.get_option('output'), str(int(time.time() * 1000)), 'results' ]) arg_sleep_secs = test_pipeline.get_option('sleep_secs') sleep_secs = int( arg_sleep_secs) if arg_sleep_secs is not None else None pipeline_verifiers = [ PipelineStateMatcher(), FileChecksumMatcher(output + '*-of-*', self.DEFAULT_CHECKSUM, sleep_secs) ] extra_opts = { 'output': output, 'on_success_matcher': all_of(*pipeline_verifiers) } extra_opts.update(opts) # Register clean up before pipeline execution self.addCleanup(delete_files, [output + '*']) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. wordcount.run(test_pipeline.get_full_options_as_args(**extra_opts))
def test_basics(self): temp_path = self.create_temp_file(self.SAMPLE_TEXT) expected_words = collections.defaultdict(int) for word in re.findall(r'[\w\']+', self.SAMPLE_TEXT, re.UNICODE): expected_words[word] += 1 wordcount.run(['--input=%s*' % temp_path, '--output=%s.result' % temp_path], save_main_session=False) # Parse result file and compare. results = [] with open_shards(temp_path + '.result-*-of-*') as result_file: for line in result_file: match = re.search(r'(\S+): ([0-9]+)', line) if match is not None: results.append((match.group(1), int(match.group(2)))) self.assertEqual(sorted(results), sorted(expected_words.items()))
def test_wordcount_it(self): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose output = '/'.join([test_pipeline.get_option('output'), test_pipeline.get_option('job_name'), 'results']) pipeline_verifiers = [PipelineStateMatcher(), FileChecksumMatcher(output + '*-of-*', self.DEFAULT_CHECKSUM)] extra_opts = {'output': output, 'on_success_matcher': all_of(*pipeline_verifiers)} # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. wordcount.run(test_pipeline.get_full_options_as_args(**extra_opts))
def test_basics(self): temp_path = self.create_temp_file(self.SAMPLE_TEXT) expected_words = collections.defaultdict(int) for word in re.findall(r'\w+', self.SAMPLE_TEXT): expected_words[word] += 1 wordcount.run([ '--input=%s*' % temp_path, '--output=%s.result' % temp_path]) # Parse result file and compare. results = [] with open(temp_path + '.result-00000-of-00001') as result_file: for line in result_file: match = re.search(r'([a-z]+): ([0-9]+)', line) if match is not None: results.append((match.group(1), int(match.group(2)))) self.assertEqual(sorted(results), sorted(expected_words.iteritems()))
def test_basics(self): temp_path = self.create_temp_file(self.SAMPLE_TEXT) expected_words = collections.defaultdict(int) for word in re.findall(r'[\w\']+', self.SAMPLE_TEXT, re.UNICODE): expected_words[word.encode('utf-8')] += 1 wordcount.run([ '--input=%s*' % temp_path, '--output=%s.result' % temp_path]) # Parse result file and compare. results = [] with open_shards(temp_path + '.result-*-of-*') as result_file: for line in result_file: match = re.search(r'(\S+): ([0-9]+)', line, re.UNICODE) if match is not None: results.append((match.group(1), int(match.group(2)))) self.assertEqual(sorted(results), sorted(expected_words.items()))
def test_wordcount_it(self): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose output = '/'.join([ test_pipeline.get_option('output'), test_pipeline.get_option('job_name'), 'results' ]) pipeline_verifiers = [ PipelineStateMatcher(), FileChecksumMatcher(output + '*-of-*', self.DEFAULT_CHECKSUM) ] extra_opts = { 'output': output, 'on_success_matcher': all_of(*pipeline_verifiers) } # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. wordcount.run(test_pipeline.get_full_options_as_args(**extra_opts))
def test_wordcount_it(self): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose output = '/'.join([test_pipeline.get_option('output'), str(int(time.time())), 'results']) arg_sleep_secs = test_pipeline.get_option('sleep_secs') sleep_secs = int(arg_sleep_secs) if arg_sleep_secs is not None else None pipeline_verifiers = [PipelineStateMatcher(), FileChecksumMatcher(output + '*-of-*', self.DEFAULT_CHECKSUM, sleep_secs)] extra_opts = {'output': output, 'on_success_matcher': all_of(*pipeline_verifiers)} # Register clean up before pipeline execution self.addCleanup(delete_files, [output + '*']) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. wordcount.run(test_pipeline.get_full_options_as_args(**extra_opts))
def run_wordcount_example(): wordcount.run()