def run(self): db_list = [s.path for s in self.input()] parallel.mapreduce( parallel.Collection.from_sharded_list(db_list), mapper=MergeUpdatesMapper(), reducer=MergeUpdatesReducer(), output_prefix=self.output().path)
def run(self): input_dir = self.input().path output_dir = self.output().path common.shell_cmd("mkdir -p %s", dirname(output_dir)) NEEDS_HEADERS = {"estabtypes.txt": ["establishment_type_id", "description"]} inputs = [] for input_file in glob.glob(input_dir + "/*.txt"): if basename(input_file) in REMAPPED_FILES: continue header_key = basename(input_file) fieldnames = NEEDS_HEADERS.get(header_key, None) inputs.append( parallel.Collection.from_glob( input_file, parallel.CSVDictLineInput( delimiter="|", fieldnames=fieldnames, quoting=csv.QUOTE_NONE, escapechar="\\" ), ) ) parallel.mapreduce( inputs=inputs, mapper=TXT2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path )
def run_mr(self, prefix, input_data, input_format=parallel.LineInput(), mapper=parallel.IdentityMapper(), reducer=parallel.IdentityReducer(), output_format=parallel.LevelDBOutput(), num_shards=5): os.system('rm -rf "%s"' % prefix) source = self.make_files(os.path.join(prefix, 'input'), input_data, input_format) output_prefix = os.path.join(prefix, 'output') parallel.mapreduce(source, mapper=mapper, reducer=reducer, output_format=output_format, output_prefix=output_prefix, num_shards=num_shards) if isinstance(output_format, parallel.LevelDBOutput): return sorted(list(parallel.ShardedDB.open(output_prefix))) if isinstance(output_format, parallel.JSONOutput): return json.load(open(output_prefix)) if isinstance(output_format, parallel.JSONLineOutput): result = [] with open(output_prefix, 'r') as input_f: for line in input_f: result.append(json.loads(line)) return result
def run(self): harmonized_file = self.input()[0].path parallel.mapreduce( parallel.Collection.from_sharded(self.input()[1].path), annotate.AnnotateMapper(harmonized_file), parallel.IdentityReducer(), self.output().path)
def _run(self): json_dir = self.input()['data'].path mapper = LoadJSONMapper( config.es_host(), index_name=self.index_name, type_name=self.type_name, docid_key=self.docid_key, incremental=self.use_checksum ) parallel.mapreduce( parallel.Collection.from_sharded(json_dir), mapper=mapper, reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), map_workers=self.load_json_workers, num_shards=1, output_prefix=config.tmp_dir('%s/load-json' % self.index_name) ) # update metadata index elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD') ) # optimize index, if requested if self.optimize_index: optimize_index(self.index_name, wait_for_merge=False)
def run(self): # Since we only iterate over dates in the umbrella process, we need to # skip batch files that do not exist output_file = self.output().path if not os.path.exists(self.batch): common.shell_cmd('touch %s', output_file) return input_file = self.input()[1].path es = elasticsearch.Elasticsearch(self.es_host) index_util.start_index_transaction(es, 'druglabel', self.epoch) parallel.mapreduce( input_collection=parallel.Collection.from_sharded(input_file), mapper=index_util.LoadJSONMapper(self.es_host, 'druglabel', 'spl', self.epoch, docid_key='set_id', version_key='version'), reducer=parallel.NullReducer(), output_prefix='/tmp/loadjson.druglabel', num_shards=1, map_workers=1) index_util.commit_index_transaction(es, 'druglabel') common.shell_cmd('touch %s', output_file)
def run(self): parallel.mapreduce( parallel.Collection.from_sharded_list([batch.path for batch in self.input()]), mapper=SPLSetIDMapper(), reducer=parallel.ListReducer(), output_prefix=self.output().path, num_shards=16)
def run(self): files = glob.glob(self.input().path + '/*/*.txt') if self.loader_task == 'init': input_files = [f for f in files if not any(i for i in IGNORE_FILES if i in f)] else: input_files = [f for f in files if self.loader_task in f] # Load and cache device problem codes. problem_codes_reference = {} device_problem_codes = {} reader = csv.reader(open(DEVICE_PROBLEM_CODES_FILE), quoting=csv.QUOTE_NONE, delimiter='|') for idx, line in enumerate(reader): if len(line) > 1: problem_codes_reference[line[0]] = line[1].strip() reader = csv.reader(open(DEVICE_PROBLEMS_FILE), quoting=csv.QUOTE_NONE, delimiter='|') for idx, line in enumerate(reader): if len(line) > 1: device_problem_codes[line[0]] = [line[1]] if device_problem_codes.get(line[0]) is None else \ device_problem_codes[line[0]] + [line[1]] parallel.mapreduce( parallel.Collection.from_glob( input_files, parallel.CSVLineInput(quoting=csv.QUOTE_NONE, delimiter='|')), mapper=CSV2JSONMapper(problem_codes_reference=problem_codes_reference, device_problem_codes=device_problem_codes), reducer=CSV2JSONJoinReducer(), output_prefix=self.output().path)
def run(self): input_glob = glob.glob(SPL_S3_DIR + '/*/barcodes/otc-bars.json') parallel.mapreduce( parallel.Collection.from_sharded(self.input()[1].path), mapper=UpcMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): parallel.mapreduce( glob.glob(self.input()[1].path + '*-of-*'), load_json_mapper, parallel.null_reducer, output_prefix=self.output().path, num_shards=1, map_workers=1)
def run(self): parallel.mapreduce( parallel.Collection.from_sharded(self.input()[1].path), LoadJSONMapper(), parallel.NullReducer(), output_prefix=self.output().path, num_shards=1, map_workers=1)
def run(self): harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() db_list = [s.path for s in self.input()[1:]] parallel.mapreduce( parallel.Collection.from_sharded_list(db_list), mapper=MaudeAnnotationMapper(harmonized_db=harmonized_db), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): parallel.mapreduce( parallel.Collection.from_glob( self.input().path, parallel.CSVDictLineInput(delimiter='|')), mapper=RXNorm2JSONMapper(), reducer=RXNormReducer(), output_prefix=self.output().path, num_shards=10)
def run(self): parallel.mapreduce( parallel.Collection.from_glob( self.input().path, parallel.CSVDictLineInput(delimiter='\t')), mapper=NDC2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=1)
def run(self): parallel.mapreduce( parallel.Collection.from_glob( self.input().path, parallel.JSONLineInput()), mapper=parallel.IdentityMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=1)
def run(self): db_list = [s.path for s in self.input()] mapreduce( Collection.from_sharded_list(db_list), mapper=JoinMapper(), reducer=PivotReducer(), output_prefix=self.output().path, num_shards=10)
def run(self): tables = ['owner_operator', 'contact_addresses', 'official_correspondent'] join_keys = ['contact_id'] parallel.mapreduce( parallel.Collection.from_sharded(self.input().path), mapper=JoinMapper(tables=tables, join_keys=join_keys), reducer=OwnerOperatorJoinReducer(), output_prefix=self.output().path, num_shards=10)
def run(self): file_name = join(self.input().path, CAERS_FILE) parallel.mapreduce( parallel.Collection.from_glob(file_name, parallel.CSVDictLineInput(delimiter=',', quoting=csv.QUOTE_MINIMAL)), mapper=CSV2JSONMapper(), reducer=CSV2JSONReducer(), output_prefix=self.output().path, num_shards=10)
def run(self): harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() parallel.mapreduce( parallel.Collection.from_sharded(self.input()[1].path), mapper=PMAAnnotateMapper(harmonized_db=harmonized_db), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=10)
def run(self): common.shell_cmd('mkdir -p %s', dirname(self.output().path)) input_files = glob.glob(self.input().path + '/*.txt') parallel.mapreduce( parallel.Collection.from_glob( input_files, parallel.CSVDictLineInput(delimiter='|', strip_str='\0')), mapper=PMAMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): harmonized_file = self.input()[0].path parallel.mapreduce( glob.glob(self.input()[1].path + '*-of-*'), annotate.AnnotateMapper(harmonized_file), parallel.identity_reducer, self.output().path, num_shards=10, map_workers=2)
def run(self): tables = ['listing_estabtypes', 'estabtypes'] join_keys = ['establishment_type_id'] parallel.mapreduce( parallel.Collection.from_sharded(self.input().path), mapper=JoinMapper(tables=tables, join_keys=join_keys), reducer=JoinEstablishmentTypesReducer(), output_prefix=self.output().path, num_shards=10)
def run(self): file_name = join(self.input().path, CAERS_FILE) parallel.mapreduce(parallel.Collection.from_glob( file_name, parallel.CSVDictLineInput(delimiter=',', quoting=csv.QUOTE_MINIMAL)), mapper=CSV2JSONMapper(), reducer=CSV2JSONReducer(), output_prefix=self.output().path, num_shards=10)
def run(self): csv2json_db = parallel.ShardedDB.open(self.input()[1].path).as_dict() parallel.mapreduce( parallel.Collection.from_glob(self.input()[0].path, parallel.CSVDictLineInput()), mapper=RecallDownloaderAndMapper(csv2json_db=csv2json_db), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, map_workers=10, num_shards=10) # Do not hit fda.gov too hard here.
def run(self): common.shell_cmd('mkdir -p %s', join(BASE_DIR, 'tmp')) files = glob.glob(self.input().path + '/*/*.json') parallel.mapreduce( parallel.Collection.from_glob(files, parallel.JSONLineInput()), mapper=ParallelExportMapper(output_dir=self.output().path), reducer=parallel.NullReducer(), output_prefix=join(BASE_DIR, 'tmp'), output_format=parallel.NullOutput(), map_workers=10)
def run(self): input_db = self.input()[0].path harmonized_file = self.input()[1].path parallel.mapreduce( parallel.Collection.from_sharded(input_db), mapper=AnnotateMapper(harmonized_file), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=1) # TODO: improve the code to avoid having to limit number of shards to one
def run(self): input_db = self.input()[0].path harmonized_file = self.input()[1].path parallel.mapreduce( parallel.Collection.from_sharded(input_db), mapper=annotate.AnnotateMapper(harmonized_file), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=1)
def run(self): harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() json_glob = glob.glob(self.input()[1].path + '/*.json') parallel.mapreduce( parallel.Collection.from_glob(json_glob, parallel.JSONLineInput()), mapper=MaudeAnnotationMapper(harmonized_db=harmonized_db), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=10, map_workers=5)
def run(self): input_db = self.input()[0].path harmonized_file = self.input()[1].path parallel.mapreduce(parallel.Collection.from_sharded(input_db), mapper=annotate.AnnotateMapper(harmonized_file), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=1, map_workers=1)
def run(self): # Read the entire packaging DB in memory for speed. package_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() parallel.mapreduce( parallel.Collection.from_sharded(self.input()[1].path), mapper=ProductAndPackagingMergingMapper(package_db=package_db), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=1)
def run(self): input_dir = dirname(self.input().path) for csv_filename in glob.glob('%(input_dir)s/clean-*.csv' % locals()): parallel.mapreduce(parallel.Collection.from_glob( csv_filename, parallel.CSVDictLineInput()), mapper=CSV2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=1, map_workers=8)
def run(self): input_dir = self.input().path for xml_filename in glob.glob('%(input_dir)s/*.xml' % locals()): parallel.mapreduce(parallel.Collection.from_glob( xml_filename, parallel.XMLDictInput(depth=1)), mapper=XML2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=1, map_workers=8)
def run(self): common.shell_cmd('mkdir -p %s', dirname(self.output().path)) input_files = glob.glob(self.input().path + '/*.txt') parallel.mapreduce( parallel.Collection.from_glob( input_files, parallel.CSVDictLineInput(delimiter='|', quoting=csv.QUOTE_NONE, escapechar='\\')), mapper=ClassificationMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): with open(join(EXTRACTED_DIR, 'ApplicationsDocsType_Lookup.txt')) as fin: rows = (line.split('\t') for line in fin) doc_lookup = {row[0]: row[1].rstrip() for row in rows} parallel.mapreduce( parallel.Collection.from_glob( join(self.input().path, 'ApplicationDocs.txt'), parallel.CSVDictLineInput(delimiter='\t')), mapper=ApplicationsDocs2JSONMapper(doc_lookup=doc_lookup), reducer=parallel.ListReducer(), output_prefix=self.output().path)
def run(self): tables = ['intermediate_owner_operator', 'registration', 'us_agent'] join_keys = ['reg_key'] db_list = [s.path for s in self.input()] parallel.mapreduce(parallel.Collection.from_sharded_list(db_list), mapper=JoinMapper(tables=tables, join_keys=join_keys), reducer=RegistrationJoinReducer(), output_prefix=self.output().path, num_shards=10)
def run(self): tables = [ 'owner_operator', 'contact_addresses', 'official_correspondent' ] join_keys = ['contact_id'] parallel.mapreduce(parallel.Collection.from_sharded(self.input().path), mapper=JoinMapper(tables=tables, join_keys=join_keys), reducer=OwnerOperatorJoinReducer(), output_prefix=self.output().path, num_shards=10)
def run(self): tables = ['intermediate_owner_operator', 'registration', 'us_agent'] join_keys = ['reg_key'] db_list = [s.path for s in self.input()] parallel.mapreduce( parallel.Collection.from_sharded_list(db_list), mapper=JoinMapper(tables=tables, join_keys=join_keys), reducer=RegistrationJoinReducer(), output_prefix=self.output().path, num_shards=10)
def run(self): input_shards = [] input_dir = self.input().path for xml_filename in glob.glob(input_dir + '/*.xml'): input_shards.append(xml_filename) parallel.mapreduce(parallel.Collection.from_list(input_shards), mapper=XML2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=len(input_shards))
def run(self): common.shell_cmd('mkdir -p %s', dirname(self.output().path)) input_files = glob.glob(self.input().path + '/*.txt') parallel.mapreduce(parallel.Collection.from_glob( input_files, parallel.CSVDictLineInput(delimiter='|', quoting=csv.QUOTE_NONE, escapechar='\\')), mapper=ClassificationMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): input_dir = self.input().path for xml_filename in glob.glob('%(input_dir)s/*.xml' % locals()): parallel.mapreduce( input_collection=parallel.Collection.from_glob(xml_filename, parallel.XMLDictInput), mapper=XML2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=1, map_workers=1)
def run(self): with open(join(EXTRACTED_DIR, 'MarketingStatus_Lookup.txt')) as fin: rows = (line.split('\t') for line in fin) doc_lookup = {row[0]: row[1] for row in rows} parallel.mapreduce(parallel.Collection.from_glob( join(self.input().path, 'TE.txt'), parallel.CSVDictLineInput(delimiter='\t')), mapper=TE2JSONMapper(doc_lookup=doc_lookup), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def _run(self): json_dir = self.input()['data'].path input_glob = glob.glob(json_dir + '/*.json') for file_name in input_glob: logging.info('Running file %s', file_name) parallel.mapreduce( parallel.Collection.from_glob(file_name, parallel.JSONLineInput()), mapper=index_util.ReloadJSONMapper(config.es_host(), self.index_name, 'maude'), reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), output_prefix='/tmp/loadjson.' + self.index_name)
def run(self): tables = ["remapped_registration_listing", "listing_pcd", "listing_proprietary_name"] join_keys = ["key_val"] parallel.mapreduce( parallel.Collection.from_sharded(self.input().path), mapper=JoinMapper(tables=tables, join_keys=join_keys), reducer=ListingJoinReducer(), output_prefix=self.output().path, num_shards=10, )
def run(self): tables = [ 'remapped_registration_listing', 'listing_pcd', 'listing_proprietary_name' ] join_keys = ['key_val'] parallel.mapreduce(parallel.Collection.from_sharded(self.input().path), mapper=JoinMapper(tables=tables, join_keys=join_keys), reducer=ListingJoinReducer(), output_prefix=self.output().path, num_shards=10)
def run(self): tables = [ 'intermediate_registration_listing', 'intermediate_establishment_listing', 'intermediate_registration' ] join_keys = ['reg_key'] db_list = [s.path for s in self.input()] parallel.mapreduce(parallel.Collection.from_sharded_list(db_list), mapper=JoinMapper(tables=tables, join_keys=join_keys), reducer=JoinAllReducer(), output_prefix=self.output().path, num_shards=10)
def run(self): files = glob.glob(self.input().path + '/*/*.txt') if self.loader_task == 'init': input_files = [f for f in files if not any(i for i in IGNORE_FILES if i in f)] else: input_files = [f for f in files if self.loader_task in f] parallel.mapreduce( parallel.Collection.from_glob( input_files, parallel.CSVLineInput(quoting=csv.QUOTE_NONE, delimiter='|')), mapper=CSV2JSONMapper(), reducer=CSV2JSONJoinReducer(), output_prefix=self.output().path)
def test_identity(self): os.system('rm -rf /tmp/test-identity*') source_files = ['/tmp/test-identity-%d' % i for i in range(10)] for f in source_files: os.system('touch "%s"' % f) source = parallel.Collection(source_files, parallel.FilenameInput) parallel.mapreduce(source, parallel.IdentityMapper(), parallel.IdentityReducer(), '/tmp/test-identity', 2) results = sorted(list(parallel.ShardedDB.open('/tmp/test-identity/'))) for i in range(10): key, value = results[i] assert key == '/tmp/test-identity-%d' % i, results[i] assert value == ''
def run(self): input_shards = [] input_dir = self.input().path for subdir, dirs, files in os.walk(input_dir): for file in files: if file.endswith('.xml'): if not file in NULLIFIED: input_shards.append(os.path.join(subdir, file)) else: logging.info("Skipping a nullified case: " + file) parallel.mapreduce(parallel.Collection.from_list(input_shards), mapper=XML2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): # AERS_SGML_2007q4.ZIP has files in sqml filenames = glob.glob(self.input().path + '/AERS_SGML_*/s[gq]ml/*.SGM') filenames.extend(glob.glob(self.input().path + '/FAERS_XML*/xml/*.xml')) input_shards = [] for filename in filenames: if 'test' in filename.lower(): continue logging.info('Adding input file to pool: %s', filename) input_shards.append(filename) parallel.mapreduce(input_shards, xml_to_json.extract_safety_reports, xml_to_json.merge_safety_reports, self.output().path, 10)
def test_sum(self): os.system('rm -rf /tmp/test-sum*') source_files = ['/tmp/test-sum-%d' % i for i in range(10)] for filename in source_files: with open(filename, 'w') as f: print >> f, '\n'.join([str(i) for i in range(100)]) source = parallel.Collection(source_files, parallel.LineInput) parallel.mapreduce(source, parallel.IdentityMapper(), parallel.SumReducer(), '/tmp/test-sum', 5) results = dict(parallel.ShardedDB.open('/tmp/test-sum/')) for i in range(100): assert str(i) in results, str(i) value = results[str(i)] self.assertEqual(value, str(i * 10.0))
def run(self): logging.info('Pipelining...') # AERS_SGML_2007q4.ZIP has files in sqml sgml_path = '/AERS_SGML_*/s[gq]ml/*.SGM' xml_path = '/FAERS_XML*/[Xx][Mm][Ll]/*.xml' filenames = glob.glob(self.input().path + sgml_path) filenames.extend(glob.glob(self.input().path + xml_path)) input_shards = [] for filename in filenames: if 'test' in filename.lower(): continue logging.info('Adding input file to pool: %s', filename) input_shards.append(filename) report_counts = parallel.mapreduce( parallel.Collection.from_list(input_shards), xml_to_json.ExtractSafetyReportsMapper(), xml_to_json.MergeSafetyReportsReducer(), self.output().path, 10) combined_counts = collections.defaultdict(int) for rc in report_counts: for timestamp, count in rc.iteritems(): combined_counts[timestamp] += count print '----REPORT COUNTS----' for timestamp, count in sorted(combined_counts.items()): print '>> ', timestamp, count
def run(self): # AERS_SGML_2007q4.ZIP has files in sqml filenames = [] for input in self.input(): sgml_path = '/s[gq]ml/*.SGM' xml_path = '/[Xx][Mm][Ll]/*.xml' logging.info('Checking for inputs in: %s', input.path) filenames.extend(glob.glob(input.path + sgml_path)) filenames.extend(glob.glob(input.path + xml_path)) assert len(filenames) > 0, 'No files to process for quarter? %s' % self.quarter input_shards = [] for filename in filenames: if 'test' in filename.lower(): continue logging.info('Adding input file to pool: %s', filename) input_shards.append(filename) report_counts = parallel.mapreduce( parallel.Collection.from_list(input_shards), xml_to_json.ExtractSafetyReportsMapper(), xml_to_json.MergeSafetyReportsReducer(), self.output().path, num_shards=16) combined_counts = collections.defaultdict(int) for rc in report_counts: for timestamp, count in rc.iteritems(): combined_counts[timestamp] += count print '----REPORT COUNTS----' for timestamp, count in sorted(combined_counts.items()): print '>> ', timestamp, count
def run(self): applications_db = self.input()[0].path products_db = self.input()[1].path applications_docs_db = self.input()[2].path submissions_db = self.input()[3].path submissions_property_type_db = self.input()[4].path marketing_status = self.input()[5].path te_db = self.input()[6].path parallel.mapreduce( parallel.Collection.from_sharded(applications_db), mapper=MergeAllMapper(applications_db, products_db, applications_docs_db, submissions_db, submissions_property_type_db, marketing_status, te_db), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, map_workers=1, num_shards=1) # TODO: improve the code to avoid having to limit number of shards to one
def run(self): ndc_spl_id_index = {} ndc_db = self.input()[1].path logging.info('Joining data from NDC DB: %s', ndc_db) db = parallel.ShardedDB.open(ndc_db) db_iter = db.range_iter(None, None) # We want each SPL ID that is in the NDC file so that we always use the # same SPL file for both ID and SET_ID based joins. for (key, val) in db_iter: ndc_spl_id_index[val['id']] = True parallel.mapreduce(parallel.Collection.from_sharded_list( [batch.path for batch in self.input()[0]]), mapper=SPLSetIDMapper(index_db=ndc_spl_id_index), reducer=parallel.ListReducer(), output_prefix=self.output().path, num_shards=16)
def run(self): es = elasticsearch.Elasticsearch(self.es_host) index_util.start_index_transaction(es, 'drugevent', self.epoch) parallel.mapreduce(parallel.Collection.from_sharded( self.input()[1].path), index_util.LoadJSONMapper(self.es_host, 'drugevent', 'safetyreport', self.epoch, docid_key='@case_number', version_key='@version'), parallel.NullReducer(), output_prefix='/tmp/loadjson.drugevent', num_shards=1, map_workers=1) index_util.commit_index_transaction(es, 'drugevent')
def run(self): files = glob.glob(self.input().path + '/*/*.txt') device_problems = glob.glob(self.input().path + '/*/foidevproblem*.txt') patient_problems = glob.glob(self.input().path + '/*/patientproblemcode*.txt') if self.loader_task == 'init': input_files = [ f for f in files if not any(i for i in IGNORE_FILES if i in f) ] else: input_files = [f for f in files if self.loader_task in f ] + device_problems + patient_problems # Load and cache device problem codes. device_problem_codes_ref = {} reader = csv.reader(open(DEVICE_PROBLEM_CODES_FILE), quoting=csv.QUOTE_NONE, delimiter='|') for idx, line in enumerate(reader): if len(line) > 1: device_problem_codes_ref[line[0]] = line[1].strip() # Load and cache patient problem codes. patient_problem_codes_ref = {} reader = csv.reader(open(PATIENT_PROBLEM_CODES_FILE), quoting=csv.QUOTE_NONE, delimiter='|') for idx, line in enumerate(reader): if len(line) > 1: patient_problem_codes_ref[line[0]] = line[1].strip() parallel.mapreduce( parallel.Collection.from_glob( input_files, parallel.CSVSplitLineInput(quoting=csv.QUOTE_NONE, delimiter='|')), mapper=CSV2JSONMapper( device_problem_codes_ref=device_problem_codes_ref, patient_problem_codes_ref=patient_problem_codes_ref), reducer=CSV2JSONJoinReducer(), output_prefix=self.output().path)
def run(self): output_file = self.output().path input_file = self.input()[1].path es = elasticsearch.Elasticsearch(self.es_host) index_util.start_index_transaction(es, 'recall', self.epoch) parallel.mapreduce( input_collection=parallel.Collection.from_sharded(input_file), mapper=index_util.LoadJSONMapper(self.es_host, 'recall', 'enforcementreport', self.epoch, docid_key='@id', version_key='@version'), reducer=parallel.NullReducer(), output_prefix='/tmp/loadjson.recall', num_shards=1, map_workers=1) index_util.commit_index_transaction(es, 'recall') common.shell_cmd('touch %s', output_file)