def run(self): parallel.mapreduce(parallel.Collection.from_glob( join(self.input().path, 'SubmissionPropertyType.txt'), parallel.CSVDictLineInput(delimiter='\t')), mapper=SubmissionPropertyType2JSONMapper(), reducer=parallel.ListReducer(), output_prefix=self.output().path)
def run(self): parallel.mapreduce(parallel.Collection.from_sharded_list( [batch.path for batch in self.input()]), mapper=SPLSetIDMapper(), reducer=parallel.ListReducer(), output_prefix=self.output().path, num_shards=16)
def run(self): with open(join(EXTRACTED_DIR, 'ApplicationsDocsType_Lookup.txt')) as fin: rows = (line.split('\t') for line in fin) doc_lookup = {row[0]: row[1].rstrip() for row in rows} parallel.mapreduce( parallel.Collection.from_glob( join(self.input().path, 'ApplicationDocs.txt'), parallel.CSVDictLineInput(delimiter='\t')), mapper=ApplicationsDocs2JSONMapper(doc_lookup=doc_lookup), reducer=parallel.ListReducer(), output_prefix=self.output().path)
def run(self): ndc_spl_id_index = {} ndc_db = self.input()[1].path logging.info('Joining data from NDC DB: %s', ndc_db) db = parallel.ShardedDB.open(ndc_db) db_iter = db.range_iter(None, None) # We want each SPL ID that is in the NDC file so that we always use the # same SPL file for both ID and SET_ID based joins. for (key, val) in db_iter: ndc_spl_id_index[val['id']] = True parallel.mapreduce(parallel.Collection.from_sharded_list( [batch.path for batch in self.input()[0]]), mapper=SPLSetIDMapper(index_db=ndc_spl_id_index), reducer=parallel.ListReducer(), output_prefix=self.output().path, num_shards=16)