def download_dataset_with_hadoop(datapackage, hdfs_path): with make_local_tmp() as tmp_local_dir: with make_hdfs_tmp(permissions='777') as tmp_hdfs_dir: # NOTE: 777 used so user yarn can write to this dir # create input file for MR job that downloads the files and puts # them in HDFS local_resource_file = pjoin(tmp_local_dir, 'resource_file.txt') with open(local_resource_file, 'w') as op: for resource in datapackage['resources']: op.write('{0}\n'.format(json.dumps(resource))) check_call('hadoop fs -put {0} {1}'.format(local_resource_file, tmp_hdfs_dir), shell=True) # construct and execute hadoop streaming command to initiate dnload cmd = ('hadoop jar {streaming_jar} ' '-D mapreduce.job.reduces=0 ' '-D mapreduce.map.speculative=false ' '-D mapreduce.task.timeout=12000000 ' '-files {mapper_script_path} ' '-input {resource_file} -output {dummy_output} ' '-mapper {mapper_script_name} ' '-inputformat {input_format} -outputformat {output_format} ' '-cmdenv STAGING_PATH={staging_path} ') args = {'streaming_jar': STREAMING_JAR, 'resource_file': pjoin(tmp_hdfs_dir, 'resource_file.txt'), 'dummy_output': pjoin(tmp_hdfs_dir, 'dummy_output'), 'mapper_script_name': 'download_mapper.py', 'mapper_script_path': pjoin( os.path.dirname(__file__), 'resources', 'download_mapper.py'), 'input_format': ( 'org.apache.hadoop.mapred.lib.NLineInputFormat'), 'output_format': ( 'org.apache.hadoop.mapred.lib.NullOutputFormat'), 'staging_path': pjoin(tmp_hdfs_dir, 'staging')} print(cmd.format(**args)) check_call(cmd.format(**args), shell=True) # move dnloaded data to final path check_call('hadoop fs -mkdir -p {0}'.format(hdfs_path), shell=True) check_call( 'sudo -u hdfs hadoop fs -chown -R {1}:supergroup {0}' .format(tmp_hdfs_dir, getuser()), shell=True) check_call( 'hadoop fs -mv "{0}/*" {1}'.format( pjoin(tmp_hdfs_dir, 'staging'), hdfs_path), shell=True)
import os.path as osp from os.path import join as pjoin from eggo.datasets.operations import ( download_dataset_with_hadoop, vcf_to_adam_variants, locus_partition, distcp) from eggo.util import make_hdfs_tmp hdfs_uri = 'hdfs:///user/ec2-user' s3a_uri = 's3a://bdg-eggo' raw_data_path = 'dbsnp_raw' adam_nested_path = 'dbsnp_adam' adam_flat_path = 'dbsnp_adam_flat' with open(pjoin(osp.dirname(__file__), 'datapackage.json')) as ip: datapackage = json.load(ip) download_dataset_with_hadoop(datapackage, pjoin(hdfs_uri, raw_data_path)) with make_hdfs_tmp('tmp_dbsnp') as tmp_hdfs_path: tmp_adam_variant_path = pjoin(tmp_hdfs_path, 'tmp_adam_variants') vcf_to_adam_variants(pjoin(hdfs_uri, raw_data_path), tmp_adam_variant_path) locus_partition(tmp_adam_variant_path, pjoin(hdfs_uri, adam_nested_path)) distcp(pjoin(hdfs_uri, adam_nested_path), pjoin(s3a_uri, adam_nested_path))