Exemplo n.º 1
0
def download_dataset_with_hadoop(datapackage, hdfs_path):
    with make_local_tmp() as tmp_local_dir:
        with make_hdfs_tmp(permissions='777') as tmp_hdfs_dir:
            # NOTE: 777 used so user yarn can write to this dir
            # create input file for MR job that downloads the files and puts
            # them in HDFS
            local_resource_file = pjoin(tmp_local_dir, 'resource_file.txt')
            with open(local_resource_file, 'w') as op:
                for resource in datapackage['resources']:
                    op.write('{0}\n'.format(json.dumps(resource)))
            check_call('hadoop fs -put {0} {1}'.format(local_resource_file,
                                                       tmp_hdfs_dir),
                       shell=True)

            # construct and execute hadoop streaming command to initiate dnload
            cmd = ('hadoop jar {streaming_jar} '
                   '-D mapreduce.job.reduces=0 '
                   '-D mapreduce.map.speculative=false '
                   '-D mapreduce.task.timeout=12000000 '
                   '-files {mapper_script_path} '
                   '-input {resource_file} -output {dummy_output} '
                   '-mapper {mapper_script_name} '
                   '-inputformat {input_format} -outputformat {output_format} '
                   '-cmdenv STAGING_PATH={staging_path} ')
            args = {'streaming_jar': STREAMING_JAR,
                    'resource_file': pjoin(tmp_hdfs_dir, 'resource_file.txt'),
                    'dummy_output': pjoin(tmp_hdfs_dir, 'dummy_output'),
                    'mapper_script_name': 'download_mapper.py',
                    'mapper_script_path': pjoin(
                        os.path.dirname(__file__), 'resources',
                        'download_mapper.py'),
                    'input_format': (
                        'org.apache.hadoop.mapred.lib.NLineInputFormat'),
                    'output_format': (
                        'org.apache.hadoop.mapred.lib.NullOutputFormat'),
                    'staging_path': pjoin(tmp_hdfs_dir, 'staging')}
            print(cmd.format(**args))
            check_call(cmd.format(**args), shell=True)

            # move dnloaded data to final path
            check_call('hadoop fs -mkdir -p {0}'.format(hdfs_path), shell=True)
            check_call(
                'sudo -u hdfs hadoop fs -chown -R {1}:supergroup {0}'
                .format(tmp_hdfs_dir, getuser()), shell=True)
            check_call(
                'hadoop fs -mv "{0}/*" {1}'.format(
                    pjoin(tmp_hdfs_dir, 'staging'), hdfs_path), shell=True)
Exemplo n.º 2
0
import os.path as osp
from os.path import join as pjoin

from eggo.datasets.operations import (
    download_dataset_with_hadoop, vcf_to_adam_variants, locus_partition,
    distcp)
from eggo.util import make_hdfs_tmp


hdfs_uri = 'hdfs:///user/ec2-user'
s3a_uri = 's3a://bdg-eggo'


raw_data_path = 'dbsnp_raw'
adam_nested_path = 'dbsnp_adam'
adam_flat_path = 'dbsnp_adam_flat'


with open(pjoin(osp.dirname(__file__), 'datapackage.json')) as ip:
    datapackage = json.load(ip)


download_dataset_with_hadoop(datapackage, pjoin(hdfs_uri, raw_data_path))

with make_hdfs_tmp('tmp_dbsnp') as tmp_hdfs_path:
    tmp_adam_variant_path = pjoin(tmp_hdfs_path, 'tmp_adam_variants')
    vcf_to_adam_variants(pjoin(hdfs_uri, raw_data_path),
                         tmp_adam_variant_path)
    locus_partition(tmp_adam_variant_path, pjoin(hdfs_uri, adam_nested_path))
    distcp(pjoin(hdfs_uri, adam_nested_path), pjoin(s3a_uri, adam_nested_path))