Пример #1
0
def make_indexed_reference(job_inputs):
    logging.info("Indexing reference genome")

    run_shell("dx-contigset-to-fasta %s reference.fasta" % job_inputs['reference']['$dnanexus_link'])
    ref_details = dxpy.DXRecord(job_inputs['reference']['$dnanexus_link']).get_details()
    ref_name = dxpy.DXRecord(job_inputs['reference']['$dnanexus_link']).describe()['name']

    # TODO: test if the genomes near the boundary work OK
    if sum(ref_details['contigs']['sizes']) < 2*1024*1024*1024:
        subprocess.check_call("bwa index -a is reference.fasta", shell=True)
    else:
        subprocess.check_call("bwa index -a bwtsw reference.fasta", shell=True)

    subprocess.check_call("XZ_OPT=-0 tar -cJf reference.tar.xz reference.fasta*", shell=True)
    indexed_ref_dxfile = dxpy.upload_local_file("reference.tar.xz", hidden=True, wait_on_close=True)

    indexed_ref_record = dxpy.new_dxrecord(name=ref_name + " (indexed for BWA)",
                                           types=["BwaLetterContigSetV3"],
                                           details={'index_archive': dxpy.dxlink(indexed_ref_dxfile.get_id()),
                                                    'original_contigset': job_inputs['reference']})
    indexed_ref_record.close()

    # TODO: dxpy project workspace convenience functions
# FIXME
#    if "projectWorkspace" in job:
#        indexed_ref_record.clone(job["projectWorkspace"])

    return indexed_ref_record
def make_indexed_reference(ref_ID):

    run_shell("dx-contigset-to-fasta %s reference.fasta" % ref_ID)
    ref_details = dxpy.DXRecord(ref_ID).get_details()
    ref_name = dxpy.DXRecord(ref_ID).describe()['name']

    # call bowtie2-build
    run_shell("bowtie2-build reference.fasta indexed_ref")
    # package it into an archive for uploading
    run_shell("XZ_OPT=-0 tar -cJf reference.tar.xz indexed_ref*")

    indexed_ref_dxfile = dxpy.upload_local_file("reference.tar.xz",
                                                hidden=True,
                                                wait_on_close=True)

    indexed_ref_record = dxpy.new_dxrecord(
        name=ref_name + " (indexed for Bowtie2)",
        types=["BowtieLetterContigSetV2"],
        details={
            'index_archive': dxpy.dxlink(indexed_ref_dxfile.get_id()),
            'original_contigset': dxpy.dxlink(ref_ID)
        })
    indexed_ref_record.close()
    '''
    # TODO: dxpy project workspace convenience functions
    if "projectWorkspace" in job:
        indexed_ref_record.clone(job["projectWorkspace"])
    '''

    return indexed_ref_record.get_id()
Пример #3
0
    def __init__(self, record_link, fastqs=None):

        self.record_link = record_link.strip()
        link_elements = self.record_link.split(':')
        record_project = link_elements[0]
        record_dxid = link_elements[1]
        self.record = dxpy.DXRecord(dxid=record_dxid, project=record_project)

        # Get record details
        self.details = self.record.get_details()
        self.project_id = self.details['laneProject']
        self.mapping_reference = self.details['mappingReference']
        self.lane_index = int(self.details['lane'])
        self.run_name = self.details['run']
        self.run_date = self.run_name.split('_')[0]
        self.library_id = self.details['library_id']
        self.lane_id = self.details['lane_id']

        # Parse library name ("DL_set2_rep1 rcvd 1/4/16")
        library_label = self.details['library']
        elements = library_label.split('rcvd')
        library_name = elements[0].rstrip()
        self.library_name = re.sub(r"[^a-zA-Z0-9]+", "-", library_name)

        # Get record properties
        self.properties = self.record.get_properties()
        self.mapper = self.properties['mapper']
        self.reference_genome_dxid = self.properties['reference_genome_dxid']
        self.reference_index_dxid = self.properties['reference_index_dxid']
        self.flowcell_id = self.properties['flowcell_id']

        self.fastq_dxids = fastqs
Пример #4
0
    def __init__(self, record_link):

        self.record_link = record_link.strip()
        link_elements = self.record_link.split(':')
        self.record_project = link_elements[0]
        self.record_dxid = link_elements[1]
        self.record = dxpy.DXRecord(dxid=self.record_dxid,
                                    project=self.record_project)

        # Get relevant dashboard details
        self.details = self.record.get_details()
        self.run_name = self.details['run']
        self.lane_index = self.details['lane']
        self.library_name = self.details['library']
        self.project_id = self.details['laneProject']

        # Get relevant dashboard properties
        self.properties = self.record.get_properties()
        self.flowcell_id = self.properties['flowcell_id']
        self.lab = self.properties['lab']
        self.operator = 'None'  # Still need to grab this info

        # Get mapping info for mapped lanes
        self.mapper = self.properties['mapper']
        if self.mapper == 'None':
            self.mapper = None
            self.ref_genome_dxid = None
            self.reference_genome = None
        else:
            self.ref_genome_dxid = self.properties['reference_genome_dxid']
            self.reference_genome = self.details['mappingReference']
Пример #5
0
def main(contig_set):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    contig_set = dxpy.DXRecord(contig_set)

    # The following line extracts the name from the file object so that
    # outputs can be named intelligently. It is not automatically generated by
    # the app wizard.

    name = contig_set.describe()['name'].replace(".fa", "")

    # Fill in your application code here.

    subprocess.check_call("dx-contigset-to-fasta %s %s.fa" % (contig_set.get_id(), name), shell=True)
    subprocess.check_call("gzip %s.fa" % name, shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    fasta_gz = dxpy.upload_local_file("%s.fa.gz" % name);

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["fasta_gz"] = dxpy.dxlink(fasta_gz)

    return output
Пример #6
0
    def __init__(self, record_link):

        self.record_link = record_link.strip()
        link_elements = self.record_link.split(':')
        record_project = link_elements[0]
        record_dxid = link_elements[1]
        self.record = dxpy.DXRecord(dxid=record_dxid, project=record_project)

        self.properties = self.record.get_properties()
        self.details = self.record.get_details()

        # Details (Used for Dashboard information)
        self.lane_project_id = self.details['laneProject']
        self.run_name = self.details['run']
        self.run_date = self.run_name.split('_')[0]
        self.lane_index = int(self.details['lane'])
        self.library_id = self.details['library_id']
        self.lane_id = self.details['lane_id']

        # Parse library name ("DL_set2_rep1 rcvd 1/4/16")
        library_label = self.details['library']
        elements = library_label.split('rcvd')
        library_name = elements[0].rstrip()
        self.library_name = re.sub(r"[^a-zA-Z0-9]+", "-", library_name)

        # Properties
        self.lims_url = self.properties['lims_url']
        self.lims_token = self.properties['lims_token']
        self.rta_version = self.properties['rta_version']
        self.seq_instrument = self.properties['seq_instrument']
        self.flowcell_id = self.properties['flowcell_id']

        self.lane_project = dxpy.DXProject(dxid=self.lane_project_id)
        self.home = os.getcwd()

        self.sample_sheet = None
        self.output_dir = None
        self.bcl2fastq_version = None
        self.lane_barcode = None
        self.flowcell_id = None

        # Choose bcl2fastq version based on rta_version
        ## DEV: Update version to match official documentation: i.e. 1.18.54 or later
        if StrictVersion(self.rta_version) < StrictVersion('1.18.54'):
            self.bcl2fastq_version = 1
        elif StrictVersion(self.rta_version) >= StrictVersion('1.18.54'):
            self.bcl2fastq_version = 2

        # Get barcode information (codepoint + name) from LIMS
        # Used to add barcode name to FastQ files
        self.connection = Connection(lims_url=self.lims_url,
                                     lims_token=self.lims_token)
        self.run_info = RunInfo(conn=self.connection, run=self.run_name)
        self.lane_info = self.run_info.get_lane(self.lane_index)

        self.barcode_dict = {}
        barcode_list = self.lane_info['barcodes']
        for barcode_info in barcode_list:
            self.barcode_dict[barcode_info['codepoint']] = barcode_info['name']
Пример #7
0
    def __init__(self, project_dxid, record_link, dx_user_id, user_first_name,
                 user_last_name, user_email, viewers, release_note, lims_url,
                 lims_token):
        # This is lane level stuff. Most of this info will be stored in dxrecord.
        if record_link:
            self.record_link = record_link.strip()
            link_elements = self.record_link.split(':')
            record_project = link_elements[0]
            record_dxid = link_elements[1]
            self.record = dxpy.DXRecord(dxid=record_dxid,
                                        project=record_project)
        else:
            self.record = None

        self.project_dxid = project_dxid
        self.dx_user_id = dx_user_id
        self.user_first_name = user_first_name
        self.user_last_name = user_last_name
        self.user_email = user_email
        self.viewers = viewers
        self.release_note = release_note
        self.lims_url = lims_url
        self.lims_token = lims_token

        # Values assigned during project transfer
        self.sponsored_datetime = None
        self.release_project_dxid = None
        self.clone_project_dxid = None

        # Values gotten from DXRecord
        self.properties = None
        self.details = None
        self.lane_index = None
        self.run_name = None
        self.library_name = None
        self.production = None
        self.lab = None

        if self.record:
            self.properties = self.record.get_properties()
            self.details = self.record.get_details()

            self.parse_record_details()
            self.parse_record_properties()
Пример #8
0
def render_bundleddepends(thing):
    from ..bindings.search import find_one_data_object
    from ..exceptions import DXError
    bundles = []
    for item in thing:
        bundle_asset_record = dxpy.DXFile(item["id"]["$dnanexus_link"]).get_properties().get("AssetBundle")
        asset = None

        if bundle_asset_record:
            asset = dxpy.DXRecord(bundle_asset_record)

        if asset:
            try:
                bundles.append(asset.describe().get("name") + " (" + asset.get_id() + ")")
            except DXError:
                asset = None

        if not asset:
            bundles.append(item["name"] + " (" + item["id"]["$dnanexus_link"] + ")")

    return bundles
Пример #9
0
    def __init__(self, record_link):

        self.record_link = record_link.strip()
        link_elements = self.record_link.split(':')
        record_project = link_elements[0]
        record_dxid = link_elements[1]
        self.record = dxpy.DXRecord(dxid=record_dxid, project=record_project)

        # Get relevant dashboard details
        self.details = self.record.get_details()
        self.run_name = self.details['run']
        self.lane_index = self.details['lane']
        self.library_name = self.details['library']
        self.project_dxid = self.details['laneProject']

        # Get relevant dashboard properties
        self.properties = self.record.get_properties()
        self.details = self.record.get_details()

        self.flowcell_id = self.properties['flowcell_id']
        self.lab = self.properties['lab']
        self.operator = 'None'     # Still need to grab this info
        
        # Boolean indicating whether project is part of production pipeline
        self.is_production = None
        production = self.properties['production']
        if production == 'true':
            self.is_production = True
        else:
            self.is_production = False

        # Get mapping info for mapped lanes
        self.mapper = self.properties['mapper']
        if self.mapper == 'None':
            self.mapper = None
            self.ref_genome_dxid = None
            self.reference_genome = None
        else:
            self.ref_genome_dxid = self.properties['reference_genome_dxid']
            self.reference_genome = self.details['mappingReference']
Пример #10
0
def main(**kwargs):

    if len(kwargs) == 0:
        kwargs = vars(parser.parse_args(sys.argv[1:]))

    # Attempt to resolve variants gtable name
    try:
        project, folderpath, entity_result = resolve_existing_path(
            kwargs['path'], expected='entity')
    except ResolutionError as details:
        parser.exit(1, fill(str(details)) + '\n')

    if entity_result is None:
        parser.exit(
            1,
            fill('Could not resolve ' + kwargs['path'] + ' to a data object') +
            '\n')

    filename = kwargs['output']
    if filename is None:
        filename = entity_result['describe']['name'].replace('/',
                                                             '%2F') + ".vcf"

    if kwargs['output'] == '-':
        outputFile = sys.stdout
    else:
        outputFile = open(filename, 'w')
    exportRef = kwargs['export_ref_calls']
    exportNoCall = kwargs['export_no_calls']

    variantsTable = dxpy.open_dxgtable(entity_result['id'])

    try:
        originalContigSet = variantsTable.get_details()['original_contigset']
    except:
        raise dxpy.AppError(
            "The original reference genome must be attached as a detail")
    contigDetails = dxpy.DXRecord(originalContigSet).get_details()

    if kwargs['reference'] is not None:
        refFileName = kwargs['reference']
        if not os.path.isfile(refFileName):
            raise dxpy.AppError(
                "The reference expected by the variants to vcf script was not a valid file"
            )
    else:
        refFileName = tempfile.NamedTemporaryFile(prefix='reference_',
                                                  suffix='.txt',
                                                  delete=False).name
        dxpy.download_dxfile(
            contigDetails['flat_sequence_file']['$dnanexus_link'], refFileName)

    if kwargs['write_header']:

        infos = variantsTable.get_details().get('infos')
        formats = variantsTable.get_details().get('formats')
        alts = variantsTable.get_details().get('alts')
        filters = variantsTable.get_details().get('filters')
        samples = variantsTable.get_details().get('samples')

        outputFile.write("##fileformat=VCFv4.1\n")
        if infos is not None:
            for k, v in collections.OrderedDict(sorted(
                    infos.iteritems())).iteritems():
                outputFile.write("##INFO=<ID=" + k + ",Number=" + v['number'] +
                                 ",Type=" + v['type'] + ",Description=\"" +
                                 v['description'] + "\">\n")

        if len(samples) > 0:
            outputFile.write(
                "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"
            )
            outputFile.write(
                "##FORMAT=<ID=AD,Number=.,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed\">\n"
            )
            outputFile.write(
                "##FORMAT=<ID=DP,Number=1,Type=String,Description=\"Approximate read depth (reads with MQ=255 or with bad mates are filtered)\">\n"
            )
        if formats is not None:
            for k, v in collections.OrderedDict(sorted(
                    formats.iteritems())).iteritems():
                outputFile.write("##FORMAT=<ID=" + k + ",Number=" +
                                 v['number'] + ",Type=" + v['type'] +
                                 ",Description=\"" + v['description'] +
                                 "\">\n")
        if alts is not None:
            for k, v in collections.OrderedDict(sorted(
                    alts.iteritems())).iteritems():
                outputFile.write("##ALT=<ID=" + k + ",Description=\"" +
                                 v['description'] + "\">\n")
        if filters is not None:
            for k, v in collections.OrderedDict(sorted(
                    filters.iteritems())).iteritems():
                outputFile.write("##FILTER=<ID=" + k + ",Description=\"" + v +
                                 "\">\n")
        for i in range(len(contigDetails['contigs']['names'])):
            outputFile.write("##contig=<ID=" +
                             contigDetails['contigs']['names'][i] +
                             ",length=" +
                             str(contigDetails['contigs']['sizes'][i]) + ">\n")
        outputFile.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO")

        if len(samples) > 0:
            outputFile.write("\tFORMAT")
            for x in samples:
                outputFile.write("\t" + x)
        outputFile.write("\n")

    chromosomeOffsets = {}
    for i in range(len(contigDetails['contigs']['names'])):
        chromosomeOffsets[contigDetails['contigs']['names']
                          [i]] = contigDetails['contigs']['offsets'][i]

    contigSequence = open(refFileName, 'r').read()

    col = {}
    names = variantsTable.get_col_names()
    for i in range(len(names)):
        col[names[i]] = i + 1
    col = collections.OrderedDict(sorted(col.items()))

    chromosomeList = contigDetails['contigs']['names']
    if kwargs['chr'] is not None:
        intersection = []
        for x in chromosomeList:
            if x in kwargs['chr']:
                intersection.append(x)
        chromosomeList = intersection[:]

    for chromosome in chromosomeList:
        buff = []
        lastPosition = -1
        query = variantsTable.genomic_range_query(chr=chromosome,
                                                  lo=0,
                                                  hi=sys.maxsize)
        for row in variantsTable.get_rows(query=query, limit=1)['data']:
            startRow = row[0]
            for row in variantsTable.iterate_rows(start=startRow):
                if row[1] != chromosome:
                    break
                if lastPosition < row[col["lo"]]:
                    writeBuffer(buff, col, outputFile, contigSequence,
                                chromosomeOffsets, exportRef, exportNoCall)
                    buff = []
                buff.append(row)
                lastPosition = row[col["lo"]]
        writeBuffer(buff, col, outputFile, contigSequence, chromosomeOffsets,
                    exportRef, exportNoCall)
        buff = []
Пример #11
0
def _dump_app_or_applet(executable, omit_resources=False, describe_output={}):
    info = executable.get()

    if info["runSpec"]["interpreter"] == "bash":
        suffix = "sh"
    elif info["runSpec"]["interpreter"] in ["python2.7", "python3", "python3.5"]:
        suffix = "py"
    else:
        print('Sorry, I don\'t know how to get executables with interpreter ' +
              info["runSpec"]["interpreter"] + '\n', file=sys.stderr)
        sys.exit(1)

    # Entry point script
    script = "src/code.%s" % (suffix,)
    os.mkdir("src")
    with open(script, "w") as f:
        f.write(info["runSpec"]["code"])

    def make_cluster_bootstrap_script_file(region, entry_point, code, suffix):
        """
        Writes the string `code` into a file at the relative path
        "src/<region>_<entry_point>_clusterBootstrap.<suffix>"
        """
        script_name = "src/%s_%s_clusterBootstrap.%s" % (region, entry_point, suffix)
        with open(script_name, "w") as f:
            f.write(code)
        return script_name

    # Get all the asset bundles
    asset_depends = []
    deps_to_remove = []

    # When an applet is built bundledDepends are added in the following order:
    # 1. bundledDepends explicitly specified in the dxapp.json
    # 2. resources (contents of resources directory added as bundledDepends)
    # 3. assetDepends (translated into bundledDepends)
    #
    # Therefore while translating bundledDepends to assetDepends, we are traversing the
    # list in reverse order and exiting when we can't find the "AssetBundle" property
    # with the tarball file.
    #
    # NOTE: If last item (and contiguous earlier items) of bundledDepends (#1 above) refers to an
    # AssetBundle tarball, those items will be converted to assetDepends.
    #
    # TODO: The bundledDepends should be annotated with another field called {"asset": true}
    # to distinguish it from non assets. It will be needed to annotate the bundleDepends,
    # when the wrapper record object is no more accessible.

    for dep in reversed(info["runSpec"]["bundledDepends"]):
        file_handle = get_handler(dep["id"])
        if isinstance(file_handle, dxpy.DXFile):
            asset_record_id = file_handle.get_properties().get("AssetBundle")
            asset_record = None
            if asset_record_id:
                asset_record = dxpy.DXRecord(asset_record_id)
                if asset_record:
                    try:
                        asset_json = {"name": asset_record.describe().get("name"),
                                              "project": asset_record.get_proj_id(),
                                              "folder": asset_record.describe().get("folder"),
                                              "version": asset_record.describe(fields={"properties": True}
                                                                               )["properties"]["version"]
                                              }
                        if dep.get("stages"):
                            asset_json["stages"] = dep["stages"]
                        asset_depends.append(asset_json)
                        deps_to_remove.append(dep)
                    except DXError:
                        print("Describe failed on the assetDepends record object with ID - " +
                              asset_record_id + "\n", file=sys.stderr)
                        pass
            else:
                break
    # Reversing the order of the asset_depends[] so that original order is maintained
    asset_depends.reverse()
    # resources/ directory
    created_resources_directory = False
    if not omit_resources:
        for dep in info["runSpec"]["bundledDepends"]:
            if dep in deps_to_remove:
                continue
            handler = get_handler(dep["id"])
            if isinstance(handler, dxpy.DXFile):
                if not created_resources_directory:
                    os.mkdir("resources")
                    created_resources_directory = True
                handler_id = handler.get_id()
                fname = "resources/%s.tar.gz" % (handler_id)
                download_dxfile(handler_id, fname)
                print("Unpacking resources", file=sys.stderr)

                def untar_strip_leading_slash(tarfname, path):
                    t = tarfile.open(tarfname)
                    for m in t.getmembers():
                        if m.name.startswith("/"):
                            m.name = m.name[1:]
                        t.extract(m, path)
                    t.close()

                untar_strip_leading_slash(fname, "resources")
                os.unlink(fname)
                deps_to_remove.append(dep)

    # TODO: if output directory is not the same as executable name we
    # should print a warning and/or offer to rewrite the "name"
    # field in the 'dxapp.json'
    dxapp_json = collections.OrderedDict()
    all_keys = executable._get_required_keys() + executable._get_optional_keys()
    for key in all_keys:
        if key in executable._get_describe_output_keys() and key in describe_output:
            dxapp_json[key] = describe_output[key]
        if key in info:
            dxapp_json[key] = info[key]
    if info.get("hidden", False):
        dxapp_json["hidden"] = True

    # TODO: inputSpec and outputSpec elements should have their keys
    # printed in a sensible (or at least consistent) order too

    # Un-inline code
    del dxapp_json["runSpec"]["code"]
    dxapp_json["runSpec"]["file"] = script

    # Remove resources from bundledDepends
    for dep in deps_to_remove:
        dxapp_json["runSpec"]["bundledDepends"].remove(dep)

    # Add assetDepends to dxapp.json
    if len(asset_depends) > 0:
        dxapp_json["runSpec"]["assetDepends"] = asset_depends

    # Ordering input/output spec keys
    ordered_spec_keys = ("name", "label", "help", "class", "type", "patterns", "optional", "default", "choices",
                         "suggestions", "group")
    for spec_key in "inputSpec", "outputSpec":
        if spec_key not in dxapp_json.keys():
            continue
        for i, spec in enumerate(dxapp_json[spec_key]):
            ordered_spec = collections.OrderedDict()
            # Adding keys, for which the ordering is defined
            for key in ordered_spec_keys:
                if key in spec.keys():
                    ordered_spec[key] = spec[key]
            # Adding the rest of the keys
            for key in spec.keys():
                if key not in ordered_spec_keys:
                    ordered_spec[key] = spec[key]
            dxapp_json[spec_key][i] = ordered_spec

    # Remove dx-toolkit from execDepends
    dx_toolkit = {"name": "dx-toolkit", "package_manager": "apt"}
    if dx_toolkit in dxapp_json["runSpec"].get("execDepends", ()):
        dxapp_json["runSpec"]["execDepends"].remove(dx_toolkit)

    # Remove "bundledDependsByRegion" field from "runSpec". This utility
    # will reconstruct the resources directory based on the
    # "bundledDepends" field, which should be equivalent to
    # "bundledDependsByRegion".
    dxapp_json["runSpec"].pop("bundledDependsByRegion", None)

    # "dx build" parses the "regionalOptions" key from dxapp.json into the
    # "runSpec.systemRequirements" field of applet/new.
    # "dx get" should parse the "systemRequirementsByRegion" field from
    # the response of /app-x/get or /applet-x/get into the "regionalOptions"
    # key in dxapp.json.
    if "systemRequirementsByRegion" in dxapp_json['runSpec']:
        dxapp_json["regionalOptions"] = {}
        for region in dxapp_json['runSpec']["systemRequirementsByRegion"]:
            region_sys_reqs = dxapp_json['runSpec']['systemRequirementsByRegion'][region]

            # handle cluster bootstrap scripts if any are present
            for entry_point in region_sys_reqs:
                try:
                    bootstrap_script = region_sys_reqs[entry_point]['clusterSpec']['bootstrapScript']
                    filename = make_cluster_bootstrap_script_file(region,
                                                                  entry_point,
                                                                  bootstrap_script,
                                                                  suffix)
                    region_sys_reqs[entry_point]['clusterSpec']['bootstrapScript'] = filename
                except KeyError:
                    # either no "clusterSpec" or no "bootstrapScript" within "clusterSpec"
                    continue

            dxapp_json["regionalOptions"][region] = \
                dict(systemRequirements=region_sys_reqs)

    # systemRequirementsByRegion data is stored in regionalOptions,
    # systemRequirements is ignored
    if 'systemRequirementsByRegion' in dxapp_json["runSpec"]:
        del dxapp_json["runSpec"]["systemRequirementsByRegion"]
    if 'systemRequirements' in dxapp_json["runSpec"]:
        del dxapp_json["runSpec"]["systemRequirements"]

    # Cleanup of empty elements. Be careful not to let this step
    # introduce any semantic changes to the app specification. For
    # example, an empty input (output) spec is not equivalent to a
    # missing input (output) spec.
    if 'runSpec' in dxapp_json:
        _recursive_cleanup(dxapp_json['runSpec'])
    if 'access' in dxapp_json:
        _recursive_cleanup(dxapp_json['access'])
    for key in executable._get_cleanup_keys():
        if key in dxapp_json and not dxapp_json[key]:
            del dxapp_json[key]

    readme = info.get("description", "")
    devnotes = info.get("developerNotes", "")

    # Write dxapp.json, Readme.md, and Readme.developer.md
    _write_json_file("dxapp.json", dxapp_json)
    if readme:
        _write_simple_file("Readme.md", readme)
    if devnotes:
        _write_simple_file("Readme.developer.md", devnotes)
Пример #12
0
def main():
    argparser = argparse.ArgumentParser(
        description="Build a dxCompiler release")
    argparser.add_argument("--force",
                           help="Build even if there is an existing version",
                           action='store_true',
                           default=False)
    argparser.add_argument("--multi-region",
                           help="Copy to all supported regions",
                           action='store_true',
                           default=False)
    argparser.add_argument("--dry-run",
                           help="Don't build any artifacts",
                           action='store_true',
                           default=False)
    args = argparser.parse_args()

    # build multi-region jar for releases, or
    # if explicitly specified
    multi_region = args.multi_region

    # Choose which dictionary to use
    if multi_region:
        project_dict = RELEASE_DICT
    else:
        project_dict = TEST_DICT

    project = util.get_project(project_dict[HOME_REGION])
    print("project: {} ({})".format(project.name, project.get_id()))

    # Figure out what the current version is
    version_id = util.get_version_id(top_dir)
    print("version: {}".format(version_id))

    # Set the folder
    folder = "/releases/{}".format(version_id)
    print("folder: {}".format(folder))

    if args.dry_run:
        args.force = False

    # remove the existing directory paths
    if args.force:
        for proj_name in project_dict.values():
            print("removing path {}:{}".format(proj_name, folder))
            dx_proj = util.get_project(proj_name)
            try:
                dx_proj.remove_folder(folder, recurse=True)
            except dxpy.DXError:
                pass

    # Make sure the target directory exists
    project.new_folder(folder, parents=True)

    # Build the asset, and the compiler jar file.
    path_dict = dict(
        map(lambda kv: (kv[0], kv[1] + ":" + folder), project_dict.items()))
    if args.dry_run:
        return

    home_ad = util.build(project, folder, version_id, top_dir, path_dict)

    if multi_region:
        for lang, asset_desc in home_ad.asset_ids.items():
            home_rec = dxpy.DXRecord(asset_desc)
            all_regions = project_dict.keys()

            # Leave only regions where the asset is missing
            target_regions = []
            for dest_region in all_regions:
                dest_proj = util.get_project(project_dict[dest_region])
                dest_asset = util.find_asset(dest_proj, folder, lang)
                if dest_asset == None:
                    target_regions.append(dest_region)

            _clone_asset(home_rec, folder, target_regions, project_dict)
Пример #13
0
def dump_executable(executable,
                    destination_directory,
                    omit_resources=False,
                    describe_output=[]):
    """
    Reconstitutes executable into a directory that would create a
    functionally identical executable if "dx build" were run on it.
    destination_directory will be the root source directory for the
    applet.

    :param executable: executable, i.e. app or applet,  to be dumped
    :type executable: DXExecutable (only DXApp or DXApplet now)
    :param destination_directory: an existing, empty, and writable directory
    :type destination_directory: str
    """

    old_cwd = os.getcwd()
    os.chdir(destination_directory)

    try:
        info = executable.get()

        if info["runSpec"]["interpreter"] == "bash":
            suffix = "sh"
        elif info["runSpec"]["interpreter"] == "python2.7":
            suffix = "py"
        else:
            print(
                'Sorry, I don\'t know how to get executables with interpreter '
                + info["runSpec"]["interpreter"] + '\n',
                file=sys.stderr)
            sys.exit(1)

        # Entry point script
        script = "src/code.%s" % (suffix, )
        os.mkdir("src")
        with open(script, "w") as f:
            f.write(info["runSpec"]["code"])

        # Get all the asset bundles
        asset_depends = []
        deps_to_remove = []

        # When an applet is built bundledDepends are added in the following order:
        # 1. bundledDepends explicitly specified in the dxapp.json
        # 2. resources (contents of resources directory added as bundledDepends)
        # 3. assetDepends (translated into bundledDepends)
        #
        # Therefore while translating bundledDepends to assetDepends, we are traversing the
        # list in reverse order and exiting when we can't find the "AssetBundle" property
        # with the tarball file.
        #
        # NOTE: If last item (and contiguous earlier items) of bundledDepends (#1 above) refers to an
        # AssetBundle tarball, those items will be converted to assetDepends.
        #
        # TODO: The bundledDepends should be annotated with another field called {"asset": true}
        # to distinguish it from non assets. It will be needed to annotate the bundleDepends,
        # when the wrapper record object is no more accessible.

        for dep in reversed(info["runSpec"]["bundledDepends"]):
            file_handle = get_handler(dep["id"])
            if isinstance(file_handle, dxpy.DXFile):
                asset_record_id = file_handle.get_properties().get(
                    "AssetBundle")
                asset_record = None
                if asset_record_id:
                    asset_record = dxpy.DXRecord(asset_record_id)
                    if asset_record:
                        try:
                            asset_depends.append({
                                "name":
                                asset_record.describe().get("name"),
                                "project":
                                asset_record.get_proj_id(),
                                "folder":
                                asset_record.describe().get("folder"),
                                "version":
                                asset_record.describe(
                                    fields={"properties": True})["properties"]
                                ["version"]
                            })
                            deps_to_remove.append(dep)
                        except DXError:
                            print(
                                "Describe failed on the assetDepends record object with ID - "
                                + asset_record_id + "\n",
                                file=sys.stderr)
                            pass
                else:
                    break
        # Reversing the order of the asset_depends[] so that original order is maintained
        asset_depends.reverse()
        # resources/ directory
        created_resources_directory = False
        if not omit_resources:
            for dep in info["runSpec"]["bundledDepends"]:
                if dep in deps_to_remove:
                    continue
                handler = get_handler(dep["id"])
                if isinstance(handler, dxpy.DXFile):
                    if not created_resources_directory:
                        os.mkdir("resources")
                        created_resources_directory = True
                    handler_id = handler.get_id()
                    fname = "resources/%s.tar.gz" % (handler_id)
                    download_dxfile(handler_id, fname)
                    print("Unpacking resources", file=sys.stderr)
                    tar = tarfile.open(fname)
                    tar.extractall("resources")
                    tar.close()
                    os.unlink(fname)
                    deps_to_remove.append(dep)

        # TODO: if output directory is not the same as executable name we
        # should print a warning and/or offer to rewrite the "name"
        # field in the 'dxapp.json'
        dxapp_json = collections.OrderedDict()
        all_keys = executable._get_required_keys(
        ) + executable._get_optional_keys()
        for key in all_keys:
            if key in executable._get_describe_output_keys(
            ) and key in describe_output:
                dxapp_json[key] = describe_output[key]
            if key in info:
                dxapp_json[key] = info[key]
        if info.get("hidden", False):
            dxapp_json["hidden"] = True

        # TODO: inputSpec and outputSpec elements should have their keys
        # printed in a sensible (or at least consistent) order too

        # Un-inline code
        del dxapp_json["runSpec"]["code"]
        dxapp_json["runSpec"]["file"] = script

        # Remove resources from bundledDepends
        for dep in deps_to_remove:
            dxapp_json["runSpec"]["bundledDepends"].remove(dep)

        # Add assetDepends to dxapp.json
        if len(asset_depends) > 0:
            dxapp_json["runSpec"]["assetDepends"] = asset_depends

        # Ordering input/output spec keys
        ordered_spec_keys = ("name", "label", "help", "class", "type",
                             "patterns", "optional", "default", "choices",
                             "suggestions", "group")
        for spec_key in "inputSpec", "outputSpec":
            if spec_key not in dxapp_json.keys():
                continue
            for i, spec in enumerate(dxapp_json[spec_key]):
                ordered_spec = collections.OrderedDict()
                # Adding keys, for which the ordering is defined
                for key in ordered_spec_keys:
                    if key in spec.keys():
                        ordered_spec[key] = spec[key]
                # Adding the rest of the keys
                for key in spec.keys():
                    if key not in ordered_spec_keys:
                        ordered_spec[key] = spec[key]
                dxapp_json[spec_key][i] = ordered_spec

        # Remove dx-toolkit from execDepends
        dx_toolkit = {"name": "dx-toolkit", "package_manager": "apt"}
        if dx_toolkit in dxapp_json["runSpec"]["execDepends"]:
            dxapp_json["runSpec"]["execDepends"].remove(dx_toolkit)

        # Remove "bundledDependsByRegion" field from "runSpec". This utility
        # will reconstruct the resources directory based on the
        # "bundledDepends" field, which should be equivalent to
        # "bundledDependsByRegion".
        dxapp_json["runSpec"].pop("bundledDependsByRegion", None)

        # Cleanup of empty elements. Be careful not to let this step
        # introduce any semantic changes to the app specification. For
        # example, an empty input (output) spec is not equivalent to a
        # missing input (output) spec.
        if 'runSpec' in dxapp_json:
            _recursive_cleanup(dxapp_json['runSpec'])
        if 'access' in dxapp_json:
            _recursive_cleanup(dxapp_json['access'])
        for key in executable._get_cleanup_keys():
            if key in dxapp_json and not dxapp_json[key]:
                del dxapp_json[key]

        readme = info.get("description", "")
        devnotes = info.get("developerNotes", "")

        # Write dxapp.json, Readme.md, and Readme.developer.md
        with open("dxapp.json", "w") as f:
            f.write(
                flatten_json_array(
                    json.dumps(dxapp_json, indent=2, separators=(',', ': ')),
                    "patterns"))
            f.write('\n')
        if readme:
            with open("Readme.md", "w") as f:
                f.write(readme)
        if devnotes:
            with open("Readme.developer.md", "w") as f:
                f.write(devnotes)
    except:
        err_exit()
    finally:
        os.chdir(old_cwd)
Пример #14
0
def main(**job_inputs):
    output = {}
    reportInput = {}

    run_shell("dx-spans-to-bed --output genes.bed " +
              job_inputs["gene_model"]["$dnanexus_link"])
    bed_id = dxpy.upload_local_file("genes.bed").get_id()
    mappings_id = job_inputs["mappings"]["$dnanexus_link"]

    # get contaminant mapping started if we're doing it:
    if "contaminants" in job_inputs:
        if not "original_reads" in job_inputs:
            raise dxpy.AppError(
                "Original Reads must be input to calculate contamination levels. Please also supply the reads object that corresponds to these RNA-Seq mappings"
            )

        name_input = []
        contam_input = []

        #spawn mappings job for each ContigSet
        for contaminant in job_inputs['contaminants']:
            calc_job = map_contaminant(Reads=job_inputs['original_reads'],
                                       Contig=contaminant)

            name_input.append(dxpy.DXRecord(contaminant).describe()['name'])
            contam_input.append({"job": calc_job, "field": "percent_mapped"})

        reportInput['contam'] = contam_input
        reportInput['names'] = name_input
    else:
        reportInput['contam'] = None
        reportInput['names'] = None

    # output mappings as SAM for analysis modules
    run_shell(" ".join([
        "dx-mappings-to-sam", "--discard_unmapped", "--output mappings.sam",
        mappings_id
    ]))
    run_shell(" ".join(
        ["samtools", "view", "-S", "-b", "mappings.sam", ">", "mappings.bam"]))
    bam_id = dxpy.upload_local_file("mappings.bam",
                                    wait_on_close=True).get_id()

    job1 = dxpy.new_dxjob({
        'BED_file': bed_id,
        "BAM_file": dxpy.dxlink(bam_id)
    }, "geneBody_coverage")

    # if paired then do inner distance calculation
    if "chr2" in dxpy.DXGTable(mappings_id).get_col_names():
        job2 = dxpy.new_dxjob(
            {
                'BED_file': bed_id,
                "BAM_file": dxpy.dxlink(bam_id)
            }, "inner_distance")
    else:
        job2 = None

    job3 = dxpy.new_dxjob({
        'BED_file': bed_id,
        "BAM_file": dxpy.dxlink(bam_id)
    }, "junction_annotation")

    job4 = dxpy.new_dxjob({"BAM_file": dxpy.dxlink(bam_id)},
                          "read_duplication")

    # implement this one when we can request a large RAM instance - requires 19GB for human genome
    job5 = dxpy.new_dxjob({
        'BED_file': bed_id,
        "BAM_file": dxpy.dxlink(bam_id)
    }, "read_distribution")
    #                       {"systemRequirements": {"instanceType":"dx_m2.2xlarge"}} )

    reportInput['geneBody'] = {"job": job1.get_id(), "field": "results"}
    if job2 != None:
        reportInput['inner_dist'] = {"job": job2.get_id(), "field": "results"}
    else:
        reportInput['inner_dist'] = None

    reportInput['junc_ann'] = {"job": job3.get_id(), "field": "results"}
    reportInput['read_dup'] = {"job": job4.get_id(), "field": "results"}
    reportInput['read_dist'] = {"job": job5.get_id(), "field": "results"}
    reportInput['mappings'] = job_inputs["mappings"]

    reportJob = dxpy.new_dxjob(reportInput, "generate_report")

    output['report'] = {"job": reportJob.get_id(), "field": "Report"}

    return output
def main():
    '''Get lane, base, and read stats for the given month.
    '''

    args = parse_args(sys.argv[1:])
    outfile = args.outfile

    if args.cron:
        now = datetime.datetime.now()
        year = now.year
        month = now.month - 1
        print 'Info: Collecting metrics for %d-%d' % (year, month)
    else:
        year = args.year
        month = args.month
    monthly_outfile = '%d-%d_seq-stats.txt' % (year, month)

    monthly_metrics = defaultdict()
    '''
    monthly_metrics = {
                       'lane_count' : 0,
                       'base_count' : 0,
                       'read_count' : 0
                      }
    '''

    # Dev: NEED TO QC THIS
    after_date = '%d-%d-01' % (year, month)
    if month > 1 and month < 12:
        before_date = '%d-%d-01' % (year, int(month + 1))
    elif month == 12:
        before_date = '%d-1-01' % int(year + 1)
    else:
        print 'Error: invalid month %d' % month
        sys.exit()

    print 'After: %s' % after_date
    print 'Before: %s ' % before_date

    monthly_records = dxpy.find_data_objects(
        classname='record',
        project='project-BY82j6Q0jJxgg986V16FQzjx',
        folder='/',
        typename='SCGPMRun',
        created_after=after_date,
        created_before=before_date)

    MOUT = open(monthly_outfile, 'w')
    for record in monthly_records:
        sequencer_type = None

        print record['id']
        dxrecord = dxpy.DXRecord(record['id'], record['project'])
        lane_details = dxrecord.get_details()
        lane_properties = dxrecord.get_properties()

        try:
            production = lane_properties['production']
            if production == 'false':
                continue
        except:
            print 'Skipping: not production'
            continue

        dxproject = lane_details['laneProject']
        lane_index = int(lane_details['lane'])
        run_name = str(lane_details['run'])
        paired_end = bool(lane_properties['paired_end'])

        #try:
        #    sequencer_type = str(lane_properties['sequencer_type'])
        #except:
        seq_instrument = str(lane_properties['seq_instrument'])
        sequencer_type = classify_instrument(seq_instrument)
        #sequencer_type = sequencer_type.replace(' ', '_')

        # Create monthly_records defaultdict to store cumulative stats
        if not sequencer_type in monthly_metrics.keys():
            monthly_metrics[sequencer_type] = {
                'lane_count': 0,
                'base_count': 0,
                'read_count': 0
            }

        lane_name = '%s_L%d' % (run_name, lane_index)
        print 'Processing %s' % lane_name

        try:
            html_file = dxpy.find_one_data_object(
                classname='file',
                name='*.lane.html',
                name_mode='glob',
                project=dxproject,
                folder='/stage0_bcl2fastq/miscellany',
                more_ok=True,
                zero_ok=False)
        except:
            print 'Warning: Could not get lane.html file. Skipping'
            continue

        html_dxfile = dxpy.DXFile(html_file['id'], html_file['project'])
        lane_metrics = parse_lane_html(html_dxfile, lane_index, lane_name)

        pf_clusters = int(lane_metrics['pf_clusters'].replace(',', ''))
        if paired_end:
            read_count = pf_clusters * 2
        else:
            read_count = pf_clusters

        mbase_count = int(lane_metrics['yield_mbases'].replace(',', ''))
        base_count = mbase_count * 1000000

        monthly_metrics[sequencer_type]['lane_count'] += 1
        monthly_metrics[sequencer_type]['base_count'] += base_count
        monthly_metrics[sequencer_type]['read_count'] += read_count

        # Write individual record data to the monthly out file.
        # One record/lane per line.
        #pdb.set_trace()
        mout_str = ('{}\t'.format(year) + '{}\t'.format(month) +
                    '{}\t'.format(run_name) + '{}\t'.format(lane_index) +
                    '{}\t'.format(read_count) + '{}\t'.format(base_count) +
                    '{}\n'.format(sequencer_type))
        MOUT.write(mout_str)
    MOUT.close()

    # Add header to new outfile
    if not os.path.isfile(outfile):
        with open(outfile, 'w') as OUT:
            OUT.write(
                'Year\tMonth\tLane_Count\tRead_Count\tBase_Count\tSeq_Type\n')
    # Write monthly metrics to outfile
    with open(outfile, 'a') as OUT:
        '''Old string formatting method
        out_str = '%d\t%d\t%d\t%d\t%d\n' % (
                                            year,
                                            month,
                                            monthly_metrics['lane_count'],
                                            monthly_metrics['read_count'],
                                            monthly_metrics['base_count'],
                                            sequencer_type)
        '''
        #pdb.set_trace()
        for sequencer_type in monthly_metrics.keys():
            out_str = (
                '{}\t'.format(year) + '{}\t'.format(month) +
                '{}\t'.format(monthly_metrics[sequencer_type]['lane_count']) +
                '{}\t'.format(monthly_metrics[sequencer_type]['read_count']) +
                '{}\t'.format(monthly_metrics[sequencer_type]['base_count']) +
                '{}\n'.format(sequencer_type))
            OUT.write(out_str)
    out_prefix = outfile.split('.')[0]
Пример #16
0
def main(**kwargs):

    if len(kwargs) == 0:
        opts = parser.parse_args(sys.argv[1:])
    else:
        opts = parser.parse_args(kwargs)

    if opts.mappings_id == None:
        parser.print_help()
        sys.exit(1)

    mappingsTable = dxpy.DXGTable(opts.mappings_id)
    idAsName = opts.id_as_name
    idPrepend = opts.id_prepend
    writeRowId = opts.write_row_id

    paired = "chr2" in mappingsTable.get_col_names()

    regions = []
    if opts.region_file != "":
        regions = re.findall("-L ([^:]*):(\d+)-(\d+)",
                             open(opts.region_file, 'r').read())

    name = mappingsTable.describe()['name']

    if opts.reference != None:
        originalContig = opts.reference
    else:
        try:
            originalContig = mappingsTable.get_details(
            )['original_contigset']['$dnanexus_link']
        except:
            raise dxpy.AppError(
                "The original reference genome must be attached to mappings table"
            )

    try:
        contigDetails = dxpy.DXRecord(originalContig).get_details()['contigs']
    except:
        raise dxpy.AppError("Unable to access reference with ID " +
                            originalContig)

    contigNames = contigDetails['names']
    contigSizes = contigDetails['sizes']

    if opts.file_name != None:
        outputFile = open(opts.file_name, 'w')
    else:
        outputFile = None

    header = ""

    for i in range(len(contigNames)):
        header += "@SQ\tSN:" + str(contigNames[i]) + "\tLN:" + str(
            contigSizes[i]) + "\n"

    assignReadGroup = opts.assign_read_group
    if assignReadGroup != "":
        header += "@RG\tID:" + assignReadGroup + "\tSM:Sample_0"
    else:
        for i in range(len(mappingsTable.get_details()['read_groups'])):
            header += "@RG\tID:" + str(i) + "\tSM:Sample_" + str(i)
            if opts.read_group_platform != '':
                header += "\tPL:" + opts.read_group_platform
            header += "\n"

    if outputFile != None:
        outputFile.write(header)
    else:
        sys.stdout.write(header)

    col = {}
    names = mappingsTable.get_col_names()
    for i in range(len(names)):
        col[names[i]] = i + 1

    column_descs = mappingsTable.describe()['columns']

    sam_cols = []
    sam_col_names = []
    sam_col_types = {}
    for c in column_descs:
        if c['name'].startswith(
                "sam_field_") or c['name'] == "sam_optional_fields":
            sam_cols.append(c)
            sam_col_names.append(c['name'])
            sam_col_types[c['name']] = c['type']

    defaultCol = {
        "sequence": "",
        "name": "",
        "quality": "",
        "status": "UNMAPPED",
        "chr": "",
        "lo": 0,
        "hi": 0,
        "negative_strand": False,
        "error_probability": 0,
        "qc_fail": False,
        "duplicate": False,
        "cigar": "",
        "mate_id": -1,
        "status2": "",
        "chr2": "",
        "lo2": 0,
        "hi2": 0,
        "negative_strand2": False,
        "proper_pair": False,
        "read_group": 0
    }

    #unmappedFile = open("unmapped.txt", 'w')

    if len(regions) == 0:

        if opts.start_row > mappingsTable.describe()['length']:
            raise dxpy.AppError(
                "Starting row is larger than number of rows in table")
        elif opts.end_row < opts.start_row:
            raise dxpy.AppError("Ending row is before Start")

        if opts.end_row > 0:
            generator = mappingsTable.iterate_rows(start=opts.start_row,
                                                   end=opts.end_row,
                                                   want_dict=True)
        else:
            generator = mappingsTable.iterate_rows(start=opts.start_row,
                                                   want_dict=True)

        # write each row unless we're throwing out unmapped
        for row in generator:
            if row["status"] != "UNMAPPED" or opts.discard_unmapped == False:
                if not paired:
                    writeRow(row, col, defaultCol, outputFile, idAsName,
                             idPrepend, writeRowId, assignReadGroup,
                             column_descs, sam_cols, sam_col_names,
                             sam_col_types)
                elif opts.no_interchromosomal and row["chr"] == row["chr2"]:
                    writeRow(row, col, defaultCol, outputFile, idAsName,
                             idPrepend, writeRowId, assignReadGroup,
                             column_descs, sam_cols, sam_col_names,
                             sam_col_types)
                elif opts.only_interchromosomal and opts.no_interchromosomal == False and (
                        row["chr"] != row["chr2"] or
                    (row["chr"] == "" and row["chr2"] == "")):
                    writeRow(row, col, defaultCol, outputFile, idAsName,
                             idPrepend, writeRowId, assignReadGroup,
                             column_descs, sam_cols, sam_col_names,
                             sam_col_types)
                elif opts.no_interchromosomal == False and opts.only_interchromosomal == False:
                    writeRow(row, col, defaultCol, outputFile, idAsName,
                             idPrepend, writeRowId, assignReadGroup,
                             column_descs, sam_cols, sam_col_names,
                             sam_col_types)

    else:
        for x in regions:
            # generate the query for this region
            query = mappingsTable.genomic_range_query(
                x[0],
                int(x[1]) + opts.region_index_offset,
                int(x[2]) + opts.region_index_offset,
                index='gri')
            for row in mappingsTable.get_rows(query=query, limit=1)['data']:
                startRow = row[0]
                for row in mappingsTable.iterate_rows(start=startRow,
                                                      want_dict=True):
                    if row["chr"] != x[0] or row["lo"] > int(
                            x[2]) + opts.region_index_offset:
                        break
                    if row["status"] != "UNMAPPED" or opts.discard_unmapped == False:
                        if not paired:
                            writeRow(row, col, defaultCol, outputFile,
                                     idAsName, idPrepend, writeRowId,
                                     assignReadGroup, column_descs, sam_cols,
                                     sam_col_names, sam_col_types)
                        elif opts.no_interchromosomal and row["chr"] == row[
                                "chr2"]:
                            writeRow(row, col, defaultCol, outputFile,
                                     idAsName, idPrepend, writeRowId,
                                     assignReadGroup, column_descs, sam_cols,
                                     sam_col_names, sam_col_types)
                        elif opts.only_interchromosomal and opts.no_interchromosomal == False and (
                                row["chr"] != row["chr2"] or
                            (row["chr"] == "" and row["chr2"] == "")):
                            writeRow(row, col, defaultCol, outputFile,
                                     idAsName, idPrepend, writeRowId,
                                     assignReadGroup, column_descs, sam_cols,
                                     sam_col_names, sam_col_types)
                        elif opts.no_interchromosomal == False and opts.only_interchromosomal == False:
                            writeRow(row, col, defaultCol, outputFile,
                                     idAsName, idPrepend, writeRowId,
                                     assignReadGroup, column_descs, sam_cols,
                                     sam_col_names, sam_col_types)

    if outputFile != None:
        outputFile.close()
Пример #17
0
def main():
    argparser = argparse.ArgumentParser(description="Build a dxWDL release")
    argparser.add_argument(
        "--force",
        help="Build even if the there is an existing version",
        action='store_true',
        default=False)
    argparser.add_argument("--multi-region",
                           help="Copy to all supported regions",
                           action='store_true',
                           default=False)
    args = argparser.parse_args()

    # build multi-region jar for releases, or
    # if explicitly specified
    multi_region = args.multi_region

    # Choose which dictionary to use
    if multi_region:
        project_dict = RELEASE_DICT
    else:
        project_dict = TEST_DICT

    project = util.get_project(project_dict[HOME_REGION])
    print("project: {} ({})".format(project.name, project.get_id()))

    # Figure out what the current version is
    version_id = util.get_version_id(top_dir)
    print("version: {}".format(version_id))

    # Set the folder
    folder = "/releases/{}".format(version_id)
    print("folder: {}".format(folder))

    # remove the existing directory paths
    if args.force:
        for proj_name in project_dict.values():
            print("removing path {}:{}".format(proj_name, folder))
            dx_proj = util.get_project(proj_name)
            try:
                dx_proj.remove_folder(folder, recurse=True)
            except dxpy.DXError:
                pass

    # Make sure the target directory exists
    project.new_folder(folder, parents=True)

    # Build the asset, and the compiler jar file.
    path_dict = dict(
        map(lambda kv: (kv[0], kv[1] + ":" + folder), project_dict.items()))
    (jar_path, home_ad) = util.build(project, folder, version_id, top_dir,
                                     path_dict)

    if multi_region:
        # download dxWDL runtime library
        home_rec = dxpy.DXRecord(home_ad.asset_id)
        fid = home_rec.get_details()['archiveFileId']['$dnanexus_link']
        fn = dxpy.describe(fid)['name']
        rtlib_path = "/tmp/{}".format(fn)
        print("Download asset file {}".format(fn))
        dxpy.download_dxfile(fid, rtlib_path, show_progress=True)

        # copy to all other regions
        for region in project_dict.keys():
            if region != home_ad.region:
                proj = project_dict[region]
                if proj is None:
                    raise Exception(
                        "No project configured for region {}".format(region))
                dest_proj = util.get_project(proj)
                if dest_proj is not None:
                    dest_ad = util.copy_across_regions(rtlib_path, home_rec,
                                                       region, dest_proj,
                                                       folder)
                else:
                    print("No project named {}".format(proj))

    # Upload compiler jar file
    util.upload_local_file(jar_path, project, folder)
Пример #18
0
def main():
    argparser = argparse.ArgumentParser(description="Build the dxWDL jar file")
    argparser.add_argument("--folder", help="Destination folder")
    argparser.add_argument("--multi-region",
                           help="Copy to all supported regions",
                           action='store_true',
                           default=False)
    argparser.add_argument("--release",
                           help="Create a dxWDL release, implies multi-region",
                           action='store_true',
                           default=False)
    args = argparser.parse_args()

    # resolve project
    project_dict = None
    if args.release:
        project_dict = RELEASE_DICT
    else:
        project_dict = TEST_DICT
    project = util.get_project(project_dict[HOME_REGION])
    print("project: {} ({})".format(project.name, project.get_id()))

    # Set the folder, build one if necessary
    if args.folder is not None:
        folder = args.folder
    elif args.release:
        folder = time.strftime("/releases/%Y-%m-%d/%H%M%S")
        project.new_folder(folder, parents=True)
    else:
        folder = time.strftime("/builds/%Y-%m-%d/%H%M%S")
        project.new_folder(folder, parents=True)
    print("folder: {}".format(folder))

    # build multi-region jar for releases, or
    # if explicitly specified
    multi_region = args.multi_region
    if args.release:
        multi_region = True

    # Figure out what the current version is
    version_id = util.get_version_id(top_dir)
    print("version: {}".format(version_id))

    # build the asset
    home_ad = util.build(project, folder, version_id, top_dir)

    ad_all = [home_ad]
    if multi_region:
        # download dxWDL runtime library
        home_rec = dxpy.DXRecord(home_ad.asset_id)
        fid = home_rec.get_details()['archiveFileId']['$dnanexus_link']
        fn = dxpy.describe(fid)['name']
        rtlib_path = "/tmp/{}".format(fn)
        print("Download asset file {}".format(fn))
        dxpy.download_dxfile(fid, rtlib_path, show_progress=True)

        # copy to all other regions
        for region in project_dict.keys():
            if region != home_ad.region:
                proj = project_dict[region]
                if proj is None:
                    raise Exception(
                        "No project configured for region {}".format(region))
                dest_proj = util.get_project(proj)
                dest_ad = util.copy_across_regions(rtlib_path, home_rec,
                                                   region, dest_proj, folder)
                ad_all.append(dest_ad)

    # build the final jar file, containing a list of the per-region
    # assets
    jar_path = util.build_final_jar(version_id, top_dir, ad_all)

    # Upload compiler jar file
    if args.release:
        util.upload_local_file(jar_path, project, folder)
Пример #19
0
def clone_asset(record_id, regions, num_retries=0, priority=None):
    """
    This function will attempt to clone the given record into all of the given regions.
    It will return a dictionary with the regions as keys and the record-ids of the
    corresponding asset as the values.  If an asset is not able to be created in a given
    region, the value will be set to None.
    """
    # Get the asset record
    record = dxpy.DXRecord(record_id)
    fid = record.get_details()['archiveFileId']['$dnanexus_link']
    curr_region = dxpy.describe(record.project)['region']

    # Only run once per region
    regions = set(regions) - set([curr_region])
    app_supported_regions = set(
        CLONE_ASSET_APP.describe()['regionalOptions'].keys())
    if len(regions - app_supported_regions) > 0:
        print('Currently no support for the following region(s): [{0}]'.format(
            ', '.join(regions - app_supported_regions)),
              file=sys.stderr)
        sys.exit(1)

    # Get information about the asset
    record_name = record.name
    asset_properties = record.get_properties()
    asset_properties['cloned_from'] = record_id
    asset_file_name = dxpy.describe(fid)['name']
    url = dxpy.DXFile(fid).get_download_url(
        preauthenticated=True,
        project=dxpy.DXFile.NO_PROJECT_HINT,
        duration=URL_DURATION)[0]

    # Fire off a clone process for each region
    pool = multiprocessing.Pool(len(regions))
    manager = multiprocessing.Manager()
    q = manager.Queue()
    clone_asset_func = functools.partial(_clone_asset_into_region,
                                         record_name=record_name,
                                         q=q,
                                         asset_properties=asset_properties,
                                         asset_file_name=asset_file_name,
                                         url=url,
                                         num_retries=num_retries,
                                         priority=priority)
    results = pool.map_async(clone_asset_func, regions)

    # Get and return the results
    remaining_regions = regions
    print('Waiting on region(s): {0} '.format(' '.join(remaining_regions)))
    while True:
        if results.ready():
            break
        else:
            if q.qsize() > 0:
                for i in xrange(q.qsize()):
                    received = set([q.get()])
                    remaining_regions = remaining_regions - received
                print('\nWaiting on region(s): {0} '.format(
                    ' '.join(remaining_regions)))
            sys.stdout.write('.')
            sys.stdout.flush()
            time.sleep(SLEEP_TIME)

    print('\nDone')
    results = results.get()
    record_ids = {}
    for result in results:
        for region in result:
            if result[region] is None:
                record_ids[region] = None
            else:
                record_ids[region] = result[region]['$dnanexus_link']

    return record_ids
Пример #20
0
def upload_applet(src_dir,
                  uploaded_resources,
                  check_name_collisions=True,
                  overwrite=False,
                  archive=False,
                  project=None,
                  override_folder=None,
                  override_name=None,
                  dx_toolkit_autodep="stable",
                  dry_run=False,
                  **kwargs):
    """
    Creates a new applet object.

    :param project: ID of container in which to create the applet.
    :type project: str, or None to use whatever is specified in dxapp.json
    :param override_folder: folder name for the resulting applet which, if specified, overrides that given in dxapp.json
    :type override_folder: str
    :param override_name: name for the resulting applet which, if specified, overrides that given in dxapp.json
    :type override_name: str
    :param dx_toolkit_autodep: What type of dx-toolkit dependency to
        inject if none is present. "stable" for the APT package; "git"
        for HEAD of dx-toolkit master branch; or False for no
        dependency.
    :type dx_toolkit_autodep: boolean or string

    """
    applet_spec = _get_applet_spec(src_dir)

    if project is None:
        dest_project = applet_spec['project']
    else:
        dest_project = project
        applet_spec['project'] = project

    if 'name' not in applet_spec:
        try:
            applet_spec['name'] = os.path.basename(os.path.abspath(src_dir))
        except:
            raise AppBuilderException(
                "Could not determine applet name from the specification (dxapp.json) or from the name of the working directory (%r)"
                % (src_dir, ))

    if override_folder:
        applet_spec['folder'] = override_folder
    if 'folder' not in applet_spec:
        applet_spec['folder'] = '/'

    if override_name:
        applet_spec['name'] = override_name

    if 'dxapi' not in applet_spec:
        applet_spec['dxapi'] = dxpy.API_VERSION

    applets_to_overwrite = []
    archived_applet = None
    if check_name_collisions and not dry_run:
        destination_path = applet_spec['folder'] + (
            '/' if not applet_spec['folder'].endswith('/') else
            '') + applet_spec['name']
        logger.debug("Checking for existing applet at " + destination_path)
        for result in dxpy.find_data_objects(classname="applet",
                                             name=applet_spec["name"],
                                             folder=applet_spec['folder'],
                                             project=dest_project,
                                             recurse=False):
            if overwrite:
                # Don't remove the old applet until after the new one
                # has been created. This avoids a race condition where
                # we remove the old applet, but that causes garbage
                # collection of the bundled resources that will be
                # shared with the new applet
                applets_to_overwrite.append(result['id'])
            elif archive:
                logger.debug("Archiving applet %s" % (result['id']))
                proj = dxpy.DXProject(dest_project)
                archive_folder = '/.Applet_archive'
                try:
                    proj.list_folder(archive_folder)
                except dxpy.DXAPIError:
                    proj.new_folder(archive_folder)

                proj.move(objects=[result['id']], destination=archive_folder)
                archived_applet = dxpy.DXApplet(result['id'],
                                                project=dest_project)
                now = datetime.datetime.fromtimestamp(archived_applet.created /
                                                      1000).ctime()
                new_name = archived_applet.name + " ({d})".format(d=now)
                archived_applet.rename(new_name)
                logger.info(
                    "Archived applet %s to %s:\"%s/%s\"" %
                    (result['id'], dest_project, archive_folder, new_name))
            else:
                raise AppBuilderException(
                    "An applet already exists at %s (id %s) and the --overwrite (-f) or --archive (-a) options were not given"
                    % (destination_path, result['id']))

    # -----
    # Override various fields from the pristine dxapp.json

    # Carry region-specific values from regionalOptions into the main
    # runSpec
    applet_spec["runSpec"].setdefault("bundledDepends", [])
    applet_spec["runSpec"].setdefault("assetDepends", [])
    if not dry_run:
        region = dxpy.api.project_describe(
            dest_project, input_params={"fields": {
                "region": True
            }})["region"]

        # if regionalOptions contain at least one region, they must include
        # the region of the target project
        if len(applet_spec.get('regionalOptions',
                               {})) != 0 and region not in applet_spec.get(
                                   'regionalOptions', {}):
            err_mesg = "destination project is in region {} but \"regionalOptions\" do not contain this region. ".format(
                region)
            err_mesg += "Please, update your \"regionalOptions\" specification"
            raise AppBuilderException(err_mesg)

        regional_options = applet_spec.get('regionalOptions',
                                           {}).get(region, {})

        # We checked earlier that if region-specific values for the
        # fields below are given, the same fields are not also specified
        # in the top-level runSpec. So the operations below should not
        # result in any user-supplied settings being clobbered.

        if 'systemRequirements' in regional_options:
            applet_spec["runSpec"]["systemRequirements"] = regional_options[
                'systemRequirements']

        if 'bundledDepends' in regional_options:
            applet_spec["runSpec"]["bundledDepends"].extend(
                regional_options["bundledDepends"])
        if 'assetDepends' in regional_options:
            applet_spec["runSpec"]["assetDepends"].extend(
                regional_options["assetDepends"])

    # Inline Readme.md and Readme.developer.md
    dxpy.executable_builder.inline_documentation_files(applet_spec, src_dir)

    # Inline the code of the program
    if "file" in applet_spec["runSpec"]:
        # Put it into runSpec.code instead
        with open(os.path.join(src_dir,
                               applet_spec["runSpec"]["file"])) as code_fh:
            applet_spec["runSpec"]["code"] = code_fh.read()
            del applet_spec["runSpec"]["file"]

    # If this is applet requires a cluster, inline any bootstrapScript code that may be provided.
    # bootstrapScript is an *optional* clusterSpec parameter.
    # NOTE: assumes bootstrapScript is always provided as a filename
    if "systemRequirements" in applet_spec["runSpec"]:
        sys_reqs = applet_spec["runSpec"]["systemRequirements"]
        for entry_point in sys_reqs:
            try:
                bootstrap_script = os.path.join(
                    src_dir,
                    sys_reqs[entry_point]["clusterSpec"]["bootstrapScript"])
                with open(bootstrap_script) as code_fh:
                    sys_reqs[entry_point]["clusterSpec"][
                        "bootstrapScript"] = code_fh.read()
            except KeyError:
                # either no "clusterSpec" or no "bootstrapScript" within "clusterSpec"
                continue
            except IOError:
                raise AppBuilderException(
                    "The clusterSpec \"bootstrapScript\" could not be read.")

    # Attach bundled resources to the app
    if uploaded_resources is not None:
        applet_spec["runSpec"]["bundledDepends"].extend(uploaded_resources)

    # Validate and process assetDepends
    asset_depends = applet_spec["runSpec"]["assetDepends"]
    if type(asset_depends) is not list or any(
            type(dep) is not dict for dep in asset_depends):
        raise AppBuilderException(
            "Expected runSpec.assetDepends to be an array of objects")
    for asset in asset_depends:
        asset_project = asset.get("project", None)
        asset_folder = asset.get("folder", '/')
        asset_stages = asset.get("stages", None)
        if "id" in asset:
            asset_record = dxpy.DXRecord(asset["id"]).describe(
                fields={'details'}, default_fields=True)
        elif "name" in asset and asset_project is not None and "version" in asset:
            try:
                asset_record = dxpy.find_one_data_object(
                    zero_ok=True,
                    classname="record",
                    typename="AssetBundle",
                    name=asset["name"],
                    properties=dict(version=asset["version"]),
                    project=asset_project,
                    folder=asset_folder,
                    recurse=False,
                    describe={
                        "defaultFields": True,
                        "fields": {
                            "details": True
                        }
                    },
                    state="closed",
                    more_ok=False)
            except dxpy.exceptions.DXSearchError:
                msg = "Found more than one asset record that matches: name={0}, folder={1} in project={2}."
                raise AppBuilderException(
                    msg.format(asset["name"], asset_folder, asset_project))
        else:
            raise AppBuilderException(
                "Each runSpec.assetDepends element must have either {'id'} or "
                "{'name', 'project' and 'version'} field(s).")

        if asset_record:
            if "id" in asset:
                asset_details = asset_record["details"]
            else:
                asset_details = asset_record["describe"]["details"]
            if "archiveFileId" in asset_details:
                archive_file_id = asset_details["archiveFileId"]
            else:
                raise AppBuilderException(
                    "The required field 'archiveFileId' was not found in "
                    "the details of the asset bundle %s " % asset_record["id"])
            archive_file_name = dxpy.DXFile(archive_file_id).describe()["name"]
            bundle_depends = {"name": archive_file_name, "id": archive_file_id}
            if asset_stages:
                bundle_depends["stages"] = asset_stages
            applet_spec["runSpec"]["bundledDepends"].append(bundle_depends)
            # If the file is not found in the applet destination project, clone it from the asset project
            if (not dry_run and
                    dxpy.DXRecord(dxid=asset_record["id"],
                                  project=dest_project).describe()["project"]
                    != dest_project):
                dxpy.DXRecord(
                    asset_record["id"],
                    project=asset_record["project"]).clone(dest_project)
        else:
            raise AppBuilderException(
                "No asset bundle was found that matched the specification %s" %
                (json.dumps(asset)))

    # Include the DNAnexus client libraries as an execution dependency, if they are not already
    # there
    if dx_toolkit_autodep == "git":
        dx_toolkit_dep = {
            "name": "dx-toolkit",
            "package_manager": "git",
            "url": "git://github.com/dnanexus/dx-toolkit.git",
            "tag": "master",
            "build_commands": "make install DESTDIR=/ PREFIX=/opt/dnanexus"
        }
    elif dx_toolkit_autodep == "stable":
        dx_toolkit_dep = {"name": "dx-toolkit", "package_manager": "apt"}
    elif dx_toolkit_autodep:
        raise AppBuilderException(
            "dx_toolkit_autodep must be one of 'stable', 'git', or False; got %r instead"
            % (dx_toolkit_autodep, ))

    if dx_toolkit_autodep:
        applet_spec["runSpec"].setdefault("execDepends", [])
        exec_depends = applet_spec["runSpec"]["execDepends"]
        if type(exec_depends) is not list or any(
                type(dep) is not dict for dep in exec_depends):
            raise AppBuilderException(
                "Expected runSpec.execDepends to be an array of objects")
        dx_toolkit_dep_found = any(
            dep.get('name') in DX_TOOLKIT_PKGS
            or dep.get('url') in DX_TOOLKIT_GIT_URLS for dep in exec_depends)
        if not dx_toolkit_dep_found:
            exec_depends.append(dx_toolkit_dep)
            if dx_toolkit_autodep == "git":
                applet_spec.setdefault("access", {})
                applet_spec["access"].setdefault("network", [])
                # Note: this can be set to "github.com" instead of "*" if the build doesn't download any deps
                if "*" not in applet_spec["access"]["network"]:
                    applet_spec["access"]["network"].append("*")

    merge(applet_spec, kwargs)

    # -----
    # Now actually create the applet

    if dry_run:
        print("Would create the following applet:")
        print(json.dumps(applet_spec, indent=2))
        print("*** DRY-RUN-- no applet was created ***")
        return None, None

    if applet_spec.get("categories", []):
        if "tags" not in applet_spec:
            applet_spec["tags"] = []
        applet_spec["tags"] = list(
            set(applet_spec["tags"]) | set(applet_spec["categories"]))

    applet_id = dxpy.api.applet_new(applet_spec)["id"]

    if archived_applet:
        archived_applet.set_properties({'replacedWith': applet_id})

    # Now it is permissible to delete the old applet(s), if any
    if applets_to_overwrite:
        logger.info("Deleting applet(s) %s" % (','.join(applets_to_overwrite)))
        dxpy.DXProject(dest_project).remove_objects(applets_to_overwrite)

    return applet_id, applet_spec
def main(**job_inputs):
    print "Beginning processing of RNA data"

    output = {}

    check_reads(job_inputs['reads'])

    # Convert reads tables to FASTQ/FASTA files
    left_reads = []
    right_reads = []

    current_reads = 0
    for reads in job_inputs['reads']:
        print "Converting reads table " + str(reads['$dnanexus_link'])
        left, right = dump_fastqa(reads['$dnanexus_link'],
                                  "reads_" + str(current_reads))

        left_reads.append(left)
        if right != None:
            right_reads.append(right)

        current_reads += 1

    # Convert Genes Object to GFF file

    run_shell("dx-genes-to-gtf --output genes.gtf " +
              job_inputs['gene_model']['$dnanexus_link'])

    # Create or download indexed genome
    genome = dxpy.DXRecord(job_inputs['reference'])

    if not 'indexed_reference' in job_inputs:
        output['indexed_reference'] = dxpy.dxlink(
            make_indexed_reference(genome.get_id()))
    else:
        output['indexed_reference'] = job_inputs['indexed_reference']
        indexed_genome = dxpy.DXRecord(job_inputs['indexed_reference'])
        dxpy.download_dxfile(indexed_genome.get_details()['index_archive'],
                             "reference.tar.xz")
        run_shell("tar -xJf reference.tar.xz")

    # call tophat
    num_cpus = multiprocessing.cpu_count()

    cmd = " ".join([
        'tophat', "-p",
        str(num_cpus), job_inputs['tophat_options'], "-G genes.gtf",
        "--transcriptome-index=./genes", "-T", "indexed_ref", " ",
        ",".join(left_reads)
    ])

    if len(right_reads) != 0:
        cmd += " " + ",".join(right_reads)

    # Invoke tophat2 with FASTQ/A file(s) and indexed reference
    try:
        run_shell(cmd)
    except:
        raise dxpy.AppError(
            "Error while running Tophat.  This could be caused by an incompatible gene model and reference or incorrect optional parameters.  Please check that these are all correct"
        )

    # upload and import the BAM as a Mappings table
    accepted_hits_file = dxpy.upload_local_file('tophat_out/accepted_hits.bam',
                                                wait_on_close=True)
    name = job_inputs['output_name']
    name += "_mappings"
    sam_importer = dxpy.DXApp(name="sam_importer")
    print "Importing BAM output of Tophat"
    import_job = sam_importer.run({
        "file":
        dxpy.dxlink(accepted_hits_file.get_id()),
        "reference_genome":
        dxpy.dxlink(genome.get_id()),
        "name":
        name
    })

    cuff_cmd = " ".join(
        ['cufflinks', '-p',
         str(num_cpus), '-G genes.gtf', '-o cuff'])

    if 'cufflinks_options' in job_inputs:
        cuff_cmd += " " + job_inputs['cufflinks_options']

    cuff_cmd += " tophat_out/accepted_hits.bam"

    # now with mapped reads in hand we can run cufflinks
    try:
        run_shell(cuff_cmd)
    except:
        raise dxpy.AppError(
            "Error while running Cufflinks.  Please check that your parameters are valid"
        )

    print "Packing, uploading, and parsing cufflinks output"
    # package cufflinks output
    run_shell("tar -czf cufflinks_output.tar.gz cuff/")
    cuff_name = job_inputs['output_name'] + "_cufflinks_output.tar.gz"
    orig_trans_file = dxpy.upload_local_file("cufflinks_output.tar.gz")
    orig_trans_file.rename(cuff_name)
    transcripts_table = upload_transcripts_file('cuff/genes.fpkm_tracking',
                                                job_inputs['output_name'])

    output['mappings'] = {"job": import_job.get_id(), "field": "mappings"}
    output['transcripts'] = dxpy.dxlink(transcripts_table.get_id())
    output['cufflinks_output'] = dxpy.dxlink(orig_trans_file.get_id())

    print "DONE!"

    return output