def test_basic_ops(self): from dxpy.utils.resolver import resolve_existing_path, ResolutionError resolve_existing_path('') with self.assertRaises(ResolutionError): resolve_existing_path('', allow_empty_string=False) proj_id, path, entity_id = resolve_existing_path(':') self.assertEqual(proj_id, dxpy.WORKSPACE_ID)
def add(self, input_name, input_value): if self.input_name_prefix is not None: if input_name.startswith(self.input_name_prefix): input_name = input_name[len(self.input_name_prefix):] else: # Skip inputs that don't start with prefix return if ':' in input_name: input_class = input_name[input_name.find(':') + 1:] input_name = input_name[:input_name.find(':')] else: input_class = None if self.input_spec is not None: if input_name not in self.input_spec: raise Exception('Input field called ' + input_name + ' was not found in the input spec') input_class = self.input_spec[input_name]['class'] if input_class is None: done = False try: # Resolve "job-xxxx:output-name" syntax into a canonical job ref job_id, field = split_unescaped(':', input_value) if is_job_id(job_id) or is_localjob_id(job_id): input_value = {"job": job_id, "field": field} done = True except: pass if not done: try: parsed_input_value = json.loads(input_value, object_pairs_hook=collections.OrderedDict) if type(parsed_input_value) in (collections.OrderedDict, list, int, long, float): input_value = parsed_input_value else: raise Exception() except: # Not recognized JSON (list or dict), so resolve it as a name try: project, folderpath, entity_result = resolve_existing_path(input_value, expected='entity') except: # If not possible, then leave it as a string project, folderpath, entity_result = None, None, None if entity_result is not None: if is_hashid(input_value): input_value = {'$dnanexus_link': entity_result['id']} else: input_value = {"$dnanexus_link": {"project": entity_result['describe']['project'], "id": entity_result['id']}} self.inputs[input_name].append(input_value) else: # Input class is known. Respect the "array" class. input_value = parse_input_or_jbor(input_class, input_value) if input_class.startswith('array:'): self.inputs[input_name].append(input_value) else: self.inputs[input_name] = input_value
def main(): parser = argparse.ArgumentParser(description='Create a manifest file for a particular folder in a project') parser.add_argument('folder', help='a folder in the current DNAnexus project') parser.add_argument('-o', '--output_file', help='Name of the output file', default='manifest.json.bz2') parser.add_argument('-r', '--recursive', help='Recursively traverse folders and append to manifest', action='store_true', default=False) args = parser.parse_args() project, folder, _ = resolve_existing_path(args.folder) ids = dxpy.find_data_objects(classname='file', first_page_size=1000, state='closed', describe={'fields': {'id': True, 'name': True, 'folder': True, 'parts': True, 'state': True, 'archivalState': True }}, project=project, folder=folder, recurse=args.recursive) manifest = { project: [] } for i,f in enumerate(ids): manifest[project].append(fileID2manifest(f['describe'], project)) if i%1000 == 0 and i != 0: print("Processed {} files".format(i)) # Dedup # Duplicate filenames are converted to filename_fileid dups = [item for item, count in collections.Counter([x['name'] for x in manifest[project]]).items() if count > 1] for x in manifest[project]: if x['name'] in dups: fname, fext = os.path.splitext(x['name']) x['name'] = fname + "_" + x['id'] + fext write_manifest_to_file(args.output_file, manifest) print("Manifest file written to {}".format(args.output_file)) print("Total {} objects".format(len(manifest[project])))
def parse_obj(string, klass): if string == '': raise ValueError('Error: Nonempty string cannot be resolved') project, path, entity_result = resolve_existing_path(string) if entity_result is None: raise TypeError('Could not resolve \"' + string + '\" to a name or ID') if not entity_result['describe']['class'] == klass: raise TypeError('Error: The given object is of class ' + entity_result['describe']['class'] + ' but an object of class ' + klass + ' was expected.') if is_hashid(string): return {'$dnanexus_link': entity_result['id']} else: return {'$dnanexus_link': {"project": entity_result['describe']['project'], "id": entity_result['id']}}
def main(**kwargs): if len(kwargs) == 0: args = parser.parse_args(sys.argv[1:]) else: args = parser.parse_args(kwargs) # Attempt to resolve name try: project, folderpath, entity_result = resolve_existing_path(args.path, expected='entity') except ResolutionError as details: parser.exit(1, fill(unicode(details)) + '\n') if entity_result is None: parser.exit(1, fill('Could not resolve ' + args.path + ' to a data object') + '\n') filename = args.output if filename is None: filename = entity_result['describe']['name'].replace('/', '%2F') dxtable = dxpy.get_handler(entity_result['id']) delimiter = ',' if args.csv else '\t' if args.output == '-': writer = csv.writer(sys.stdout, delimiter=delimiter) else: if args.output is None and not args.no_ext: filename += '.csv' if args.csv else '.tsv' if not args.overwrite and os.path.exists(filename): parser.exit(1, fill('Error: path \"' + filename + '\" already exists but -f/--overwrite was not set') + '\n') writer = csv.writer(open(filename, 'wb'), delimiter=delimiter) if not args.no_header: writer.writerow((['__id__:int'] if args.rowid else []) + [(col['name'] + ':' + col['type']) for col in dxtable.describe()['columns']]) # Query stuff if args.gri is not None: try: lo = int(args.gri[1]) hi = int(args.gri[2]) except: parser.exit(1, fill('Error: the LO and HI arguments to --gri must be integers') + '\n') gri_query = dxpy.DXGTable.genomic_range_query(args.gri[0], lo, hi, args.gri_mode, args.gri_name) iterator = dxtable.iterate_query_rows(query=gri_query, limit=args.limit) else: iterator = dxtable.iterate_rows(start=args.starting, end=(None if args.limit is None else args.starting + args.limit)) for row in iterator: writer.writerow([unicode(item).encode('utf-8') for item in row[0 if args.rowid else 1:]])
def parse_obj(string, klass): if string == '': raise ValueError('Error: Nonempty string cannot be resolved') project, path, entity_result = resolve_existing_path(string) if entity_result is None: raise TypeError('Could not resolve \"' + string + '\" to a name or ID') if not entity_result['describe']['class'] == klass: raise TypeError('Error: The given object is of class ' + entity_result['describe']['class'] + ' but an object of class ' + klass + ' was expected.') if is_hashid(string): return {'$dnanexus_link': entity_result['id']} else: return { '$dnanexus_link': { "project": entity_result['describe']['project'], "id": entity_result['id'] } }
def main(): parser = argparse.ArgumentParser( description= 'Create a manifest file for a particular folder in a project') parser.add_argument('folder', help='a folder in the current DNAnexus project') parser.add_argument('--outfile', help='Name of the output file', default='manifest.json.bz2') parser.add_argument( '-r', '--recursive', help='Recursively traverse folders and append to manifest', action='store_true') args = parser.parse_args() project, folder, _ = resolve_existing_path(args.folder) ids = dxpy.find_data_objects(classname='file', first_page_size=1000, describe={ 'id': True, 'name': True, 'folder': True, 'parts': True }, project=project, folder=folder, recurse=args.recursive) manifest = {project: []} for i, f in enumerate(ids): manifest[project].append(fileID2manifest(f['describe'], project)) if i % 1000 == 0 and i != 0: print("Processed {} files".format(i)) with open(args.outfile, "w") as f: f.write(bz2.compress(json.dumps(manifest, indent=2, sort_keys=True))) print("Manifest file written to {}".format(args.outfile)) print("Total {} objects".format(len(manifest[project])))
def parse_args_as_job_input(args, app_spec): parser = argparse.ArgumentParser() json_inputs = set() for ispec in app_spec.get("inputSpec", []): kwargs = {} if ispec.get("type") == "int": kwargs["type"] = int elif ispec.get("type") == "float": kwargs["type"] = float elif ispec.get("type") == "boolean": kwargs["type"] = bool elif ispec.get("type") != "string": json_inputs.add(ispec["name"]) if ispec.get("optional") != None: kwargs["required"] = not ispec["optional"] parser.add_argument("--" + ispec["name"], **kwargs) inputs = {} for i, value in vars(parser.parse_args(args)).items(): if value is None: continue if i in json_inputs: try: inputs[i] = json.loads(value) except ValueError: from dxpy.utils.resolver import resolve_existing_path project, path, results = resolve_existing_path( value, ask_to_resolve=False, describe={'id': True}, allow_mult=False) print(project, path, results) if results is None or len(results) != 1: raise ValueError( "Value {v} could not be resolved".format(v=value)) inputs[i] = dxpy.dxlink(results[0]['id'], project_id=project) else: inputs[i] = value return inputs
def main(): parser = argparse.ArgumentParser( description='Create a manifest file from a DNAnexus directory') parser.add_argument('directory') parser.add_argument( '-r', '--recursive', help='Recursively traverse folders and append to manifest', action='store_true') parser.add_argument('--outfile', help='Name of the output file', default='manifest.json.bz2') args = parser.parse_args() project, folder, _ = resolve_existing_path(args.directory) generate_manifest_file(folder, project, args.outfile, args.recursive) print("Manifest file written to {}".format(args.outfile))
def parse_args_as_job_input(args, app_spec): parser = argparse.ArgumentParser() json_inputs = set() for ispec in app_spec.get("inputSpec", []): kwargs = {} if ispec.get("type") == "int": kwargs["type"] = int elif ispec.get("type") == "float": kwargs["type"] = float elif ispec.get("type") == "boolean": kwargs["type"] = bool elif ispec.get("type") != "string": json_inputs.add(ispec["name"]) if ispec.get("optional") != None: kwargs["required"] = not ispec["optional"] parser.add_argument("--" + ispec["name"], **kwargs) inputs = {} for i, value in vars(parser.parse_args(args)).items(): if value is None: continue if i in json_inputs: try: inputs[i] = json.loads(value) except ValueError: from dxpy.utils.resolver import resolve_existing_path project, path, results = resolve_existing_path( value, ask_to_resolve=False, describe={"id": True}, allow_mult=False ) print(project, path, results) if results is None or len(results) != 1: raise ValueError("Value {v} could not be resolved".format(v=value)) inputs[i] = dxpy.dxlink(results[0]["id"], project_id=project) else: inputs[i] = value return inputs
def main(**kwargs): if len(kwargs) == 0: kwargs = vars(parser.parse_args(sys.argv[1:])) # Attempt to resolve variants gtable name try: project, folderpath, entity_result = resolve_existing_path( kwargs['path'], expected='entity') except ResolutionError as details: parser.exit(1, fill(unicode(details)) + '\n') if entity_result is None: parser.exit( 1, fill('Could not resolve ' + kwargs['path'] + ' to a data object') + '\n') filename = kwargs['output'] if filename is None: filename = entity_result['describe']['name'].replace('/', '%2F') + ".vcf" if kwargs['output'] == '-': outputFile = sys.stdout else: outputFile = open(filename, 'w') exportRef = kwargs['export_ref_calls'] exportNoCall = kwargs['export_no_calls'] variantsTable = dxpy.open_dxgtable(entity_result['id']) try: originalContigSet = variantsTable.get_details()['original_contigset'] except: raise dxpy.AppError( "The original reference genome must be attached as a detail") contigDetails = dxpy.DXRecord(originalContigSet).get_details() if kwargs['reference'] is not None: refFileName = kwargs['reference'] if not os.path.isfile(refFileName): raise dxpy.AppError( "The reference expected by the variants to vcf script was not a valid file" ) else: refFileName = tempfile.NamedTemporaryFile(prefix='reference_', suffix='.txt', delete=False).name dxpy.download_dxfile( contigDetails['flat_sequence_file']['$dnanexus_link'], refFileName) if kwargs['write_header']: infos = variantsTable.get_details().get('infos') formats = variantsTable.get_details().get('formats') alts = variantsTable.get_details().get('alts') filters = variantsTable.get_details().get('filters') samples = variantsTable.get_details().get('samples') outputFile.write("##fileformat=VCFv4.1\n") if infos is not None: for k, v in collections.OrderedDict(sorted( infos.iteritems())).iteritems(): outputFile.write("##INFO=<ID=" + k + ",Number=" + v['number'] + ",Type=" + v['type'] + ",Description=\"" + v['description'] + "\">\n") if len(samples) > 0: outputFile.write( "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n" ) outputFile.write( "##FORMAT=<ID=AD,Number=.,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed\">\n" ) outputFile.write( "##FORMAT=<ID=DP,Number=1,Type=String,Description=\"Approximate read depth (reads with MQ=255 or with bad mates are filtered)\">\n" ) if formats is not None: for k, v in collections.OrderedDict(sorted( formats.iteritems())).iteritems(): outputFile.write("##FORMAT=<ID=" + k + ",Number=" + v['number'] + ",Type=" + v['type'] + ",Description=\"" + v['description'] + "\">\n") if alts is not None: for k, v in collections.OrderedDict(sorted( alts.iteritems())).iteritems(): outputFile.write("##ALT=<ID=" + k + ",Description=\"" + v['description'] + "\">\n") if filters is not None: for k, v in collections.OrderedDict(sorted( filters.iteritems())).iteritems(): outputFile.write("##FILTER=<ID=" + k + ",Description=\"" + v + "\">\n") for i in range(len(contigDetails['contigs']['names'])): outputFile.write("##contig=<ID=" + contigDetails['contigs']['names'][i] + ",length=" + str(contigDetails['contigs']['sizes'][i]) + ">\n") outputFile.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") if len(samples) > 0: outputFile.write("\tFORMAT") for x in samples: outputFile.write("\t" + x) outputFile.write("\n") chromosomeOffsets = {} for i in range(len(contigDetails['contigs']['names'])): chromosomeOffsets[contigDetails['contigs']['names'] [i]] = contigDetails['contigs']['offsets'][i] contigSequence = open(refFileName, 'r').read() col = {} names = variantsTable.get_col_names() for i in range(len(names)): col[names[i]] = i + 1 col = collections.OrderedDict(sorted(col.items())) chromosomeList = contigDetails['contigs']['names'] if kwargs['chr'] is not None: intersection = [] for x in chromosomeList: if x in kwargs['chr']: intersection.append(x) chromosomeList = intersection[:] for chromosome in chromosomeList: buff = [] lastPosition = -1 query = variantsTable.genomic_range_query(chr=chromosome, lo=0, hi=sys.maxint) for row in variantsTable.get_rows(query=query, limit=1)['data']: startRow = row[0] for row in variantsTable.iterate_rows(start=startRow): if row[1] != chromosome: break if lastPosition < row[col["lo"]]: writeBuffer(buff, col, outputFile, contigSequence, chromosomeOffsets, exportRef, exportNoCall) buff = [] buff.append(row) lastPosition = row[col["lo"]] writeBuffer(buff, col, outputFile, contigSequence, chromosomeOffsets, exportRef, exportNoCall) buff = []
def add(self, input_name, input_value): if self.input_name_prefix is not None: if input_name.startswith(self.input_name_prefix): input_name = input_name[len(self.input_name_prefix):] else: # Skip inputs that don't start with prefix return if ':' in input_name: input_class = input_name[input_name.find(':') + 1:] input_name = input_name[:input_name.find(':')] else: input_class = None if self.input_spec is not None: if input_name not in self.input_spec and self._desc.get( 'class') != 'workflow': raise Exception('Input field called ' + input_name + ' was not found in the input spec') elif input_name in self.input_spec: input_class = self.input_spec[input_name]['class'] if input_class is None: done = False try: # Resolve "job-xxxx:output-name" syntax into a canonical job ref job_id, field = split_unescaped(':', input_value) if is_job_id(job_id) or is_localjob_id(job_id): input_value = {"job": job_id, "field": field} done = True except: pass if not done: try: parsed_input_value = json.loads( input_value, object_pairs_hook=collections.OrderedDict) if type(parsed_input_value) in (collections.OrderedDict, list, int, long, float): input_value = parsed_input_value else: raise Exception() except: # Not recognized JSON (list or dict), so resolve it as a name try: project, folderpath, entity_result = resolve_existing_path( input_value, expected='entity') except: # If not possible, then leave it as a string project, folderpath, entity_result = None, None, None if entity_result is not None: if is_hashid(input_value): input_value = { '$dnanexus_link': entity_result['id'] } else: input_value = { "$dnanexus_link": { "project": entity_result['describe']['project'], "id": entity_result['id'] } } if isinstance(self.inputs[input_name], list) and \ not isinstance(self.inputs[input_name], basestring): self.inputs[input_name].append(input_value) else: self.inputs[input_name] = input_value else: # Input class is known. Respect the "array" class. input_value = parse_input_or_jbor(input_class, input_value) if input_class.startswith('array:'): self.inputs[input_name].append(input_value) else: self.inputs[input_name] = input_value
def main(**kwargs): if len(kwargs) == 0: kwargs = vars(parser.parse_args(sys.argv[1:])) # Attempt to resolve variants gtable name try: project, folderpath, entity_result = resolve_existing_path(kwargs['path'], expected='entity') except ResolutionError as details: parser.exit(1, fill(unicode(details)) + '\n') if entity_result is None: parser.exit(1, fill('Could not resolve ' + kwargs['path'] + ' to a data object') + '\n') filename = kwargs['output'] if filename is None: filename = entity_result['describe']['name'].replace('/', '%2F') + ".vcf" if kwargs['output'] == '-': outputFile = sys.stdout else: outputFile = open(filename, 'w') exportRef = kwargs['export_ref_calls'] exportNoCall = kwargs['export_no_calls'] variantsTable = dxpy.open_dxgtable(entity_result['id']) try: originalContigSet = variantsTable.get_details()['original_contigset'] except: raise dxpy.AppError("The original reference genome must be attached as a detail") contigDetails = dxpy.DXRecord(originalContigSet).get_details() if kwargs['reference'] is not None: refFileName = kwargs['reference'] if not os.path.isfile(refFileName): raise dxpy.AppError("The reference expected by the variants to vcf script was not a valid file") else: refFileName = tempfile.NamedTemporaryFile(prefix='reference_', suffix='.txt', delete=False).name dxpy.download_dxfile(contigDetails['flat_sequence_file']['$dnanexus_link'], refFileName) if kwargs['write_header']: infos = variantsTable.get_details().get('infos') formats = variantsTable.get_details().get('formats') alts = variantsTable.get_details().get('alts') filters = variantsTable.get_details().get('filters') samples = variantsTable.get_details().get('samples') outputFile.write("##fileformat=VCFv4.1\n") if infos is not None: for k, v in collections.OrderedDict(sorted(infos.iteritems())).iteritems(): outputFile.write("##INFO=<ID="+k+",Number="+v['number']+",Type="+v['type']+",Description=\""+v['description']+"\">\n") if len(samples) > 0: outputFile.write("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n") outputFile.write("##FORMAT=<ID=AD,Number=.,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed\">\n") outputFile.write("##FORMAT=<ID=DP,Number=1,Type=String,Description=\"Approximate read depth (reads with MQ=255 or with bad mates are filtered)\">\n") if formats is not None: for k, v in collections.OrderedDict(sorted(formats.iteritems())).iteritems(): outputFile.write("##FORMAT=<ID="+k+",Number="+v['number']+",Type="+v['type']+",Description=\""+v['description']+"\">\n") if alts is not None: for k, v in collections.OrderedDict(sorted(alts.iteritems())).iteritems(): outputFile.write("##ALT=<ID="+k+",Description=\""+v['description']+"\">\n") if filters is not None: for k, v in collections.OrderedDict(sorted(filters.iteritems())).iteritems(): outputFile.write("##FILTER=<ID="+k+",Description=\""+v+"\">\n") for i in range(len(contigDetails['contigs']['names'])): outputFile.write("##contig=<ID="+contigDetails['contigs']['names'][i]+",length="+str(contigDetails['contigs']['sizes'][i])+">\n") outputFile.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") if len(samples) > 0: outputFile.write("\tFORMAT") for x in samples: outputFile.write("\t"+x) outputFile.write("\n") chromosomeOffsets = {} for i in range(len(contigDetails['contigs']['names'])): chromosomeOffsets[contigDetails['contigs']['names'][i]] = contigDetails['contigs']['offsets'][i] contigSequence = open(refFileName,'r').read() col = {} names = variantsTable.get_col_names() for i in range(len(names)): col[names[i]] = i+1 col = collections.OrderedDict(sorted(col.items())) chromosomeList = contigDetails['contigs']['names'] if kwargs['chr'] is not None: intersection = [] for x in chromosomeList: if x in kwargs['chr']: intersection.append(x) chromosomeList = intersection[:] for chromosome in chromosomeList: buff = [] lastPosition = -1 query = variantsTable.genomic_range_query(chr=chromosome, lo=0, hi=sys.maxint) for row in variantsTable.get_rows(query=query, limit=1)['data']: startRow = row[0] for row in variantsTable.iterate_rows(start=startRow): if row[1] != chromosome: break if lastPosition < row[col["lo"]]: writeBuffer(buff, col, outputFile, contigSequence, chromosomeOffsets, exportRef, exportNoCall) buff = [] buff.append(row) lastPosition = row[col["lo"]] writeBuffer(buff, col, outputFile, contigSequence, chromosomeOffsets, exportRef, exportNoCall) buff = []