def maf2mvf(args): """Main method""" # ESTABLISH MAF args.qprint("Starting ConvertMAF2MVF") maf = MultiAlignFile(args) args.qprint("MAF Established") # ESTABLISH MVF mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) args.qprint("MVF output initialized") # PROCESS SAMPLE INFO contig_translate = {1: 1} samplelabels = [s.split(':')[0] for s in args.sample_tags.split(',')] args.qprint("Sample tags processed: {}".format(samplelabels)) if args.ref_tag not in samplelabels: raise IndexError("--ref-tag not in the tags listed in --sample-tags") samplelabels.remove(args.ref_tag) samplelabels.insert(0, args.ref_tag) mvf.sample_ids = samplelabels[:] mvf.sample_indices = list(range(len(mvf.sample_ids))) for i, label in enumerate(samplelabels): mvf.sample_data[i] = {'id': label, 'index': i} mvf.reset_max_sample() mvf.metadata['sourceformat'] = maf.metadata['sourceformat'] mvf.metadata.notes.append(args.command_string) # WRITE MVF HEADER mvf.write_data(mvf.get_header()) args.qprint("MAF Headers Written") mvfentries = [] nentry = 0 total_entries = 0 args.qprint("Begin data conversion") for pos, length, msa in maf: for sname in samplelabels: if sname not in msa: msa[sname] = '-'*length msa['contig'] = 1 for i in range(length): mvf_alleles = encode_mvfstring( ''.join(msa[s][i].strip() for s in samplelabels)) if mvf_alleles: mvfentries.append( (contig_translate.get(msa['contig']), pos+i, (mvf_alleles,))) nentry += 1 if nentry == args.line_buffer: total_entries += nentry mvf.write_entries(mvfentries, encoded=True) args.qprint("{} entries written".format(total_entries)) mvfentries = [] nentry = 0 if mvfentries: total_entries += nentry mvf.write_entries(mvfentries) args.qprint("{} entries written".format(total_entries)) args.qprint("Complete.") return ''
def vcf2mvf(args=None): """Main method for vcf2mvf""" sepchars = dict([("TAB", "\t"), ("SPACE", " "), ("DBLSPACE", " "), ("COMMA", ","), ("MIXED", None)]) args.fieldsep = sepchars[args.field_sep] # ESTABLISH VCF args.qprint("Opening input VCF: {}".format(args.vcf)) vcf = VariantCallFile(args.vcf, indexcontigs=(not args.no_autoindex)) # ESTABLISH MVF args.qprint("Establishing output MVF: {}".format(args.out)) mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) mvf.notes.append(args.command_string) mvf.metadata['mvfversion'] = args.versionx # PROCESS CONTIG INFO args.qprint("Processing VCF headers.") vcfcontigs = vcf.metadata['contigs'].copy() args.qprint("{} contigs found.".format(len(vcfcontigs))) contig_translate = {} if args.contig_ids: for cid, cvcf, cmvf in (x.split(';') for x in args.contig_ids): try: cid = int(cid) except ValueError: pass assert cvcf in [vcfcontigs[x]['label'] for x in vcfcontigs] for vid in vcfcontigs: if vcfcontigs[vid]['label'] == cvcf: contig_translate[cvcf] = [cid, cmvf] if cid in mvf.metadata['contigs']: raise RuntimeError( 'Contig id {} is not unique'.format(cid)) mvf.metadata['contigs'][cid] = vcfcontigs[vid].copy() if cmvf in mvf.get_contig_labels(): raise RuntimeError( 'Contig label {} is not unique'.format(cmvf)) mvf.metadata['contigs'][cid]['label'] = cmvf[:] mvf.reset_max_contig() mvf.max_contig_index -= 1 args.qprint("Processing contigs.") static_contig_ids = list(mvf.get_contig_ids()) for vcid in vcfcontigs: vlabel = vcfcontigs[vcid]['label'] if vlabel not in static_contig_ids: newindex = mvf.get_next_contig_index() if ((is_int(vlabel) or len(vlabel) < 3) and vlabel not in static_contig_ids): newid = vlabel[:] else: newid = str(newindex) mvf.contig_indices.append(newindex) mvf.contig_ids.append(newid) mvf.contig_data[newindex] = vcfcontigs[vcid].copy() static_contig_ids.append(newid) contig_translate[vlabel] = [newindex, vlabel] mvf.reset_max_contig() new_contigs = [(x, mvf.contig_data[x]['label']) for x in mvf.contig_indices] if args.skip_contig_label_check is False: args.qprint("Checking contigs for label/id overlap errors.") xids = [x[0] for x in new_contigs] xlabels = [x[1] for x in new_contigs] xintersect = set(xids).intersection(xlabels) if xintersect: for i, (newid, newlabel) in enumerate(new_contigs): if i % 100 == 0: args.qprint("{} contigs processed".format(i)) if newid in xlabels[:i] or newid in xlabels[i + 1:]: # if newid in xlabels: # if xlabels.index(newid) != i: raise RuntimeError("Error contig id {} is the same as" " the label for another contig" " ({})".format(newid, xlabels.index(newid))) if newlabel in xids[:i] or newlabel in xids[i + 1:]: # if newlabel in xids: # if xids.index(newlabel) != i: raise RuntimeError("Error contig label {} is the same" "as the id for another contig" "({})".format(newlabel, xids.index(newlabel))) # PROCESS SAMPLE INFO args.qprint("Processing samples.") samplelabels = [args.ref_label] + vcf.metadata['samples'][:] if args.alleles_from: args.alleles_from = args.alleles_from.split(':') samplelabels += args.alleles_from if args.sample_replace: newsample = [ x.split(':') if ':' in tuple(x) else tuple([x, x]) for x in args.sample_replace ] unmatched = list(enumerate(samplelabels)) for old, new in newsample: labelmatched = False for j, (i, name) in enumerate(unmatched): if old in name: samplelabels[i] = new labelmatched = j break if labelmatched is not False: del unmatched[labelmatched] mvf.sample_indices = list(range(len(samplelabels))) mvf.sample_ids = samplelabels[:] for i, label in enumerate(samplelabels): mvf.sample_data[i] = {'id': label} mvf.metadata['ncol'] = len(mvf.sample_ids) mvf.max_sample_index = len(mvf.sample_ids) mvf.metadata['sourceformat'] = vcf.metadata['sourceformat'] # WRITE MVF HEADER mvf.write_data(mvf.get_header()) mvfentries = [] nentry = 0 args.qprint("Processing VCF entries.") for vcfrecord in vcf.iterentries(args): mvfstring = ''.join(vcfrecord['genotypes']) if args.filter_nonref_empty is True: if all(x in 'Xx-?' for x in mvfstring[1:]): continue mvf_alleles = encode_mvfstring(mvfstring) if args.out_flavor in ('dnaqual', ): qual_alleles = encode_mvfstring(''.join(vcfrecord['qscores'])) if mvf_alleles: mvfentries.append( (contig_translate.get(vcfrecord['contig'])[0], vcfrecord['coord'], ((mvf_alleles, qual_alleles) if args.out_flavor in ('dnaqual', ) else (mvf_alleles, )))) nentry += 1 if nentry == args.line_buffer: mvf.write_entries(mvfentries, encoded=True) mvfentries = [] nentry = 0 if mvfentries: mvf.write_entries(mvfentries) mvfentries = [] return ''
def filter_mvf(args): """Main method""" args.qprint("Running FilterMVF") if args.more_help is True: modulehelp() sys.exit() if args.mvf is None and args.test is None: raise RuntimeError("No input file specified with --mvf") if args.out is None and args.test is None: raise RuntimeError("No output file specified with --out") # Establish Input MVF if args.test is not None: ncol = args.test_nchar or len(args.test.split()[1]) else: mvf = MultiVariantFile(args.mvf, 'read') ncol = mvf.metadata['ncol'] args.qprint("Input MVF read with {} columns.".format(ncol)) # Create Actionset if args.labels: for i in range(len(args.actions)): action = args.actions[i] arr = action.split(':') if arr[0] in ('collapsepriority', 'collapsemerge'): arr[1] = ','.join([ str(mvf.sample_id_to_index[x]) for x in arr[1].split(',')]) if arr[0] in ('columns', 'allelegroup', 'notmultigroup', 'reqsample'): for j in range(1, len(arr)): arr[j] = ','.join([ str(mvf.sample_id_to_index[x]) for x in arr[j].split(',')]) args.actions[i] = ':'.join(arr) removed_columns = set([]) for i in range(len(args.actions)): action = args.actions[i] arr = action.split(':') if arr[0] in ('collapsepriority', 'collapsemerge'): tmp_arr = arr[1][:] arr[1] = ','.join([ str(int(x) - len([y for y in removed_columns if y < int(x)])) for x in arr[1].split(',')]) removed_columns.update([int(x) for x in tmp_arr.split(',')[1:]]) print(arr) print(removed_columns) if arr[0] in ('columns', 'allelegroup', 'notmultigroup', 'reqsample'): for j in range(1, len(arr)): arr[j] = ','.join([ str(int(x) - len([y for y in removed_columns if y < int(x)])) for x in arr[j].split(',')]) args.actions[i] = ':'.join(arr) actionset = build_actionset(args.actions, ncol) args.qprint("Actions established.") args.qprint(actionset) # TESTING MODE if args.test: loc, alleles = args.test.split() linefail = False transformed = False # invar = invariant (single character) # refvar (all different than reference, two chars) # onecov (single coverage, + is second character) # onevar (one variable base, + is third character) # full = full alleles (all chars) if args.verbose: print(alleles) linetype = get_linetype(alleles) sys.stdout.write("MVF Encoding type '{}' detected\n".format(linetype)) for actionname, actiontype, actionfunc, actionarg in actionset: sys.stdout.write("Applying action {} ({}): ".format( actionname, actiontype)) if actiontype == 'filter': if not actionfunc(alleles, linetype): linefail = True sys.stdout.write("Filter Fail\n") break sys.stdout.write("Filter Pass\n") elif actiontype == 'transform': transformed = True alleles = actionfunc(alleles, linetype) linetype = get_linetype(alleles) if linetype == 'empty': linefail = True sys.stdout.write("Transform removed all alleles\n") break sys.stdout.write("Transform result {}\n".format(alleles)) elif actiontype == 'location': loc = loc.split(':') loc[1] = int(loc[1]) if actionfunc(loc) is False: linefail = True sys.stdout.write("Location Fail\n") break sys.stdout.write("Location Pass\n") if linefail is False: if transformed: if linetype == 'full': alleles = encode_mvfstring(alleles) if alleles: test_output = "{}\t{}\n".format(loc, alleles) sys.stdout.write("Final output = {}\n".format( test_output)) else: sys.stdout.write("Transform removed all alleles\n") else: sys.stdout.write("No changes applied\n") sys.stdout.write("Final output = {}\n".format(args.test)) sys.exit() # MAIN MODE # Set up file handler outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.copy_headers_from(mvf) removed_indices = set([]) # reprocess header if actions are used that filter columns if any(x == y[0] for x in ('columns', 'collapsepriority', 'collapsemerge') for y in actionset): for actionname, actiontype, actionfunc, actionarg in actionset: if actionname == 'columns': if args.labels: oldindices = [outmvf.sample_id_to_index[int(x)] for x in actionarg[0]] else: oldindices = [int(x) for x in actionarg[0]] elif actionname in ('collapsepriority', 'collapsemerge'): actionarg[0] = [x - len([y for y in removed_indices if y < x]) for x in actionarg[0]] oldindices = [x for x in outmvf.sample_indices if x not in actionarg[0][1:]] outmvf.sample_ids = outmvf.get_sample_ids(oldindices) outmvf.sample_data = dict( (i, outmvf.sample_data[oldindices[i]]) for i, _ in enumerate(oldindices)) if actionname in ('collapsepriority', 'collapsemerge'): if len(actionarg) == 2: outmvf.sample_data[actionarg[0][0]]['id'] = actionarg[1][0] outmvf.sample_ids[actionarg[0][0]] = actionarg[1][0] outmvf.sample_indices = list(range(len(oldindices))) outmvf.metadata['ncol'] = len(outmvf.sample_indices) outmvf.notes.append(args.command_string) outmvf.write_data(outmvf.get_header()) args.qprint("Output MVF established.") # End header editing linebuffer = [] nbuffer = 0 args.qprint("Processing Entries.") write_total = 0 for chrom, pos, allelesets in mvf.iterentries(decode=False): linefail = False transformed = False # invar = invariant (single character) # refvar (all different than reference, two chars) # onecov (single coverage, + is second character) # onevar (one variable base, + is third character) # full = full alleles (all chars) alleles = allelesets[0] linetype = get_linetype(alleles) if linetype == 'empty': continue if args.verbose is True: sys.stdout.write(" {} {} ".format(alleles, linetype)) for actionname, actiontype, actionfunc, _ in actionset: if actiontype == 'filter': linefail = not actionfunc(alleles, linetype) elif actiontype == 'transform': transformed = True alleles = actionfunc(alleles, linetype) linetype = get_linetype(alleles) linefail = linetype == 'empty' elif actiontype == 'location': linefail = not actionfunc([chrom, pos]) if linefail: break if linefail is False: if transformed: if linetype == 'full': alleles = mvf.encode(alleles) if not alleles: linefail = True nbuffer += 1 linebuffer.append((chrom, pos, (alleles,))) if args.verbose: sys.stdout.write("{}\n".format(alleles)) if nbuffer == args.line_buffer: write_total += args.line_buffer args.qprint("{} entries written. Total written: {}.".format( args.line_buffer, write_total)) outmvf.write_entries(linebuffer) linebuffer = [] nbuffer = 0 elif args.verbose: sys.stdout.write("FAIL\n") if linebuffer: outmvf.write_entries(linebuffer) write_total += len(linebuffer) args.qprint("{} entries written. Total written: {}.".format( args.line_buffer, write_total)) linebuffer = [] return ''