def maf2mvf(args): """Main method""" # ESTABLISH MAF args.qprint("Starting ConvertMAF2MVF") maf = MultiAlignFile(args) args.qprint("MAF Established") # ESTABLISH MVF mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) args.qprint("MVF output initialized") # PROCESS SAMPLE INFO contig_translate = {1: 1} samplelabels = [s.split(':')[0] for s in args.sample_tags.split(',')] args.qprint("Sample tags processed: {}".format(samplelabels)) if args.ref_tag not in samplelabels: raise IndexError("--ref-tag not in the tags listed in --sample-tags") samplelabels.remove(args.ref_tag) samplelabels.insert(0, args.ref_tag) mvf.sample_ids = samplelabels[:] mvf.sample_indices = list(range(len(mvf.sample_ids))) for i, label in enumerate(samplelabels): mvf.sample_data[i] = {'id': label, 'index': i} mvf.reset_max_sample() mvf.metadata['sourceformat'] = maf.metadata['sourceformat'] mvf.metadata.notes.append(args.command_string) # WRITE MVF HEADER mvf.write_data(mvf.get_header()) args.qprint("MAF Headers Written") mvfentries = [] nentry = 0 total_entries = 0 args.qprint("Begin data conversion") for pos, length, msa in maf: for sname in samplelabels: if sname not in msa: msa[sname] = '-'*length msa['contig'] = 1 for i in range(length): mvf_alleles = encode_mvfstring( ''.join(msa[s][i].strip() for s in samplelabels)) if mvf_alleles: mvfentries.append( (contig_translate.get(msa['contig']), pos+i, (mvf_alleles,))) nentry += 1 if nentry == args.line_buffer: total_entries += nentry mvf.write_entries(mvfentries, encoded=True) args.qprint("{} entries written".format(total_entries)) mvfentries = [] nentry = 0 if mvfentries: total_entries += nentry mvf.write_entries(mvfentries) args.qprint("{} entries written".format(total_entries)) args.qprint("Complete.") return ''
def legacy_annotate_mvf(args): """Main method""" args.qprint("Running LegacyAnnotateMVF") mvf = MultiVariantFile(args.mvf, 'read') args.qprint("Input MVF header processed.") args.qprint("MVF flavor: {}".format(mvf.flavor)) gff, geneids = parse_gff_legacy_annotate( args.gff, mvf.contig_data, gene_pattern=args.gene_pattern) args.qprint("GFF processed.") outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite, flavor=mvf.flavor) outmvf.copy_headers_from(mvf) if args.nongenic_mode is False: outmvf.contig_data = geneids.copy() outmvf.contig_indices = list(range(len(geneids))) outmvf.contig_ids = [geneids[x]['id'] for x in outmvf.contig_indices] outmvf.contig_labels = [geneids[x]['label'] for x in outmvf.contig_indices] outmvf.write_data(outmvf.get_header()) args.qprint("Output MVF established.") entrybuffer = [] nentry = 0 args.qprint("Processing MVF entries.") for contigid, pos, allelesets in mvf.iterentries(decode=False): annotated_pos = None if contigid in gff: for (xgeneid, xstart, xstop) in gff[contigid]: if xstart < pos < xstop: annotated_pos = xgeneid + 0 break if args.nongenic_mode is True and args.unmargin > 0: for xpos in range(pos - args.unmargin, pos + args.unmargin + 1): if xstart < xpos < xstop: annotated_pos = xgeneid + 0 break if annotated_pos is not None and not args.nongenic_mode: entrybuffer.append((annotated_pos, pos, allelesets)) elif args.nongenic_mode and annotated_pos is None: entrybuffer.append((contigid, pos, allelesets)) if args.nongenic_mode or annotated_pos is not None: nentry += 1 if nentry == args.line_buffer: args.qprint("Writing block of entries.") outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 if entrybuffer: outmvf.write_entries(entrybuffer) args.qprint("Writing final block of entries.") entrybuffer = [] nentry = 0 return ''
def annotate_mvf(args): """Main method""" args.qprint("Running AnnotateMVF") mvf = MultiVariantFile(args.mvf, 'read') args.qprint("Input MVF header processed.") args.qprint("MVF flavor: {}".format(mvf.metadata['flavor'])) gff, geneids = parse_gff_annotate(args.gff, mvf.metadata['contigs'], gene_prefix=args.gene_prefix) args.qprint("GFF processed.") outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite, flavor=mvf.metadata['flavor']) outmvf.metadata = deepcopy(mvf.metadata) if args.nongenic_mode is False: outmvf.metadata['contigs'] = geneids outmvf.write_data(outmvf.get_header()) args.qprint("Output MVF established.") entrybuffer = [] nentry = 0 args.qprint("Processing MVF entries.") for contigid, pos, allelesets in mvf.iterentries(decode=False): annotated_pos = False if contigid in gff: if pos in gff[contigid]: annotated_pos = True elif args.nongenic_mode is True and args.unmargin > 0: for xpos in range(pos - args.unmargin, pos + args.unmargin + 1): if xpos in gff[contigid]: annotated_pos = True break if annotated_pos and not args.nongenic_mode: entrybuffer.append((gff[contigid][pos], pos, allelesets)) elif args.nongenic_mode and not annotated_pos: entrybuffer.append((contigid, pos, allelesets)) if args.nongenic_mode or annotated_pos: nentry += 1 if nentry == args.line_buffer: args.qprint("Writing block of entries.") outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 if entrybuffer: outmvf.write_entries(entrybuffer) args.qprint("Writing final block of entries.") entrybuffer = [] nentry = 0 return ''
def annotate_mvf(args): """Main method""" mvf = MultiVariantFile(args.mvf, 'read') gff, geneids = parse_gff_annotate(args.gff, mvf.metadata['contigs']) if args.quiet is False: print("gff_processed") outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.metadata = deepcopy(mvf.metadata) if args.nongenic_mode is False: outmvf.metadata['contigs'] = geneids outmvf.write_data(outmvf.get_header()) entrybuffer = [] nentry = 0 for contigid, pos, allelesets in mvf.iterentries(decode=False): annotated_pos = False if contigid in gff: if pos in gff[contigid]: annotated_pos = True elif args.nongenic_mode is True and args.unmargin > 0: for xpos in range(pos - args.unmargin, pos + args.unmargin + 1): if xpos in gff[contigid]: annotated_pos = True break if args.nongenic_mode is False and annotated_pos is True: entrybuffer.append((gff[contigid][pos], pos, allelesets)) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 elif args.nongenic_mode is True and annotated_pos is False: entrybuffer.append((contigid, pos, allelesets)) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 if entrybuffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 return ''
def maf2mvf(args): """Main method""" # ESTABLISH MAF maf = MultiAlignFile(args) # ESTABLISH MVF mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) # PROCESS SAMPLE INFO contig_translate = {1: 1} samplelabels = [s.split(':')[0] for s in args.sample_tags] samplelabels.remove(args.ref_tag) samplelabels.insert(0, args.ref_tag) mvf.metadata['labels'] = samplelabels[:] for i, label in enumerate(samplelabels): mvf.metadata['samples'][i] = {'label': label} mvf.metadata['ncol'] = len(mvf.metadata['labels']) mvf.metadata['sourceformat'] = maf.metadata['sourceformat'] # WRITE MVF HEADER mvf.write_data(mvf.get_header()) mvfentries = [] nentry = 0 for pos, length, msa in maf: for sname in samplelabels: if sname not in msa: msa[sname] = '-'*length msa['contig'] = 1 for i in range(length): mvf_alleles = encode_mvfstring( ''.join(msa[s][i].strip() for s in samplelabels)) if mvf_alleles: mvfentries.append( (contig_translate.get(msa['contig']), pos+i, (mvf_alleles,))) nentry += 1 if nentry == args.line_buffer: mvf.write_entries(mvfentries, encoded=True) mvfentries = [] nentry = 0 if mvfentries: mvf.write_entries(mvfentries) return ''
def fasta2mvf(args): """Main method""" sepchars = dict([("PIPE", "\\|"), ("TAB", "\\t"), ("SPACE", "\\s"), ("DBLSPACE", "\\s\\s"), ("COMMA", "\\,"), ("NONE", None), ("AT", "\\@"), ('UNDER', "\\_"), ("DBLUNDER", "\\_\\_")]) if args.field_sep is None: args.field_sep = '' else: args.field_sep = re.compile("[{}]".format(''.join( [sepchars[x] for x in args.field_sep]))) if args.manual_coord: assert len(args.manual_coord) == len(args.fasta) args.manual_coord = [(x.split(':')[0], int(x.split(":")[1].split('..')[0]), int(x.split(':')[1].split('..')[1])) for x in args.manual_coord] mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) fasta = {} current_contig = 0 fsamples = [] fcontigs = [] for ifasta, fastapath in enumerate(args.fasta): print("Processing {}".format(fastapath)) for header, seq in fasta_iter(fastapath): if args.field_sep is None: header = header[:] if args.field_sep != '' and args.field_sep is not None: header = [str(x) for x in re.split(args.field_sep, header)] if args.contig_by_file is True: contig = os.path.basename(fastapath[:]) if args.sample_field is None: sample = header[:] else: sample = header[args.sample_field] elif (len(header) < max( args.contig_field if args.contig_field is not None else 0, args.sample_field if args.sample_field is not None else 0) or args.contig_field is None or args.sample_field is None): contig = "UNK{}".format(current_contig) sample = header[:] elif args.manual_coord: contig = args.manual_coord[ifasta][0] else: contig = header[args.contig_field] sample = header[args.sample_field] if contig not in fcontigs: fcontigs.append(contig) fasta[contig] = {} if sample not in fsamples: fsamples.append(sample) fasta[contig][sample] = (len(seq), seq) reflabel = None if args.ref_label: for i, samplename in enumerate(fsamples): if args.ref_label in samplename: reflabel = i break if reflabel: newref = fsamples.pop(i) fsamples = [newref] + fsamples for i, contig in enumerate(fcontigs): new_index = mvf.get_next_contig_index() mvf.contig_indices.append(new_index) mvf.contig_ids.append(str(new_index)) mvf.contig_labels.append(contig) mvf.contig_label_to_index[contig] = new_index mvf.contig_id_to_index[str(new_index)] = new_index mvf.contig_data[new_index] = { 'label': contig, 'id': str(new_index), 'length': max([fasta[contig][x][0] for x in fasta[contig]]) } mvf.metadata['labels'] = fsamples[:] for i, label in enumerate(fsamples[:]): mvf.sample_indices.append(i) mvf.sample_id_to_index[label] = i mvf.sample_ids.append(label) mvf.sample_data[i] = {'id': label} mvf.metadata['ncol'] = len(mvf.metadata['labels']) mvf.metadata['sourceformat'] = 'fasta' mvf.metadata.append(args.command_string) mvf.flavor = args.flavor # WRITE MVF HEADER mvf.write_data(mvf.get_header()) mvfentries = [] nentry = 0 mvf_alleles = {} for cind, contig in enumerate(fcontigs): for pos in range(mvf.contig_data[cind + 1]['length']): mvf_alleles = encode_mvfstring( ''.join(samp not in fasta[contig] and '-' or pos >= fasta[contig][samp][0] and '-' or fasta[contig][samp][1][pos] for samp in fsamples)) if mvf_alleles: if args.flavor == 'dna': mvf_alleles = ''.join( ["X" if x in 'NX' else x for x in mvf_alleles]) mvfentries.append((cind, pos + 1, (mvf_alleles, ))) nentry += 1 if nentry == args.write_buffer: mvf.write_entries(mvfentries, encoded=True) mvfentries = [] nentry = 0 if mvfentries: mvf.write_entries(mvfentries) mvfentries = [] return ''
def vcf2mvf(args=None): """Main method for vcf2mvf""" sepchars = dict([("TAB", "\t"), ("SPACE", " "), ("DBLSPACE", " "), ("COMMA", ","), ("MIXED", None)]) args.fieldsep = sepchars[args.field_sep] # ESTABLISH VCF args.qprint("Opening input VCF: {}".format(args.vcf)) vcf = VariantCallFile(args.vcf, indexcontigs=(not args.no_autoindex)) # ESTABLISH MVF args.qprint("Establishing output MVF: {}".format(args.out)) mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) mvf.notes.append(args.command_string) mvf.metadata['mvfversion'] = args.versionx # PROCESS CONTIG INFO args.qprint("Processing VCF headers.") vcfcontigs = vcf.metadata['contigs'].copy() args.qprint("{} contigs found.".format(len(vcfcontigs))) contig_translate = {} if args.contig_ids: for cid, cvcf, cmvf in (x.split(';') for x in args.contig_ids): try: cid = int(cid) except ValueError: pass assert cvcf in [vcfcontigs[x]['label'] for x in vcfcontigs] for vid in vcfcontigs: if vcfcontigs[vid]['label'] == cvcf: contig_translate[cvcf] = [cid, cmvf] if cid in mvf.metadata['contigs']: raise RuntimeError( 'Contig id {} is not unique'.format(cid)) mvf.metadata['contigs'][cid] = vcfcontigs[vid].copy() if cmvf in mvf.get_contig_labels(): raise RuntimeError( 'Contig label {} is not unique'.format(cmvf)) mvf.metadata['contigs'][cid]['label'] = cmvf[:] mvf.reset_max_contig() mvf.max_contig_index -= 1 args.qprint("Processing contigs.") static_contig_ids = list(mvf.get_contig_ids()) for vcid in vcfcontigs: vlabel = vcfcontigs[vcid]['label'] if vlabel not in static_contig_ids: newindex = mvf.get_next_contig_index() if ((is_int(vlabel) or len(vlabel) < 3) and vlabel not in static_contig_ids): newid = vlabel[:] else: newid = str(newindex) mvf.contig_indices.append(newindex) mvf.contig_ids.append(newid) mvf.contig_data[newindex] = vcfcontigs[vcid].copy() static_contig_ids.append(newid) contig_translate[vlabel] = [newindex, vlabel] mvf.reset_max_contig() new_contigs = [(x, mvf.contig_data[x]['label']) for x in mvf.contig_indices] if args.skip_contig_label_check is False: args.qprint("Checking contigs for label/id overlap errors.") xids = [x[0] for x in new_contigs] xlabels = [x[1] for x in new_contigs] xintersect = set(xids).intersection(xlabels) if xintersect: for i, (newid, newlabel) in enumerate(new_contigs): if i % 100 == 0: args.qprint("{} contigs processed".format(i)) if newid in xlabels[:i] or newid in xlabels[i + 1:]: # if newid in xlabels: # if xlabels.index(newid) != i: raise RuntimeError("Error contig id {} is the same as" " the label for another contig" " ({})".format(newid, xlabels.index(newid))) if newlabel in xids[:i] or newlabel in xids[i + 1:]: # if newlabel in xids: # if xids.index(newlabel) != i: raise RuntimeError("Error contig label {} is the same" "as the id for another contig" "({})".format(newlabel, xids.index(newlabel))) # PROCESS SAMPLE INFO args.qprint("Processing samples.") samplelabels = [args.ref_label] + vcf.metadata['samples'][:] if args.alleles_from: args.alleles_from = args.alleles_from.split(':') samplelabels += args.alleles_from if args.sample_replace: newsample = [ x.split(':') if ':' in tuple(x) else tuple([x, x]) for x in args.sample_replace ] unmatched = list(enumerate(samplelabels)) for old, new in newsample: labelmatched = False for j, (i, name) in enumerate(unmatched): if old in name: samplelabels[i] = new labelmatched = j break if labelmatched is not False: del unmatched[labelmatched] mvf.sample_indices = list(range(len(samplelabels))) mvf.sample_ids = samplelabels[:] for i, label in enumerate(samplelabels): mvf.sample_data[i] = {'id': label} mvf.metadata['ncol'] = len(mvf.sample_ids) mvf.max_sample_index = len(mvf.sample_ids) mvf.metadata['sourceformat'] = vcf.metadata['sourceformat'] # WRITE MVF HEADER mvf.write_data(mvf.get_header()) mvfentries = [] nentry = 0 args.qprint("Processing VCF entries.") for vcfrecord in vcf.iterentries(args): mvfstring = ''.join(vcfrecord['genotypes']) if args.filter_nonref_empty is True: if all(x in 'Xx-?' for x in mvfstring[1:]): continue mvf_alleles = encode_mvfstring(mvfstring) if args.out_flavor in ('dnaqual', ): qual_alleles = encode_mvfstring(''.join(vcfrecord['qscores'])) if mvf_alleles: mvfentries.append( (contig_translate.get(vcfrecord['contig'])[0], vcfrecord['coord'], ((mvf_alleles, qual_alleles) if args.out_flavor in ('dnaqual', ) else (mvf_alleles, )))) nentry += 1 if nentry == args.line_buffer: mvf.write_entries(mvfentries, encoded=True) mvfentries = [] nentry = 0 if mvfentries: mvf.write_entries(mvfentries) mvfentries = [] return ''
def filter_mvf(args): """Main method""" if args.more_help is True: modulehelp() sys.exit() if args.mvf is None and args.test is None: raise RuntimeError("No input file specified with --mvf") if args.out is None and args.test is None: raise RuntimeError("No output file specified with --out") # Establish Input MVF if args.test is not None: ncol = args.test_nchar or len(args.test.split()[1]) else: mvf = MultiVariantFile(args.mvf, 'read') ncol = mvf.metadata['ncol'] # Create Actionset if args.labels: labels = mvf.get_sample_labels()[:] for i in range(len(args.actions)): action = args.actions[i] arr = action.split(':') if arr[0] in ('columns', 'collapsepriority', 'collapsemerge', 'allelegroup', 'notmultigroup'): for j in range(1, len(arr)): arr[j] = ','.join( [str(labels.index(x)) for x in arr[j].split(',')]) args.actions[i] = ':'.join(arr) actionset = build_actionset(args.actions, ncol) # TESTING MODE if args.test: loc, alleles = args.test.split() linefail = False transformed = False # invar = invariant (single character) # refvar (all different than reference, two chars) # onecov (single coverage, + is second character) # onevar (one variable base, + is third character) # full = full alleles (all chars) if args.verbose: print(alleles) linetype = get_linetype(alleles) sys.stdout.write("MVF Encoding type '{}' detected\n".format(linetype)) for actionname, actiontype, actionfunc, actionarg in actionset: sys.stdout.write("Applying action {} ({}): ".format( actionname, actiontype)) if actiontype == 'filter': if not actionfunc(alleles, linetype): linefail = True sys.stdout.write("Filter Fail\n") break else: sys.stdout.write("Filter Pass\n") elif actiontype == 'transform': transformed = True alleles = actionfunc(alleles, linetype) linetype = get_linetype(alleles) if linetype == 'empty': linefail = True sys.stdout.write("Transform removed all alleles\n") break else: sys.stdout.write("Transform result {}\n".format(alleles)) elif actiontype == 'location': loc = loc.split(':') loc[1] = int(loc[1]) if actionfunc(loc) is False: linefail = True sys.stdout.write("Location Fail\n") break else: sys.stdout.write("Location Pass\n") if linefail is False: if transformed: if linetype == 'full': alleles = encode_mvfstring(alleles) if alleles: test_output = "{}\t{}\n".format(loc, alleles) sys.stdout.write("Final output = {}\n".format(test_output)) else: sys.stdout.write("Transform removed all alleles\n") else: sys.stdout.write("No changes applied\n") sys.stdout.write("Final output = {}\n".format(args.test)) sys.exit() # MAIN MODE # Set up file handler outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.metadata = deepcopy(mvf.metadata) # reprocess header if actions are used that filter columns if any(x == y[0] for x in ('columns', 'collapsepriority', 'collapsemerge') for y in actionset): if args.labels: labels = outmvf.metadata['labels'][:] else: labels = [x for x in outmvf.metadata['samples']] for actionname, actiontype, actionfunc, actionarg in actionset: if actionname == 'columns': labels = [labels[x] for x in actionarg[0]] elif actionname in ('collapsepriority', 'collapsemerge'): labels = [ labels[x] for x in range(len(labels)) if x not in actionarg[0][1:] ] if args.labels: oldindices = mvf.get_sample_indices(labels) else: oldindices = labels[:] newsamples = {} for i, _ in enumerate(labels): newsamples[i] = mvf.metadata['samples'][oldindices[i]] outmvf.metadata['samples'] = newsamples.copy() outmvf.metadata['labels'] = labels[:] outmvf.write_data(outmvf.get_header()) # End header editing linebuffer = [] nbuffer = 0 for chrom, pos, allelesets in mvf.iterentries(decode=False): linefail = False transformed = False # invar = invariant (single character) # refvar (all different than reference, two chars) # onecov (single coverage, + is second character) # onevar (one variable base, + is third character) # full = full alleles (all chars) alleles = allelesets[0] linetype = get_linetype(alleles) if linetype == 'empty': continue if args.verbose is True: sys.stdout.write(" {} {}".format(alleles, linetype)) for actionname, actiontype, actionfunc, actionargs in actionset: if actiontype == 'filter': if not actionfunc(alleles, linetype): linefail = True elif actiontype == 'transform': transformed = True alleles = actionfunc(alleles, linetype) linetype = get_linetype(alleles) if linetype == 'empty': linefail = True elif actiontype == 'location': if actionfunc([chrom, pos]) is False: linefail = True if linefail: break if linefail is False: if transformed: if linetype == 'full': alleles = mvf.encode(alleles) if not alleles: linefail = True nbuffer += 1 linebuffer.append((chrom, pos, (alleles, ))) if args.verbose: sys.stdout.write("{}\n".format(alleles)) if nbuffer == args.line_buffer: outmvf.write_entries(linebuffer) linebuffer = [] nbuffer = 0 elif args.verbose: sys.stdout.write("FAIL\n") if linebuffer: outmvf.write_entries(linebuffer) linebuffer = [] return ''
def filter_mvf(args): """Main method""" args.qprint("Running FilterMVF") if args.more_help is True: modulehelp() sys.exit() if args.mvf is None and args.test is None: raise RuntimeError("No input file specified with --mvf") if args.out is None and args.test is None: raise RuntimeError("No output file specified with --out") # Establish Input MVF if args.test is not None: ncol = args.test_nchar or len(args.test.split()[1]) else: mvf = MultiVariantFile(args.mvf, 'read') ncol = mvf.metadata['ncol'] args.qprint("Input MVF read with {} columns.".format(ncol)) # Create Actionset if args.labels: for i in range(len(args.actions)): action = args.actions[i] arr = action.split(':') if arr[0] in ('collapsepriority', 'collapsemerge'): arr[1] = ','.join([ str(mvf.sample_id_to_index[x]) for x in arr[1].split(',')]) if arr[0] in ('columns', 'allelegroup', 'notmultigroup', 'reqsample'): for j in range(1, len(arr)): arr[j] = ','.join([ str(mvf.sample_id_to_index[x]) for x in arr[j].split(',')]) args.actions[i] = ':'.join(arr) removed_columns = set([]) for i in range(len(args.actions)): action = args.actions[i] arr = action.split(':') if arr[0] in ('collapsepriority', 'collapsemerge'): tmp_arr = arr[1][:] arr[1] = ','.join([ str(int(x) - len([y for y in removed_columns if y < int(x)])) for x in arr[1].split(',')]) removed_columns.update([int(x) for x in tmp_arr.split(',')[1:]]) print(arr) print(removed_columns) if arr[0] in ('columns', 'allelegroup', 'notmultigroup', 'reqsample'): for j in range(1, len(arr)): arr[j] = ','.join([ str(int(x) - len([y for y in removed_columns if y < int(x)])) for x in arr[j].split(',')]) args.actions[i] = ':'.join(arr) actionset = build_actionset(args.actions, ncol) args.qprint("Actions established.") args.qprint(actionset) # TESTING MODE if args.test: loc, alleles = args.test.split() linefail = False transformed = False # invar = invariant (single character) # refvar (all different than reference, two chars) # onecov (single coverage, + is second character) # onevar (one variable base, + is third character) # full = full alleles (all chars) if args.verbose: print(alleles) linetype = get_linetype(alleles) sys.stdout.write("MVF Encoding type '{}' detected\n".format(linetype)) for actionname, actiontype, actionfunc, actionarg in actionset: sys.stdout.write("Applying action {} ({}): ".format( actionname, actiontype)) if actiontype == 'filter': if not actionfunc(alleles, linetype): linefail = True sys.stdout.write("Filter Fail\n") break sys.stdout.write("Filter Pass\n") elif actiontype == 'transform': transformed = True alleles = actionfunc(alleles, linetype) linetype = get_linetype(alleles) if linetype == 'empty': linefail = True sys.stdout.write("Transform removed all alleles\n") break sys.stdout.write("Transform result {}\n".format(alleles)) elif actiontype == 'location': loc = loc.split(':') loc[1] = int(loc[1]) if actionfunc(loc) is False: linefail = True sys.stdout.write("Location Fail\n") break sys.stdout.write("Location Pass\n") if linefail is False: if transformed: if linetype == 'full': alleles = encode_mvfstring(alleles) if alleles: test_output = "{}\t{}\n".format(loc, alleles) sys.stdout.write("Final output = {}\n".format( test_output)) else: sys.stdout.write("Transform removed all alleles\n") else: sys.stdout.write("No changes applied\n") sys.stdout.write("Final output = {}\n".format(args.test)) sys.exit() # MAIN MODE # Set up file handler outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.copy_headers_from(mvf) removed_indices = set([]) # reprocess header if actions are used that filter columns if any(x == y[0] for x in ('columns', 'collapsepriority', 'collapsemerge') for y in actionset): for actionname, actiontype, actionfunc, actionarg in actionset: if actionname == 'columns': if args.labels: oldindices = [outmvf.sample_id_to_index[int(x)] for x in actionarg[0]] else: oldindices = [int(x) for x in actionarg[0]] elif actionname in ('collapsepriority', 'collapsemerge'): actionarg[0] = [x - len([y for y in removed_indices if y < x]) for x in actionarg[0]] oldindices = [x for x in outmvf.sample_indices if x not in actionarg[0][1:]] outmvf.sample_ids = outmvf.get_sample_ids(oldindices) outmvf.sample_data = dict( (i, outmvf.sample_data[oldindices[i]]) for i, _ in enumerate(oldindices)) if actionname in ('collapsepriority', 'collapsemerge'): if len(actionarg) == 2: outmvf.sample_data[actionarg[0][0]]['id'] = actionarg[1][0] outmvf.sample_ids[actionarg[0][0]] = actionarg[1][0] outmvf.sample_indices = list(range(len(oldindices))) outmvf.metadata['ncol'] = len(outmvf.sample_indices) outmvf.notes.append(args.command_string) outmvf.write_data(outmvf.get_header()) args.qprint("Output MVF established.") # End header editing linebuffer = [] nbuffer = 0 args.qprint("Processing Entries.") write_total = 0 for chrom, pos, allelesets in mvf.iterentries(decode=False): linefail = False transformed = False # invar = invariant (single character) # refvar (all different than reference, two chars) # onecov (single coverage, + is second character) # onevar (one variable base, + is third character) # full = full alleles (all chars) alleles = allelesets[0] linetype = get_linetype(alleles) if linetype == 'empty': continue if args.verbose is True: sys.stdout.write(" {} {} ".format(alleles, linetype)) for actionname, actiontype, actionfunc, _ in actionset: if actiontype == 'filter': linefail = not actionfunc(alleles, linetype) elif actiontype == 'transform': transformed = True alleles = actionfunc(alleles, linetype) linetype = get_linetype(alleles) linefail = linetype == 'empty' elif actiontype == 'location': linefail = not actionfunc([chrom, pos]) if linefail: break if linefail is False: if transformed: if linetype == 'full': alleles = mvf.encode(alleles) if not alleles: linefail = True nbuffer += 1 linebuffer.append((chrom, pos, (alleles,))) if args.verbose: sys.stdout.write("{}\n".format(alleles)) if nbuffer == args.line_buffer: write_total += args.line_buffer args.qprint("{} entries written. Total written: {}.".format( args.line_buffer, write_total)) outmvf.write_entries(linebuffer) linebuffer = [] nbuffer = 0 elif args.verbose: sys.stdout.write("FAIL\n") if linebuffer: outmvf.write_entries(linebuffer) write_total += len(linebuffer) args.qprint("{} entries written. Total written: {}.".format( args.line_buffer, write_total)) linebuffer = [] return ''
def mvf_join(args): """Main method""" concatmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) # Copy the first file's metadata if args.main_header_file: if args.main_header_file not in args.mvf: raise RuntimeError("{} not found in files".format( args.main_header_file)) else: args.main_header_file = args.mvf.index(args.main_header_file) else: args.main_header_file = 0 first_mvf = MultiVariantFile(args.mvf[args.main_header_file], 'read') concatmvf.metadata = first_mvf.metadata.copy() # Open each MVF file, read headers to make unified header transformers = [] for mvfname in args.mvf: # This will create a dictionary of samples{old:new}, contigs{old:new} transformer = MvfTransformer() mvf = MultiVariantFile(mvfname, 'read') for i, label in enumerate(mvf.get_sample_labels()): if label not in concatmvf.get_sample_labels(): concatmvf.metadata['labels'].append(label) concatmvf.metadata['samples'][ concatmvf.metadata['labels'].index(label)] = { 'label': label } if concatmvf.metadata['labels'].index(label) != i: transformer.set_label( i, concatmvf.metadata['labels'].index(label)) for contigid, contigdata in iter(mvf.metadata['contigs'].items()): if contigdata['label'] not in [ concatmvf.metadata['contigs'][x]['label'] for x in concatmvf.metadata['contigs'] ]: newid = (contigid not in concatmvf.metadata['contigs'] and contigid or concatmvf.get_next_contig_id()) concatmvf.metadata['contigs'][newid] = contigdata else: for concatid, concatdata in ( concatmvf.metadata['contigs'].items()): if contigdata['label'] == concatdata['label']: newid = concatid break if newid != contigid: transformer.set_contig(contigid, newid) transformers.append(transformer) # Write output header concatmvf.write_data(concatmvf.get_header()) # Now loop through each file entries = [] nentries = 0 for ifile, mvfname in enumerate(args.mvf): if not args.quiet: sys.stderr.write("Processing {} ...\n".format(mvfname)) transformer = transformers[ifile] mvf = MultiVariantFile(mvfname, 'read') for contigid, pos, allelesets in mvf.iterentries(decode=False, quiet=args.quiet): if transformer.labels: allelesets = [mvf.decode(x) for x in allelesets] for j, alleles in enumerate(allelesets): allelesets[j] = concatmvf.encode(''.join([ x in transformer.labels and alleles[transformer.labels[x]] or alleles[x] for x in range(len(alleles)) ])) if transformer.contigs: contigid = (contigid in transformer['contigs'] and transformer['contigs'][contigid] or contigid) entries.append((contigid, pos, allelesets)) nentries += 1 if nentries == args.line_buffer: concatmvf.write_entries(entries) entries = [] nentries = 0 if entries: concatmvf.write_entries(entries) entries = [] nentries = 0 if not args.quiet: sys.stderr.write("done\n") return ''
def translate_mvf(args): """Main method""" args.qprint("Running TranslateMVF") if args.gff: args.qprint("Reading and Indexing MVF.") else: args.qprint("Reading MVF.") mvf = MultiVariantFile(args.mvf, 'read', contigindex=bool(args.gff)) if mvf.flavor != 'dna': raise RuntimeError("MVF must be flavor=dna to translate") if args.gff: args.qprint("Processing MVF Index File.") mvf.read_index_file() args.qprint("GFF processing start.") gff_genes, gene_order = parse_gff_exome(args) args.qprint("GFF processed.") outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.copy_headers_from(mvf) outmvf.contig_data = dict( ( i, dict((y, z) for (y, z) in gff_genes[x].items() if y not in ('cds', ))) for (i, x) in enumerate(gene_order)) outmvf.contig_indices = list(range(len(gene_order))) outmvf.contig_ids = [gff_genes[x]['id'] for x in gene_order] outmvf.contig_labels = [gff_genes[x]['label'] for x in gene_order] outmvf.flavor = args.output_data outmvf.metadata.notes.append(args.command_string) outmvf.write_data(outmvf.get_header()) args.qprint("Output MVF Established.") entrybuffer = [] nentry = 0 pos = None if not args.gff: args.qprint("No GFF used, translating sequences as pre-aligned in " "coding frame.") inputbuffer = [] current_contig = '' for contigid, pos, allelesets in mvf.iterentries(decode=False): if current_contig == '': current_contig = contigid[:] if contigid == current_contig: inputbuffer.append((pos, allelesets)) else: for _, amino_acids, alleles in iter_codons( inputbuffer, mvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids,))) else: entrybuffer.append(( current_contig, pos, ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 inputbuffer = [(pos, allelesets)] current_contig = contigid[:] if inputbuffer: for _, amino_acids, alleles in iter_codons( inputbuffer, outmvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids,))) else: entrybuffer.append(( current_contig, pos, ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 else: running_gene_index = -1 for igene, gene in enumerate(gene_order): xcontiglabel = gff_genes[gene]['contig'] xcontig = mvf.get_contig_indices( labels=gff_genes[gene]['contig']) if xcontig is None: print("Warning: contig {} not found".format( gff_genes[gene]['contig'])) xcontigid = mvf.get_contig_ids(indices=xcontig)[0] min_gene_coord = gff_genes[gene]['cds'][0][0] max_gene_coord = gff_genes[gene]['cds'][-1][1] mvf_entries = {} if not igene % 100: args.qprint("Processing gene {} on {}".format( gene, xcontiglabel)) for contigid, pos, allelesets in mvf.itercontigentries( xcontig, decode=False): if pos < min_gene_coord: continue if pos > max_gene_coord: break mvf_entries[pos] = allelesets[0] reverse_strand = gff_genes[gene]['strand'] == '-' coords = [] running_gene_index += 1 for elem in gff_genes[gene]['cds']: coords.extend(list(range(elem[0], elem[1] + 1))) if reverse_strand: coords = coords[::-1] for codoncoord in range(0, len(coords), 3): alleles = tuple(mvf_entries.get(x, '-') for x in coords[codoncoord:codoncoord + 3]) if len(alleles) < 3: alleles = tuple(list(alleles) + ['-'] * (3 - len(alleles))) if all(len(x) == 1 for x in alleles): if reverse_strand: alleles = tuple( MLIB.complement_bases[x] for x in alleles) decoded_alleles = alleles amino_acids = translate_single_codon(''.join(alleles)) else: if reverse_strand is True: decoded_alleles = tuple(tuple(MLIB.complement_bases[y] for y in mvf.decode(x)) for x in alleles) alleles = tuple(outmvf.encode(''.join(x)) for x in decoded_alleles) else: decoded_alleles = tuple(mvf.decode(x) for x in alleles) amino_acids = tuple(translate_single_codon(''.join(x)) for x in zip(*decoded_alleles)) amino_acids = outmvf.encode(''.join(amino_acids)) if args.output_data == 'protein': entrybuffer.append(( ( xcontigid if args.retain_contigs else running_gene_index ), ( coords[codoncoord] if args.retain_coords else codoncoord ), ( amino_acids, ) )) elif args.output_data == 'codon': entrybuffer.append(( ( xcontigid if args.retain_contigs else running_gene_index ), ( coords[codoncoord] if args.retain_coords else codoncoord ), ( amino_acids, alleles[0], alleles[1], alleles[2] ) )) elif args.output_data == 'dna': for j, elem in enumerate( range(codoncoord, min(codoncoord + 3, len(coords)))): entrybuffer.append(( ( xcontigid if args.retain_contigs else running_gene_index ), ( coords[elem] if args.retain_coords else elem + 1 ), ( alleles[j], ) )) nentry += 1 if nentry >= args.line_buffer: args.qprint("Writing a block of {} entries.".format( args.line_buffer)) outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 if entrybuffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 return ''
def legacy_translate_mvf(args): """Main method""" args.qprint("Running LegacyTranslateMVF") if args.gff: args.qprint("Reading and Indexing MVF.") else: args.qprint("Reading MVF.") mvf = MultiVariantFile(args.mvf, 'read', contigindex=bool(args.gff)) if mvf.flavor != 'dna': raise RuntimeError("MVF must be flavor=dna to translate") if args.gff: args.qprint("Processing MVF Index File.") mvf.read_index_file() args.qprint("GFF processing start.") gff = parse_gff_legacy_translate( args.gff, args, parent_gene_pattern=args.parent_gene_pattern) args.qprint("GFF processed.") outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.copy_headers_from(mvf) outmvf.flavor = args.output_data outmvf.write_data(outmvf.get_header()) args.qprint("Output MVF Established.") entrybuffer = [] nentry = 0 pos = None if not args.gff: args.qprint("No GFF used, translating sequences as pre-aligned in " "coding frame.") inputbuffer = [] current_contig = '' for contigid, pos, allelesets in mvf.iterentries(decode=False): if current_contig == '': current_contig = contigid[:] if contigid == current_contig: inputbuffer.append((pos, allelesets)) else: for _, amino_acids, alleles in iter_codons( inputbuffer, mvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids,))) else: entrybuffer.append(( current_contig, pos, ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 inputbuffer = [(pos, allelesets)] current_contig = contigid[:] if inputbuffer: for _, amino_acids, alleles in iter_codons( inputbuffer, outmvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids,))) else: entrybuffer.append(( current_contig, pos, ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 else: args.qprint("Indexing GFF gene names.") # mvfid_to_gffname = outmvf.get_contig_reverse_dict() for xcontig in outmvf.get_contig_indices(): mvf_entries = {} xcontiglabel = outmvf.get_contig_labels(indices=xcontig)[0] xcontigid = outmvf.get_contig_ids(indices=xcontig)[0] if xcontiglabel not in gff: if args.verbose: print( ("No entries in GFF, " "skipping contig: index:{} id:{} label:{}").format( xcontig, xcontigid, xcontiglabel)) continue if not xcontig % 100: args.qprint("Processing contig: {} {}".format( xcontigid, xcontiglabel)) for contigid, pos, allelesets in mvf.itercontigentries( xcontig, decode=False): mvf_entries[pos] = allelesets[0] for coords in sorted(gff[xcontiglabel]): reverse_strand = coords[3] == '-' alleles = (tuple(mvf_entries.get(x, '-') for x in coords[2::-1]) if reverse_strand is True else tuple(mvf_entries.get(x, '-') for x in coords[0:3])) if all(len(x) == 1 for x in alleles): if reverse_strand: alleles = tuple( MLIB.complement_bases[x] for x in alleles) decoded_alleles = alleles amino_acids = translate_single_codon(''.join(alleles)) else: if reverse_strand is True: decoded_alleles = tuple(tuple(MLIB.complement_bases[y] for y in mvf.decode(x)) for x in alleles) alleles = tuple(outmvf.encode(''.join(x)) for x in decoded_alleles) else: decoded_alleles = tuple(mvf.decode(x) for x in alleles) amino_acids = tuple(translate_single_codon(''.join(x)) for x in zip(*decoded_alleles)) # print("aminx", amino_acids) amino_acids = outmvf.encode(''.join(amino_acids)) # if all(x in '-X' for x in amino_acids): # continue # print("amino", amino_acids) # print("translated", amino_acids, alleles) if args.output_data == 'protein': entrybuffer.append((xcontig, coords[0], (amino_acids,))) else: entrybuffer.append(( xcontigid, coords[0], ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry >= args.line_buffer: args.qprint("Writing a block of {} entries.".format( args.line_buffer)) outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 if entrybuffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 return ''
def merge_mvf(args): """Main method""" args.qprint("Running MergeMVF") if any(fpath.endswith('.gz') for fpath in args.mvf): print("WARNING! Running MergeMVF with gzipped input files is " "extremely slow and strongly discouraged.") concatmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) # Copy the first file's metadata args.qprint("Reading First File and Establishing Output") if args.main_header_file: if args.main_header_file not in args.mvf: raise RuntimeError("{} not found in files".format( args.main_header_file)) args.main_header_file = args.mvf.index(args.main_header_file) else: args.main_header_file = 0 first_mvf = MultiVariantFile(args.mvf[args.main_header_file], 'read') concatmvf.copy_header(first_mvf) # Open each MVF file, read headers to make unified header transformers = [] mvfmetadata = [] inputfiles = [] for mvfname in args.mvf: args.qprint("Reading headers from {}".format(mvfname)) # This will create a dictionary of samples{old:new}, contigs{old:new} args.qprint("Processing Headers and Indexing: {}".format(mvfname)) transformer = MvfTransformer() mvf = MultiVariantFile(mvfname, 'read', contigindex=(not args.skip_index)) if args.skip_index: mvf.read_index_file() mvf.reset_max_contig() mvfmetadata.append(mvf.metadata) for i, sid in enumerate(mvf.get_sample_ids()): if sid not in concatmvf.get_sample_ids(): new_sindex = concatmvf.max_sample_index + 0 concatmvf.max_sample_index += 1 concatmvf.sample_indices.append(new_sindex) concatmvf.sample_ids.append(sid) concatmvf.sample_data[new_sindex] = {} concatmvf.sample_data[new_sindex]['id'] = sid concatmvf.sample_id_to_index[sid] = new_sindex transformer.set_label(i, concatmvf.sample_id_to_index[sid]) for cindex in mvf.contig_indices: if (mvf.contig_data[cindex]['label'] not in concatmvf.contig_label_to_index): new_cindex = (mvf.contig_data[cindex]['id'] if mvf.contig_data[cindex]['id'] not in concatmvf.contig_ids else concatmvf.get_next_contig_index()) concatmvf.contig_data[new_cindex] = ( mvf.contig_data[cindex].copy()) else: new_cindex = concatmvf.contig_label_to_index[ mvf.contig_data[cindex]['label']] transformer.set_contig(cindex, new_cindex) transformers.append(transformer) inputfiles.append(mvf) # Write output header args.qprint("Writing headers to merge output") concatmvf.reset_max_sample() concatmvf.notes.append(args.command_string) concatmvf.write_data(concatmvf.get_header()) # Now loop through each file blank_entry = '-' * len(concatmvf.sample_indices) for cons_contig in concatmvf.contig_indices: contig_merged_entries = {} args.qprint("Merging Contig Index: {}".format(cons_contig)) for ifile, mvffile in enumerate(inputfiles): if cons_contig not in transformers[ifile].contigs: continue localcontig = transformers[ifile].contigs[cons_contig] if 'idx' not in mvffile.contig_data[localcontig]: print("not found") continue for _, pos, allelesets in mvffile.itercontigentries(localcontig, decode=True): if pos not in contig_merged_entries: contig_merged_entries[pos] = blank_entry[:] for j, base in enumerate(allelesets[0]): xcoord = transformers[ifile].labels_rev[j] if contig_merged_entries[pos][xcoord] != '-': if contig_merged_entries[pos][xcoord] == base: continue if base in '-X': continue raise RuntimeError( ("Merging columns have two different bases: " "{} {} {}").format( pos, contig_merged_entries[pos][xcoord], base)) contig_merged_entries[pos] = ( contig_merged_entries[pos][:xcoord] + base + contig_merged_entries[pos][xcoord + 1:]) if contig_merged_entries: concatmvf.write_entries( ((cons_contig, coord, (entry, )) for coord, entry in sorted(contig_merged_entries.items())), encoded=False) args.qprint("Entries written for contig {}: {}".format( cons_contig, len(contig_merged_entries))) return ''
def merge_mvf(args): """Main method""" args.qprint("Running MergeMVF") if any(fpath.endswith('.gz') for fpath in args.mvf): print("WARNING! Running MergeMVF with gzipped input files is " "extremely slow and strongly discouraged.") concatmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) # Copy the first file's metadata args.qprint("Reading First File and Establishing Output") if args.main_header_file: if args.main_header_file not in args.mvf: raise RuntimeError("{} not found in files".format( args.main_header_file)) else: args.main_header_file = args.mvf.index(args.main_header_file) else: args.main_header_file = 0 first_mvf = MultiVariantFile(args.mvf[args.main_header_file], 'read') concatmvf.metadata = first_mvf.metadata.copy() # Open each MVF file, read headers to make unified header transformers = [] mvfmetadata = [] concatmvf_reverse_contig = dict( (x['label'], k) for (k, x) in concatmvf.metadata['contigs'].items()) inputfiles = [] for mvfname in args.mvf: args.qprint("Reading headers from {}".format(mvfname)) # This will create a dictionary of samples{old:new}, contigs{old:new} args.qprint("Processing Headers and Indexing: {}".format(mvfname)) transformer = MvfTransformer() mvf = MultiVariantFile(mvfname, 'read', contigindex=(not args.skip_index)) if args.skip_index: mvf.read_index_file() mvf.reset_max_contig_id() mvfmetadata.append(mvf.metadata) for i, label in enumerate(mvf.get_sample_labels()): if label not in concatmvf.get_sample_labels(): concatmvf.metadata['labels'].append(label) concatmvf.metadata['samples'][ concatmvf.metadata['labels'].index(label)] = { 'label': label } # if concatmvf.metadata['labels'].index(label) != i: transformer.set_label(i, concatmvf.metadata['labels'].index(label)) for contigid, contigdata in iter(mvf.metadata['contigs'].items()): if contigdata['label'] not in concatmvf_reverse_contig: newid = (contigid if contigid not in concatmvf.metadata['contigs'] else concatmvf.get_next_contig_id()) concatmvf.metadata['contigs'][newid] = contigdata concatmvf_reverse_contig[contigdata['label']] = newid else: newid = concatmvf_reverse_contig[contigdata['label']] transformer.set_contig(contigid, newid) transformers.append(transformer) inputfiles.append(mvf) # Write output header args.qprint("Writing headers to merge output") concatmvf.reset_ncol() concatmvf.write_data(concatmvf.get_header()) contigs = concatmvf.metadata['contigs'] # Now loop through each file blank_entry = '-' * len(concatmvf.metadata['samples']) for current_contig in contigs: contig_merged_entries = {} args.qprint("Merging Contig: {}".format(current_contig)) for ifile, mvffile in enumerate(inputfiles): if current_contig not in transformers[ifile].contigs: continue localcontig = transformers[ifile].contigs[current_contig] for chrom, pos, allelesets in mvffile.itercontigentries( localcontig, decode=True): if pos not in contig_merged_entries: contig_merged_entries[pos] = blank_entry[:] for j, base in enumerate(allelesets[0]): xcoord = transformers[ifile].labels_rev[j] if contig_merged_entries[pos][xcoord] != '-': if contig_merged_entries[pos][xcoord] == base: continue if base == '-' or base == 'X': continue raise RuntimeError( "Merging columns have two different bases: {} {} {}" .format(pos, contig_merged_entries[pos][xcoord], base)) contig_merged_entries[pos] = ( contig_merged_entries[pos][:xcoord] + base + contig_merged_entries[pos][xcoord + 1:]) concatmvf.write_entries( ((current_contig, coord, (entry, )) for coord, entry in sorted(contig_merged_entries.items())), encoded=False) args.qprint("Entries written for contig {}: {}".format( current_contig, len(contig_merged_entries))) return ''
def translate_mvf(args): """Main method""" mvf = MultiVariantFile(args.mvf, 'read') if mvf.flavor != 'dna': raise RuntimeError("MVF must be flavor=dna to translate") if args.gff: gff = parse_gff_translate(args.gff, args) if not args.quiet: print("gff_processed") outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.metadata = deepcopy(mvf.metadata) outmvf.flavor = args.output_data outmvf.write_data(outmvf.get_header()) entrybuffer = [] nentry = 0 if not args.gff: inputbuffer = [] current_contig = '' for contigid, pos, allelesets in mvf.iterentries(decode=False): if current_contig == '': current_contig = contigid[:] if contigid == current_contig: inputbuffer.append((pos, allelesets)) else: for _, amino_acids, alleles in iter_codons(inputbuffer, mvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids, ))) else: entrybuffer.append( (current_contig, pos, (amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 inputbuffer = [(pos, allelesets)] current_contig = contigid[:] if inputbuffer: for _, amino_acids, alleles in iter_codons(inputbuffer, mvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append((current_contig, pos, (amino_acids, ))) else: entrybuffer.append( (current_contig, pos, (amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 else: mvf_entries = {} for contigid, pos, allelesets in mvf.iterentries(decode=False): if contigid not in mvf_entries: mvf_entries[contigid] = {} mvf_entries[contigid][pos] = allelesets[0] for contigname in sorted(gff): contigid = mvf.get_contig_ids(labels=contigname)[0] for coords in sorted(gff[contigname]): reverse_strand = False if coords[3] == '-': reverse_strand = True alleles = [ mvf_entries[contigid].get(x, '-') for x in coords[2::-1] ] else: alleles = [ mvf_entries[contigid].get(x, '-') for x in coords[0:3] ] if all(len(x) == 1 for x in alleles): if reverse_strand: alleles = [MLIB.complement_bases[x] for x in alleles] decoded_alleles = alleles amino_acids = translate(''.join(alleles))[0] else: if reverse_strand: decoded_alleles = [[ MLIB.complement_bases[y] for y in mvf.decode(x) ] for x in alleles] alleles = [ mvf.encode(''.join(x)) for x in decoded_alleles ] else: decoded_alleles = [mvf.decode(x) for x in alleles] amino_acids = [ translate(''.join(x)) for x in zip(*decoded_alleles) ] amino_acids = mvf.encode(''.join( [x[0] for x in amino_acids])) if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append((contigid, coords[0], (amino_acids, ))) else: entrybuffer.append( (contigid, coords[0], (amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 if entrybuffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 return ''
def vcf2mvf(args=None): """Main method for vcf2mvf""" sepchars = dict([("TAB", "\t"), ("SPACE", " "), ("DBLSPACE", " "), ("COMMA", ","), ("MIXED", None)]) args.fieldsep = sepchars[args.field_sep] # ESTABLISH VCF vcf = VariantCallFile(args.vcf, indexcontigs=(not args.no_autoindex)) # ESTABLISH MVF mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) # PROCESS CONTIG INFO vcfcontigs = vcf.metadata['contigs'].copy() contig_translate = {} if args.contig_ids: for cid, cvcf, cmvf in (x.split(';') for x in args.contig_ids): try: cid = int(cid) except ValueError: pass assert cvcf in [vcfcontigs[x]['label'] for x in vcfcontigs] for vid in vcfcontigs: if vcfcontigs[vid]['label'] == cvcf: contig_translate[cvcf] = [cid, cmvf] if cid in mvf.metadata['contigs']: raise RuntimeError( 'Contig id {} is not unique'.format(cid)) mvf.metadata['contigs'][cid] = vcfcontigs[vid].copy() if cmvf in mvf.get_contig_labels(): raise RuntimeError( 'Contig label {} is not unique'.format(cmvf)) mvf.metadata['contigs'][cid]['label'] = cmvf[:] mvf.reset_max_contig_id() for vcid in vcfcontigs: vlabel = vcfcontigs[vcid]['label'] if vlabel not in mvf.get_contig_labels(): if ((is_int(vlabel) or len(vlabel) < 3) and vlabel not in mvf.get_contig_ids()): newid = vlabel[:] else: newid = mvf.get_next_contig_id() mvf.metadata['contigs'][newid] = vcfcontigs[vcid].copy() contig_translate[vlabel] = [newid, vlabel] mvf.reset_max_contig_id() new_contigs = [(x, mvf.metadata['contigs'][x]['label']) for x in mvf.metadata['contigs']] for i, (newid, newlabel) in enumerate(new_contigs): for j, (xid, xlabel) in enumerate(new_contigs): if i == j: continue if newid == xlabel: raise RuntimeError("Error contig id {} is the same as" " the label for another contig" " ({} {})".format(newid, xid, xlabel)) if newlabel == xid: raise RuntimeError("Error contig label {} is the same" "as the id for another contig" "({} {})".format(newlabel, xid, xlabel)) # PROCESS SAMPLE INFO samplelabels = [args.ref_label] + vcf.metadata['samples'][:] if args.alleles_from: args.alleles_from = args.alleles_from.split(':') samplelabels += args.alleles_from if args.sample_replace: newsample = [ x.split(':') if ':' in tuple(x) else tuple([x, x]) for x in args.sample_replace ] unmatched = [x for x in enumerate(samplelabels)] for old, new in newsample: labelmatched = False for j, (i, name) in enumerate(unmatched): if old in name: samplelabels[i] = new labelmatched = j break if labelmatched is not False: del unmatched[labelmatched] mvf.metadata['labels'] = samplelabels[:] for i, label in enumerate(samplelabels): mvf.metadata['samples'][i] = {'label': label} mvf.metadata['ncol'] = len(mvf.metadata['labels']) mvf.metadata['sourceformat'] = vcf.metadata['sourceformat'] # WRITE MVF HEADER mvf.write_data(mvf.get_header()) mvfentries = [] nentry = 0 for vcfrecord in vcf.iterentries(args): # try: mvf_alleles = encode_mvfstring(''.join(vcfrecord['genotypes'])) if args.out_flavor in ('dnaqual', ): qual_alleles = encode_mvfstring(''.join(vcfrecord['qscores'])) if mvf_alleles: mvfentries.append( (contig_translate.get(vcfrecord['contig'])[0], vcfrecord['coord'], ((mvf_alleles, qual_alleles) if args.out_flavor in ('dnaqual', ) else (mvf_alleles, )))) nentry += 1 if nentry == args.line_buffer: mvf.write_entries(mvfentries, encoded=True) mvfentries = [] nentry = 0 # except Exception as exception: if mvfentries: mvf.write_entries(mvfentries) mvfentries = [] return ''