def calc_sample_coverage(args): """Counts the total number of non-gap/ambiguous characters for each sample per contig. """ mvf = MultiVariantFile(args.mvf, 'read') data = {} # Set up sample indices sample_labels = mvf.get_sample_labels() if args.sample_indices is not None: sample_indices = [int(x) for x in args.sample_indices[0].split(",")] elif args.sample_labels is not None: sample_indices = mvf.get_sample_indices( labels=args.sample_labels[0].split(",")) else: sample_indices = mvf.get_sample_indices() # Set up contig ids if args.contig_ids is not None: contig_ids = args.contig_ids[0].split(",") elif args.contig_labels is not None: contig_ids = mvf.get_contig_ids( labels=args.contig_labels[0].split(",")) else: contig_ids = mvf.get_contig_ids() for contig, _, allelesets in mvf.iterentries( contigs=contig_ids, subset=sample_indices, decode=True): if contig not in data: data[contig] = dict.fromkeys(sample_labels, 0) data[contig]['contig'] = contig for j, x in enumerate(sample_indices): data[contig][sample_labels[x]] += int( allelesets[0][j] not in 'Xx-') outfile = OutputFile(path=args.out, headers=(["contig"] + [sample_labels[x] for x in sample_indices])) for contig in data: outfile.write_entry(data[contig]) return ''
def mvf2phy(args): """Main method""" mvf = MultiVariantFile(args.mvf, 'read') if (mvf.flavor in ("dna", "rna") and args.output_data == "prot") or ( mvf.flavor == "prot" and args.output_data in ("dna", "rna")): raise RuntimeError( "--outdput-data {} incompatiable with '{}' flavor mvf".format( args.output_data, mvf.flavor)) max_region_coord = dict((x, None) for x in mvf.get_contig_ids()) if args.regions is not None: _, max_region_coord, _ = parse_regions_arg(args.regions, mvf.get_contig_ids()) if args.sample_indices is not None: sample_indices = [int(x) for x in args.sample_indices[0].split(",")] elif args.sample_labels is not None: sample_indices = mvf.get_sample_indices( ids=args.sample_labels[0].split(",")) else: sample_indices = mvf.get_sample_indices() sample_labels = mvf.get_sample_ids(indices=sample_indices) skipcontig = '' tmp_files = dict((fn, open("{}-{}.tmp".format(fn, randint(1000000, 9999999)), 'w+', args.buffer)) for fn in sample_labels) labelwritten = dict.fromkeys(sample_labels, False) current_contig_id = None current_contig_start = 1 current_contig_end = 1 if args.partition is True: partprefix = "PROT" if args.output_data == "prot" else "DNA" partitionfile = open("{}.part".format(args.out), 'w') for contig, _, allelesets in mvf.iterentries( contig_ids=(mvf.get_contig_ids() if args.regions is None else max_region_coord[:]), decode=True): if contig == skipcontig: continue if contig not in max_region_coord: skipcontig = contig[:] continue if current_contig_id is None: current_contig_id = contig[:] elif contig != current_contig_id: if args.partition is True: if current_contig_end > current_contig_start: partitionfile.write("{}, {} = {}-{}\n".format( partprefix, mvf.get_contig_labels(ids=current_contig_id), current_contig_start, current_contig_end - 1)) current_contig_id = contig[:] # reset start as one position after end of last current_contig_start = current_contig_end current_contig_end = current_contig_end + 1 for col, label in zip(sample_indices, sample_labels): if not labelwritten[label]: if args.label_type == 'long': tmp_files[label].write("{}{}".format( label[:100], " " * (100 - len(label[:100])))) elif args.label_type == 'short': tmp_files[label].write("{}{}".format( label[:20], " " * (20 - len(label[:20])))) labelwritten[label] = True if mvf.flavor == 'dna': tmp_files[label].write(allelesets[0][col] == 'X' and 'N' or allelesets[0][col]) if label == sample_labels[0]: current_contig_end += 1 elif ((mvf.flavor == 'codon' and args.output_data == 'prot') or (mvf.flavor == 'prot')): tmp_files[label].write(allelesets[0][col]) if label == sample_labels[0]: current_contig_end += 1 elif mvf.flavor == 'codon': codon = [ "N" if allelesets[x][col] == 'X' else allelesets[x][col] for x in (1, 2, 3) ] tmp_files[label].write(''.join(codon)) if label == sample_labels[0]: current_contig_end += 3 first_file = True totalseqlen = 0 with open(args.out, 'w') as outfile: for filehandler in tmp_files.values(): # read first file to establish sequence length for phylip header if first_file is True: filehandler.seek(0, 0) buff = filehandler.read(args.buffer) while buff != '': if " " in buff: totalseqlen += len(buff.strip().split(" ")[-1]) else: totalseqlen += len(buff.strip()) buff = filehandler.read(args.buffer) outfile.write("{} {}\n".format(len(sample_labels), totalseqlen)) first_file = False filehandler.seek(0, 0) buff = filehandler.read(args.buffer) while buff != '': if first_file is True: outfile.write("{} {}\n".format(len(sample_labels), len(buff.split()[1]))) first_file = False outfile.write(buff) buff = filehandler.read(args.buffer) outfile.write("\n") filehandler.close() os.remove(os.path.join(args.temp_dir, filehandler.name)) if args.partition is True: if current_contig_end > current_contig_start: partitionfile.write("{}, {} = {}-{}\n".format( partprefix, mvf.get_contig_labels(ids=current_contig_id), current_contig_start, current_contig_end - 1)) partitionfile.close() return ''
def vcf2mvf(args=None): """Main method for vcf2mvf""" sepchars = dict([("TAB", "\t"), ("SPACE", " "), ("DBLSPACE", " "), ("COMMA", ","), ("MIXED", None)]) args.fieldsep = sepchars[args.field_sep] # ESTABLISH VCF args.qprint("Opening input VCF: {}".format(args.vcf)) vcf = VariantCallFile(args.vcf, indexcontigs=(not args.no_autoindex)) # ESTABLISH MVF args.qprint("Establishing output MVF: {}".format(args.out)) mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) mvf.notes.append(args.command_string) mvf.metadata['mvfversion'] = args.versionx # PROCESS CONTIG INFO args.qprint("Processing VCF headers.") vcfcontigs = vcf.metadata['contigs'].copy() args.qprint("{} contigs found.".format(len(vcfcontigs))) contig_translate = {} if args.contig_ids: for cid, cvcf, cmvf in (x.split(';') for x in args.contig_ids): try: cid = int(cid) except ValueError: pass assert cvcf in [vcfcontigs[x]['label'] for x in vcfcontigs] for vid in vcfcontigs: if vcfcontigs[vid]['label'] == cvcf: contig_translate[cvcf] = [cid, cmvf] if cid in mvf.metadata['contigs']: raise RuntimeError( 'Contig id {} is not unique'.format(cid)) mvf.metadata['contigs'][cid] = vcfcontigs[vid].copy() if cmvf in mvf.get_contig_labels(): raise RuntimeError( 'Contig label {} is not unique'.format(cmvf)) mvf.metadata['contigs'][cid]['label'] = cmvf[:] mvf.reset_max_contig() mvf.max_contig_index -= 1 args.qprint("Processing contigs.") static_contig_ids = list(mvf.get_contig_ids()) for vcid in vcfcontigs: vlabel = vcfcontigs[vcid]['label'] if vlabel not in static_contig_ids: newindex = mvf.get_next_contig_index() if ((is_int(vlabel) or len(vlabel) < 3) and vlabel not in static_contig_ids): newid = vlabel[:] else: newid = str(newindex) mvf.contig_indices.append(newindex) mvf.contig_ids.append(newid) mvf.contig_data[newindex] = vcfcontigs[vcid].copy() static_contig_ids.append(newid) contig_translate[vlabel] = [newindex, vlabel] mvf.reset_max_contig() new_contigs = [(x, mvf.contig_data[x]['label']) for x in mvf.contig_indices] if args.skip_contig_label_check is False: args.qprint("Checking contigs for label/id overlap errors.") xids = [x[0] for x in new_contigs] xlabels = [x[1] for x in new_contigs] xintersect = set(xids).intersection(xlabels) if xintersect: for i, (newid, newlabel) in enumerate(new_contigs): if i % 100 == 0: args.qprint("{} contigs processed".format(i)) if newid in xlabels[:i] or newid in xlabels[i + 1:]: # if newid in xlabels: # if xlabels.index(newid) != i: raise RuntimeError("Error contig id {} is the same as" " the label for another contig" " ({})".format(newid, xlabels.index(newid))) if newlabel in xids[:i] or newlabel in xids[i + 1:]: # if newlabel in xids: # if xids.index(newlabel) != i: raise RuntimeError("Error contig label {} is the same" "as the id for another contig" "({})".format(newlabel, xids.index(newlabel))) # PROCESS SAMPLE INFO args.qprint("Processing samples.") samplelabels = [args.ref_label] + vcf.metadata['samples'][:] if args.alleles_from: args.alleles_from = args.alleles_from.split(':') samplelabels += args.alleles_from if args.sample_replace: newsample = [ x.split(':') if ':' in tuple(x) else tuple([x, x]) for x in args.sample_replace ] unmatched = list(enumerate(samplelabels)) for old, new in newsample: labelmatched = False for j, (i, name) in enumerate(unmatched): if old in name: samplelabels[i] = new labelmatched = j break if labelmatched is not False: del unmatched[labelmatched] mvf.sample_indices = list(range(len(samplelabels))) mvf.sample_ids = samplelabels[:] for i, label in enumerate(samplelabels): mvf.sample_data[i] = {'id': label} mvf.metadata['ncol'] = len(mvf.sample_ids) mvf.max_sample_index = len(mvf.sample_ids) mvf.metadata['sourceformat'] = vcf.metadata['sourceformat'] # WRITE MVF HEADER mvf.write_data(mvf.get_header()) mvfentries = [] nentry = 0 args.qprint("Processing VCF entries.") for vcfrecord in vcf.iterentries(args): mvfstring = ''.join(vcfrecord['genotypes']) if args.filter_nonref_empty is True: if all(x in 'Xx-?' for x in mvfstring[1:]): continue mvf_alleles = encode_mvfstring(mvfstring) if args.out_flavor in ('dnaqual', ): qual_alleles = encode_mvfstring(''.join(vcfrecord['qscores'])) if mvf_alleles: mvfentries.append( (contig_translate.get(vcfrecord['contig'])[0], vcfrecord['coord'], ((mvf_alleles, qual_alleles) if args.out_flavor in ('dnaqual', ) else (mvf_alleles, )))) nentry += 1 if nentry == args.line_buffer: mvf.write_entries(mvfentries, encoded=True) mvfentries = [] nentry = 0 if mvfentries: mvf.write_entries(mvfentries) mvfentries = [] return ''
def calc_all_character_count_per_sample(args): """Count the number of and relative rate of certain bases spatially along chromosomes """ args.qprint("Running CalcAllCharacterCountPerSample") mvf = MultiVariantFile(args.mvf, 'read') current_contig = None current_position = 0 data_in_buffer = False # Set up sample indices sample_labels = mvf.get_sample_ids() if args.sample_indices is not None: sample_indices = [int(x) for x in args.sample_indices[0].split(",")] elif args.sample_labels is not None: sample_indices = mvf.get_sample_indices( ids=args.sample_labels[0].split(",")) else: sample_indices = mvf.get_sample_indices() # Set up contig ids if args.contig_ids is not None: contig_ids = args.contig_ids[0].split(",") elif args.contig_labels is not None: contig_ids = mvf.get_contig_ids( labels=args.contig_labels[0].split(",")) else: contig_ids = None data = dict((i, {}) for i in sample_indices) data_characters = [{} for i in sample_indices] for contig, pos, allelesets in mvf.iterentries(decode=False, contig_ids=contig_ids): # Check Minimum Site Coverage if check_mincoverage(args.mincoverage, allelesets[0]) is False: continue if current_contig is None: current_contig = contig[:] if args.windowsize > 0: while pos > current_position + args.windowsize - 1: current_position += args.windowsize # Check if windows are specified. if not same_window((current_contig, current_position), (contig, pos), args.windowsize): args.qprint("Processing contig {}".format(current_contig)) for i in sample_indices: data[i][(current_contig, current_position)] = { 'contig': current_contig, 'position': current_position } data[i][(current_contig, current_position)].update(data_characters[i]) if contig != current_contig: current_contig = contig[:] current_position = 0 else: current_position += (0 if args.windowsize == -1 else args.windowsize) data_characters = [{} for i in sample_indices] data_in_buffer = False alleles = allelesets[0] if len(alleles) == 1: for i in sample_indices: data_characters[i][alleles[0]] = ( data_characters[i].get(alleles[0], 0) + 1) else: alleles = mvf.decode(alleles) for i in sample_indices: data_characters[i][alleles[i]] = ( data_characters[i].get(alleles[i], 0) + 1) data_in_buffer = True if data_in_buffer: for i in sample_indices: data[i][(current_contig, current_position)] = { 'contig': current_contig, 'position': current_position } data[i][(current_contig, current_position)].update(data_characters[i]) # WRITE OUTPUT all_chars = set([]) for sampleid in data: for window in data[sampleid]: all_chars.update([ x for x in data[sampleid][window] if x not in ('contig', 'position') ]) headers = ['contig', 'position'] headers.extend(list(sorted(all_chars))) outfile = OutputFile(path=args.out, headers=headers) for sampleid in sample_indices: outfile.write("#{}\n".format(sample_labels[sampleid])) sorted_entries = [(data[sampleid][k]['contig'], data[sampleid][k]['position'], k) for k in data[sampleid]] for _, _, k in sorted_entries: outfile.write_entry(data[sampleid][k], defaultvalue='0') return ''
def calc_dstat_combinations(args): """Calculate genome-wide D-statstics for all possible trio combinations of samples and outgroups specified. """ mvf = MultiVariantFile(args.mvf, 'read') data = {} sample_labels = mvf.get_sample_ids() if args.outgroup_indices is not None: outgroup_indices = [ int(x) for x in args.outgroup_indices[0].split(",") ] elif args.outgroup_labels is not None: outgroup_indices = mvf.get_sample_indices( ids=args.outgroup_labels[0].split(",")) if args.sample_indices is not None: sample_indices = [int(x) for x in args.sample_indices[0].split(",")] elif args.sample_labels is not None: sample_indices = mvf.get_sample_indices( ids=args.sample_labels[0].split(",")) else: sample_indices = mvf.get_sample_indices() if args.contig_ids is not None: contig_ids = args.contig_ids[0].split(",") elif args.contig_labels is not None: contig_ids = mvf.get_contig_ids( labels=args.contig_labels[0].split(",")) else: contig_ids = None if any(x in outgroup_indices for x in sample_indices): raise RuntimeError("Sample and Outgroup column lists cannot overlap.") for contig, _, allelesets in mvf: if contig not in contig_ids: continue alleles = mvf.decode(allelesets[0]) for i, j, k in combinations(sample_indices, 3): for outgroup in outgroup_indices: subset = [alleles[x] for x in [i, j, k, outgroup]] if any(x not in 'ATGC' for x in subset): continue if subset[-1] not in subset[:3]: continue if len(set(subset)) != 2: continue # [ABBA, BABA, BBAA] val = (0 + 1 * (subset[0] == subset[3]) + 2 * (subset[1] == subset[3]) + 4 * (subset[2] == subset[3])) if val in (1, 2): val -= 1 elif val == 4: val = 2 else: continue tetrad = (i, j, k, outgroup) if tetrad not in data: data[tetrad] = {} if contig not in data[tetrad]: data[tetrad][contig] = [0, 0, 0] data[tetrad][contig][val] += 1 # WRITE OUTPUT headers = ['sample0', 'sample1', 'sample2', "outgroup"] for xcontig in contig_ids: headers.extend([ '{}:abba'.format(xcontig), '{}:baba'.format(xcontig), '{}:bbaa'.format(xcontig), '{}:D'.format(xcontig) ]) outfile = OutputFile(path=args.out, headers=headers) for i, j, k in combinations(sample_indices, 3): for outgroup in outgroup_indices: tetrad = tuple([i, j, k, outgroup]) if tetrad not in data: continue entry = dict(('sample{}'.format(i), sample_labels[x]) for i, x in enumerate(tetrad[:3])) entry['outgroup'] = sample_labels[outgroup] for contig in contig_ids: if contig not in data[tetrad]: entry.update(dict().fromkeys([ '{}:abba'.format(contig), '{}:baba'.format(contig), '{}:bbaa'.format(contig), '{}:D'.format(contig) ], '0')) else: [abba, baba, bbaa] = data[tetrad][contig] if abba > baba and abba > bbaa: dstat = zerodiv(baba - bbaa, baba + bbaa) elif baba > bbaa and baba > abba: dstat = zerodiv(abba - bbaa, abba + bbaa) else: dstat = zerodiv(abba - baba, abba + baba) entry.update([('{}:abba'.format(contig), abba), ('{}:baba'.format(contig), baba), ('{}:bbaa'.format(contig), bbaa), ('{}:D'.format(contig), dstat)]) outfile.write_entry(entry) return ''
def plot_chromoplot(args): """Main method""" pallette = Pallette() if args.colors is not None: pallette.basecolors = args.colors # Establish MVF and parse chromosome information if args.quiet is False: print("Reading MVF...") mvf = MultiVariantFile(args.mvf, 'read') if args.quiet is False: print("Parsing headers...") if args.contig_ids is not None: contigids = args.contig_ids[0].split(",") elif args.contig_labels is not None: contigids = mvf.get_contig_ids(labels=args.contig_labels[0].split(",")) else: contigids = mvf.get_contig_ids() if args.quiet is False: print("Plotting chromoplot for contigs: {}".format( ",".join(contigids))) sample_labels = mvf.get_sample_labels() if args.sample_indices is not None: sample_indices = [int(x) for x in args.sample_indices[0].split(",")] elif args.sample_labels is not None: sample_indices = mvf.get_sample_indices( labels=args.sample_labels[0].split(",")) else: sample_indices = mvf.get_sample_indices() assert len(sample_indices) >= 3 if args.outgroup_indices is not None: outgroup_indices = [ int(x) for x in args.outgroup_indices[0].split(",") ] elif args.outgroup_labels is not None: outgroup_indices = mvf.get_sample_indices( labels=args.outgroup_labels[0].split(",")) assert len(outgroup_indices) >= 1 quartets = [(x, y, z, outgroup) for x, y, z in combinations(sample_indices, 3) for outgroup in outgroup_indices] # Begin iterations for quartet_indices in quartets: quartet_labels = [sample_labels[x] for x in quartet_indices] if args.quiet is False: print("Beginning quartet {}".format(",".join(quartet_labels))) params = { 'contigs': [[ contigid, mvf.metadata['contigs'][contigid]['label'], mvf.metadata['contigs'][contigid]['length'] ] for contigid in contigids], 'outpath': ((args.out_prefix if args.out_prefix is not None else '') or '_'.join(quartet_labels)) + ".png", 'labels': quartet_labels, 'indices': quartet_indices, 'windowsize': args.windowsize, 'majority': args.majority, 'infotrack': args.info_track, 'yscale': args.yscale, 'xscale': args.xscale, 'quiet': args.quiet, 'plottype': args.plot_type } chromoplot = Chromoplot(params=params, pallette=pallette) current_contig = '' for contig, pos, allelesets in mvf.iterentries(subset=quartet_indices, decode=True, contigs=contigids): if contig != current_contig: if args.quiet is False: print("Starting contig {}".format(contig)) current_contig = contig[:] alleles = allelesets[0] if '-' in alleles: site_code = 'gap' elif any(x not in 'ATGCatgc' for x in alleles): site_code = 'ambiguous' elif alleles[3] not in alleles[:3]: site_code = 'nonpolar' elif len(set(alleles)) > 2: site_code = 'triallelic' else: site_code = sum([ 2**(3 - j) * (alleles[j] != alleles[3]) for j in range(3) ]) chromoplot.add_data(str(contig), int(pos // args.windowsize), site_code) contig = '' current_contig = '' if not args.quiet: print("Writing image...") chromoplot.plot_chromoplot() if not args.quiet: print("Writing log...") chromoplot.write_total_log() return ''
def calc_character_count(args): """Count the number of and relative rate of certain bases spatially along chromosomes """ mvf = MultiVariantFile(args.mvf, 'read') data = {} current_contig = None current_position = 0 all_match = 0 all_total = 0 data_in_buffer = False # Set up base matching from special words def proc_special_word(argx): if argx == 'dna': argx = MLIB.validchars['dna'] elif argx == 'dnaambig2': argx = MLIB.validchars['dna+ambig2'] elif argx == 'dnaambig3': argx = MLIB.validchars['dna+ambig3'] elif argx == 'dnaambigall': argx = MLIB.validchars['dna+ambigall'] elif argx == 'prot': argx = MLIB.validchars['amino'] return argx args.base_match = proc_special_word(args.base_match) args.base_total = proc_special_word(args.base_total) # Set up sample indices sample_labels = mvf.get_sample_labels() if args.sample_indices is not None: sample_indices = [int(x) for x in args.sample_indices[0].split(",")] elif args.sample_labels is not None: sample_indices = mvf.get_sample_indices( labels=args.sample_labels[0].split(",")) else: sample_indices = mvf.get_sample_indices() # Set up contig ids if args.contig_ids is not None: contig_ids = args.contig_ids[0].split(",") elif args.contig_labels is not None: contig_ids = mvf.get_contig_ids( labels=args.contig_labels[0].split(",")) else: contig_ids = None match_counts = dict().fromkeys([sample_labels[i] for i in sample_indices], 0) total_counts = dict().fromkeys([sample_labels[i] for i in sample_indices], 0) for contig, pos, allelesets in mvf.iterentries(decode=False, contigs=contig_ids): # Check Minimum Site Coverage if check_mincoverage(args.mincoverage, allelesets[0]) is False: continue #if contig not in contig_ids: # continue # Establish first contig if current_contig is None: current_contig = contig[:] if args.windowsize > 0: while pos > current_position + args.windowsize - 1: current_position += args.windowsize # Check if windows are specified. if not same_window((current_contig, current_position), (contig, pos), args.windowsize): data[(current_contig, current_position)] = { 'contig': current_contig, 'position': current_position } for k in match_counts: data[(current_contig, current_position)].update([ (k + '.match', match_counts[k] + all_match), (k + '.total', total_counts[k] + all_total), (k + '.prop', ((float(match_counts[k] + all_match) / float(total_counts[k] + all_total)) if total_counts[k] + all_total > 0 else 0)) ]) if contig != current_contig: current_contig = contig[:] current_position = 0 else: current_position += (0 if args.windowsize == -1 else args.windowsize) match_counts = dict().fromkeys( [sample_labels[i] for i in sample_indices], 0) total_counts = dict().fromkeys( [sample_labels[i] for i in sample_indices], 0) all_total = 0 all_match = 0 data_in_buffer = False else: alleles = allelesets[0] if len(alleles) == 1: if args.base_match is None: all_match += 1 elif alleles in args.base_match: all_match += 1 if args.base_total is None: all_total += 1 elif alleles in args.base_total: all_total += 1 else: alleles = mvf.decode(alleles) for i in sample_indices: if args.base_match is None: match_counts[sample_labels[i]] += 1 elif alleles[i] in args.base_match: match_counts[sample_labels[i]] += 1 if args.base_total is None: total_counts[sample_labels[i]] += 1 elif alleles[i] in args.base_total: total_counts[sample_labels[i]] += 1 data_in_buffer = True if data_in_buffer: data[(current_contig, current_position)] = { 'contig': current_contig, 'position': current_position } for k in match_counts: data[(current_contig, current_position)].update([ (k + '.match', match_counts[k] + all_match), (k + '.total', total_counts[k] + all_total), (k + '.prop', ((float(match_counts[k] + all_match) / float(total_counts[k] + all_total)) if total_counts[k] + all_total > 0 else 0)) ]) # WRITE OUTPUT headers = ['contig', 'position'] for label in sample_labels: headers.extend([label + x for x in ('.match', '.total', '.prop')]) outfile = OutputFile(path=args.out, headers=headers) sorted_entries = sorted([(data[k]['contig'], data[k]['position'], k) for k in data]) for _, _, k in sorted_entries: outfile.write_entry(data[k]) return ''
def translate_mvf(args): """Main method""" args.qprint("Running TranslateMVF") if args.gff: args.qprint("Reading and Indexing MVF.") else: args.qprint("Reading MVF.") mvf = MultiVariantFile(args.mvf, 'read', contigindex=bool(args.gff)) if mvf.flavor != 'dna': raise RuntimeError("MVF must be flavor=dna to translate") if args.gff: args.qprint("Processing MVF Index File.") mvf.read_index_file() args.qprint("GFF processing start.") gff_genes, gene_order = parse_gff_exome(args) args.qprint("GFF processed.") outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.copy_headers_from(mvf) outmvf.contig_data = dict( ( i, dict((y, z) for (y, z) in gff_genes[x].items() if y not in ('cds', ))) for (i, x) in enumerate(gene_order)) outmvf.contig_indices = list(range(len(gene_order))) outmvf.contig_ids = [gff_genes[x]['id'] for x in gene_order] outmvf.contig_labels = [gff_genes[x]['label'] for x in gene_order] outmvf.flavor = args.output_data outmvf.metadata.notes.append(args.command_string) outmvf.write_data(outmvf.get_header()) args.qprint("Output MVF Established.") entrybuffer = [] nentry = 0 pos = None if not args.gff: args.qprint("No GFF used, translating sequences as pre-aligned in " "coding frame.") inputbuffer = [] current_contig = '' for contigid, pos, allelesets in mvf.iterentries(decode=False): if current_contig == '': current_contig = contigid[:] if contigid == current_contig: inputbuffer.append((pos, allelesets)) else: for _, amino_acids, alleles in iter_codons( inputbuffer, mvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids,))) else: entrybuffer.append(( current_contig, pos, ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 inputbuffer = [(pos, allelesets)] current_contig = contigid[:] if inputbuffer: for _, amino_acids, alleles in iter_codons( inputbuffer, outmvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids,))) else: entrybuffer.append(( current_contig, pos, ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 else: running_gene_index = -1 for igene, gene in enumerate(gene_order): xcontiglabel = gff_genes[gene]['contig'] xcontig = mvf.get_contig_indices( labels=gff_genes[gene]['contig']) if xcontig is None: print("Warning: contig {} not found".format( gff_genes[gene]['contig'])) xcontigid = mvf.get_contig_ids(indices=xcontig)[0] min_gene_coord = gff_genes[gene]['cds'][0][0] max_gene_coord = gff_genes[gene]['cds'][-1][1] mvf_entries = {} if not igene % 100: args.qprint("Processing gene {} on {}".format( gene, xcontiglabel)) for contigid, pos, allelesets in mvf.itercontigentries( xcontig, decode=False): if pos < min_gene_coord: continue if pos > max_gene_coord: break mvf_entries[pos] = allelesets[0] reverse_strand = gff_genes[gene]['strand'] == '-' coords = [] running_gene_index += 1 for elem in gff_genes[gene]['cds']: coords.extend(list(range(elem[0], elem[1] + 1))) if reverse_strand: coords = coords[::-1] for codoncoord in range(0, len(coords), 3): alleles = tuple(mvf_entries.get(x, '-') for x in coords[codoncoord:codoncoord + 3]) if len(alleles) < 3: alleles = tuple(list(alleles) + ['-'] * (3 - len(alleles))) if all(len(x) == 1 for x in alleles): if reverse_strand: alleles = tuple( MLIB.complement_bases[x] for x in alleles) decoded_alleles = alleles amino_acids = translate_single_codon(''.join(alleles)) else: if reverse_strand is True: decoded_alleles = tuple(tuple(MLIB.complement_bases[y] for y in mvf.decode(x)) for x in alleles) alleles = tuple(outmvf.encode(''.join(x)) for x in decoded_alleles) else: decoded_alleles = tuple(mvf.decode(x) for x in alleles) amino_acids = tuple(translate_single_codon(''.join(x)) for x in zip(*decoded_alleles)) amino_acids = outmvf.encode(''.join(amino_acids)) if args.output_data == 'protein': entrybuffer.append(( ( xcontigid if args.retain_contigs else running_gene_index ), ( coords[codoncoord] if args.retain_coords else codoncoord ), ( amino_acids, ) )) elif args.output_data == 'codon': entrybuffer.append(( ( xcontigid if args.retain_contigs else running_gene_index ), ( coords[codoncoord] if args.retain_coords else codoncoord ), ( amino_acids, alleles[0], alleles[1], alleles[2] ) )) elif args.output_data == 'dna': for j, elem in enumerate( range(codoncoord, min(codoncoord + 3, len(coords)))): entrybuffer.append(( ( xcontigid if args.retain_contigs else running_gene_index ), ( coords[elem] if args.retain_coords else elem + 1 ), ( alleles[j], ) )) nentry += 1 if nentry >= args.line_buffer: args.qprint("Writing a block of {} entries.".format( args.line_buffer)) outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 if entrybuffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 return ''
def legacy_translate_mvf(args): """Main method""" args.qprint("Running LegacyTranslateMVF") if args.gff: args.qprint("Reading and Indexing MVF.") else: args.qprint("Reading MVF.") mvf = MultiVariantFile(args.mvf, 'read', contigindex=bool(args.gff)) if mvf.flavor != 'dna': raise RuntimeError("MVF must be flavor=dna to translate") if args.gff: args.qprint("Processing MVF Index File.") mvf.read_index_file() args.qprint("GFF processing start.") gff = parse_gff_legacy_translate( args.gff, args, parent_gene_pattern=args.parent_gene_pattern) args.qprint("GFF processed.") outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.copy_headers_from(mvf) outmvf.flavor = args.output_data outmvf.write_data(outmvf.get_header()) args.qprint("Output MVF Established.") entrybuffer = [] nentry = 0 pos = None if not args.gff: args.qprint("No GFF used, translating sequences as pre-aligned in " "coding frame.") inputbuffer = [] current_contig = '' for contigid, pos, allelesets in mvf.iterentries(decode=False): if current_contig == '': current_contig = contigid[:] if contigid == current_contig: inputbuffer.append((pos, allelesets)) else: for _, amino_acids, alleles in iter_codons( inputbuffer, mvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids,))) else: entrybuffer.append(( current_contig, pos, ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 inputbuffer = [(pos, allelesets)] current_contig = contigid[:] if inputbuffer: for _, amino_acids, alleles in iter_codons( inputbuffer, outmvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids,))) else: entrybuffer.append(( current_contig, pos, ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 else: args.qprint("Indexing GFF gene names.") # mvfid_to_gffname = outmvf.get_contig_reverse_dict() for xcontig in outmvf.get_contig_indices(): mvf_entries = {} xcontiglabel = outmvf.get_contig_labels(indices=xcontig)[0] xcontigid = outmvf.get_contig_ids(indices=xcontig)[0] if xcontiglabel not in gff: if args.verbose: print( ("No entries in GFF, " "skipping contig: index:{} id:{} label:{}").format( xcontig, xcontigid, xcontiglabel)) continue if not xcontig % 100: args.qprint("Processing contig: {} {}".format( xcontigid, xcontiglabel)) for contigid, pos, allelesets in mvf.itercontigentries( xcontig, decode=False): mvf_entries[pos] = allelesets[0] for coords in sorted(gff[xcontiglabel]): reverse_strand = coords[3] == '-' alleles = (tuple(mvf_entries.get(x, '-') for x in coords[2::-1]) if reverse_strand is True else tuple(mvf_entries.get(x, '-') for x in coords[0:3])) if all(len(x) == 1 for x in alleles): if reverse_strand: alleles = tuple( MLIB.complement_bases[x] for x in alleles) decoded_alleles = alleles amino_acids = translate_single_codon(''.join(alleles)) else: if reverse_strand is True: decoded_alleles = tuple(tuple(MLIB.complement_bases[y] for y in mvf.decode(x)) for x in alleles) alleles = tuple(outmvf.encode(''.join(x)) for x in decoded_alleles) else: decoded_alleles = tuple(mvf.decode(x) for x in alleles) amino_acids = tuple(translate_single_codon(''.join(x)) for x in zip(*decoded_alleles)) # print("aminx", amino_acids) amino_acids = outmvf.encode(''.join(amino_acids)) # if all(x in '-X' for x in amino_acids): # continue # print("amino", amino_acids) # print("translated", amino_acids, alleles) if args.output_data == 'protein': entrybuffer.append((xcontig, coords[0], (amino_acids,))) else: entrybuffer.append(( xcontigid, coords[0], ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry >= args.line_buffer: args.qprint("Writing a block of {} entries.".format( args.line_buffer)) outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 if entrybuffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 return ''
def infer_window_tree(args): """Main method""" # ESTABLISH FILE OBJECTS mvf = MultiVariantFile(args.mvf, 'read') # Set up contig ids if args.contig_ids is not None: contig_ids = args.contig_ids[0].split(",") elif args.contig_labels is not None: contig_ids = mvf.get_contig_ids( labels=args.contig_labels[0].split(",")) else: contig_ids = mvf.get_contig_ids() treefile = OutputFile( args.out, headers=[ 'contig', 'windowstart', 'windowsize', 'tree', 'topology', 'topoid', # 'templabels', ### USED FOR DEBUGGING ### 'alignlength', 'aligndepth', 'status' ]) topofile = OutputFile(args.out + '.counts', headers=['rank', 'topology', 'count']) if args.sample_indices is not None: sample_indices = [int(x) for x in args.sample_indices[0].split(",")] elif args.sample_labels is not None: sample_indices = mvf.get_sample_indices( labels=args.sample_labels[0].split(",")) else: sample_indices = mvf.get_sample_indices() if not os.path.exists(args.temp_dir): os.mkdir(args.temp_dir) os.chdir(args.temp_dir) # SETUP PARAMS main_labels = mvf.get_sample_labels(sample_indices) if args.choose_allele in ['randomboth', 'majorminor']: main_labels = [label + x for x in ['a', 'b'] for label in main_labels] params = { 'outgroups': args.raxml_outgroups or [], 'rootwith': (args.root_with.split(',') if args.root_with is not None else None), 'minsites': args.min_sites, 'minseqcoverage': args.min_seq_coverage, 'mindepth': args.min_depth, 'raxmlpath': args.raxml_path, 'raxmlopts': args.raxml_opts, 'duplicateseq': args.duplicate_seq, 'model': args.raxml_model, 'bootstrap': args.bootstrap, 'windowsize': args.windowsize, 'chooseallele': args.choose_allele, 'tempdir': args.temp_dir, 'tempprefix': args.temp_prefix } # WINDOW START INTERATION verify_raxml(params) current_contig = '' current_position = 0 window_data = None skip_contig = False topo_ids = {} topo_counts = {} for contig, pos, allelesets in mvf.iterentries(contigs=contig_ids, subset=sample_indices, quiet=args.quiet, no_invariant=False, no_ambig=False, no_gap=False, decode=True): if current_contig == contig: if skip_contig is True: continue if not same_window((current_contig, current_position), (contig, pos), args.windowsize): skip_contig = False if window_data is not None: entry = window_data.maketree_raxml(params) if entry['status'] != 'ok': if args.output_empty: treefile.write_entry(entry) if args.windowsize != -1: skip_contig = True else: topo = entry["topology"] topo_counts[topo] = topo_counts.get(topo, 0) + 1 if topo not in topo_ids: topo_ids[topo] = (topo_ids and max(topo_ids.values()) + 1 or 0) entry["topoid"] = topo_ids[topo] treefile.write_entry(entry) current_position = (current_position + args.windowsize if (contig == current_contig and args.windowsize > 0) else 0) current_contig = contig[:] window_data = None window_data = WindowData( window_params={ 'contigname': (mvf.get_contig_labels( ids=current_contig) if args.output_contig_labels is not None else current_contig[:]), "windowstart": ( '-1' if args.windowsize == -1 else current_position + 0), "windowsize": args.windowsize, "labels": main_labels[:] }) # ADD ALLELES if mvf.flavor == 'dna': if args.choose_allele != 'none': allelesets[0] = hapsplit(allelesets[0], args.choose_allele) window_data.append_alleles(allelesets[0], mindepth=args.min_depth) # LAST LOOP if window_data: entry = window_data.maketree_raxml(params) if entry['status'] != 'ok': if args.output_empty: treefile.write_entry(entry) else: topo = entry["topology"] topo_counts[topo] = topo_counts.get(topo, 0) + 1 if topo not in topo_ids: topo_ids[topo] = (max(topo_ids.values()) + 1 if topo_ids else 0) entry["topoid"] = topo_ids[topo] treefile.write_entry(entry) window_data = None # END WINDOW ITERATION topo_list = sorted([(v, k) for k, v in topo_counts.items()], reverse=True) for rank, [value, topo] in enumerate(topo_list): topofile.write_entry({'rank': rank, 'count': value, 'topology': topo}) return ''
def infer_window_tree(args): """Main method""" args.qprint("Running InferTree") # ESTABLISH FILE OBJECTS mvf = MultiVariantFile(args.mvf, 'read') args.qprint("Read MVF File: {}".format(args.mvf)) # Set up contig ids if args.contig_ids is not None: contig_ids = args.contig_ids[0].split(",") elif args.contig_labels is not None: contig_ids = mvf.get_contig_ids( labels=args.contig_labels[0].split(",")) else: contig_ids = mvf.get_contig_ids() treefile = OutputFile( args.out, headers=['contig', 'windowstart', 'windowsize', 'tree', 'topology', 'topoid', # 'templabels', ### USED FOR DEBUGGING ### 'alignlength', 'aligndepth', 'status']) topofile = OutputFile(args.out + '.counts', headers=['rank', 'topology', 'count']) if args.sample_indices is not None: sample_indices = [int(x) for x in args.sample_indices[0].split(",")] elif args.sample_labels is not None: sample_indices = mvf.get_sample_indices( ids=args.sample_labels[0].split(",")) else: sample_indices = mvf.get_sample_indices() if not os.path.exists(args.temp_dir): os.mkdir(args.temp_dir) os.chdir(args.temp_dir) # SETUP PARAMS main_labels = mvf.get_sample_ids(sample_indices) if args.choose_allele in ['randomboth', 'majorminor']: main_labels = [label + x for x in ['a', 'b'] for label in main_labels] params = { 'bootstrap': args.bootstrap, 'chooseallele': args.choose_allele, 'collapse_polytomies': args.collapse_polytomies, 'duplicateseq': args.duplicate_seq, 'engine': args.engine, 'engine_path': args.engine_path, 'engine_opts': args.engine_opts, 'mindepth': args.min_depth, 'minseqcoverage': args.min_seq_coverage, 'minsites': args.min_sites, 'model': args.model, 'outgroups': (args.raxml_outgroups if args.raxml_outgroups is not None else None), 'rootwith': (args.root_with.split(',') if args.root_with is not None else []), 'tempdir': args.temp_dir, 'tempprefix': args.temp_prefix, 'windowsize': args.windowsize, } # DEFAULT MODEL if params['model'] is None: if params['engine'] == 'raxml': params['model'] = 'GTRGAMMA' elif params['engine'] == 'raxml-ng': params['model'] = "GTR+G" # WINDOW START INTERATION verify_raxml(params) args.qprint("RAxML Found.") current_contig = None current_position = 0 window_data = None # skip_contig = False topo_ids = {} topo_counts = {} args.qprint("Prcocessing Records") windowsizename = "window size={}".format(args.windowsize) if windowsizename == "window size=-1": windowsizename = "whole contig" elif windowsizename == "window size=0": windowsizename = "whole genome" window_data = WindowData(window_params={ 'contigname': 'all', "windowstart": 0, "windowsize": 0, "labels": main_labels[:]}) for contig, pos, allelesets in mvf.iterentries( contig_ids=contig_ids, subset=sample_indices, no_invariant=False, no_ambig=False, no_gap=False, decode=True): # if current_contig == contig: # if skip_contig is True: # args.qprint("Skipping contig: {}".format(current_contig)) # continue if not same_window((current_contig, current_position), (contig, pos), args.windowsize): # skip_contig = False if window_data is not None: args.qprint(("Making tree for {} " "at contig {} position {}").format( windowsizename, current_contig, current_position)) entry = window_data.maketree_raxml(params) if entry['status'] != 'ok': if args.output_empty: treefile.write_entry(entry) # if args.windowsize != -1: # skip_contig = True args.qprint( "TREE REJECTED with error code: {} ({})".format( entry['status'], entry.get('comment', "None"))) else: args.qprint("Tree completed.") topo = entry["topology"] topo_counts[topo] = topo_counts.get(topo, 0) + 1 if topo not in topo_ids: topo_ids[topo] = (max(topo_ids.values()) + 1 if topo_ids else 0) entry["topoid"] = topo_ids[topo] treefile.write_entry(entry) current_position = current_position + args.windowsize if ( contig == current_contig and args.windowsize > 0) else 0 current_contig = contig[:] window_data = None window_data = WindowData(window_params={ 'contigname': (mvf.get_contig_labels(ids=current_contig) if args.output_contig_labels is not None else current_contig[:]), "windowstart": ('-1' if args.windowsize == -1 else current_position + 0), "windowsize": args.windowsize, "labels": main_labels[:]}) # ADD ALLELES if mvf.flavor == 'dna': if args.choose_allele != 'none': allelesets[0] = hapsplit(allelesets[0], args.choose_allele) window_data.append_alleles(allelesets[0], mindepth=args.min_depth) elif mvf.flavor == 'codon': for i in (1, 2, 3): if args.choose_allele != 'none': allelesets[i] = hapsplit(allelesets[i], args.choose_allele) window_data.append_alleles(allelesets[i], mindepth=args.min_depth) # LAST LOOP if window_data: entry = window_data.maketree_raxml(params) if entry['status'] != 'ok': if args.output_empty: treefile.write_entry(entry) else: topo = entry["topology"] topo_counts[topo] = topo_counts.get(topo, 0) + 1 if topo not in topo_ids: topo_ids[topo] = ( max(topo_ids.values()) + 1 if topo_ids else 0) entry["topoid"] = topo_ids[topo] treefile.write_entry(entry) window_data = None # END WINDOW ITERATION topo_list = sorted([(v, k) for k, v in topo_counts.items()], reverse=True) for rank, [value, topo] in enumerate(topo_list): topofile.write_entry({'rank': rank, 'count': value, 'topology': topo}) return ''
def translate_mvf(args): """Main method""" mvf = MultiVariantFile(args.mvf, 'read') if mvf.flavor != 'dna': raise RuntimeError("MVF must be flavor=dna to translate") if args.gff: gff = parse_gff_translate(args.gff, args) if not args.quiet: print("gff_processed") outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.metadata = deepcopy(mvf.metadata) outmvf.flavor = args.output_data outmvf.write_data(outmvf.get_header()) entrybuffer = [] nentry = 0 if not args.gff: inputbuffer = [] current_contig = '' for contigid, pos, allelesets in mvf.iterentries(decode=False): if current_contig == '': current_contig = contigid[:] if contigid == current_contig: inputbuffer.append((pos, allelesets)) else: for _, amino_acids, alleles in iter_codons(inputbuffer, mvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids, ))) else: entrybuffer.append( (current_contig, pos, (amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 inputbuffer = [(pos, allelesets)] current_contig = contigid[:] if inputbuffer: for _, amino_acids, alleles in iter_codons(inputbuffer, mvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append((current_contig, pos, (amino_acids, ))) else: entrybuffer.append( (current_contig, pos, (amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 else: mvf_entries = {} for contigid, pos, allelesets in mvf.iterentries(decode=False): if contigid not in mvf_entries: mvf_entries[contigid] = {} mvf_entries[contigid][pos] = allelesets[0] for contigname in sorted(gff): contigid = mvf.get_contig_ids(labels=contigname)[0] for coords in sorted(gff[contigname]): reverse_strand = False if coords[3] == '-': reverse_strand = True alleles = [ mvf_entries[contigid].get(x, '-') for x in coords[2::-1] ] else: alleles = [ mvf_entries[contigid].get(x, '-') for x in coords[0:3] ] if all(len(x) == 1 for x in alleles): if reverse_strand: alleles = [MLIB.complement_bases[x] for x in alleles] decoded_alleles = alleles amino_acids = translate(''.join(alleles))[0] else: if reverse_strand: decoded_alleles = [[ MLIB.complement_bases[y] for y in mvf.decode(x) ] for x in alleles] alleles = [ mvf.encode(''.join(x)) for x in decoded_alleles ] else: decoded_alleles = [mvf.decode(x) for x in alleles] amino_acids = [ translate(''.join(x)) for x in zip(*decoded_alleles) ] amino_acids = mvf.encode(''.join( [x[0] for x in amino_acids])) if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append((contigid, coords[0], (amino_acids, ))) else: entrybuffer.append( (contigid, coords[0], (amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 if entrybuffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 return ''
def vcf2mvf(args=None): """Main method for vcf2mvf""" sepchars = dict([("TAB", "\t"), ("SPACE", " "), ("DBLSPACE", " "), ("COMMA", ","), ("MIXED", None)]) args.fieldsep = sepchars[args.field_sep] # ESTABLISH VCF vcf = VariantCallFile(args.vcf, indexcontigs=(not args.no_autoindex)) # ESTABLISH MVF mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) # PROCESS CONTIG INFO vcfcontigs = vcf.metadata['contigs'].copy() contig_translate = {} if args.contig_ids: for cid, cvcf, cmvf in (x.split(';') for x in args.contig_ids): try: cid = int(cid) except ValueError: pass assert cvcf in [vcfcontigs[x]['label'] for x in vcfcontigs] for vid in vcfcontigs: if vcfcontigs[vid]['label'] == cvcf: contig_translate[cvcf] = [cid, cmvf] if cid in mvf.metadata['contigs']: raise RuntimeError( 'Contig id {} is not unique'.format(cid)) mvf.metadata['contigs'][cid] = vcfcontigs[vid].copy() if cmvf in mvf.get_contig_labels(): raise RuntimeError( 'Contig label {} is not unique'.format(cmvf)) mvf.metadata['contigs'][cid]['label'] = cmvf[:] mvf.reset_max_contig_id() for vcid in vcfcontigs: vlabel = vcfcontigs[vcid]['label'] if vlabel not in mvf.get_contig_labels(): if ((is_int(vlabel) or len(vlabel) < 3) and vlabel not in mvf.get_contig_ids()): newid = vlabel[:] else: newid = mvf.get_next_contig_id() mvf.metadata['contigs'][newid] = vcfcontigs[vcid].copy() contig_translate[vlabel] = [newid, vlabel] mvf.reset_max_contig_id() new_contigs = [(x, mvf.metadata['contigs'][x]['label']) for x in mvf.metadata['contigs']] for i, (newid, newlabel) in enumerate(new_contigs): for j, (xid, xlabel) in enumerate(new_contigs): if i == j: continue if newid == xlabel: raise RuntimeError("Error contig id {} is the same as" " the label for another contig" " ({} {})".format(newid, xid, xlabel)) if newlabel == xid: raise RuntimeError("Error contig label {} is the same" "as the id for another contig" "({} {})".format(newlabel, xid, xlabel)) # PROCESS SAMPLE INFO samplelabels = [args.ref_label] + vcf.metadata['samples'][:] if args.alleles_from: args.alleles_from = args.alleles_from.split(':') samplelabels += args.alleles_from if args.sample_replace: newsample = [ x.split(':') if ':' in tuple(x) else tuple([x, x]) for x in args.sample_replace ] unmatched = [x for x in enumerate(samplelabels)] for old, new in newsample: labelmatched = False for j, (i, name) in enumerate(unmatched): if old in name: samplelabels[i] = new labelmatched = j break if labelmatched is not False: del unmatched[labelmatched] mvf.metadata['labels'] = samplelabels[:] for i, label in enumerate(samplelabels): mvf.metadata['samples'][i] = {'label': label} mvf.metadata['ncol'] = len(mvf.metadata['labels']) mvf.metadata['sourceformat'] = vcf.metadata['sourceformat'] # WRITE MVF HEADER mvf.write_data(mvf.get_header()) mvfentries = [] nentry = 0 for vcfrecord in vcf.iterentries(args): # try: mvf_alleles = encode_mvfstring(''.join(vcfrecord['genotypes'])) if args.out_flavor in ('dnaqual', ): qual_alleles = encode_mvfstring(''.join(vcfrecord['qscores'])) if mvf_alleles: mvfentries.append( (contig_translate.get(vcfrecord['contig'])[0], vcfrecord['coord'], ((mvf_alleles, qual_alleles) if args.out_flavor in ('dnaqual', ) else (mvf_alleles, )))) nentry += 1 if nentry == args.line_buffer: mvf.write_entries(mvfentries, encoded=True) mvfentries = [] nentry = 0 # except Exception as exception: if mvfentries: mvf.write_entries(mvfentries) mvfentries = [] return ''