def calc_sample_coverage(args): """Counts the total number of non-gap/ambiguous characters for each sample per contig. """ mvf = MultiVariantFile(args.mvf, 'read') data = {} # data_order = [] # Set up sample indices if args.sample_indices is not None: sample_indices = [int(x) for x in args.sample_indices[0].split(",")] elif args.sample_labels is not None: sample_indices = mvf.get_sample_indices( ids=args.sample_labels[0].split(",")) else: sample_indices = mvf.get_sample_indices() sample_labels = mvf.get_sample_ids(indices=sample_indices) # Set up contig ids if args.contig_ids is not None: contig_indices = mvf.get_contig_indices(args.contig_ids[0].split(",")) elif args.contig_labels is not None: contig_indices = mvf.get_contig_indices( labels=args.contig_labels[0].split(",")) else: contig_indices = None for contig, _, allelesets in mvf.iterentries(contig_indices=contig_indices, subset=sample_indices, decode=True): if contig not in data: data[contig] = dict((x, 0) for x in sample_labels) data[contig]['contig'] = contig for j, elem in enumerate(sample_indices): data[contig][sample_labels[elem]] += int( allelesets[0][j] not in 'Xx-') outfile = OutputFile(path=args.out, headers=(["contig"] + [sample_labels[x] for x in sample_indices])) for contig in data: outfile.write_entry(data[contig]) return ''
def mvf2fastagene(args): """Main method""" args.qprint("Indexing MVF") mvf = MultiVariantFile(args.mvf, 'read', contigindex=True) if (mvf.flavor in ("dna", "rna") and args.output_data == "prot") or ( mvf.flavor == "prot" and args.output_data in ("dna", "rna")): raise RuntimeError( "--output-data {} incompatiable with '{}' flavor mvf".format( args.output_data, mvf.flavor)) if args.output_data is None: raise RuntimeError("--output-data required") sample_labels = mvf.get_sample_ids() if args.sample_indices is not None: sample_indices = [int(x) for x in args.sample_indices[0].split(",")] elif args.sample_labels is not None: sample_indices = mvf.get_sample_indices( ids=args.sample_labels[0].split(",")) else: sample_indices = mvf.get_sample_indices() args.qprint("Beginning Entries.") if not os.path.exists(args.output_dir): args.qprint("Output Directory Created: {}".format(args.output_dir)) os.mkdir(args.output_dir) else: args.qprint("Output Directory Exists Already: {}".format( args.output_dir)) write_buffer = {} for targetcontig in mvf.get_contig_indices(): contiglabel = mvf.get_contig_labels(indices=targetcontig)[0] args.qprint("Reading Contig {}: {}".format(targetcontig, contiglabel)) write_buffer = dict((x, []) for x in sample_labels) data_in_buffer = False for _, _, allelesets in mvf.itercontigentries(targetcontig, decode=True): for col, label in zip(sample_indices, sample_labels): if mvf.flavor == 'dna': write_buffer[label].append('N' if allelesets[0][col] == 'X' else allelesets[0][col]) data_in_buffer = True elif mvf.flavor in ('codon', 'prot') and (args.output_data == 'prot'): write_buffer[label].append(allelesets[0][col]) data_in_buffer = True elif mvf.flavor == 'codon' and args.output_data == 'dna': if args.choose_allele == 'random1': codon = [ 'N' if allelesets[x][col] == 'X' else (MLIB.randomnuc(allelesets[x][col]) if (allelesets[x][col] in MLIB.validchars['dnaambig23']) else allelesets[x][col]) for x in (1, 2, 3) ] else: codon = [ 'N' if allelesets[x][col] == 'X' else allelesets[x][col] for x in (1, 2, 3) ] write_buffer[label].append(''.join(codon)) data_in_buffer = True if data_in_buffer: args.qprint("Writing Align") with open(os.path.join(args.output_dir, contiglabel + ".fa"), 'w') as outfile: for label in write_buffer: if (mvf.flavor == 'codon' and args.output_data in ('dna', 'prot')): if ((mvf.contig_data[targetcontig].get('strand', '+') == '-') and (args.ignore_strand is False)): entryseq = ''.join(write_buffer[label][::-1]) else: entryseq = ''.join(write_buffer[label]) else: entryseq = ''.join(write_buffer[label]) outfile.write(">{}\n{}\n".format(label, entryseq)) outfile.write("\b") return ''
def mvf2fasta(args): """Main method""" mvf = MultiVariantFile(args.mvf, 'read') if (mvf.flavor in ("dna", "rna") and args.output_data == "prot") or ( mvf.flavor == "prot" and args.output_data in ("dna", "rna")): raise RuntimeError( "--output-data {} incompatiable with '{}' flavor mvf".format( args.output_data, mvf.flavor)) regions, max_region_coord, regionlabel = parse_regions_arg( args.regions, mvf.contig_data) sample_labels = mvf.get_sample_ids() if args.sample_indices is not None: sample_indices = [int(x) for x in args.sample_indices[0].split(",")] elif args.sample_labels is not None: sample_indices = mvf.get_sample_indices( ids=args.sample_labels[0].split(",")) else: sample_indices = mvf.get_sample_indices() skipcontig = None tmp_files = dict( (fname, tempfile.NamedTemporaryFile(mode='w+', prefix=fname)) for fname in sample_labels) labelwritten = dict.fromkeys(sample_labels, False) write_buffer = {} current_contig = None data_written = False args.qprint("Regions determined. Reading entries.") for contig, pos, allelesets in mvf.iterentries( contig_indices=mvf.get_contig_indices( ids=list(max_region_coord.keys())), decode=True): if current_contig is None: current_contig = mvf.get_contig_indices(ids=contig) if contig == skipcontig: continue if (contig not in max_region_coord) or ( max_region_coord[contig] is not None and pos > max_region_coord[contig]): skipcontig = contig[:] continue inregion = False for rcontig, rstart, rstop, _ in regions[contig]: if contig == rcontig: if rstart is None or pos >= rstart: if rstop is None or pos <= rstop: inregion = True break if inregion is False: continue for col, label in zip(sample_indices, sample_labels): if not labelwritten[label]: if args.label_type == 'long': xlabel = "{} region={}".format(label, regionlabel) elif args.label_type == 'short': xlabel = "{}".format(label) tmp_files[label].write(">{}\n".format(xlabel)) labelwritten[label] = True if mvf.flavor == 'dna': tmp_files[label].write("N" if allelesets[0][col] == 'X' else allelesets[0][col]) data_written = True elif mvf.flavor in ('codon', 'prot') and (args.output_data == 'prot'): tmp_files[label].write(allelesets[0][col]) data_written = True elif mvf.flavor == 'codon' and args.output_data == 'dna': codon = [ "N" if allelesets[x][col] == 'X' else allelesets[x][col] for x in (1, 2, 3) ] if not args.gene_mode: tmp_files[label].write(''.join(codon)) data_written = True else: if contig != current_contig: if mvf.metadata['contigs'][current_contig].get( 'strand', "+") == '-': write_buffer[label] = write_buffer[label][::-1] tmp_files[label].write(''.join(write_buffer[label])) data_written = True if label not in write_buffer: write_buffer[label] = [] write_buffer[label].append(''.join(codon)) if args.gene_mode and current_contig != contig: write_buffer = {} current_contig = contig[:] if write_buffer: for label in write_buffer: if mvf.metadata['contigs'][current_contig].get('strand', "+") == '-': write_buffer[label] = write_buffer[label][::-1] tmp_files[label].write(''.join(write_buffer[label])) data_written = True write_buffer = {} if data_written is False: print("ERROR NO DATA WRITTEN") with open(args.out, 'w') as outfile: for filehandler in tmp_files.values(): filehandler.seek(0, 0) buff = filehandler.read(args.buffer) while buff: outfile.write(buff) buff = filehandler.read(args.buffer) outfile.write("\n") filehandler.close() return ''
def calc_character_count(args): """Count the number of and relative rate of certain bases spatially along chromosomes """ mvf = MultiVariantFile(args.mvf, 'read') data = {} current_contig = None current_position = 0 all_match = 0 all_total = 0 data_in_buffer = False # Set up base matching from special words data_order = [] def proc_special_word(argx): if argx == 'dna': argx = MLIB.validchars['dna'] elif argx == 'dnaambig2': argx = MLIB.validchars['dna+ambig2'] elif argx == 'dnaambig3': argx = MLIB.validchars['dna+ambig3'] elif argx == 'dnaambigall': argx = MLIB.validchars['dna+ambigall'] elif argx == 'prot': argx = MLIB.validchars['amino'] return argx args.base_match = proc_special_word(args.base_match) args.base_total = proc_special_word(args.base_total) # Set up sample indices if args.sample_indices is not None: sample_indices = [int(x) for x in args.sample_indices[0].split(",")] elif args.sample_labels is not None: sample_indices = mvf.get_sample_indices( ids=args.sample_labels[0].split(",")) else: sample_indices = mvf.get_sample_indices() sample_labels = mvf.get_sample_ids(indices=sample_indices) # Set up contig ids if args.contig_ids is not None: contig_indices = mvf.get_contig_indices( ids=args.contig_ids[0].split(",")) elif args.contig_labels is not None: contig_indices = mvf.get_contig_indices( labels=args.contig_labels[0].split(",")) else: contig_indices = None match_counts = dict().fromkeys([sample_labels[i] for i in sample_indices], 0) total_counts = dict().fromkeys([sample_labels[i] for i in sample_indices], 0) for contig, pos, allelesets in mvf.iterentries( decode=False, contig_indices=contig_indices): # Check Minimum Site Coverage if check_mincoverage(args.mincoverage, allelesets[0]) is False: continue # if contig not in contig_ids: # continue # Establish first contig if current_contig is None: current_contig = contig[:] if args.windowsize > 0: while pos > current_position + args.windowsize - 1: current_position += args.windowsize # Check if windows are specified. if not same_window((current_contig, current_position), (contig, pos), args.windowsize): data[(current_contig, current_position)] = { 'contig': current_contig, 'position': current_position } data_order.append((current_contig, current_position)) for k in match_counts: data[(current_contig, current_position)].update([ (k + '.match', match_counts[k] + all_match), (k + '.total', total_counts[k] + all_total), (k + '.prop', ((float(match_counts[k] + all_match) / float(total_counts[k] + all_total)) if total_counts[k] + all_total > 0 else 0)) ]) if contig != current_contig: current_contig = contig[:] current_position = 0 else: current_position += (0 if args.windowsize == -1 else args.windowsize) match_counts = dict().fromkeys( [sample_labels[i] for i in sample_indices], 0) total_counts = dict().fromkeys( [sample_labels[i] for i in sample_indices], 0) all_total = 0 all_match = 0 data_in_buffer = False else: alleles = allelesets[0] if len(alleles) == 1: if args.base_match is None: all_match += 1 elif alleles in args.base_match: all_match += 1 if args.base_total is None: all_total += 1 elif alleles in args.base_total: all_total += 1 else: alleles = mvf.decode(alleles) for i in sample_indices: if args.base_match is None: match_counts[sample_labels[i]] += 1 elif alleles[i] in args.base_match: match_counts[sample_labels[i]] += 1 if args.base_total is None: total_counts[sample_labels[i]] += 1 elif alleles[i] in args.base_total: total_counts[sample_labels[i]] += 1 data_in_buffer = True if data_in_buffer: data[(current_contig, current_position)] = { 'contig': current_contig, 'position': current_position } data_order.append((current_contig, current_position)) for k in match_counts: data[(current_contig, current_position)].update([ (k + '.match', match_counts[k] + all_match), (k + '.total', total_counts[k] + all_total), (k + '.prop', ((float(match_counts[k] + all_match) / float(total_counts[k] + all_total)) if total_counts[k] + all_total > 0 else 0)) ]) # WRITE OUTPUT headers = ['contig', 'position'] for label in sample_labels: headers.extend([label + x for x in ('.match', '.total', '.prop')]) outfile = OutputFile(path=args.out, headers=headers) for okey in data_order: outfile.write_entry(data[okey]) return ''
def translate_mvf(args): """Main method""" args.qprint("Running TranslateMVF") if args.gff: args.qprint("Reading and Indexing MVF.") else: args.qprint("Reading MVF.") mvf = MultiVariantFile(args.mvf, 'read', contigindex=bool(args.gff)) if mvf.flavor != 'dna': raise RuntimeError("MVF must be flavor=dna to translate") if args.gff: args.qprint("Processing MVF Index File.") mvf.read_index_file() args.qprint("GFF processing start.") gff_genes, gene_order = parse_gff_exome(args) args.qprint("GFF processed.") outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.copy_headers_from(mvf) outmvf.contig_data = dict( ( i, dict((y, z) for (y, z) in gff_genes[x].items() if y not in ('cds', ))) for (i, x) in enumerate(gene_order)) outmvf.contig_indices = list(range(len(gene_order))) outmvf.contig_ids = [gff_genes[x]['id'] for x in gene_order] outmvf.contig_labels = [gff_genes[x]['label'] for x in gene_order] outmvf.flavor = args.output_data outmvf.metadata.notes.append(args.command_string) outmvf.write_data(outmvf.get_header()) args.qprint("Output MVF Established.") entrybuffer = [] nentry = 0 pos = None if not args.gff: args.qprint("No GFF used, translating sequences as pre-aligned in " "coding frame.") inputbuffer = [] current_contig = '' for contigid, pos, allelesets in mvf.iterentries(decode=False): if current_contig == '': current_contig = contigid[:] if contigid == current_contig: inputbuffer.append((pos, allelesets)) else: for _, amino_acids, alleles in iter_codons( inputbuffer, mvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids,))) else: entrybuffer.append(( current_contig, pos, ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 inputbuffer = [(pos, allelesets)] current_contig = contigid[:] if inputbuffer: for _, amino_acids, alleles in iter_codons( inputbuffer, outmvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids,))) else: entrybuffer.append(( current_contig, pos, ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 else: running_gene_index = -1 for igene, gene in enumerate(gene_order): xcontiglabel = gff_genes[gene]['contig'] xcontig = mvf.get_contig_indices( labels=gff_genes[gene]['contig']) if xcontig is None: print("Warning: contig {} not found".format( gff_genes[gene]['contig'])) xcontigid = mvf.get_contig_ids(indices=xcontig)[0] min_gene_coord = gff_genes[gene]['cds'][0][0] max_gene_coord = gff_genes[gene]['cds'][-1][1] mvf_entries = {} if not igene % 100: args.qprint("Processing gene {} on {}".format( gene, xcontiglabel)) for contigid, pos, allelesets in mvf.itercontigentries( xcontig, decode=False): if pos < min_gene_coord: continue if pos > max_gene_coord: break mvf_entries[pos] = allelesets[0] reverse_strand = gff_genes[gene]['strand'] == '-' coords = [] running_gene_index += 1 for elem in gff_genes[gene]['cds']: coords.extend(list(range(elem[0], elem[1] + 1))) if reverse_strand: coords = coords[::-1] for codoncoord in range(0, len(coords), 3): alleles = tuple(mvf_entries.get(x, '-') for x in coords[codoncoord:codoncoord + 3]) if len(alleles) < 3: alleles = tuple(list(alleles) + ['-'] * (3 - len(alleles))) if all(len(x) == 1 for x in alleles): if reverse_strand: alleles = tuple( MLIB.complement_bases[x] for x in alleles) decoded_alleles = alleles amino_acids = translate_single_codon(''.join(alleles)) else: if reverse_strand is True: decoded_alleles = tuple(tuple(MLIB.complement_bases[y] for y in mvf.decode(x)) for x in alleles) alleles = tuple(outmvf.encode(''.join(x)) for x in decoded_alleles) else: decoded_alleles = tuple(mvf.decode(x) for x in alleles) amino_acids = tuple(translate_single_codon(''.join(x)) for x in zip(*decoded_alleles)) amino_acids = outmvf.encode(''.join(amino_acids)) if args.output_data == 'protein': entrybuffer.append(( ( xcontigid if args.retain_contigs else running_gene_index ), ( coords[codoncoord] if args.retain_coords else codoncoord ), ( amino_acids, ) )) elif args.output_data == 'codon': entrybuffer.append(( ( xcontigid if args.retain_contigs else running_gene_index ), ( coords[codoncoord] if args.retain_coords else codoncoord ), ( amino_acids, alleles[0], alleles[1], alleles[2] ) )) elif args.output_data == 'dna': for j, elem in enumerate( range(codoncoord, min(codoncoord + 3, len(coords)))): entrybuffer.append(( ( xcontigid if args.retain_contigs else running_gene_index ), ( coords[elem] if args.retain_coords else elem + 1 ), ( alleles[j], ) )) nentry += 1 if nentry >= args.line_buffer: args.qprint("Writing a block of {} entries.".format( args.line_buffer)) outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 if entrybuffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 return ''
def legacy_translate_mvf(args): """Main method""" args.qprint("Running LegacyTranslateMVF") if args.gff: args.qprint("Reading and Indexing MVF.") else: args.qprint("Reading MVF.") mvf = MultiVariantFile(args.mvf, 'read', contigindex=bool(args.gff)) if mvf.flavor != 'dna': raise RuntimeError("MVF must be flavor=dna to translate") if args.gff: args.qprint("Processing MVF Index File.") mvf.read_index_file() args.qprint("GFF processing start.") gff = parse_gff_legacy_translate( args.gff, args, parent_gene_pattern=args.parent_gene_pattern) args.qprint("GFF processed.") outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.copy_headers_from(mvf) outmvf.flavor = args.output_data outmvf.write_data(outmvf.get_header()) args.qprint("Output MVF Established.") entrybuffer = [] nentry = 0 pos = None if not args.gff: args.qprint("No GFF used, translating sequences as pre-aligned in " "coding frame.") inputbuffer = [] current_contig = '' for contigid, pos, allelesets in mvf.iterentries(decode=False): if current_contig == '': current_contig = contigid[:] if contigid == current_contig: inputbuffer.append((pos, allelesets)) else: for _, amino_acids, alleles in iter_codons( inputbuffer, mvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids,))) else: entrybuffer.append(( current_contig, pos, ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 inputbuffer = [(pos, allelesets)] current_contig = contigid[:] if inputbuffer: for _, amino_acids, alleles in iter_codons( inputbuffer, outmvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids,))) else: entrybuffer.append(( current_contig, pos, ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 else: args.qprint("Indexing GFF gene names.") # mvfid_to_gffname = outmvf.get_contig_reverse_dict() for xcontig in outmvf.get_contig_indices(): mvf_entries = {} xcontiglabel = outmvf.get_contig_labels(indices=xcontig)[0] xcontigid = outmvf.get_contig_ids(indices=xcontig)[0] if xcontiglabel not in gff: if args.verbose: print( ("No entries in GFF, " "skipping contig: index:{} id:{} label:{}").format( xcontig, xcontigid, xcontiglabel)) continue if not xcontig % 100: args.qprint("Processing contig: {} {}".format( xcontigid, xcontiglabel)) for contigid, pos, allelesets in mvf.itercontigentries( xcontig, decode=False): mvf_entries[pos] = allelesets[0] for coords in sorted(gff[xcontiglabel]): reverse_strand = coords[3] == '-' alleles = (tuple(mvf_entries.get(x, '-') for x in coords[2::-1]) if reverse_strand is True else tuple(mvf_entries.get(x, '-') for x in coords[0:3])) if all(len(x) == 1 for x in alleles): if reverse_strand: alleles = tuple( MLIB.complement_bases[x] for x in alleles) decoded_alleles = alleles amino_acids = translate_single_codon(''.join(alleles)) else: if reverse_strand is True: decoded_alleles = tuple(tuple(MLIB.complement_bases[y] for y in mvf.decode(x)) for x in alleles) alleles = tuple(outmvf.encode(''.join(x)) for x in decoded_alleles) else: decoded_alleles = tuple(mvf.decode(x) for x in alleles) amino_acids = tuple(translate_single_codon(''.join(x)) for x in zip(*decoded_alleles)) # print("aminx", amino_acids) amino_acids = outmvf.encode(''.join(amino_acids)) # if all(x in '-X' for x in amino_acids): # continue # print("amino", amino_acids) # print("translated", amino_acids, alleles) if args.output_data == 'protein': entrybuffer.append((xcontig, coords[0], (amino_acids,))) else: entrybuffer.append(( xcontigid, coords[0], ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry >= args.line_buffer: args.qprint("Writing a block of {} entries.".format( args.line_buffer)) outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 if entrybuffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 return ''