def test_trim_below_abundance(AnyTabletype): hi = AnyTabletype(6) x = "ATGGCAGTAGCAGTGAGC" x_rc = screed.rc(x) hi.consume(x_rc[:10]) print(len(x)) (y, pos) = hi.trim_below_abundance(x, 0) assert pos == len(x) - hi.ksize() + 1 assert x[:pos] == y
def main(): parser = argparse.ArgumentParser() parser.add_argument('contigfile') parser.add_argument('blastz_alignment') parser.add_argument('-l', '--min-length', type=int, default=100) parser.add_argument('-b', '--boundary', type=int, default=5) args = parser.parse_args() sequences = {} for record in screed.open(args.contigfile): sequences[record.name] = record.sequence rc_name = record.name + " (reverse complement)" sequences[rc_name] = screed.rc(str(record.sequence)) print >>sys.stderr, 'loaded %d sequences from %s' % (len(sequences), args.contigfile) fp = open(args.blastz_alignment) records = parse_blastz2.parse_blastz(fp, args.min_length) print >>sys.stderr, 'loaded %d records with min length %d' % (len(records), args.min_length) # make things unique by subject match uniq_records = [ (s_name, s_start, s_end) for (q, s_name, s_start, s_end) in records ] uniq_records = set(uniq_records) print >>sys.stderr, 'uniqified down to %d records' % (len(uniq_records),) for (s_name, s_start, s_end) in uniq_records: seq = sequences[s_name] b_start = max(s_start - 1 - args.boundary, 0) b_end = min(s_end - 1 + args.boundary, len(seq)) interval = seq[b_start:b_end] s_short = s_name.split()[0] if 'reverse complement' in s_name: s_short += 'RC' print '>%s:%d-%d\n%s' % (s_short, b_start, b_end, interval)
def main(argv): parser = argparse.ArgumentParser() parser.add_argument('bcalm_unitigs') parser.add_argument('gxt_out') parser.add_argument('contigs_out') parser.add_argument('-k', '--ksize', type=int, default=31) parser.add_argument('-d', '--debug', action='store_true') parser.add_argument('-P', '--pendants', action="store_true", help="don't remove low abundance pendants") parser.add_argument('-a', '--abundance', nargs='?', type=float, default=1.1) parser.add_argument('--randomize', help='randomize cDBG order') args = parser.parse_args(argv) k = args.ksize trim = not args.pendants trim_cutoff = args.abundance unitigs = args.bcalm_unitigs debug = args.debug if args.debug: logging.basicConfig(filename='bcalm_to_gxt.log', filemode='w', level=logging.DEBUG) else: logging.basicConfig(filename='bcalm_to_gxt.log', filemode='w', level=logging.WARNING) logging.debug("starting bcalm_to_gxt run.") gxtfp = open(args.gxt_out, 'wt') contigsfp = bgzf.open(args.contigs_out, 'wb') info_filename = args.contigs_out + '.info.csv' info_fp = open(info_filename, 'wt') in_mh = sourmash.MinHash(0, 31, scaled=1000) out_mh = sourmash.MinHash(0, 31, scaled=1000) # load in the basic graph structure from the BCALM output file neighbors, sequences, mean_abunds, sizes = read_bcalm(unitigs, debug, k) # record input k-mers in a minhash for seq in sequences.values(): in_mh.add_sequence(seq) # make order deterministic by reordering around min value of first, last, # and reverse complementing sequences appropriately print('reordering...') reordering = {} # first, put sequences in specific orientation sequence_list = [] for key in neighbors: v = sequences[key] # pick lexicographically smaller of forward & reverse complement. v2 = screed.rc(v) if v > v2: v = v2 sequence_list.append((v, key)) del sequences[key] # sort all sequences: sequence_list.sort(reverse=True) if args.randomize: print('(!! randomizing order per --randomize !!)') random.shuffle(sequence_list) # ok, now remap all the things. remapping = {} new_sequences = {} # remap sequences new_key = 0 while sequence_list: # consume while iterating sequence, old_key = sequence_list.pop() remapping[old_key] = new_key new_sequences[new_key] = sequence new_key += 1 # remap other things new_neighbors = collections.defaultdict(set) for old_key, vv in neighbors.items(): new_vv = [ remapping[v] for v in vv ] new_neighbors[remapping[old_key]] = set(new_vv) new_mean_abunds = {} for old_key, value in mean_abunds.items(): new_mean_abunds[remapping[old_key]] = value new_sizes = {} for old_key, value in sizes.items(): new_sizes[remapping[old_key]] = value assert len(sequences) == 0 print('...done') sequences = new_sequences mean_abunds = new_mean_abunds sizes = new_sizes neighbors = new_neighbors # if we are removing pendants, we need to relabel the contigs so they are # consecutive integers starting from 0. If not, we create dummy data # structures to make the interface the same elsewhere in the data if trim: print('removing pendants...') non_pendants = set(v for v, N in neighbors.items() if len(N) > 1 or mean_abunds[v] > trim_cutoff) contract_degree_two(non_pendants, neighbors, sequences, mean_abunds, sizes, k) else: non_pendants = list(neighbors.keys()) aliases = {x: i for i, x in enumerate(sorted(non_pendants))} n = len(aliases) # write out sequences & compute offsets offsets = {} kv_list = sorted(aliases.items(), key=lambda x:x[1]) for x, i in kv_list: offsets[x] = contigsfp.tell() contigsfp.write('>{}\n{}\n'.format(i, sequences[x])) out_mh.add_sequence(sequences[x]) contigsfp.close() print('... done! {} unitigs'.format(n)) # start the gxt file by writing the number of nodes (unitigs)) gxtfp.write('{}\n'.format(n)) # write out all of the links, in 'from to' format. n_edges = 0 for v, N in sorted(neighbors.items()): for u in sorted(N): gxtfp.write('{} {}\n'.format(aliases[v], aliases[u])) n_edges += 1 print('{} vertices, {} edges'.format(n, n_edges)) info_fp.write('contig_id,offset,mean_abund,n_kmers\n') for v, i in aliases.items(): info_fp.write('{},{},{:.3f},{}\n'.format(i, offsets[v], mean_abunds[v], sizes[v])) # output two sourmash signatures: one for input contigs, one for # output contigs. in_sig = sourmash.SourmashSignature(in_mh, filename=args.bcalm_unitigs) sourmash.save_signatures([ in_sig ], open(args.bcalm_unitigs + '.sig', 'wt')) out_sig = sourmash.SourmashSignature(out_mh, filename=args.contigs_out) sourmash.save_signatures([ out_sig ], open(args.contigs_out + '.sig', 'wt'))
record = iter(screed.open(sys.argv[1])).next() genome = record.sequence len_genome = len(genome) n_reads = int(len_genome*COVERAGE / float(READLEN)) reads_mut = 0 total_mut = 0 for i in range(n_reads): start = random.randint(0, len_genome - READLEN) read = genome[start:start + READLEN].upper() # reverse complement? if random.choice([0, 1]) == 0: read = screed.rc(read) # error? was_mut = False for _ in range(READLEN): while random.randint(1, ERROR_RATE) == 1: pos = random.randint(1, READLEN) - 1 read = read[:pos] + random.choice(['a', 'c', 'g', 't']) + read[pos+1:] was_mut = True total_mut += 1 if was_mut: reads_mut += 1 print '>read%d\n%s' % (i, read)
def main(argv): parser = argparse.ArgumentParser() parser.add_argument('bcalm_unitigs') parser.add_argument('gxt_out') parser.add_argument('contigs_out') parser.add_argument('-k', '--ksize', type=int, default=31) parser.add_argument('-d', '--debug', action='store_true') parser.add_argument('-P', '--pendants', action="store_true", help="don't remove low abundance pendants") parser.add_argument('-a', '--abundance', nargs='?', type=float, default=1.1) parser.add_argument('--randomize', help='randomize cDBG order') args = parser.parse_args(argv) k = args.ksize trim = not args.pendants trim_cutoff = args.abundance unitigs = args.bcalm_unitigs debug = args.debug if args.debug: logging.basicConfig(filename='bcalm_to_gxt.log', filemode='w', level=logging.DEBUG) else: logging.basicConfig(filename='bcalm_to_gxt.log', filemode='w', level=logging.WARNING) logging.debug("starting bcalm_to_gxt run.") gxtfp = open(args.gxt_out, 'wt') contigsfp = bgzf.open(args.contigs_out, 'wb') info_filename = args.contigs_out + '.info.csv' info_fp = open(info_filename, 'wt') in_mh = sourmash.MinHash(0, 31, scaled=1000) out_mh = sourmash.MinHash(0, 31, scaled=1000) # load in the basic graph structure from the BCALM output file neighbors, sequences, mean_abunds, sizes = read_bcalm(unitigs, debug, k) # record input k-mers in a minhash for seq in sequences.values(): in_mh.add_sequence(seq) # make order deterministic by reordering around min value of first, last, # and reverse complementing sequences appropriately print('reordering...') reordering = {} # first, put sequences in specific orientation sequence_list = [] for key in neighbors: v = sequences[key] # pick lexicographically smaller of forward & reverse complement. v2 = screed.rc(v) if v > v2: v = v2 sequence_list.append((v, key)) del sequences[key] # sort all sequences: sequence_list.sort(reverse=True) if args.randomize: print('(!! randomizing order per --randomize !!)') random.shuffle(sequence_list) # ok, now remap all the things. remapping = {} new_sequences = {} # remap sequences new_key = 0 while sequence_list: # consume while iterating sequence, old_key = sequence_list.pop() remapping[old_key] = new_key new_sequences[new_key] = sequence new_key += 1 # remap other things new_neighbors = collections.defaultdict(set) for old_key, vv in neighbors.items(): new_vv = [remapping[v] for v in vv] new_neighbors[remapping[old_key]] = set(new_vv) new_mean_abunds = {} for old_key, value in mean_abunds.items(): new_mean_abunds[remapping[old_key]] = value new_sizes = {} for old_key, value in sizes.items(): new_sizes[remapping[old_key]] = value assert len(sequences) == 0 print('...done') sequences = new_sequences mean_abunds = new_mean_abunds sizes = new_sizes neighbors = new_neighbors # if we are removing pendants, we need to relabel the contigs so they are # consecutive integers starting from 0. If not, we create dummy data # structures to make the interface the same elsewhere in the data if trim: print('removing pendants...') non_pendants = set(v for v, N in neighbors.items() if len(N) > 1 or mean_abunds[v] > trim_cutoff) contract_degree_two(non_pendants, neighbors, sequences, mean_abunds, sizes, k) else: non_pendants = list(neighbors.keys()) aliases = {x: i for i, x in enumerate(sorted(non_pendants))} n = len(aliases) # write out sequences & compute offsets offsets = {} kv_list = sorted(aliases.items(), key=lambda x: x[1]) for x, i in kv_list: offsets[x] = contigsfp.tell() contigsfp.write('>{}\n{}\n'.format(i, sequences[x])) out_mh.add_sequence(sequences[x]) contigsfp.close() print('... done! {} unitigs'.format(n)) # start the gxt file by writing the number of nodes (unitigs)) gxtfp.write('{}\n'.format(n)) # write out all of the links, in 'from to' format. n_edges = 0 for v, N in sorted(neighbors.items()): for u in sorted(N): gxtfp.write('{} {}\n'.format(aliases[v], aliases[u])) n_edges += 1 print('{} vertices, {} edges'.format(n, n_edges)) info_fp.write('contig_id,offset,mean_abund,n_kmers\n') for v, i in aliases.items(): info_fp.write('{},{},{:.3f},{}\n'.format(i, offsets[v], mean_abunds[v], sizes[v])) # output two sourmash signatures: one for input contigs, one for # output contigs. in_sig = sourmash.SourmashSignature(in_mh, filename=args.bcalm_unitigs) sourmash.save_signatures([in_sig], open(args.bcalm_unitigs + '.sig', 'wt')) out_sig = sourmash.SourmashSignature(out_mh, filename=args.contigs_out) sourmash.save_signatures([out_sig], open(args.contigs_out + '.sig', 'wt'))
def _equals_rc(query, match): return (query == match) or (screed.rc(query) == match)
n_reads = N_READS reads_mut = 0 total_mut = 0 z = [] for i in range(n_reads): index = random.choice(indices) sequence = seqs[index] start = random.randint(0, len(sequence) - READLEN) read = sequence[start:start + READLEN].upper() # reverse complement? if random.choice([0, 1]) == 0: read = screed.rc(read) # error? was_mut = False for _ in range(READLEN): while random.randint(1, ERROR_RATE) == 1: pos = random.randint(1, READLEN) - 1 read = read[:pos] + random.choice(['a', 'c', 'g', 't' ]) + read[pos + 1:] was_mut = True total_mut += 1 if was_mut: reads_mut += 1 print '>read%d\n%s' % (i, read)
def main (): random.seed(1) # make this reproducible. infname = "NCDScomplete_p_10gxG.fasta" #infname = sys.argv[1] outname = "NCDScomplete_p_10gxG_10X" #outname = sys.argv[2] outfile='%s.fasta' % outname outf=open(outfile,"w+") logfile='%s.log' % outname outlog=open(logfile,"w+") COVERAGE=10 READLEN=100 ERROR_RATE=100 #record = iter(screed.open(sys.argv[1])).next() #record = iter(screed.open('NCDScomplete_p.fasta')).next() record = screed.open(infname) id = 0 total_reads = 0 total_mutated = 0 total_mutations = 0 for g in record: name = "n%s" % g.name # this is for noise # g.name genome = g.sequence len_genome = len(genome) n_reads = int(len_genome*COVERAGE / float(READLEN)) reads_mut = 0 total_mut = 0 for i in range(n_reads): if len_genome < 100: pass else: start = random.randint(0, len_genome - READLEN) read = genome[start:start + READLEN].upper() # reverse complement? if random.choice([0, 1]) == 0: read = screed.rc(read) # error? was_mut = False for _ in range(READLEN): while random.randint(1, ERROR_RATE) == 1: pos = random.randint(1, READLEN) - 1 read = read[:pos] + random.choice(['a', 'c', 'g', 't']) + read[pos+1:] was_mut = True total_mut += 1 if was_mut: reads_mut += 1 outf.write('>n%dread%d\n%s\n' % (id,i, read)) #print >>sys.stderr, "%d of %d reads mutated; %d total mutations from sequence %s" % \ #(reads_mut, n_reads, total_mut, name) outlog.write("%d of %d reads mutated; %d total mutations from sequence %d => %s\n" % \ (reads_mut, n_reads, total_mut, id, name)) total_reads += n_reads total_mutated += reads_mut total_mutations += total_mut id += 1 outlog.write("TOTAL: %d of %d reads mutated; %d total mutations" % \ (total_mutated, total_reads, total_mutations)) print >>sys.stderr, "Done!!"