def add_consensi(family, qual_thres): for order in ('ab', 'ba'): for mate in (0, 1): alignment = family[order][mate] alignment.consensus = consensuslib.get_consensus( alignment.seqs, alignment.quals, qual_thres=chr(qual_thres + QUAL_OFFSET), gapped=True)
def make_sscs(family, order, mate, qual_thres, cons_thres, min_cons_reads): seqs = [read['seq'] for read in family] quals = [read['qual'] for read in family] consensus_seq = consensus.get_consensus(seqs, quals, cons_thres=cons_thres, min_reads=min_cons_reads, qual_thres=qual_thres ) return {'seq':consensus_seq, 'order':order, 'mate':mate, 'nreads':len(family)}
def process_duplex(duplex, barcode, workers=None, stats=None, incl_sscs=False, sscs_fh=None, processes=1, min_reads=1, qual_thres=' '): stats['families'] += 1 # Are we the controller process or a worker? if processes > 1: i = stats['families'] % len(workers) worker = workers[i] delegate(worker, duplex, barcode) return # We're a worker. Actually process the family. start = time.time() consensi = [] reads_per_strand = [] duplex_mate = None for (order, mate), family in duplex.items(): reads = len(family) if reads < min_reads: continue # The mate number for the duplex consensus. It's arbitrary, but all that matters is that the # two mates have different numbers. This system ensures that: # Mate 1 is from the consensus of ab/1 and ba/2 families, while mate 2 is from ba/1 and ab/2. if (order == 'ab' and mate == 1) or (order == 'ba' and mate == 2): duplex_mate = 1 else: duplex_mate = 2 seqs = [read['seq'] for read in family] quals = [read['qual'] for read in family] consensi.append(consensus.get_consensus(seqs, quals, qual_thres=qual_thres)) reads_per_strand.append(reads) assert len(consensi) <= 2 if sscs_fh: for cons, (order, mate), reads in zip(consensi, duplex.keys(), reads_per_strand): sscs_fh.write('>{bar}.{order}.{mate} {reads}\n'.format(bar=barcode, order=order, mate=mate, reads=reads)) sscs_fh.write(cons+'\n') if len(consensi) == 1 and incl_sscs: print_duplex(consensi[0], barcode, duplex_mate, reads_per_strand) elif len(consensi) == 2: align = swalign.smith_waterman(*consensi) #TODO: log error & return if len(align.target) != len(align.query) cons = consensus.build_consensus_duplex_simple(align.target, align.query) print_duplex(cons, barcode, duplex_mate, reads_per_strand) elapsed = time.time() - start logging.info('{} sec for {} reads.'.format(elapsed, sum(reads_per_strand))) if stats and len(consensi) > 0: stats['time'] += elapsed stats['reads'] += sum(reads_per_strand) stats['runs'] += 1
def get_consensus(seq_align, qual_align, qual_thres): """Wrapper around consensus.get_consensus(). When running under Python 3, this encodes strings passed to it as bytes and decodes its return value into str.""" if not (seq_align and qual_align): return None if sys.version_info.major == 3: seqs_bytes = [bytes(seq, 'utf8') for seq in seq_align] quals_bytes = [bytes(qual, 'utf8') for qual in qual_align] qual_thres_byte = qual_thres + 32 else: seqs_bytes = seq_align quals_bytes = qual_align qual_thres_byte = chr(qual_thres + 32) cons_bytes = consensuslib.get_consensus(seqs_bytes, quals_bytes, qual_thres=qual_thres_byte, gapped=True) if sys.version_info.major == 3: cons_seq = str(cons_bytes, 'utf8') else: cons_seq = cons_bytes return cons_seq
def get_consensus(seq_align, qual_align, qual_thres): """Wrapper around consensus.get_consensus(). When running under Python 3, this encodes strings passed to it as bytes and decodes its return value into str.""" if not (seq_align and qual_align): return None if sys.version_info.major == 3: seqs_bytes = [bytes(seq, 'utf8') for seq in seq_align] quals_bytes = [bytes(qual, 'utf8') for qual in qual_align] qual_thres_byte = qual_thres+32 else: seqs_bytes = seq_align quals_bytes = qual_align qual_thres_byte = chr(qual_thres+32) cons_bytes = consensuslib.get_consensus(seqs_bytes, quals_bytes, qual_thres=qual_thres_byte, gapped=True) if sys.version_info.major == 3: cons_seq = str(cons_bytes, 'utf8') else: cons_seq = cons_bytes return cons_seq
def process_barcodes(dict_num, kmer, barcodes): """Perform a multiple sequence alignment on a set of barcodes and parse the result. Uses MAFFT.""" # If there's only one barcode, we don't have to do an alignment. if len(barcodes) == 1: return dict_num, kmer, barcodes[0], barcodes, [1.0] with tempfile.NamedTemporaryFile('w', delete=False, prefix='align.msa.') as family_file: for i, barcode in enumerate(barcodes): family_file.write('>{}\n'.format(i)) family_file.write(barcode+'\n') with open(os.devnull, 'w') as devnull: try: command = ['mafft', '--nuc', '--quiet', family_file.name] output = subprocess.check_output(command, stderr=devnull) except (OSError, subprocess.CalledProcessError): return None os.remove(family_file.name) alignment = read_fasta(output, upper=True) consensus_seq = consensus.get_consensus(alignment) similarities = [] for barcode in barcodes: similarities.append(get_similarity(consensus_seq, barcode)) return dict_num, kmer, consensus_seq, barcodes, similarities
def process_barcodes(dict_num, kmer, barcodes): """Perform a multiple sequence alignment on a set of barcodes and parse the result. Uses MAFFT.""" # If there's only one barcode, we don't have to do an alignment. if len(barcodes) == 1: return dict_num, kmer, barcodes[0], barcodes, [1.0] with tempfile.NamedTemporaryFile('w', delete=False, prefix='align.msa.') as family_file: for i, barcode in enumerate(barcodes): family_file.write('>{}\n'.format(i)) family_file.write(barcode + '\n') with open(os.devnull, 'w') as devnull: try: command = ['mafft', '--nuc', '--quiet', family_file.name] output = subprocess.check_output(command, stderr=devnull) except (OSError, subprocess.CalledProcessError): return None os.remove(family_file.name) alignment = read_fasta(output, upper=True) consensus_seq = consensus.get_consensus(alignment) similarities = [] for barcode in barcodes: similarities.append(get_similarity(consensus_seq, barcode)) return dict_num, kmer, consensus_seq, barcodes, similarities
intens1 = np.array(list(map_1.values())) intens2 = np.array(list(map_2.values())) intens1 = intens1.reshape(1, len(intens1)) intens2 = intens2.reshape(1, len(intens2)) cos_lib = cosine_similarity(intens1, intens2) return cos_lib[0][0] print(len(np.unique(image_UPGMA_pixel1))) cluster2concensus = {} cluster2comparison = {} for cluster in np.unique(image_UPGMA_pixel1): print(cluster) cluster2concensus[cluster] = consensus.get_consensus( cluster, image_UPGMA_pixel1, dist_dot_product, ids, imzMLfile, xs, ys) cluster_ids = consensus.get_cluster_elements(cluster, image_UPGMA_pixel1, parser, xs, ys) tmp = list() for i in cluster_ids: tmp.append( 1 - (get_similarity(cluster2concensus[cluster], consensus.tupel2map(parser.getspectrum(i))))) cluster2comparison[cluster] = tmp consensus_distance = np.zeros( (len(cluster2concensus.keys()), len(cluster2concensus.keys()))) for cluster1 in range(len(cluster2concensus.keys())): for cluster2 in range(cluster1, len(cluster2concensus.keys())): consensus_distance[cluster1, cluster2] = consensus_distance[ cluster2, cluster1] = 1 - get_similarity(
def main(argv): parser = argparse.ArgumentParser(description=DESCRIPTION) parser.set_defaults(**OPT_DEFAULTS) parser.add_argument('seqs', metavar='sequence', nargs='*', help='The alignment.') parser.add_argument('-i', '--input', help='Provide the sequences in this input file instead of as command-line arguments. ' 'Give "-" to use stdin.') parser.add_argument('-f', '--format', choices=('plain', 'duplex'), help='Input format. "plain" is a simple list of the sequences, one on each line. "duplex" is ' 'the 8-column format of the family-sorted read data from the duplex pipeline. It must be ' 'the read pairs from a single alpha/beta barcode combination (both the alpha-beta and ' 'beta-alpha strands). If "duplex" is given, you must also specify which of the four ' 'possible alignments to output with --mate and --order.') parser.add_argument('-m', '--mate', type=int, choices=(1, 2)) parser.add_argument('-o', '--order', choices=('ab', 'ba')) parser.add_argument('-F', '--qual-format', choices=('sanger',)) parser.add_argument('-q', '--qual', type=int, help='Quality threshold: Default: %(default)s') args = parser.parse_args(argv[1:]) qual_thres = ' ' if args.qual_format == 'sanger': qual_thres = chr(args.qual + 33) else: fail('Error: Unsupported FASTQ quality format "{}".'.format(args.qual_format)) # Check arguments. if not (args.seqs or args.input): fail('Error: You must provide sequences either in a file with --input or as arguments.') elif args.seqs and args.input: fail('Error: You cannot provide sequences in both a file and command-line arguments.') if args.format == 'duplex' and not (args.mate and args.order): fail('Error: If the --format is duplex, you must specify a --mate and --order.') # Read input. quals = [] if args.input: if args.format == 'plain': if args.input == '-': seqs = [line.strip() for line in sys.stdin] else: with open(args.input) as infile: seqs = [line.strip() for line in infile] elif args.format == 'duplex': if args.input == '-': (seqs, quals) = parse_duplex(sys.stdin, args.mate, args.order) else: with open(args.input) as infile: (seqs, quals) = parse_duplex(infile, args.mate, args.order) else: seqs = args.seqs align = make_msa(seqs) if quals: quals = seqtools.transfer_gaps_multi(quals, align, gap_char_out=' ') cons = consensus.get_consensus(align, quals, qual_thres=qual_thres, gapped=True) output = format_alignment(cons, align, quals, qual_thres=ord(qual_thres)) for seq in output: print seq