Пример #1
0
def match_multi(fq1, fq2, primersets):
    for r1, r2 in zip(mp.fastx_read(fq1, read_comment=True),
                      mp.fastx_read(fq2, read_comment=True)):
        r1 = Read(*r1)
        r2 = Read(*r2)
        matches = {}
        for pset in primersets:
            matches[pset.name] = Matched(r1, pset.match(r1.seq), r2,
                                         pset.match(r2.seq))
        yield r1, r2, matches
Пример #2
0
def generate_coverage(read1, read2, mapping, ref, pwid=0.95, ncpu=1, chunk_size=500000, quiet=False):

    if not quiet: print("Building index and data structures...")

    seq_cov = {}
    for name, seq in pyfastx.Fasta(ref, build_index=False):
        seq_cov[name] = np.zeros(len(seq), dtype=int)

    nreads = 0
    read_len = 0
    for r in mp.fastx_read(read1):
        nreads+=1
        read_len += len(r[1])
    read_len /= nreads
    min_chain_score = int(0.9*read_len)
    min_mis_match = int(read_len-pwid*read_len)

    a = mp.Aligner(ref, preset='sr', n_threads=ncpu, best_n=1000, min_chain_score=min_chain_score)  # load or build index 
    if not a: raise Exception("ERROR: failed to load/build index")

    def mpile(seqs):
        if seqs is None: return([])
        thrbuf = mp.ThreadBuffer()
        hits = []
        chrom=None
        for hit in a.map(seqs[1], buf=thrbuf):
            if (hit.NM<=min_mis_match) and ('S' not in hit.cigar_str) and ('H' not in hit.cigar_str):
                if chrom is None:
                    chrom=mapping[hit.ctg]
                    hits.append((hit.ctg, hit.r_st-1, hit.r_en))
                elif mapping[hit.ctg] == chrom:
                    hits.append((hit.ctg, hit.r_st-1, hit.r_en))
                else:
                    break
        return(hits)

    if not quiet: print("Aligning reads...")
    pool = ThreadPool(ncpu)
    for reads in tqdm(grouper(chain(
        mp.fastx_read(read1),
        mp.fastx_read(read2)), chunk_size), 
        total=int(1+2*nreads/chunk_size), disable=quiet):
        hits = pool.map(mpile, reads)
        for hit in chain.from_iterable(hits):
            if hit is None: continue
            seq_cov[hit[0]][hit[1]:hit[2]] += 1

    #close the pool and wait for the work to finish
    pool.close()
    pool.join()

    return(seq_cov)
Пример #3
0
def main(argv):
	opts, args = getopt.getopt(argv[1:], "x:n:m:k:w:r:c")
	if len(args) < 2:
		print("Usage: minimap2.py [options] <ref.fa>|<ref.mmi> <query.fq>")
		print("Options:")
		print("  -x STR      preset: sr, map-pb, map-ont, asm5, asm10 or splice")
		print("  -n INT      mininum number of minimizers")
		print("  -m INT      mininum chaining score")
		print("  -k INT      k-mer length")
		print("  -w INT      minimizer window length")
		print("  -r INT      band width")
		print("  -c          output the cs tag")
		sys.exit(1)

	preset = min_cnt = min_sc = k = w = bw = None
	out_cs = False
	for opt, arg in opts:
		if opt == '-x': preset = arg
		elif opt == '-n': min_cnt = int(arg)
		elif opt == '-m': min_chain_score = int(arg)
		elif opt == '-r': bw = int(arg)
		elif opt == '-k': k = int(arg)
		elif opt == '-w': w = int(arg)
		elif opt == '-c': out_cs = True

	a = mp.Aligner(args[0], preset=preset, min_cnt=min_cnt, min_chain_score=min_sc, k=k, w=w, bw=bw)
	if not a: raise Exception("ERROR: failed to load/build index file '{}'".format(args[0]))
	for name, seq, qual in mp.fastx_read(args[1]): # read one sequence
		for h in a.map(seq, cs=out_cs): # traverse hits
			print('{}\t{}\t{}'.format(name, len(seq), h))
Пример #4
0
def runMapper(referenceIndex, asm2Filename, minQueryLen):
    print("running minimap2 and finding top hit per query sequence\n")
    scaffoldMapList0 = []

    for name, seq, qual in mp.fastx_read(asm2Filename):
        print("... query: %s" % name)
        if len(seq) < minQueryLen:
            print(
                "...... Skipping, query too short (seq len of %i is less than minimum: %i)\n"
                % (len(seq), minQueryLen))
            continue
        hits = []
        for hit in referenceIndex.map(seq):

            hits.append(name + "\t" + str(len(seq)) + "\t" + hit.ctg + "\t" +
                        str(hit.mlen))

        topAln = getTopHitByAlignmentLength(hits)
        print("Top hit: %s\n" % topAln['top_aln_id'])
        scaffoldMapList0.append({
            'queryID': name,
            'qury_len': len(seq),
            'refID': topAln['top_aln_id'],
            'alignLen': topAln['top_aln_blen']
        })
    return (scaffoldMapList0)
Пример #5
0
def chunk_process(num_reads, args, blat):
    '''Split the input fasta into chunks and process'''
    if args.blatThreads:
        chunk_size = (num_reads // args.numThreads) + 1
    else:
        chunk_size = args.groupSize
    if chunk_size > num_reads:
        chunk_size = num_reads

    pool = mp.Pool(args.numThreads)
    pbar = tqdm(total=num_reads // chunk_size + 1, desc='Preprocessing')
    iteration, current_num, tmp_reads, target = 1, 0, {}, chunk_size
    for read in mm.fastx_read(args.reads, read_comment=False):
        if len(read[1]) < args.lencutoff:
            continue
        tmp_reads[read[0]] = read[1]
        current_num += 1
        if current_num == target:
            pool.apply_async(process,
                             args=(args, tmp_reads, blat, iteration),
                             callback=lambda _: pbar.update(1))
            iteration += 1
            target = chunk_size * iteration
            if target >= num_reads:
                target = num_reads
            tmp_reads = {}
    pool.close()
    pool.join()
    pbar.close()

    cat_files(args.out_path, 'pre_tmp_*/tmp_splint_aln.psl',
              args.out_path + 'tmp/splint_to_read_alignments.psl')
    remove_files(args.out_path, 'pre_tmp*')
Пример #6
0
def align_contigs(**kwargs):

    if 'infile_fasta' in kwargs:
        infile = kwargs['infile_fasta']
    if 'out' in kwargs:
        outfile = kwargs['out']
    if 'genome' in kwargs:
        genome = kwargs['genome']
    if 'preset' in kwargs:
        preset = kwargs['preset']
    if 'nthreads' in kwargs:
        nthreads = kwargs['nthreads']

    a = mp.Aligner(str(genome), preset=preset, n_threads=nthreads)

    if not a: raise Exception("ERROR: failed to load/build index")

    outfile = open(outfile, 'w')

    outfile.write(
        "read\tchr\tpos\tr_st\tr_en\tq_st\tq_en\tq_len\tprimary\tstrand\tcs\tcigstr\tcigtup\n"
    )

    for name, seq, qual in mp.fastx_read(infile):
        seq_len = len(seq)
        print name
        for hit in a.map(seq, cs=True):
            outfile.write(
                "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                    name, hit.ctg, hit.r_st, hit.r_st, hit.r_en, hit.q_st,
                    hit.q_en, seq_len, hit.is_primary, hit.strand, hit.cs,
                    hit.cigar_str, hit.cigar))

    outfile.close()
Пример #7
0
def read_subreads(seq_file, chrom_reads):
    for read in mm.fastx_read(seq_file, read_comment=False):
        root_name = read[0].split('_')[0]
        if root_name in chrom_reads:
            # root_name : [(header, seq, qual), ...]
            chrom_reads[root_name].append(read)  # read = (header, seq, qual)
    return chrom_reads
Пример #8
0
def load_reference(fp):
    '''
    only support for one contig for now

    out: ('EU117116.1', 'AAAATATAAAAACT...')
    '''
    rname, rseq, _ = next(mp.fastx_read(fp))
    return rname, rseq
Пример #9
0
def readFastq(seqFile):
    readDict={}
    for name,seq,qual in mappy.fastx_read(seqFile):
        root_name=name.split('_')[0]
        if root_name not in readDict:
            readDict[root_name]=[]
        readDict[root_name].append((name,seq,qual))
    return readDict
Пример #10
0
def getFlankCrit(flankFa):
    flanks = [seq for _, seq, _ in mp.fastx_read(flankFa)]

    def includesFlanks(rec):
        a = mp.Aligner(seq=rec.query_sequence, preset='sr')
        return np.all([len(list(a.map(f))) == 1 for f in flanks])

    return includesFlanks
Пример #11
0
def match_single(fq, primersets):
    for r in mp.fastx_read(fq, read_comment=True):
        r = Read(*r)
        rc = mp.revcomp(r.seq)
        matches = {}
        for pset in primersets:
            matches[pset.name] = Matched(r, pset.match(r.seq), r,
                                         pset.match(rc))
        yield r, matches
Пример #12
0
def read_fasta(inFile, indexes):
    '''Reads in FASTA files, returns a dict of header:sequence'''
    readDict, index_dict = {}, {}
    for read in mm.fastx_read(inFile, read_comment=False):
        readDict[read[0]] = read[1]
        if indexes:
            index_dict[read[1]] = read[0]
    if indexes:
        return readDict, index_dict
    return readDict
def main(sequence_fasta, output):
    """For each sequence, deconcatenate and write to output."""
    corrected = []
    for name, seq, _ in mp.fastx_read(sequence_fasta):
        corrected.append([name, deconcatenate(seq)])

    handler = get_output_handler(output)
    for n, s in corrected:
        handler.write(f">{n}\n{s}\n")
    handler.close()
Пример #14
0
def remove_by_alignment(fq, ref, out, mapq, preset, human_out, threads,
                        logger):

    fout = smart_open(filename=out, mode="w")

    if human_out:
        hout = smart_open(filename=human_out, mode="w")
    else:
        hout = None

    logger.info(f"Starting to map reads against: {ref}")

    logger.info(f"Initiating aligner: {ref}")
    aligner = mp.Aligner(str(ref), preset=preset, n_threads=threads)

    logger.info(f"Opening file handle: {fq}")
    if fq:
        reads = mp.fastx_read(str(fq))
    else:
        reads = None  # PE

    ref_maps = 0
    total_reads = 0

    logger.info(f"Filtering mapped reads [Q >= {mapq}]")

    human = []
    not_human = []
    for name, seq, qual in reads:
        mapped = aligner.map(seq)
        for aln in mapped:
            if aln.mapq >= mapq:
                ref_maps += 1
                if name not in human:
                    human.append(name)
                if hout is not None:
                    hout.write(str(f"@{name}\n{seq}\n+\n{qual}\n"))
                continue

        if name not in human:
            fout.write(str(f"@{name}\n{seq}\n+\n{qual}\n"))
            if name not in not_human:
                not_human.append(name)

        total_reads += 1

    fout.close()

    if hout is not None:
        hout.close()

    logger.info(f"Computed {ref_maps} mappings against reference: {ref}")
    logger.info(f"Recovered  {len(not_human)} / {total_reads} reads from {fq}")
Пример #15
0
def create_index(reference_file):
    aligner = mp.Aligner(reference_file, best_n=1)

    for name, seq, qual in mp.fastx_read(reference_file, read_comment=False):
        reference_names.append(name)
        reference_lengths[name] = len(seq)

    if not aligner:
        raise Exception("ERROR: failed to load/build index file '{}'".format(
            reference_file))

    return aligner
Пример #16
0
def hdf_to_sam(args):
    """Entry point for converting guppy methylcalled fast5s to sam."""
    sys.stdout.write('\t'.join(('@HD', 'VN:1.5', 'SO:unsorted')))
    sys.stdout.write('\n')
    for name, seq, _ in mappy.fastx_read(args.reference, read_comment=False):
        sys.stdout.write('@SQ\tSN:{}\tLN:{}\n'.format(name, len(seq)))

    fast5s = get_fast5_file_list(args.path, recursive=args.recursive)
    worker = functools.partial(hdf_to_sam_worker, args.reference)
    with ProcessPoolExecutor(max_workers=args.workers) as executor:
        for res in executor.map(worker, fast5s):
            for r in res:
                sys.stdout.write('{}\n'.format(r))
Пример #17
0
def write_multiple_hits_to_logPAF(a, S288c_transcripts, outdir, outname, multiple_hits):
        ''' 
        write transcripts that match to multiple times on the same chromosome to a log file
        in PAF format. Use paftools view outdir/outname.paf to visualize those alignments.
        '''
        with open(join(outdir,outname+'_multiple_hits.log.paf'),'w') as multiple_fh:
                for name, seq, _, comment in mp.fastx_read(S288c_transcripts, read_comment=True):
                        if name not in multiple_hits: continue
                        chromosome, chr_start, chr_end, strand = location_from_comment(comment)
                        for h in a.map(seq, cs=True): # traverse hits
                                _, hit_chromosome = chromosome_from_hit(h.ctg)
                                if hit_chromosome != chromosome:
                                        continue
                                outstring = name + '\t' + str(len(seq)) + '\t' + str(h) + '\n'
                                multiple_fh.write(outstring)
Пример #18
0
def run():

    a = mp.Aligner(
        "/home/sonhoanghguyen/Projects/readuntil/simulation/npgraph/test/assembly_graph.fasta"
    )  # load or build index
    if not a: raise Exception("ERROR: failed to load/build index")

    with grpc.insecure_channel('localhost:2105') as channel:
        stub = npgraph_service_pb2_grpc.AssemblyGuideStub(channel)
        print("Connected with server at localhost:2105")

        for name, seq, qual in mp.fastx_read(
                "/home/sonhoanghguyen/Projects/readuntil/simulation/npgraph/test/E_coli_K-12_MG1655_good_long.fastq.gz"
        ):
            #1. make request
            request = npgraph_service_pb2.RequestAssembly()
            request.read_id = name
            for hit in a.map(seq):  # traverse alignments
                #print("{}\t{}\t{}\t{}".format(hit.ctg, hit.r_st, hit.r_en, hit.cigar_str))
                request.hits_list.append(
                    npgraph_service_pb2.AlignmentMsg(query_name=name,
                                                     query_length=len(seq),
                                                     query_start=hit.q_st,
                                                     query_end=hit.q_en,
                                                     strand=hit.strand > 0,
                                                     target_name=hit.ctg,
                                                     target_length=hit.ctg_len,
                                                     target_start=hit.r_st,
                                                     target_end=hit.r_en,
                                                     quality=hit.mapq,
                                                     score=hit.mlen))

            #2. get and print response
            if len(request.hits_list) > 0:
                try:
                    start_time = time.time()
                    response = stub.GetAssemblyContribution(request)
                    print("{}: {} in {:.5f} seconds".format(
                        response.read_id, response.usefulness,
                        time.time() - start_time))
                except grpc.RpcError as e:
                    print("{}: errorcode={}".format(request.read_id,
                                                    str(e.code())))
                    continue
            else:
                print("{}: unmapped!".format(request.read_id))
                continue
Пример #19
0
 def mapping(self, query_path, ref_path):
     if os.path.isdir(query_path):
         file_list = os.listdir(query_path)
     else:
         file_list = [query_path]
     for file_name in file_list:
         mapper = mp.Aligner(ref_path, preset="map-ont")
         for name, seq, qual in mp.fastx_read(file_name):
             for hit in mapper.map(seq):
                 self.names.append(name)
                 self.cigar.append(hit.cigar_str)
                 self.r_st.append(hit.r_st)
                 self.r_end.append(hit.r_en)
                 self.q_st.append(hit.q_st)
                 self.q_end.append(hit.q_en)
                 self.section.append(hit.ctg)
                 self.strand.append(hit.strand)
def main(parser):
    args = parser.parse_args()

    if args.motifs:
        print 'Over-riding preset motifs with %s' % args.motifs
        motifs = args.motifs.split(',')
        label = args.label
    else:
        motifs = repeatPatterns[args.preset]
        label = args.label if args.label else args.preset

    aligner = mp.Aligner(args.target)

    #function to generate output names
    s = args.sample + '.' if args.sample else ''
    l = label + '.' if label else ''
    outfileName = lambda name, ext: '{d}/{s}{l}{n}.{e}'.format(
        d=args.outDir, s=s, l=l, n=name, e=ext)

    #function to write summary
    writeSummary = lambda: summary.to_csv(outfileName('summary', 'csv'))

    print 'Mapping and extracting repeat regions'
    repeatRegions = pd.DataFrame({
        'read': rec[0],
        'subsequence': extractRepeat(rec[1], aligner)
    } for rec in mp.fastx_read(args.ccsFastx))

    repeatRegions = repeatRegions.assign(size=repeatRegions.subsequence.map(len))\
                                 .sort_values('size',ascending=False)\
                                 .drop(columns='size')\
                                 .reset_index(drop=True)
    #filter and summarize
    summary, filtered = countAlignments(repeatRegions, reference=args.target)

    print 'Counting repeats'
    try:
        motifDfs = [pd.concat(filtered.set_index('read',append=True).subsequence.map(getPositions(motif)).to_dict())\
                      .reset_index(level=2,drop=True)\
                      .reset_index()\
                      .rename(columns={'level_0':'idx','level_1':'readName'})
                    for motif in motifs]
    except ValueError, e:
        writeSummary()
        raise fastRepeatAnalysisReport_Exception('No reads map to target!')
Пример #21
0
def mappyAlign(infile, outfile):
    import mappy as mp
    
    a = mp.Aligner("/mnt/ix1/Resources/10X_resources/refdata-b37-2.1.0/fasta/genome.fa")

    if not a: raise Exception("ERROR: failed to load/build index")

    outfile = open(outfile, 'w')

    outfile.write("read\tchr\tpos\tr_st\tr_en\tq_st\tq_en\tcigstr\tcigtup\n")

    for name, seq, qual in mp.fastx_read(infile): # read a fasta/q sequence

        for hit in a.map(seq): # traverse alignments ##CORE DUMPED### on aji, but fine on tamago
            if ((hit.ctg).isdigit()):
                outfile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(name, hit.ctg, hit.r_st, hit.r_st, hit.r_en, hit.q_st, hit.q_en, hit.cigar_str, hit.cigar))

    outfile.close()
Пример #22
0
def hdf_to_sam(args):
    """Entry point for converting guppy methylcalled fast5s to sam."""
    logger = medaka.common.get_named_logger('ModExtract')
    logger.info(
        "NOTE: Mod. base scores are output w.r.t the sequencing direction, "
        "not the aligned read orientation.")
    extractor = Extractor(args.path,
                          recursive=args.recursive,
                          workers=args.io_workers)

    sys.stdout.write('\t'.join(('@HD', 'VN:1.5', 'SO:unsorted')))
    sys.stdout.write('\n')
    sys.stdout.write('\t'.join(
        ('@CO', 'Guppy basecaller mod. base tags are stored w.r.t. the '
         'sequencing direction, they should be reversed for reads '
         'aligning to the reverse strand.\n')))
    if args.reference is None:
        # write unaligned sam
        for read, tags in extractor:
            sam = unaligned_read(read, tags)
            sys.stdout.write('{}\n'.format(sam))
    else:
        for name, seq, _ in mappy.fastx_read(args.reference,
                                             read_comment=False):
            sys.stdout.write('@SQ\tSN:{}\tLN:{}\n'.format(name, len(seq)))
        aligner = Aligner(args.reference,
                          preset='map-ont',
                          n_threads=args.workers)

        def _write(future):
            try:
                sam = future.result()
                if sam is not None:
                    sys.stdout.write('{}\n'.format(sam))
            except Exception:
                pass
            # https://bugs.python.org/issue27144
            future._result = None

        with ThreadPoolExecutor(max_items=args.workers,
                                max_workers=args.workers) as executor:
            for read, tags in extractor:
                future = executor.submit(aligner.map, read, tags)
                future.add_done_callback(_write)
Пример #23
0
def read_quality(
    read,
    ref_fasta,
    result_dict={
        'nb_mappings': [],
        'matches': 0,
        'mismatches': 0,
        'deletions': 0,
        'insertions': 0,
        'mapping_quality': []
    }):
    """
    Saves read quality of FASTQ/A reads to file.
    
    Args:
        reads -- str, path to read in FASTA or FAST5 format
        ref_fasta -- str, path to FASTA file containing reference
        output -- str, name of output file
        
    Returns: None
    """
    aligner = mp.Aligner(ref_fasta)  # constructor that indexes reference

    for name, seq, qual in mp.fastx_read(
            read):  # generator that open FASTA/Q and yiels name, seq, qual

        nb_hits = 0
        for hit in aligner.map(
                seq
        ):  # aligns seq against index (generates Alignment object that describe alignment)
            if hit.is_primary:  # usually best and first
                matches_mismatches = sum(
                    [c[0] for c in hit.cigar if c[1] == 0])  # from CIGAR
                result_dict['matches'] += hit.mlen
                result_dict['mismatches'] += matches_mismatches - hit.mlen
                result_dict['insertions'] += sum(
                    [c[0] for c in hit.cigar if c[1] == 1])
                result_dict['deletions'] += sum(
                    [c[0] for c in hit.cigar if c[1] == 2])
                result_dict['mapping_quality'].append(hit.mapq)
            nb_hits += 1
        result_dict['nb_mappings'].append(nb_hits)

    return result_dict
Пример #24
0
def run_polyte(reffile, r1name, fname, output_type, cut_site, min_len):
    '''Align filtered fastq to genome
    '''
    reference = mp.Aligner(reffile, preset="sr")
    print("Load in reference...")  # load or build index
    if not reference:
        raise Exception("ERROR: failed to load/build index")
    print("Done")
    output_sam = SAMBAMWriter(fname, reference, output_type)
    print("Running alignment...")
    reads1 = mp.fastx_read(r1name)
    while True:
        try:
            read1 = process_reads2.Read(reads1.__next__())
            read1.split_read(cut_site, min_len)
            read1.qual_trim(10, 10)
            if read1.seq:
                res = map_te_reads(read1, reference)
                if res:
                    output_sam.process_te_output(res, read1)
        except StopIteration:
            break
Пример #25
0
def read_fastq_file(seq_file, check):
    '''
    Takes a FASTQ file and returns a list of tuples
    In each tuple:
        name : str, read ID
        seed : int, first occurrence of the splint
        seq : str, sequence
        qual : str, quality line
        average_quals : float, average quality of that line
        seq_length : int, length of the sequence
    Has a check mode where if it sees one read, it'll return True
    '''
    read_list = []
    for read in mm.fastx_read(seq_file, read_comment=False):
        split_name = read[0].split('_')
        name, seed = split_name[0], 0
        seq, qual = read[1], read[2]
        if check:
            return True
        avg_q = np.average([ord(x) - 33 for x in qual])
        s_len = len(seq)
        read_list.append((read[0], seq, qual, avg_q, s_len))
    return read_list
Пример #26
0
def main(argv):
    """
    Main PAtChER function
    """
    distance = 10
    nthreads = 1
    cut_site = "GATC"
    min_len = 20
    output_type = "SAM"
    debug = False
    myargs = getopts(argv)
    if '-g' in myargs:
        reffile = myargs["-g"]
    else:
        print_help()
    if '-o' in myargs:
        fname = myargs[
            "-o"] / var / folders / sm / tmsr4vt95wsbb57yrr6szm9w0000gn / T / com.apple.iChat / Messages / Transfers / IMG_4777.PNG
    else:
        print_help()
    if '-r1' in myargs:
        r1name = myargs["-r1"]
        reads1 = mp.fastx_read(r1name)
    else:
        print_help()
    if '-r2' in myargs:
        r2name = myargs["-r2"]
        reads2 = mp.fastx_read(r2name)
    else:
        print_help()
    if '-d' in myargs:
        distance = int(myargs["-d"])
    if '-D' in myargs:
        debug = True
    if '-t' in myargs:
        nthreads = int(myargs["-t"])
    if '-c' in myargs:
        cut_site = myargs["-c"]
    if '-l' in myargs:
        min_len = int(myargs["-l"])
    if '-b' in myargs:
        output_type = "BAM"

    print(f"Using refrence:{reffile}")
    print(f"Using read 1:{r1name}")
    print(f'Using read 2:{r2name}')
    print(f'Using distance +/-:{distance}')
    print(f"Using threads:{nthreads}")
    print(f"Using cutsite:{cut_site}")
    print(f"Writing to:{fname}")
    if nthreads > 3:
        multiproc2.run(reffile, reads1, reads2, fname, distance, nthreads,
                       cut_site, min_len, output_type, debug)
    else:
        if nthreads > 1:
            print(
                "Cannot run multithreading with less than 3 threads defaulting to single"
            )
        runsingle(reffile, reads1, reads2, fname, distance, cut_site, min_len,
                  output_type)
    reads1.close()
    reads2.close()
    print("Run complete")
Пример #27
0
 def _method_mappy(self, *args, **kwargs):
     with open(self.outfile, "w") as fasta:
         for (name, seq, _) in fastx_read(self.infile):
             fasta.write(">{}\n{}\n".format(name, seq))
Пример #28
0
    writer = csv.writer(outfile, delimiter='\t', lineterminator=os.linesep)
    for line in open(sys.argv[2]):  # bed
        line = line.rstrip().split('\t')
        if isbed:
            name = line[3][:line[3].rfind(';')]
        else:
            name = line[9][:line[9].rfind(';')]

        if name not in assigned_names:
            writer.writerow(line)
            headers_keep.add(name)

import mappy as mm

headers_used = set()
for fle in sys.argv[5:]:
    for read in mm.fastx_read(fle):
        header, seq, qual = read
        if header in headers_keep:
            print('>' + header)
            print(seq)
            headers_used.add(header)

diff = len(headers_keep - headers_used)
if diff > 0:
    sys.stderr.write(
        '{} names do not match any names in fastq file(s)'.format(diff))
    sys.stderr.write('e.g. {} in bed but not in fastq\n'.format(
        list(headers_keep - headers_used)[0]))
    sys.exit(1)
Пример #29
0
def StrandSim(w, c):
    '''
	Perform first part of strand-seq simulations and re-align to the original haplotype
	'''

    hfa = pyfaidx.Fasta(c.ffile)

    if w.chrom not in hfa.keys():

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Warning] Chromosome ' + w.chrom +
              ' not found in ' + c.ffile + '. Skipped simulation')

    else:

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Message] Preparing simulation from ' + c.ffile +
              '. Haplotype ' + str(c.hapnumber))

        chr_ = hfa[w.chrom]
        seq_ = chr_[w.start - 1:w.end].seq
        tmpfa = os.path.abspath(c.haplodir + '/' + 'htmp.fa')
        region = w.chrom + '_' + str(w.start) + '_' + str(w.end)

        with open(tmpfa,
                  'w') as tmpfout:  #write temporary fa for sampling reads

            tmpfout.write('>' + region + '\n' +
                          '\n'.join(re.findall('.{1,60}', seq_)) + '\n')

        Ns = seq_.count('N')  #normalize coverage on Ns
        Nreads = round(((c.regioncoverage * (len(seq_) - Ns)) / c.length) /
                       2)  #for paired-end sequencing

        mate1h = os.path.abspath(c.haplodir + '/hr1.tmp.fq')
        mate2h = os.path.abspath(c.haplodir + '/hr2.tmp.fq')

        hapcov = Nreads * c.length * 2 / ((w.end - w.start) - Ns)
        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now +
              '][Message] Simulated coverage for this region will be ' +
              str(hapcov))

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Message] Simulating')

        wgsim.core(r1=mate1h,
                   r2=mate2h,
                   ref=tmpfa,
                   err_rate=c.error,
                   mut_rate=c.mutation,
                   indel_frac=c.indels,
                   indel_ext=c.extindels,
                   N=Nreads,
                   dist=c.distance,
                   stdev=c.stdev,
                   size_l=c.length,
                   size_r=c.length,
                   max_n=0.05,
                   is_hap=0,
                   is_fixed=0,
                   seed=0)

        os.remove(tmpfa)

        mate1hnew = os.path.abspath(c.haplodir + '/hr1.fq')
        mate2hnew = os.path.abspath(c.haplodir + '/hr2.fq')

        with open(mate1hnew, 'w') as out1, open(mate2hnew, 'w') as out2:

            for (name1, seq1, qual1), (name2, seq2,
                                       qual2) in zip(mp.fastx_read(mate1h),
                                                     mp.fastx_read(mate2h)):

                #change name1/name2

                newname1 = '@c' + str(c.singlecellnum) + 'h' + str(
                    c.hapnumber) + 'fh_' + name1
                newname2 = '@c' + str(c.singlecellnum) + 'h' + str(
                    c.hapnumber) + 'fh_' + name2

                read1 = [newname1, seq1, '+', qual1]
                read2 = [newname2, seq2, '+', qual2]

                out1.write('\n'.join(read1) + '\n')
                out2.write('\n'.join(read2) + '\n')

        os.remove(mate1h)
        os.remove(mate2h)

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print(
            '[' + now +
            '][Message] Mapping simulated reads to the corresponding haplotype'
        )

        BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.srt.bam')

        sam_cmd = [
            'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only',
            '-t',
            str(c.threads), c.ffile, mate1hnew, mate2hnew
        ]
        bam_cmd = [
            'samtools', 'sort', '-@',
            str(round(c.threads / 2)), '-o', BAM
        ]

        p1 = subprocess.Popen(sam_cmd,
                              stderr=open(os.devnull, 'wb'),
                              stdout=subprocess.PIPE)
        bout = open(BAM, 'wb')
        p2 = subprocess.run(bam_cmd,
                            stdin=p1.stdout,
                            stderr=open(os.devnull, 'wb'),
                            stdout=bout)
        bout.close()

        os.remove(mate1hnew)
        os.remove(mate2hnew)

        #now re-parse BAM file to keep only Watson/Crick reads
        #Watson reads: read1 forward, read2 reverse
        #Crick reads: read2 forward, read1 reverse

        ivf = None

        if len(c.sce_bedregion) != 0:

            sce_string = ''

            for s in c.sce_bedregion:

                if s[3] == c.cellid and s[4] == c.hapid:

                    sce_string += s.chrom + '\t' + str(s.start) + '\t' + str(
                        s.end) + '\n'

            if sce_string != '':

                sce_fromscratch = pybedtools.BedTool(sce_string.rstrip(),
                                                     from_string=True)
                ivf = sce_fromscratch.as_intervalfile(
                )  #intervals where to perform SCE events

                now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                print(
                    '[' + now +
                    '][Message] Detected one ore more SCE event for current cell/haplotype'
                )

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print(
            '[' + now +
            '][Message] Extracting Watson (R1F,R2R) and Crick (R1R,R2F) reads')

        save = pysam.set_verbosity(0)
        bamstrand = pysam.AlignmentFile(
            BAM, 'rb', require_index=False)  #until-eof consumes the bamfile
        pysam.set_verbosity(save)
        Wreads = list(WR(bamstrand, ivf))
        bamstrand.close()

        save = pysam.set_verbosity(0)
        bamstrand = pysam.AlignmentFile(
            BAM, 'rb', require_index=False)  #re-open for second round
        pysam.set_verbosity(save)
        Creads = list(CR(bamstrand, ivf))
        bamstrand.close()

        os.remove(BAM)

        if c.noise > 0:

            now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
            print('[' + now + '][Message] Adding noise to strands')

            CtoW = random.sample(Creads, round(len(Wreads) / 100 * c.noise))
            Wreads += CtoW

            WtoC = random.sample(Wreads, round(len(Creads) / 100 * c.noise))
            Creads += WtoC

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Message] Writing Watson and Crick FASTQ')

        w1 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.w1.fq')
        w2 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.w2.fq')

        c1 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.c1.fq')
        c2 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.c2.fq')

        with open(w1, 'w') as wout1, open(w2, 'w') as wout2:

            for r1, r2 in Wreads:

                if r1.get_tag('OS') == 'W':  #this is true W

                    read1 = [
                        '@' + r1.query_name, r1.query_sequence, '+',
                        '2' * c.length
                    ]
                    read2 = [
                        '@' + r2.query_name,
                        mp.revcomp(r2.query_sequence), '+', '2' * c.length
                    ]

                else:  #write to Watson, but is Crick

                    read1 = [
                        '@' + r1.query_name,
                        mp.revcomp(r1.query_sequence), '+', '2' * c.length
                    ]
                    read2 = [
                        '@' + r2.query_name, r2.query_sequence, '+',
                        '2' * c.length
                    ]

                wout1.write('\n'.join(read1) + '\n')
                wout2.write('\n'.join(read2) + '\n')

        with open(c1, 'w') as cout1, open(c2, 'w') as cout2:

            for r1, r2 in Creads:

                if r1.get_tag('OS') == 'C':  #this is true C

                    read1 = [
                        '@' + r1.query_name,
                        mp.revcomp(r1.query_sequence), '+', '2' * c.length
                    ]
                    read2 = [
                        '@' + r2.query_name, r2.query_sequence, '+',
                        '2' * c.length
                    ]

                else:  #write to Crick, but is Watson

                    read1 = [
                        '@' + r1.query_name, r1.query_sequence, '+',
                        '2' * c.length
                    ]
                    read2 = [
                        '@' + r2.query_name,
                        mp.revcomp(r2.query_sequence), '+', '2' * c.length
                    ]

                cout1.write('\n'.join(read1) + '\n')
                cout2.write('\n'.join(read2) + '\n')

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print(
            '[' + now +
            '][Message] Mapping Watson and Crick reads to the original reference'
        )

        BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) +
                              '.W.srt.bam')

        sam_cmd = [
            'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only',
            '-t',
            str(c.threads), '-R', '@RG\\tID:illumina\\tSM:strand', c.REF, w1,
            w2
        ]
        bam_cmd = [
            'samtools', 'sort', '-@',
            str(round(c.threads / 2)), '-o', BAM
        ]

        p1 = subprocess.Popen(sam_cmd,
                              stderr=open(os.devnull, 'wb'),
                              stdout=subprocess.PIPE)
        bout = open(BAM, 'wb')
        p2 = subprocess.run(bam_cmd,
                            stdin=p1.stdout,
                            stderr=open(os.devnull, 'wb'),
                            stdout=bout)
        bout.close()

        os.remove(w1)
        os.remove(w2)

        BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) +
                              '.C.srt.bam')

        sam_cmd = [
            'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only',
            '-t',
            str(c.threads), '-R', '@RG\\tID:illumina\\tSM:strand', c.REF, c1,
            c2
        ]
        bam_cmd = [
            'samtools', 'sort', '-@',
            str(round(c.threads / 2)), '-o', BAM
        ]

        p1 = subprocess.Popen(sam_cmd,
                              stderr=open(os.devnull, 'wb'),
                              stdout=subprocess.PIPE)
        bout = open(BAM, 'wb')
        p2 = subprocess.run(bam_cmd,
                            stdin=p1.stdout,
                            stderr=open(os.devnull, 'wb'),
                            stdout=bout)
        bout.close()

        os.remove(c1)
        os.remove(c2)
Пример #30
0
def BulkSim(w, c):
    '''
	Perform bulk simulations and re-align to the un-modified reference
	'''

    hfa = pyfaidx.Fasta(c.ffile)

    if w.chrom not in hfa.keys():

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Warning] Chromosome ' + w.chrom +
              ' not found in ' + c.ffile + '. Skipped simulation')

    else:

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Message] Preparing simulation from ' + c.ffile +
              '. Clone ' + str(c.clonenumber) + '. Haplotype ' +
              str(c.hapnumber))

        chr_ = hfa[w.chrom]
        seq_ = chr_[w.start - 1:w.end].seq
        tmpfa = os.path.abspath(c.haplodir + '/' + 'htmp.fa')
        region = w.chrom + '_' + str(w.start) + '_' + str(w.end)

        with open(tmpfa,
                  'w') as tmpfout:  #write temporary fa for sampling reads

            tmpfout.write('>' + region + '\n' +
                          '\n'.join(re.findall('.{1,60}', seq_)) + '\n')

        Ns = seq_.count('N')  #normalize coverage on Ns
        Nreads = round(((c.regioncoverage * (len(seq_) - Ns)) / c.length) /
                       2)  #for paired-end sequencing

        mate1h = os.path.abspath(c.haplodir + '/hr1.tmp.fq')
        mate2h = os.path.abspath(c.haplodir + '/hr2.tmp.fq')

        if float(w[4]) < 100.0:

            tmpref = os.path.abspath(c.haplodir + '/' + 'rtmp.fa')
            seq__ = c.refall[w.chrom][w.start - 1:w.end].seq

            with open(tmpref,
                      'w') as tmpfout:  #write temporary fa for sampling reads

                tmpfout.write('>' + region + '\n' +
                              '\n'.join(re.findall('.{1,60}', seq__)) + '\n')

            #simulate part from reference and part from haplotype

            haploreadsN = round(Nreads / 100 * float(w[4]))

            hapcov = haploreadsN * c.length * 2 / ((w.end - w.start) - Ns)
            now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
            print('[' + now +
                  '][Message] Simulated coverage for this region will be ' +
                  str(hapcov))

            refreadsN = Nreads - haploreadsN
            refcov = refreadsN * c.length * 2 / ((w.end - w.start) - Ns)
            print(
                '[' + now +
                '][Message] Simulated coverage for the corresponding reference region will be '
                + str(refcov))

            mate1r = os.path.abspath(c.haplodir + '/rr1.tmp.fq')
            mate2r = os.path.abspath(c.haplodir + '/rr2.tmp.fq')

            now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
            print('[' + now + '][Message] Simulating')

            wgsim.core(r1=mate1h,
                       r2=mate2h,
                       ref=tmpfa,
                       err_rate=c.error,
                       mut_rate=c.mutation,
                       indel_frac=c.indels,
                       indel_ext=c.extindels,
                       N=haploreadsN,
                       dist=c.distance,
                       stdev=c.stdev,
                       size_l=c.length,
                       size_r=c.length,
                       max_n=0.05,
                       is_hap=0,
                       is_fixed=0,
                       seed=0)
            wgsim.core(r1=mate1r,
                       r2=mate2r,
                       ref=tmpref,
                       err_rate=c.error,
                       mut_rate=c.mutation,
                       indel_frac=c.indels,
                       indel_ext=c.extindels,
                       N=refreadsN,
                       dist=c.distance,
                       stdev=c.stdev,
                       size_l=c.length,
                       size_r=c.length,
                       max_n=0.05,
                       is_hap=0,
                       is_fixed=0,
                       seed=0)

            os.remove(tmpfa)
            os.remove(tmpref)

            mate1hnew = os.path.abspath(c.haplodir + '/hr1.fq')
            mate2hnew = os.path.abspath(c.haplodir + '/hr2.fq')

            with open(mate1hnew, 'w') as out1, open(mate2hnew, 'w') as out2:

                for (name1, seq1,
                     qual1), (name2, seq2,
                              qual2) in zip(mp.fastx_read(mate1h),
                                            mp.fastx_read(mate2h)):

                    #change name1/name2

                    newname1 = '@c' + str(c.clonenumber) + 'h' + str(
                        c.hapnumber) + 'fh_' + name1
                    newname2 = '@c' + str(c.clonenumber) + 'h' + str(
                        c.hapnumber) + 'fh_' + name2

                    read1 = [newname1, seq1, '+', qual1]
                    read2 = [newname2, seq2, '+', qual2]

                    out1.write('\n'.join(x for x in read1) + '\n')
                    out2.write('\n'.join(x for x in read2) + '\n')

            os.remove(mate1h)
            os.remove(mate2h)

            with open(mate1hnew, 'a') as out1, open(mate2hnew, 'a') as out2:

                for (name1, seq1,
                     qual1), (name2, seq2,
                              qual2) in zip(mp.fastx_read(mate1r),
                                            mp.fastx_read(mate2r)):

                    #change name1/name2

                    newname1 = '@c' + str(c.clonenumber) + 'h' + str(
                        c.hapnumber) + 'fr_' + name1
                    newname2 = '@c' + str(c.clonenumber) + 'h' + str(
                        c.hapnumber) + 'fr_' + name2

                    read1 = [newname1, seq1, '+', qual1]
                    read2 = [newname2, seq2, '+', qual2]

                    out1.write('\n'.join(read1) + '\n')
                    out2.write('\n'.join(read2) + '\n')

            os.remove(mate1r)
            os.remove(mate2r)

            #split in chunks for multiprocessing

        else:

            hapcov = Nreads * c.length * 2 / ((w.end - w.start) - Ns)
            now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
            print('[' + now +
                  '][Message] Simulated coverage for this region will be ' +
                  str(hapcov))

            now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
            print('[' + now + '][Message] Simulating')

            wgsim.core(r1=mate1h,
                       r2=mate2h,
                       ref=tmpfa,
                       err_rate=c.error,
                       mut_rate=c.mutation,
                       indel_frac=c.indels,
                       indel_ext=c.extindels,
                       N=Nreads,
                       dist=c.distance,
                       stdev=c.stdev,
                       size_l=c.length,
                       size_r=c.length,
                       max_n=0.05,
                       is_hap=0,
                       is_fixed=0,
                       seed=0)

            os.remove(tmpfa)

            mate1hnew = os.path.abspath(c.haplodir + '/hr1.fq')
            mate2hnew = os.path.abspath(c.haplodir + '/hr2.fq')

            with open(mate1hnew, 'w') as out1, open(mate2hnew, 'w') as out2:

                for (name1, seq1,
                     qual1), (name2, seq2,
                              qual2) in zip(mp.fastx_read(mate1h),
                                            mp.fastx_read(mate2h)):

                    #change name1/name2

                    newname1 = '@c' + str(c.clonenumber) + 'h' + str(
                        c.hapnumber) + 'fh_' + name1
                    newname2 = '@c' + str(c.clonenumber) + 'h' + str(
                        c.hapnumber) + 'fh_' + name2

                    read1 = [newname1, seq1, '+', qual1]
                    read2 = [newname2, seq2, '+', qual2]

                    out1.write('\n'.join(read1) + '\n')
                    out2.write('\n'.join(read2) + '\n')

            os.remove(mate1h)
            os.remove(mate2h)

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now +
              '][Message] Mapping simulated reads to the reference genome')

        BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.srt.bam')

        sam_cmd = [
            'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only',
            '-t',
            str(c.threads), '-R', '@RG\\tID:illumina\\tSM:bulk', c.REF,
            mate1hnew, mate2hnew
        ]
        bam_cmd = [
            'samtools', 'sort', '-@',
            str(round(c.threads / 2)), '-o', BAM
        ]

        p1 = subprocess.Popen(sam_cmd,
                              stderr=open(os.devnull, 'wb'),
                              stdout=subprocess.PIPE)
        bout = open(BAM, 'wb')
        p2 = subprocess.run(bam_cmd,
                            stdin=p1.stdout,
                            stderr=open(os.devnull, 'wb'),
                            stdout=bout)
        bout.close()

        os.remove(mate1hnew)
        os.remove(mate2hnew)