Exemplo n.º 1
0
def diffSeq(line, fnRef, fnSim):
    MaxPrintChars = 100

    fpRef = open(fnRef, 'r')
    fpSim = open(fnSim, 'r')
    words = line.strip().split()

    chrom = words[0]
    refPos0 = int(words[1])
    refPos1 = int(words[2])
    simPos0 = int(words[7])
    simPos1 = int(words[8])

    print line.strip()
    print '---------'
    if refPos1 - refPos0 + 1 < MaxPrintChars and simPos1 - simPos0 + 1 < MaxPrintChars:
        for name, seq, qual in readfq(fpRef):
            if name == chrom:
                print seq[refPos0 - 1:refPos1]
                break
        for name, seq, qual in readfq(fpSim):
            if name == chrom:
                print seq[simPos0 - 1:simPos1]
                break
    fpRef.close()
    fpSim.close()
Exemplo n.º 2
0
def diffSeq(line, fnRef, fnSim):
    MaxPrintChars = 100
    
    fpRef = open(fnRef, 'r')
    fpSim = open(fnSim, 'r')
    words = line.strip().split()

    chrom = words[0]
    refPos0 = int(words[1])
    refPos1 = int(words[2])
    simPos0 = int(words[7])
    simPos1 = int(words[8])

    print line.strip()
    print '---------'
    if refPos1 - refPos0 +1 < MaxPrintChars and simPos1 - simPos0+1 < MaxPrintChars:
        for name, seq, qual in readfq(fpRef):
            if name == chrom:
                print seq[refPos0-1: refPos1]
                break
        for name, seq, qual in readfq(fpSim):
            if name == chrom:
                print seq[simPos0-1: simPos1]
                break
    fpRef.close()
    fpSim.close()
def demultiplex_dual_barcodes():
    parser = argparse.ArgumentParser()
    parser.add_argument('--index1_file', required=True)
    parser.add_argument('--index1_max_distance', type=int, required=True)
    parser.add_argument('--index2_file', required=True)
    parser.add_argument('--index2_max_distance', type=int, required=True)
    parser.add_argument('--read1_fq', required=True)
    parser.add_argument('--read2_fq', required=True)
    parser.add_argument('--output_dir', required=True)
    #parser.add_argument('--output_file_prefix')
    args=parser.parse_args()

    bc=dict()
    bce=dict()
    for i in [1,2]:
        bc[i]=read_barcode_dict(args.__dict__['index'+str(i)+'_file'])
        bce[i]=expand_barcode_dict(bc[i], args.__dict__['index'+str(i)+'_max_distance'])



    outfiles=dict()
    def output_file(filename):
        if filename not in outfiles:
            outfiles[filename]=gzip.open(filename, 'wt')
        return outfiles[filename]


    f1=readfq(open(args.read1_fq))
    f2=readfq(open(args.read2_fq))

    for name1, seq1, qual1 in f1:
        m=re.search('#([ACGTN]+)_([ACGTN]+)/', name1)
        
        sample_id=dict()
        for i in [1,2]:
            index_seq=m.group(i)
            if index_seq in bce[i]:
                sample_id[i]=bc[i][bce[i][index_seq]]
            else:
                sample_id[i]=None
        
        if sample_id[1] is None or sample_id[2] is None:
            filename_suffix='Unassigned'
        else:
            filename_suffix=sample_id[1]+'_'+sample_id[2]
        
        base_output_filename=args.output_dir+'/'+filename_suffix
        
        f=output_file(base_output_filename+'_R1.fq.gz')
        f.write('\n'.join(['@'+name1,seq1,'+',qual1,'']))
        
        name2, seq2, qual2 = next(f2)
        f=output_file(base_output_filename+'_R2.fq.gz')
        f.write('\n'.join(['@'+name2,seq2,'+',qual2,'']))
        
        #print(output_filename)


    for f in outfiles.values():
        f.close()
Exemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("input",
                        help="path for the fasta file to build bwt",
                        type=str)
    parser.add_argument("save", help="path to save bwt", type=str)

    try:
        args = parser.parse_args()

    except:
        parser.print_help()
        sys.exit(1)

    handle = open(args.input)

    seqs = []
    rc_seqs = []
    names = []
    for name, seq, qual in readfq(handle):
        seqs.append(seq)
        rc_seqs.append(revcomp(seq))
        names.append(name)
    print("build forward index")
    fm_index = FMindex(seqs, names)
    print("finish build")

    print("build reverse index")
    rc_index = FMindex(rc_seqs, names)
    print("finish build")
    index = [fm_index, rc_index]
    pickle.dump(index, open(args.save, "wb"))
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("query", help="path for query sequence pasta file", type=str)
    parser.add_argument("target", help="path for target sequence pasta file", type=str)
    parser.add_argument("threshold", help="threshold to report overlap", type=int)
    parser.add_argument("-align", required=False, action='store_true')

    try:
        args = parser.parse_args()

    except:
        parser.print_help()
        sys.exit(1)

    # build index for all target sequence
    handle = open(args.target)
    target_dict = {}
    seqs = []
    names = []
    for name, seq, qual in readfq(handle):
        seqs.append(seq)
        names.append(name)
        target_dict[name] = seq
    print("build index")
    fm_index = FMindex(seqs, names)
    print("finish build")

    # test overlap
    handle = open(args.query)
    for name, seq, qual in readfq(handle):
        # print(name)

        outputs = fm_index.find_overlaps(seq, name, args.threshold)
        for output in outputs:
            if (output[0]!=output[3]):
                output.append("fw")
                print("\t".join(output))
                if args.align:
                    print_align(output, seq, target_dict[output[3]])

        outputs = fm_index.find_overlaps(revcomp(seq), name, args.threshold)
        for output in outputs:
            if (output[0] != output[3]):
                output.append("rc")
                print("\t".join(output))
                if args.align:
                    print_align(output, seq, target_dict[output[3]])
Exemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("query", help="path for query sequence pasta file", type=str)
    parser.add_argument("index", help="path for bwt index", type=str)
    parser.add_argument("threshold", help="threshold to report overlap", type=int)

    try:
        args = parser.parse_args()

    except:
        parser.print_help()
        sys.exit(1)

    # build index for all target sequence
    fw_index, rc_index = pickle.load(open(args.index, "rb"))

    # test overlap
    handle = open(args.query)
    for name, seq, qual in readfq(handle):
        outputs = fw_index.find_overlaps(seq, name, args.threshold)
        for output in outputs:
            if (output[0]!=output[3]):
                output.append("fw")
                output.append("suffix")
                print("\t".join(output))

        outputs = fw_index.find_overlaps(revcomp(seq), name, args.threshold)
        for output in outputs:
            if (output[0] != output[3]):
                output.append("rc")
                output.append("suffix")
                print("\t".join(output))

        outputs = rc_index.find_overlaps(revcomp(seq), name, args.threshold)
        for output in outputs:
            if (output[0] != output[3]):
                output.append("fw")
                output.append("prefix")
                print("\t".join(output))

        outputs = rc_index.find_overlaps(seq, name, args.threshold)
        for output in outputs:
            if (output[0] != output[3]):
                output.append("rc")
                output.append("prefix")
                print("\t".join(output))
Exemplo n.º 7
0
        genes_not_trusted,
        genes_not_trusted_conditioned,
        trusted_SVnames,
        threshold_least_exons=nb_exons,
        threshold_most_introns=nb_introns,
        threshold_exon_cov=cov_exons,
        threshold_intron_cov=cov_introns,  # CIC cases are filtered here
        is_verbose=is_verbose,
        is_debug=is_debug)

    SVseqs = {}
    fs = [dir_fasta + _ for _ in os.listdir(dir_fasta) if _.endswith('.fa')]
    qnames = [_.split('@')[1] for _ in PPG_100flk.keys()]
    for f in fs:
        with open(f) as file:
            for qname, seq, qual in readfq(file):
                if qname in qnames:
                    SVseqs[qname] = seq
    print('loaded {0} SV seqs from {1} files'.format(len(SVseqs), len(fs)))

    SVflks = {}
    fs = [dir_flk + _ for _ in os.listdir(dir_flk) if _.endswith('.fa')]
    qnames = [_.split('@')[1] for _ in PPG_100flk.keys()]
    for f in fs:
        with open(f) as file:
            for qname, seq, qual in readfq(file):
                if qname[:-2] in qnames:
                    if qname[:-2] not in SVflks:
                        SVflks[qname[:-2]] = {'r': None, 'l': None}
                    SVflks[qname[:-2]][qname[-1]] = seq
    print('loaded {0} SV flankings from {1} files'.format(
Exemplo n.º 8
0
parser.add_argument('-r', '--ref', metavar='ref.fa', required=True, dest='ref', help='input reference (required)')
parser.add_argument('-o', '--outVCF', metavar='out.vcf', required=True, dest='outVCF', help='output vcf file (required)')
args = parser.parse_args()

# Get all the reference positions
selectedPos = collections.defaultdict(set)
nuclDict = collections.defaultdict(set)
if args.vcfFile:
    vcf_reader = gzip.open(args.vcfFile, 'rb') if args.vcfFile.endswith('.gz') else open(args.vcfFile, 'r')
    for line in vcf_reader:
        if line.startswith('#'):
            continue
        fields = line.split('\t', 2)
        selectedPos[fields[0]].add(int(fields[1]))
        nuclDict[(fields[0], int(fields[1]))] = 'N'
    vcf_reader.close()

# Store the true reference nucleotide
f_in = gzip.open(args.ref) if args.ref.endswith('.gz') else open(args.ref)
for seqName, seqNuc, seqQuals in readfq(f_in):
    for pos in selectedPos[seqName]:
        nuclDict[(seqName, pos)] = seqNuc[(pos-1):pos]

# Replace vcf reference allele
if args.vcfFile:
    vcf_reader = vcf.Reader(open(args.vcfFile), 'r', compressed=True) if args.vcfFile.endswith('.gz') else vcf.Reader(open(args.vcfFile), 'r', compressed=False)
    vcf_writer = vcf.Writer(open(args.outVCF, 'w'), vcf_reader, lineterminator='\n')
    for record in vcf_reader:
        record.REF = nuclDict[(record.CHROM, record.POS)]
        vcf_writer.write_record(record)
Exemplo n.º 9
0
     sys.exit(1)
 fp_in_r1 = open(sys.argv[1], 'r')
 fp_in_r2 = open(sys.argv[2], 'r')
 fp_ann = open(sys.argv[3], 'r')
 fp_out_r1 = open(sys.argv[4] + '1.fq', 'w')
 fp_out_r2 = open(sys.argv[4] + '2.fq', 'w')
 fp_ans = open(sys.argv[4] + '.ans', 'w')
 #Read Ann
 posRefStart, posSimStart, posRefEnd, posSimEnd = {}, {}, {}, {}
 for line in fp_ann:
     a = read_ann.ann(line)
     createPosMap(a, posRefStart, posRefEnd, posSimStart, posSimEnd)
     #print posRefEnd, posSimEnd
 #convert read1
 tot_num = 0
 for name, seq, qual in readfq(fp_in_r1):
     words = name.split('_')
     chrom = words[0]
     simPos0 = int(words[1])
     simPos1 = int(words[2])
     refPos0 = simPos2Ref(chrom, simPos0,
                          posRefStart, posRefEnd,
                          posSimStart, posSimEnd, fp_ans)
     refPos1 = simPos2Ref(chrom, simPos1,
                          posRefStart, posRefEnd,
                          posSimStart, posSimEnd, fp_ans)
     words[1], words[2] = str(refPos0), str(refPos1)
     name = ''
     for i in words:
         name += i + '_'
     name = name.rstrip('_')
Exemplo n.º 10
0
import re
import sys
from readfq import readfq

complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N':'N'}

if len(sys.argv) < 3:
    sys.exit("error: too few arguments\nusage: find_motif.py infile.fa <motifseq>\n")

seqs = dict()
infile = open(sys.argv[1])
for name, seq, qual in readfq(infile):
    seqs[name] = seq

motif_str = sys.argv[2].upper()
motif_str_rc = "".join([complement[b] for b in motif_str[::-1]])
pat = "(%s|%s)" % (motif_str.replace("N", "[ATCG]"), motif_str_rc.replace("N", "[ATCG]"))
motif = re.compile(pat, flags=re.IGNORECASE)

for name, seq in seqs.items():
    matches = motif.finditer(seq)
    for match in matches:
        if match is not None:
            if len(sys.argv) == 4:
                print "\t".join(map(str, (name, match.start(), match.end(), sys.argv[3])))
            else:
                print "\t".join(map(str, (name, match.start(), match.end())))
            #print "\t".join(map(str, (name, match.start(), match.end(), seq[match.start():match.end()])))
Exemplo n.º 11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("seed",
                        help="path for query sequence pasta file",
                        type=str)
    parser.add_argument("data", help="path for all reads", type=str)
    parser.add_argument("index", help="path for bwt index", type=str)
    parser.add_argument("threshold",
                        help="threshold to report overlap",
                        type=int)
    parser.add_argument("output", help="path for output fasta", type=str)

    try:
        args = parser.parse_args()

    except:
        parser.print_help()
        sys.exit(1)

    # build index for all target sequence
    seq_dict = SeqIO.index(args.data, 'fasta')
    sets = []
    seeds = set()
    total = set()
    handle = open(args.seed)
    for name, seq, qual in readfq(handle):
        seeds.add(name)
    sets.append(seeds)
    total = total.union(seeds)

    fw_index, rc_index = pickle.load(open(args.index, "rb"))

    new_set = seeds
    while len(new_set) > 0:
        prev_set = new_set
        new_set = set()
        for name in prev_set:

            seq = str(seq_dict[name].seq)
            outputs = fw_index.find_overlaps(seq, name, args.threshold)
            for output in outputs:
                if output[3] not in total:
                    new_set.add(output[3])

            outputs = fw_index.find_overlaps(revcomp(seq), name,
                                             args.threshold)
            for output in outputs:
                if output[3] not in total:
                    new_set.add(output[3])

            outputs = rc_index.find_overlaps(revcomp(seq), name,
                                             args.threshold)
            for output in outputs:
                if output[3] not in total:
                    new_set.add(output[3])

            outputs = rc_index.find_overlaps(seq, name, args.threshold)
            for output in outputs:
                if output[3] not in total:
                    new_set.add(output[3])

        sets.append(new_set)
        total = total.union(new_set)
        print("Find one set of reads {}".format(len(new_set)))
        print('Total number of reads is {}'.format(len(total)))

    outputs = []
    for seq_id in total:
        outputs.append(seq_dict[seq_id])

    SeqIO.write(outputs, open(args.output, 'w'), 'fasta')
Exemplo n.º 12
0
                    required=True,
                    dest='ref',
                    help='input reference (required)')
args = parser.parse_args()

# Get all the positions
bcf = cyvcf2.VCF(args.bcf)
selectedPos = collections.defaultdict(set)
for record in bcf:
    selectedPos[record.CHROM].add(record.POS)

# Store the prefix and suffix reference nucleotide
nuclPrevDict = collections.defaultdict(set)
nuclPostDict = collections.defaultdict(set)
f_in = gzip.open(args.ref) if args.ref.endswith('.gz') else open(args.ref)
for seqName, seqNuc, seqQuals in readfq(f_in):
    for pos in selectedPos[seqName]:
        nuclPrevDict[(seqName, pos)] = seqNuc[(pos - 2):(pos - 1)]
        nuclPostDict[(seqName, pos)] = seqNuc[pos:(pos + 1)]

# Build mutation dictionary
mt = dict()
for i in ['A', 'C', 'G', 'T']:
    for j in ['A', 'C', 'G', 'T']:
        for k in ['A', 'C', 'G', 'T']:
            for l in ['A', 'C', 'G', 'T']:
                if j != k:
                    if (j == 'C') or (j == 'T'):
                        mt[(i, j, k, l)] = (i, j, k, l)
                    else:
                        mt[(i, j, k, l)] = (rev(l), rev(j), rev(k), rev(i))
Exemplo n.º 13
0
            status = 0  # unless new best ref matches last best ref

        out_ls_fh.write('\t'.join([
            central_sample_id,
            best_qc[central_sample_id][2],
            best_qc[central_sample_id][1],
            str(status),
        ]) + '\n')

        # Add this sequence's PAG to the best_published_names set
        best_published_names.add(best_qc[central_sample_id][1])

# Iterate the matched FASTA and print out sequences that have a name in the best_published_names set
seen_best_published_names = set([])
with open(args.fasta) as latest_fasta_fh:
    for name, seq, qual in readfq(latest_fasta_fh):
        # Apparently I write the names out wrong so that's good
        curr_pag = name.split('|')[0].replace('COGUK', 'COG-UK')
        central_sample_id = curr_pag.split('/')[1]
        if curr_pag in best_published_names:
            # Remove deletion chars (https://github.com/COG-UK/dipi-group/issues/38)
            seq = seq.replace('-', '')

            sys.stdout.write('>%s\n%s\n' % (central_sample_id, seq))
            seen_best_published_names.add(curr_pag)
sys.stderr.write("[NOTE] %s best sequences written.\n" %
                 len(seen_best_published_names))
sys.stderr.write("[NOTE] %s best sequences missing.\n" %
                 (len(best_published_names) - len(seen_best_published_names)))

if len(seen_best_published_names) != len(best_published_names):
Exemplo n.º 14
0
#!/usr/bin/env python
# nuccount.py -- tally nucleotides in a file
import sys
from collections import Counter
from readfq import readfq

IUPAC_BASES = "ACGTRYSWKMBDHVN-."

# intialize counter
counts = Counter()

for name, seq, qual in readfq(sys.stdin):
    # for each sequence entry, add all its bases to the counter
    counts.update(seq.upper())

# print the results
for base in IUPAC_BASES:
    print base + "\t" + str(counts[base])
Exemplo n.º 15
0
def main(debug=False):
    parser = argparse.ArgumentParser()
    parser.add_argument("path", help="path for sequence pasta file", type=str)
    parser.add_argument("k", help="size of kmer", type=int)
    parser.add_argument("perm", help="number of permutation", type=int)
    parser.add_argument("threshold",
                        help="threshold to report similar",
                        type=float)
    parser.add_argument("-cluster",
                        help="generate clusters instead of find similar reads",
                        required=False,
                        action='store_true')
    parser.add_argument("-debug", required=False, action='store_true')

    try:
        args = parser.parse_args()

    except:
        parser.print_help()
        sys.exit(1)

    handle = open(args.path)

    lsh = BioLSH(threshold=args.threshold, num_perm=args.perm)
    minhash_dict = {}
    minhash_rc_dict = {}

    if args.debug:
        set_dict = {}

    with lsh.insertion_session() as session:
        for name, seq, qual in readfq(handle):
            m_hash = kmer_minhash(seq, args.k, args.perm, debug=args.debug)
            """
            if name == 'r1':
                m_hash.set_hashvalues(np.array([0, 11, 22, 3, 5, 6, 9, 45, 98, 0, 1, 7]))
            elif name == 'r2':
                m_hash.set_hashvalues(np.array([11, 9, 3, 4, 98, 0, 1, 7, 23, 15, 0, 31]))
            """

            session.insert(name, m_hash)
            minhash_dict[name] = m_hash
            minhash_rc_dict[name] = kmer_minhash(seq,
                                                 args.k,
                                                 args.perm,
                                                 rc=True)

            if args.debug:
                set_dict[name] = kmer_set(seq, args.k)

    if args.cluster:
        clusters = lsh.cluster()
        for cluster in clusters:
            print("\t".join(cluster))

    else:

        for id in minhash_dict.keys():
            fw_result = lsh.query(minhash_dict[id])
            rc_result = lsh.query(minhash_rc_dict[id])
            result = list(set(fw_result).union(set(rc_result)))
            for similar_id in result:
                if id != similar_id:
                    print("{}\t{}".format(id, similar_id))

    if args.debug:
        for id in minhash_dict.keys():
            for query_id in minhash_dict.keys():
                print()
                print("query_id: {} target_id: {}".format(query_id, id))
                result = minhash_dict[query_id].jaccard(minhash_dict[id])
                print("MinHash Estimated:", result)
                result = float(
                    len(set_dict[query_id].intersection(
                        set_dict[id]))) / float(
                            len(set_dict[query_id].union(set_dict[id])))
                print("Real Similarity", result)
                result = minhash_dict[query_id].hamming(minhash_dict[id])
                print("Hamming distance of signature vector:", result)
Exemplo n.º 16
0
import sys
from readfq import readfq
import read_ann

usage = 'map <Ref> <sim_genome> <sim_ann>'

if __name__ == '__main__':
    if len(sys.argv) < 4:
        print >>sys.stderr, usage
        sys.exit(1)
    fp_ref = open(sys.argv[1], 'r')
    fp_sim_genome = open(sys.argv[2], 'r')
    fp_ann = open(sys.argv[3], 'r')
    last = None
    for chrom_ref, seq_ref, qual_ref in readfq(fp_ref):
        chrom_sim, seq_sim, qual_sim = readfq(fp_sim_genome).next()
        if chrom_ref != chrom_sim:
            print >>sys.stderr, '[Error]: Diff chromosome!'
            sys.exit(1)
        last = read_ann.ann(fp_ann.readline())

    fp_ref.close()
    fp_sim_genome.close()
    fp_ann.close()
Exemplo n.º 17
0
def main(debug=False):
    parser = argparse.ArgumentParser()
    parser.add_argument("path", help="path for sequence pasta file", type=str)
    parser.add_argument("groundtruth",
                        help="path for sequence pasta file",
                        type=str)
    parser.add_argument("k", help="size of kmer", type=int)
    parser.add_argument("perm", help="number of permutation", type=int)

    try:
        args = parser.parse_args()

    except:
        parser.print_help()
        sys.exit(1)

    handle = open(args.path)

    truth = collections.defaultdict(dict)
    with open(args.groundtruth) as f1:
        for line in f1:
            sp = line.split()
            query_id = sp[0]
            target_id = sp[1]
            overlap_size = int(sp[2])
            truth[query_id][target_id] = overlap_size

    minhash_dict = {}
    minhash_rc_dict = {}

    for name, seq, qual in readfq(handle):
        m_hash = kmer_minhash(seq, args.k, args.perm)
        minhash_dict[name] = m_hash
        minhash_rc_dict[name] = kmer_minhash(seq, args.k, args.perm, rc=True)

    identity_true = []
    identity_false = []
    identities = []
    jaccard_true = []
    jaccard_false = []
    jaccards = []
    for id in minhash_dict.keys():
        for query_id in minhash_dict.keys():
            if id != query_id:
                jaccard = minhash_dict[query_id].jaccard(minhash_dict[id])
                identity = minhash_dict[query_id].identity(minhash_dict[id])
                jaccards.append(jaccard)
                identities.append(identity)
                if id in truth[query_id]:
                    jaccard_true.append(jaccard)
                    identity_true.append(identity)
                else:
                    jaccard_false.append(jaccard)
                    identity_false.append(identity)

    cur_dir = os.path.dirname(args.path)
    cur_file = os.path.basename(args.path).split(".")[0]
    jaccard_file = os.path.join(
        cur_dir,
        cur_file + '_jaccard_k_{}_perm_{}.png'.format(args.k, args.perm))
    identity_file = os.path.join(
        cur_dir,
        cur_file + '_identity_k_{}_perm_{}.png'.format(args.k, args.perm))

    plt.figure()
    hist, bins = np.histogram(identities, bins=50)
    width = 0.7 * (bins[1] - bins[0])
    center = (bins[:-1] + bins[1:]) / 2
    hist1, bins = np.histogram(identity_true, bins=bins, normed=True)
    center = (bins[:-1] + bins[1:]) / 2
    plt.bar(center,
            hist1,
            align='center',
            width=width,
            label="True",
            alpha=0.7)
    hist1, bins = np.histogram(identity_false, bins=bins, normed=True)
    center = (bins[:-1] + bins[1:]) / 2
    plt.bar(center,
            hist1,
            align='center',
            width=width,
            label="False",
            alpha=0.7)
    plt.legend()
    plt.savefig(identity_file)

    plt.figure()
    hist, bins = np.histogram(jaccards, bins=50)
    width = 0.7 * (bins[1] - bins[0])
    center = (bins[:-1] + bins[1:]) / 2
    hist1, bins = np.histogram(jaccard_true, bins=bins, normed=True)
    center = (bins[:-1] + bins[1:]) / 2
    plt.bar(center,
            hist1,
            align='center',
            width=width,
            label="True",
            alpha=0.7)
    hist1, bins = np.histogram(jaccard_false, bins=bins, normed=True)
    center = (bins[:-1] + bins[1:]) / 2
    plt.bar(center,
            hist1,
            align='center',
            width=width,
            label="False",
            alpha=0.7)
    plt.legend()
    plt.savefig(jaccard_file)
Exemplo n.º 18
0
sys.stderr.write("[NOTE] %d samples with metadata loaded\n" %
                 len(parsed_metadata))
if len(seen_pags) != len(best_pags):
    missing = set(best_pags.values()) - seen_pags
    for pag in missing:
        sys.stderr.write(
            "[WARN] Best PAG found for %s but not matched to metadata\n" % pag)
    sys.exit(3)

# Load the FASTA, lookup and emit the sample_date and genome sequence
print(','.join([
    "COG-ID",
    "Sample_date",
    "Adm1",
    "Pillar",
    "Published_date",
    "Sequence",
]))
with open(args.fasta) as all_fh:
    for name, seq, qual in readfq(all_fh):
        central_sample_id = name

        print(','.join([
            central_sample_id,
            parsed_metadata[central_sample_id]["collection_or_received_date"],
            parsed_metadata[central_sample_id]["adm1"],
            parsed_metadata[central_sample_id]["collection_pillar"],
            parsed_metadata[central_sample_id]["published_date"],
            seq,
        ]))
Exemplo n.º 19
0
        print usage
        sys.exit(1)

    fp_ref = open(sys.argv[1], 'r')
    fp_var = open(sys.argv[2], 'r')
    fp_sim_genome = open(sys.argv[3] + ".simGenome.fa", 'w')
    fp_sim_ann = open(sys.argv[3] + ".simAnn", 'w')
    sim_genome = ''

    cur_var = None
    line = fp_var.readline().strip()
    if len(line) == 0:
        print >> sys.stderr, "File %s is empty" % (sys.argv[2])
        sys.exit(1)
    cur_var = var(line)
    for chrom, seq, qual in readfq(fp_ref):
        sim_genome_len = 0
        sim_genome = ''
        non_variant_region_start = 1
        non_variant_region_end = 1
        sim_start = -1
        sim_end = -1
        while cur_var.chr == chrom:
            non_variant_region_end = cur_var.pos_start

            if cur_var.type == 'SNV' or cur_var.type == 'SNP':
                sim_genome += seq[non_variant_region_start -
                                  1:non_variant_region_end] + cur_var.seq
                sim_genome_len += non_variant_region_end - non_variant_region_start + 1 + 1
                #print >>sys.stderr, '>Ref_%d_%d\n%s'%(non_variant_region_start-1, non_variant_region_end+1, seq[non_variant_region_start-1:non_variant_region_end+1])
                #print >>sys.stderr, '@Sim SNP\n%s'%(seq[non_variant_region_start-1:non_variant_region_end]+cur_var.seq)
Exemplo n.º 20
0
#!/usr/bin/env python
# nuccount.py -- tally nucleotides in a file
import sys
from collections import Counter
from readfq import readfq

IUPAC_BASES = "ACGTRYSWKMBDHVN-."

# intialize counter
counts = Counter()

for name, seq, qual in readfq(sys.stdin):
    # for each sequence entry, add all its bases to the counter
    counts.update(seq.upper())

# print the results
for base in IUPAC_BASES:
    print(base + "\t" + str(counts[base]))
Exemplo n.º 21
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--ref", required=True)
    parser.add_argument("--msa", required=True)
    parser.add_argument("-n", type=int, required=True, help="number of sequences to process, will be divided amongst threads")
    parser.add_argument("-t", "--threads", type=int, default=4)
    args = parser.parse_args()

    # Check files exist
    for fpt, fp in ("REF", args.ref), ("MSA", args.msa):
        if not os.path.isfile(fp):
            sys.stderr.write("[FAIL] Could not open %s %s.\n" % (fpt, fp))
            sys.exit(1)
        else:
            sys.stderr.write("[NOTE] %s: %s\n" % (fpt, fp))

    sys.stderr.write("[NOTE] NUM_SEQUENCES: %d\n" % args.n)
    sys.stderr.write("[NOTE] ASKLEPIAN_VARIANT_THREADS: %d\n" % args.threads)

    # Load the ref and assign it to ref_seq
    with open(args.ref) as canon_fh:
        for name, seq, qual in readfq(canon_fh):
            break
        if not name:
            sys.stderr.write("[FAIL] Could not read sequence from reference.\n")
            sys.exit(2)
        else:
            ref_seq = seq

    write_q = multiprocessing.Queue()
    processes = []

    writer_process = multiprocessing.Process(
        target=write_worker,
        args=(
            write_q,
            args.threads,
        ),
    )
    processes.append(writer_process)

    window_l = ceil(args.n / float(args.threads))
    for window_i, window_pos in enumerate(range(0, args.n, window_l)):
        start = window_pos
        end = window_pos + window_l - 1 # remove 1 as we dont use gte in worker
        if window_i == (args.threads - 1):
            end = args.n # in case we've managed to screw the last window and its too short, just set it to N

        sys.stderr.write("[WORK] Worker %d (%d, %d)\n" % (window_i, start, end))
        p = multiprocessing.Process(
            target=variant_worker,
            args=(
                write_q,
                ref_seq,
                args.msa,
                window_i,
                start,
                end,
            ),
        )
        processes.append(p)

    # Engage
    for p in processes:
        p.start()

    # Block
    for p in processes:
        p.join()

    sys.stderr.write("[DONE] All workers exited, bye!\n")
Exemplo n.º 22
0
    if not ((type(winSize) == type(0)) and (type(step) == type(0))):
        raise Exception("**ERROR** type(winSize) and type(step) must be int.")
    if step > winSize:
        raise Exception("**ERROR** step must not be larger than winSize.")
    if winSize > len(sequence):
        raise Exception("**ERROR** winSize must not be larger than sequence length.")
 
    # Pre-compute number of chunks to emit
    numOfChunks = ((len(sequence)-winSize)/step)+1
 
    # Do the work
    for i in range(0,numOfChunks*step,step):
        yield sequence[i:i+winSize]

with open(reference, 'rb') as fh:
    for _, seq, _ in readfq.readfq(fh):
        for i, kmer in enumerate(sliding_window(seq, kmer_size)):
            bloom.add(kmer, i)

i = 0
with open(reference, 'rb') as fh:
    for _, seq, _ in readfq.readfq(fh):
        for i, kmer in enumerate(sliding_window(seq, kmer_size)):
            if i % 5 == 0:
                bloom.delete(kmer, i)

bloom.flush()
del bloom

bloom = pydablooms.load_dabloom(capacity=capacity,
                                error_rate=error_rate,
Exemplo n.º 23
0
def variant_worker(write_q, ref_seq, msa, window_i, start_record_i, end_record_i):
    # Open the MSA, iterate over each sequence and walk the genome to find
    # diagreements with the loaded reference
    # NOTE This particular MSA does not handle insertions
    record_i = -1
    first = True
    with open(msa, 'r') as all_fh:
        for name, seq, qual in readfq(all_fh):
            record_i += 1
            if record_i < start_record_i:
                continue
            if record_i > end_record_i:
                break

            if first:
                sys.stderr.write("[STAT] Worker %d started on record %d\n" % (window_i, record_i))
                first = False

            central_sample_id = name

            query_on_ref_pos = 0
            current_deletion_len = 0

            curr_lines = []
            for qbase in seq:
                if qbase == '-':
                    # Extend the length of the current deletion
                    current_deletion_len += 1
                else:
                    if current_deletion_len > 0:
                        # We've come to the end of a deletion, output it
                        curr_lines.append(','.join([
                            central_sample_id,
                            #str( "%d-%d" % ((query_on_ref_pos-current_deletion_len)+1, query_on_ref_pos) ),
                            str((query_on_ref_pos-current_deletion_len)+1),
                            "",
                            "%dD" % current_deletion_len,
                            "1",
                        ]))
                        current_deletion_len = 0

                # Now deletions are handled, check for single nucleotide variants
                # NOTE This includes missing data such as N
                # NOTE This algorithm does not consider INS against ref
                if qbase != ref_seq[query_on_ref_pos]:
                    if current_deletion_len == 0:
                        # SNV detected and we aren't in an active DEL
                        curr_lines.append(','.join([
                            central_sample_id,
                            str(query_on_ref_pos+1),
                            ref_seq[query_on_ref_pos],
                            qbase,
                            "0",
                        ]))

                # Advance pointer (this is overkill here but a useful starting point
                # for a future algo walking the ref for insertions)
                query_on_ref_pos += 1

            if current_deletion_len > 0:
                # Output the last deletion, if there is one 
                # (this is almost always going to be garbage but we include it for completeness)
                curr_lines.append(','.join([
                    central_sample_id,
                    #str( "%d-%d" % ((query_on_ref_pos-current_deletion_len)+1, query_on_ref_pos) ),
                    str((query_on_ref_pos-current_deletion_len)+1),
                    "",
                    "%dD" % current_deletion_len,
                    "1",
                ]))

            # Push curr lines to writer
            write_q.put( '\n'.join(curr_lines) + '\n' )

        # Break out, send sentinel to queue
        sys.stderr.write("[DONE] Worker %d finished at next record %d\n" % (window_i, record_i))
        write_q.put(None)
Exemplo n.º 24
0
    fastq_file1 = open(read_files[0], 'r')
    fastq_file2 = open(read_files[1], 'r')

    pool = Pool(processes=int(args.threads))

    read_comparisons = list()
    amplicon_reads = dict()
    amplicon_fastq_file_pairs = dict()
    no_read_amplicons = list()

    num_processed = 0
    num_perfect = 0
    num_mismatched = 0

    sys.stdout.write("Sorting reads into appropriate amplicons\n")
    for result in pool.imap(match_read_primers, itertools.izip(readfq.readfq(fastq_file1), readfq.readfq(fastq_file2),
                                                               itertools.repeat(primer_sets)), 100000):
        num_processed += 1
        if (num_processed % 10000) == 0:
            sys.stdout.write("Processed %s reads\n" % num_processed)

        if result['ratio1_amplicon'] != result['ratio2_amplicon']:
            num_mismatched += 1
            read1_filename = "%s-mismatched_R1.fastq" % args.output
            read2_filename = "%s-mismatched_R2.fastq" % args.output
            write_fastq_pairs(read1_filename, read2_filename, result)
            continue

        read1_fastq_name = "%s-%s_R1.fastq" % (args.output, result['ratio1_amplicon'])
        read2_fastq_name = "%s-%s_R2.fastq" % (args.output, result['ratio2_amplicon'])
        write_fastq_pairs(read1_fastq_name, read2_fastq_name, result)
Exemplo n.º 25
0
import re
import sys
from readfq import readfq

complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}

if len(sys.argv) < 3:
    sys.exit(
        "error: too few arguments\nusage: find_motif.py infile.fa <motifseq>\n"
    )

seqs = dict()
infile = open(sys.argv[1])
for name, seq, qual in readfq(infile):
    seqs[name] = seq

motif_str = sys.argv[2].upper()
motif_str_rc = "".join([complement[b] for b in motif_str[::-1]])
pat = "(%s|%s)" % (motif_str.replace(
    "N", "[ATCG]"), motif_str_rc.replace("N", "[ATCG]"))
motif = re.compile(pat, flags=re.IGNORECASE)

for name, seq in seqs.items():
    matches = motif.finditer(seq)
    for match in matches:
        if match is not None:
            if len(sys.argv) == 4:
                print "\t".join(
                    map(str, (name, match.start(), match.end(), sys.argv[3])))
            else:
                print "\t".join(map(str, (name, match.start(), match.end())))