示例#1
0
def test_process_bam_mismatches():
    tbam = os.path.join(DATA, "tmp.bam")
    bam = os.path.join(DATA, "ordered_umi.bam")
    if os.path.exists(tbam):
        os.remove(tbam)
    with captured_output() as (out, err):
        process_bam(bam, tbam, mismatches=1)
    assert os.path.exists(tbam)
    it = iter(out.getvalue().split("\n"))
    assert it.next().strip() == "1\t9\t10\t4\t2"
    assert it.next().strip() == "1\t11\t12\t2\t1"
    assert it.next().strip() == "1\t29\t30\t2\t1"

    bam_reader = Samfile(tbam)
    it = iter(bam_reader)
    r = it.next()
    assert r.pos == 4
    assert r.qname == "read8:UMI_ATTCAGGG"
    r = it.next()
    assert r.pos == 9
    assert r.qname == "read1:UMI_AAAAAGGG"
    r = it.next()
    assert r.pos == 9
    assert r.qname == "read4:UMI_AAAGGGGG"
    r = it.next()
    assert r.pos == 11
    assert r.qname == "read5:UMI_ATTTAGGG"
    bam_reader.close()
    os.remove(tbam)
示例#2
0
def get_sorted_aligned_reads(args, header, sequence):
    if args.reference_hash and os.path.exists(args.reference_hash):
        print("Loading index...")
        ref_index = load_hash(args.reference_hash)
    else:
        print("Computing reference index...")
        ref_index = build_hashtable(sequence, args.kmer, args.stride)
        save_hash(*ref_index, file=args.reference_hash)
    print("Verifying hash...")
    for hash_, offset_ in islice(ref_index[0].iteritems(), 20):
        if not verify_hash(sequence, offset_, args.kmer, hash_):
            raise ValueError(
                'Index failed to verify: offset {} has mismatching hashes'.
                format(offset_))
    print("Aligning reads...")
    pair_iterator = read_paired_fasta(args.reads_file)
    sam_iterator = align_pairs(sequence, ref_index, pair_iterator, 'hw2_rg')
    sam_iterator = iter(sorted(sam_iterator, cmp=compsam))
    if args.out_bam:
        outfile = Samfile(args.out_bam,
                          'wb',
                          header=SAM_HEADER(header, sequence))
        for read in sam_iterator:
            outfile.write(read)
        outfile.close()
        infile = Samfile(args.out_bam, 'rb')
        sam_iterator = infile
    return sam_iterator
示例#3
0
def single_end_sam_parsing(sam_list, cov, identity_threshold):
    match = {}
    to_process = []
    if sam_list[0] is None:
        print "The ene-to-end mapping of SE data produced an error."
    else:
        to_process.append(sam_list[0])
    if sam_list[1] is None:
        print "The local mapping mode of SE data  produced an error."
    else:
        to_process.append(sam_list[1])
    for single_sam in to_process:
        sam = Samfile(single_sam)
        for align in sam:
            if align.tid != -1:
                query_name, query_len, ref_name = align.qname, float(
                    align.rlen), sam.getrname(align.tid)
                if align.cigar is not None:
                    align_len, query_aligned_len = cigar_parsing(align.cigar)
                    nm = -1
                    if (query_aligned_len / query_len) * 100 >= cov:
                        for coppia in align.tags:
                            if coppia[0] == "NM":
                                nm = float(coppia[1])
                    if align_len != 0 and nm >= 0:
                        paired_perc_id = ((align_len - nm) / align_len) * 100
                        if paired_perc_id >= identity_threshold:
                            match.setdefault(query_name, set())
                            match[query_name].add(ref_name)
        sam.close()
    return match
def callbase(bamfile, snpsites, out):
    BF = Samfile(bamfile, 'rb') #open your bam file
    SF = open(snpsites, 'r')    #the file contain snp sites info
    RF = open(out, 'w')         #resulte file
    RF.write('ref_name\tpos\tRbase\tAbase\tA\tT\tC\tG\tN\tothers\n')
    for i in SF:
        if i.startswith('#'):
            continue
        else:
            line = ParseSNPsitesLine(i)
            vcf_pos = line.pos-1 #change 1-base to 0-based
            vcf_refname = line.chrom
            print 'processing: %s %s...'%(vcf_refname, str(vcf_pos))
            At, Tt, Ct, Gt, Nt, othert = 0, 0, 0, 0, 0, 0
            for i in BF.pileup(vcf_refname, vcf_pos, vcf_pos+1):
                if i.pos == vcf_pos:
                    vcf_Rbase = line.Rbase
                    vcf_Abase = line.Abase
                    for j in i.pileups:
                        yourbase = j.alignment.seq[j.qpos]
                        if yourbase == 'A': At += 1
                        elif yourbase == 'T': Tt += 1
                        elif yourbase == 'C': Ct += 1
                        elif yourbase == 'G': Gt += 1
                        elif yourbase == 'N': Nt += 1
                        else: othert += 1
        RF.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(vcf_refname, \
str(vcf_pos+1), vcf_Rbase, vcf_Abase, str(At), str(Tt), str(Ct), str(Gt), \
str(Nt), str(othert)))
    BF.close()
示例#5
0
def test_process_bam_mismatches():
    tbam = os.path.join(DATA, "tmp.bam")
    bam = os.path.join(DATA, "ordered_umi.bam")
    if os.path.exists(tbam):
        os.remove(tbam)
    with captured_output() as (out, err):
        process_bam(bam, tbam, mismatches=1)
    assert os.path.exists(tbam)
    it = iter(out.getvalue().split("\n"))
    assert it.next().strip() == "1\t9\t10\t4\t2"
    assert it.next().strip() == "1\t11\t12\t2\t1"
    assert it.next().strip() == "1\t29\t30\t2\t1"

    bam_reader = Samfile(tbam)
    it = iter(bam_reader)
    r = it.next()
    assert r.pos == 4
    assert r.qname == "read8:UMI_ATTCAGGG"
    r = it.next()
    assert r.pos == 9
    assert r.qname == "read1:UMI_AAAAAGGG"
    r = it.next()
    assert r.pos == 9
    assert r.qname == "read4:UMI_AAAGGGGG"
    r = it.next()
    assert r.pos == 11
    assert r.qname == "read5:UMI_ATTTAGGG"
    bam_reader.close()
    os.remove(tbam)
示例#6
0
def create_table(half_ext, feature_summit_file_name, bam_names, bam_counts, bam_list, output_file_name):

  # Initialization
  outLoc = "/".join(output_file_name.split("\t")[:-1]) + "/"
  command = "mkdir -p "+outLoc
  os.system(command)

  # Allowed chromosomes
  chrList = ["chr"+str(e) for e in range(1,23)+["X"]]

  # Fetching regions
  featureSummitFile = open(feature_summit_file_name,"r")
  regionList = []
  for line in featureSummitFile:
    ll = line.strip().split("\t")
    if(ll[0] not in chrList): continue
    region = [ll[0], int(ll[1])-half_ext, int(ll[2])+half_ext]
    if(int(region[1]) < 0): continue
    regionList.append(region)
  featureSummitFile.close()

  # Creating table
  matrix = []
  for i in range(0,len(bam_list)):
    inputBamFileName = bam_list[i]
    correctFactor = int(bam_counts[i])/1000000
    extension = inputBamFileName.split(".")[-1]
    if(extension == "bam"):
      bamFile = Samfile(inputBamFileName,"rb")
      vec = []
      for region in regionList:
        try: bamSignal = fetchSignal(bamFile, region) / correctFactor
        except Exception: bamSignal = 0
        vec.append(bamSignal)
    elif(extension == "bw" or extension == "bigwig"):
      bamFile = pyBigWig.open(inputBamFileName)
      vec = []
      for region in regionList:
        try: bamSignal = fetchSignalBw(bamFile, region) / correctFactor
        except Exception: bamSignal = 0
      vec.append(bamSignal)
    else: print("The tool supports only BAM or BIGWIG files.")
    matrix.append(vec)
    bamFile.close()
  outputFile = open(output_file_name,"w")
  outputFile.write("\t".join(bam_names)+"\n")
  for j in range(0,len(matrix[0])):
    vec = []
    for i in range(0,len(matrix)):
      try: vec.append(str(matrix[i][j]))
      except Exception: vec.append("NA")
    outputFile.write("\t".join(vec)+"\n")
  outputFile.close()
示例#7
0
    def removeEdgeMismatches(self, bamFile, minDistance, minBaseQual):
        startTime = Helper.getTime()
        minDistance = int(minDistance)
        counter = 0
        j = 0
        num_lines = len(self.variantDict)
        Helper.info(
            " [%s] remove Missmatches from the first %s bp from read edges" %
            (startTime.strftime("%c"), str(minDistance)), self.logFile,
            self.textField)

        bamFile = Samfile(bamFile, "rb")

        for varKey in self.variantDict.keys():
            variant = self.variantDict[varKey]

            counter += 1
            if counter % 10000 == 0:
                Helper.status('%s mm parsed ' % counter, self.logFile,
                              self.textField, "grey")

            keepSNP = False
            varPos = variant.position - 1
            iter = bamFile.pileup(variant.chromosome, variant.position - 1,
                                  variant.position)
            #walks up the region wich overlap this position
            for x in iter:
                if x.pos == varPos:
                    for pileupread in x.pileups:  #walk through the single reads
                        if not pileupread.is_del and not pileupread.is_refskip:
                            distance = abs(
                                pileupread.alignment.alen -
                                pileupread.query_position
                            ) if pileupread.alignment.is_reverse else pileupread.query_position
                            if distance >= minDistance:
                                #check readBase and Base Quality
                                if pileupread.alignment.query_sequence[
                                        pileupread.
                                        query_position] == variant.alt and pileupread.alignment.query_qualities[
                                            pileupread.
                                            query_position] >= minBaseQual:
                                    #if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt:
                                    keepSNP = True

            if keepSNP == False:
                j += 1
                del self.variantDict[varKey]

        Helper.status('%s of %svariants were deleted' % (j, num_lines),
                      self.logFile, self.textField, "black")
        Helper.printTimeDiff(startTime, self.logFile, self.textField)
        bamFile.close()
示例#8
0
def main(args=None):
    if args is None:
        args = sys.argv[1:]

    f = Samfile(args[0])
    header = f.header
    f.close()

    reflen = header['SQ'][0]['LN']

    BamIO.write(clip(BamIO.parse(args[0]), reflen), args[1], header=header)

    return 0
示例#9
0
def paired_end_sam_parsing(sam_list, cov, identity_threshold):
    match = {}
    to_process = []
    if sam_list[0] is None:
        print "The ene-to-end mapping of SE data produced an error."
    else:
        to_process.append(sam_list[0])
    if sam_list[1] is None:
        print "The local mapping mode of SE data  produced an error."
    else:
        to_process.append(sam_list[1])
    for paired_sam in to_process:
        r1_match = {}
        r2_match = {}
        sam = Samfile(paired_sam)
        for align in sam:
            if align.tid != -1:
                query_name, query_len, ref_name = align.qname, float(
                    align.rlen), sam.getrname(align.tid)
                if align.cigar is not None:
                    align_len, query_aligned_len = cigar_parsing(align.cigar)
                    # print query_name, align_len, query_aligned_len
                    nm = -1
                    if (query_aligned_len / query_len) * 100 >= cov:
                        for coppia in align.tags:
                            if coppia[0] == "NM":
                                nm = float(coppia[1])
                    if align_len != 0 and nm >= 0:
                        paired_perc_id = ((align_len - nm) / align_len) * 100
                        if paired_perc_id >= 90:
                            if align.is_read1:
                                r1_match.setdefault(query_name, {})
                                r1_match[query_name].setdefault(ref_name, [])
                                r1_match[query_name][ref_name].append(
                                    paired_perc_id)
                            if align.is_read2:
                                r2_match.setdefault(query_name, {})
                                r2_match[query_name].setdefault(ref_name, [])
                                r2_match[query_name][ref_name].append(
                                    paired_perc_id)
        sam.close()
        for query in set(r1_match.keys()).intersection(set(r2_match.keys())):
            for ref in set(r1_match[query].keys()).intersection(
                    r2_match[query].keys()):
                average_perc_id = calcola_media(
                    [max(r1_match[query][ref]),
                     max(r2_match[query][ref])])
                if average_perc_id >= identity_threshold:
                    match.setdefault(query, set())
                    match[query].add(ref)
    return match
示例#10
0
def subsample(fn, ns=None):
    if ns is None:
        fn, ns = fn
    sample = []
    count = 0
    outdir_base = path.join(path.dirname(fn), 'subset')
    sf = Samfile(fn)
    try:
        i_weight = float(sf.mapped) / max(ns)
        print "Read out ", i_weight
    except ValueError:
        i_weight = 0.0
        for read in sf:
            i_weight += 1
        print "Counted ", i_weight
        i_weight /= float(max(ns))
        sf = Samfile(fn)

    print fn, count, i_weight
    for i, read in enumerate(sf):
        key = random()**i_weight
        if len(sample) < max(ns):
            heappush(sample, (key, read, i + count))
        else:
            heappushpop(sample, (key, read, i + count))

    count += i

    for n in ns:
        if n == min(ns):
            outdir = outdir_base + '_min'
        else:
            outdir = outdir_base + '{:04.1f}M'.format(n / 1e6)
        try:
            makedirs(outdir)
        except OSError:
            pass
        sampN = sorted(sample, reverse=True)[:int(n)]
        print "Kept {: >12,} of {: >12,} reads".format(len(sampN), count)
        print fn, '->', outdir
        stdout.flush()
        of = Samfile(path.join(outdir, 'accepted_hits.bam'),
                     mode='wb',
                     template=sf)
        sample.sort(key=lambda (key, read, pos): (read.tid, read.pos))
        for key, read, pos in sampN:
            of.write(read)
        of.close()
    sf.close()
    return [count for key, read, count in sample]
示例#11
0
def expression_dict_from_bam(alias_dict, gene_dict, exp_file_name):

    # Fetching expression
    exp_dict = dict()
    exp_file = Samfile(exp_file_name, "rb")
    for k in gene_dict.keys():
        geneVec = gene_dict[k]
        gene = geneVec[3]
        region = [geneVec[0], int(geneVec[1]), int(geneVec[2])]
        exp = fetch_counts(exp_file, region)
        exp_dict[gene] = float(exp) / (region[2] - region[1])
    exp_file.close()

    # Returning objects
    return exp_dict
示例#12
0
def subsample(fn, ns=None):
    if ns is None:
        fn, ns = fn
    sample = []
    count = 0
    outdir_base = path.join(path.dirname(fn), "subset")
    sf = Samfile(fn)
    try:
        i_weight = float(sf.mapped) / max(ns)
        print "Read out ", i_weight
    except ValueError:
        i_weight = 0.0
        for read in sf:
            i_weight += 1
        print "Counted ", i_weight
        i_weight /= float(max(ns))
        sf = Samfile(fn)

    print fn, count, i_weight
    for i, read in enumerate(sf):
        key = random() ** i_weight
        if len(sample) < max(ns):
            heappush(sample, (key, read, i + count))
        else:
            heappushpop(sample, (key, read, i + count))

    count += i

    for n in ns:
        if n == min(ns):
            outdir = outdir_base + "_min"
        else:
            outdir = outdir_base + "{:04.1f}M".format(n / 1e6)
        try:
            makedirs(outdir)
        except OSError:
            pass
        sampN = sorted(sample, reverse=True)[: int(n)]
        print "Kept {: >12,} of {: >12,} reads".format(len(sampN), count)
        print fn, "->", outdir
        stdout.flush()
        of = Samfile(path.join(outdir, "accepted_hits.bam"), mode="wb", template=sf)
        sample.sort(key=lambda (key, read, pos): (read.tid, read.pos))
        for key, read, pos in sampN:
            of.write(read)
        of.close()
    sf.close()
    return [count for key, read, count in sample]
示例#13
0
def create_hic_file(chrom_sizes_file_name, ctcf_peaks_file_name,
                    ctcf_motifs_file_name, loops_file_name,
                    loops_hiccups_output_file_name):

    # Parameters
    outLoc = "/".join(loops_hiccups_output_file_name.split("/")[:-1]) + "/"
    command = "mkdir -p " + outLoc
    os.system(command)

    # Chrom sizes
    chrom_list, chrom_dict = read_chromosome_sizes(chrom_sizes_file_name)

    # Reading loop list
    loop_list = read_loop_list(chrom_list, loops_file_name)

    # Hiccups Header
    hic_header = [
        "chr1", "x1", "x2", "chr2", "y1", "y2", "color", "o", "e_bl",
        "e_donute_h", "e_v", "fdr_bl", "fdr_donut", "fdr_h", "fdr_v",
        "num_collapsed", "centroid1", "centroid2", "radius", "motif_x1",
        "motif_x2", "sequence_1", "orientation_1", "uniqueness_1", "motif_y1",
        "motif_y2", "sequence2", "orientation_2", "uniqueness_2"
    ]

    # Opening CTCF files
    if (os.path.isfile(ctcf_peaks_file_name)
            and os.path.isfile(ctcf_motifs_file_name)):
        ctcf_peaks_file = Samfile(ctcf_peaks_file_name, "rb")
        ctcf_motifs_file = Samfile(ctcf_motifs_file_name, "rb")
    else:
        ctcf_peaks_file = None
        ctcf_motifs_file = None

    # Writing hiccups file
    write_hiccups_file(hic_header, loop_list, ctcf_peaks_file,
                       ctcf_motifs_file, loops_hiccups_output_file_name)

    # Closing bam files
    if (os.path.isfile(ctcf_peaks_file_name)
            and os.path.isfile(ctcf_motifs_file_name)):
        ctcf_peaks_file.close()
        ctcf_motifs_file.close()
示例#14
0
def get_sorted_aligned_reads(args, header, sequence):
    if args.ref_idx and os.path.exists(args.ref_idx):
        print("Loading index...")
        ref_index = load_hash(args.ref_idx)
    else:
        print("Computing reference index...")
        ref_index = build_hashtable(sequence, args.kmer, args.stride)
    print("Verifying hash...")
    for hash_, offset_ in islice(ref_index[0].iteritems(), 20):
        if not verify_hash(sequence, offset_, args.kmer, hash_):
            raise ValueError(
                'Index failed to verify: offset {} has mismatching hashes'.
                format(offset_))
    print("Aligning reads...")
    pair_iterator = read_paired_fasta(args.reads_file)
    sam_iterator = align_pairs(sequence, ref_index, pair_iterator, 'hw1_rg')
    print('Sorting SAMRecords in memory...')
    sam_iterator = iter(sorted(sam_iterator, cmp=compsam))
    if args.out_bam:
        header = {
            'HD': {
                'VN': '1.0'
            },
            'SQ': [{
                'SN': header[1:],
                'LN': len(sequence)
            }],
            'RG': [{
                'ID': 'hw1_rg',
                'SM': SAMPLE_NAME,
                'PU': 'Unknown',
                'PL': 'Unknown',
                'LB': 'Unknown'
            }]
        }
        outfile = Samfile(args.out_bam, 'wb', header=header)
        for read in sam_iterator:
            outfile.write(read)
        outfile.close()
        infile = Samfile(args.out_bam, 'rb')
        sam_iterator = infile
    return sam_iterator
示例#15
0
def create_table(downstream_extension, upstream_extension, number_of_bins, number_of_counts, alias_file_name, gene_list_file_name, regions_file_name, signal_file_type, signal_file_name, output_gene_file_name):

  # Fetch alias dictionary
  aliasDict = create_alias_dictionary(alias_file_name)

  # Fetch gene list dict
  if(gene_list_file_name == "."): gene_list_file_name = None
  geneListDict = create_gene_list_dictionary(aliasDict, gene_list_file_name)

  # Fetch genes and enhancers
  geneDict = genes_dictionary(aliasDict, geneListDict, regions_file_name)

  # Opening signal file
  if(signal_file_type == "bam"): 
    signal_file = Samfile(signal_file_name, "rb")
  elif(signal_file_type == "bw"):
    signal_file = pyBigWig.open(signal_file_name)

  # Writing meta signals
  write_meta_signals(downstream_extension, upstream_extension, number_of_bins, number_of_counts, geneDict, signal_file_type, signal_file, output_gene_file_name)
  signal_file.close()
示例#16
0
 def removeEdgeMismatches(self,bamFile,minDistance, minBaseQual):
     startTime=Helper.getTime()
     minDistance=int(minDistance)
     counter=0;j=0  
     num_lines = len(self.variantDict)
     Helper.info(" [%s] remove Missmatches from the first %s bp from read edges" % (startTime.strftime("%c"),str(minDistance)),self.logFile,self.textField)
     
     bamFile = Samfile(bamFile, "rb")
     
     for varKey in self.variantDict.keys():
         variant = self.variantDict[varKey]
         
         counter+=1
         if counter%10000==0:
             Helper.status('%s mm parsed ' % counter ,self.logFile, self.textField,"grey")
         
         keepSNP=False
         varPos=variant.position-1
         iter = bamFile.pileup(variant.chromosome, variant.position-1, variant.position)
         #walks up the region wich overlap this position
         for x in iter:
             if x.pos == varPos:
                 for pileupread in x.pileups: #walk through the single reads
                     if not pileupread.is_del and not pileupread.is_refskip:
                         distance=abs(pileupread.alignment.alen-pileupread.query_position) if pileupread.alignment.is_reverse else pileupread.query_position
                         if distance >= minDistance:
                             #check readBase and Base Quality
                             if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt and pileupread.alignment.query_qualities[pileupread.query_position]>=minBaseQual:
                             #if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt:
                                 keepSNP=True
                                 
         if keepSNP==False:
             j+=1
             del self.variantDict[varKey]
     
     Helper.status('%s of %svariants were deleted' % (j,num_lines), self.logFile, self.textField,"black") 
     Helper.printTimeDiff(startTime, self.logFile, self.textField)
     bamFile.close()
示例#17
0
def get_promoters(promoter_ext, alias_dict, genomic_regions_file_name, stag_regions_file_name):

  # Initialization
  regionFile = open(genomic_regions_file_name, "rU")
  stagRegionsFile = Samfile(stag_regions_file_name, "rb")
  allList = []
  stagDict = dict()

  # Iterating on region file
  for line in regionFile:

    # Fetching data
    ll = line.strip().split("\t")
    nn = ll[3].split(":")
    chrom = ll[0]; start = ll[1]; end = ll[2]; strand = ll[5]; region = nn[0]; name = nn[1]; activity = nn[2]
    if(region != "PROMOTER" or activity == "INACTIVE"): continue

    # Gene name
    try: gene = alias_dict[name.upper()]
    except Exception: gene = name.upper()
    promoterWriteVec = [chrom, start, end, gene, "0", strand]

    # Appending gene to allList
    allList.append(promoterWriteVec)

    # Check whether promoter intersect both stag regions
    promoterRegion = [chrom, max(int(start)-promoter_ext, 0), int(end)+promoter_ext]
    check = check_bam_at_least_one_read(stagRegionsFile, promoterRegion)
    if(not check): continue
    stagDict[gene] = promoterWriteVec

  # Termination
  regionFile.close()
  stagRegionsFile.close()

  # Returning objects
  return allList, stagDict
示例#18
0
def merge_interregional_bams(input_paths, output_path, regions_dir):
    """Merges regional sorted BAMs
  Should be used for regional BAMs that are sorted and have defined intervals.
  Written to work with XGAP pipeline only, specifically using config to
  find interval files.
  Args:
    input_paths: List of regional BAMs, in proper order
    output_path: Path to output merged BAM
    regions_dir: Path to directory with interval files
    config: Config dict for XGAP
  """
    n_regions = len(input_paths)
    sample = Samfile(input_paths[0], 'rb')
    sample_header = dict(sample.header)
    sample.close()
    seq_dict = {}
    for i, seq in enumerate(sample_header["SQ"]):
        seq_dict[seq["SN"]] = i
    #regions_dir = "{}/regions/{}/".format(config['output-dir'], n_regions)
    with Samfile(output_path, 'wb', header=sample_header) as out_file:
        for index, input_path in enumerate(input_paths):
            # Object Info: region['ref_id'] = [lower_bound, upper_bound]
            region = _load_region(index, regions_dir, n_regions, seq_dict)
            started = False
            with Samfile(input_path, 'rb') as in_file:
                for alignment in in_file:
                    pos = alignment.reference_start
                    ref_id = alignment.reference_id
                    if region[ref_id][0] <= pos <= region[ref_id][1]:
                        if not started:
                            started = True
                        out_file.write(alignment)
                    else:
                        # Reached end of interval, move to next file
                        if started:
                            break
示例#19
0
def split_samfile(sam_file, splits, prefix='', path=''):
    """Take a sam file and split it splits number of times.

    :path:    Where to put the split files.
    :prefix:  A prefix for the outfile names.
    :returns: A tuple of job files.
    """
    # Determine how many reads will be in each split sam file.
    num_lines = count_reads(sam_file)
    num_reads = int(int(num_lines)/splits) + 1

    # Get rid of starting path
    sam_name = os.path.basename(sam_file)

    # Subset the SAM file into X number of jobs
    cnt      = 0
    currjob  = 1
    suffix   = '.split_sam_' + str(currjob).zfill(4)
    run_file = os.path.join(path, prefix + sam_name + suffix)
    rmode    = 'rb' if sam_name.split('.')[0] == 'bam' else 'r'
    wmode    = 'wb'

    # Actually split the file
    outfiles = [run_file]
    with Samfile(sam_file, rmode) as in_sam:
        sam_split = Samfile(run_file, wmode, template=in_sam)
        for line in in_sam:
            cnt += 1
            if cnt < num_reads:
                sam_split.write(line)
            elif cnt == num_reads:
                # Check if next line is mate-pair. If so, don't split.
                line2     = next(in_sam)
                currjob  += 1
                suffix    = '.split_sam_' + str(currjob).zfill(4)
                run_file  = os.path.join(path, prefix + sam_name + suffix)
                new_sam   = Samfile(run_file, wmode, template=in_sam)
                outfiles.append(run_file)

                if line.qname == line2.qname:
                    sam_split.write(line)
                    sam_split.write(line2)
                    sam_split.close()
                    cnt = 0
                else:
                    sam_split.write(line)
                    sam_split.close()
                    new_sam.write(line2)
                    cnt = 0
                sam_split = new_sam
        sam_split.close()
    return tuple(outfiles)
示例#20
0
def parse_barcode(bamfile):
	"""parses a sorted and index bam file, removes all cases where rna hits more than one spot in genome
	and writes to a file, create file for mutant and wildtype based on barcodes"""
	samfile = Samfile(bamfile, "rb")
	multi_hit_file = Samfile("MultiHit.bam","wb",template=samfile)
	mutant = Samfile("Mutant.bam","wb",template=samfile)
	wildtype = Samfile("Wildtype.bam","wb",template=samfile)
	for line in samfile.fetch():
		#if line.is_secondary:
		## does this hit to more than one spot in genome
		#	multi_hit_file.write(line)
		if "#GAGT"in line.qname or "#TTAG" in line.qname: 
		## write to mutant file
			mutant.write(line)
		elif "#ACCC" in line.qname or "#CGTA" in line.qname:
		### write to wildtype file
			wildtype.write(line)

	multi_hit_file.close()
	mutant.close()
	wildtype.close()
	samfile.close()
示例#21
0
  value = float(ll[0])
  if(value < 0): value -= pseudocount
  if(value > 0): value += pseudocount
  eigenVec = [key, value]
  if(value > 0):
    posReadCount += float(fetchTotalReadsBam(dnaseFile, [chromosome, p1, p2]))
    posCount += 1
    if(value > maxPos): maxPos = value
  elif(value < 0):
    negReadCount += float(fetchTotalReadsBam(dnaseFile, [chromosome, p1, p2]))
    negCount += 1
    if(value < minNeg): minNeg = value
  position += resolution
  eigenList.append(eigenVec)
eigenFile.close()
dnaseFile.close()
posReadCount = posReadCount/posCount
negReadCount = negReadCount/negCount
maxPos = round(maxPos * 100,4)
minNeg = round(minNeg * 100,4)

# Checking if signal change is needed
if(negReadCount > posReadCount):
  for i in range(0,len(eigenList)):
    if(eigenList[i][1] == 0):
      eigenList[i][1] = -maxPos
      continue
    eigenList[i][1] = round(-eigenList[i][1] * 100,4)
else:
  for i in range(0,len(eigenList)):
    if(eigenList[i][1] == 0):
示例#22
0
def _sort_alignments_by_region(bwa_cmd,
                               regions,
                               output_dir,
                               basename,
                               log_output=stdout):
    """Sort bwa alignments into specified regions
  Sorts alignments from bwa command into separate files for each specified
  region. Sorts unmapped and qc fail reads into separate files. Paired reads
  that fall into different regions are added to chrI file.
  Args:
    bwa_cmd: String for full bwa command (Ex. "bwa mem -M -t 1 ref.fa test.fq")
    regions: A dict storing region information.
      regions['seq'] = [list of regions that encompass a part of 'seq']
    output_dir: Directory storing region subdirectories that store alignments
    basename: File prefix for output BAM files.
    log_output: Handle for log output
  Returns:
    output_paths: List of all output files.
  """
    regional_reads, header, seq_dict = {}, {}, {}
    curr_reads = []
    read = None
    unmapped_path = "{}/unmapped/{}.bam".format(output_dir, basename)
    qcfail_path = "{}/qcfail/{}.bam".format(output_dir, basename)
    log_output.write("executing BWA\n")
    start = time()
    bwa_process = Popen(bwa_cmd, stdout=PIPE, stderr=log_output, shell=True)

    for line in bwa_process.stdout:
        line = line.decode("utf-8")
        if line[0] == '@':
            _add_line_to_header(line, header)
            continue
        if not seq_dict:
            for index, seq in enumerate(header["SQ"]):
                header["SQ"][index]["LN"] = int(header["SQ"][index]["LN"])
                seq_dict[seq["SN"]] = index
                seq_dict[index] = seq["SN"]
            unmapped_file = Samfile(unmapped_path, 'wb', header=header)
            qcfail_file = Samfile(qcfail_path, 'wb', header=header)
        read = _string_to_aligned_segment(line, seq_dict, log_output)
        #conditionals to group bwa output by read names and sort one group at a time
        if not curr_reads:
            curr_reads.append(read)
            continue
        if curr_reads[0].query_name == read.query_name:
            curr_reads.append(read)
            continue
        else:
            _read_sorter(curr_reads, unmapped_file, qcfail_file, regions,
                         regional_reads, seq_dict, log_output)
            curr_reads.clear()
            curr_reads.append(read)
    if curr_reads:
        _read_sorter(curr_reads, unmapped_file, qcfail_file, regions,
                     regional_reads, seq_dict, log_output)
        curr_reads.clear()

    _add_line_to_header("@HD\tSO:coordinate", header)
    output_paths = _sort_regional_reads(regional_reads, regions, output_dir,
                                        basename, header, log_output)
    end = time()
    log_output.write("Sorted {} regions in {} "
                     "seconds\nDone\n".format(len(output_paths),
                                              (end - start)))
    log_output.flush()
    fsync(log_output.fileno())
    output_paths.append(unmapped_path)
    output_paths.append(qcfail_path)
    unmapped_file.close()
    qcfail_file.close()
    return output_paths
示例#23
0
def _sort_regional_reads(regional_reads, regions, output_dir, basename, header,
                         log_output):
    """Sorts reads in dict struct by coordinate and writes to BAM files
  Uses counting sort with the intuition that most positions in a range will
  map to a non-empty bucket, where a bucket holds reads mapped to a certain
  position.
  Args:
    regional_reads: Complicated dict struct
      ex. regional_reads[region][chromosome][position] returns list of reads
          at this position in this chromosome in this region
    regions: Regions dict used to sort reads during alignment. Produced by
      _load_regions()
    output_dir: self-explanatory
    basename: Prefix for output BAM files
    header: Dict representation of SAM header (described in pysam docs)
    log_output: File object for log.
  Returns:
    output_paths: List of output file paths
  """
    output_paths = []
    region_intervals = {}
    sequence_order = [chrom["SN"] for chrom in header["SQ"]]
    region_names = set(["chri"])
    # Get region intervals
    for seq_id in sequence_order:
        for interval in regions[seq_id]:
            region_names.add(interval.name)
            if interval.name in region_intervals:
                region_intervals[interval.name].append(
                    [seq_id, interval.lower_bound, interval.upper_bound])
            else:
                region_intervals[interval.name] = [[
                    seq_id, interval.lower_bound, interval.upper_bound
                ]]
    #for region in regional_reads.keys():
    for region in region_names:
        output_path = "{}/{}/{}.bam".format(output_dir, region, basename)
        output_file = Samfile(output_path, 'wb', header=header)
        if region == "chri":
            for chromosome in sequence_order:
                if chromosome in regional_reads[region]:
                    for position in sorted(
                            regional_reads[region][chromosome].keys()):
                        for read in regional_reads[region][chromosome].pop(
                                position):
                            output_file.write(read)
        elif region in regional_reads:
            for entries in region_intervals[region]:
                chromosome = entries[0]
                low, high = entries[1], entries[2]
                if chromosome in regional_reads[region]:
                    for position in range(low, high + 1):
                        if position in regional_reads[region][chromosome]:
                            for read in regional_reads[region][chromosome].pop(
                                    position):
                                output_file.write(read)
                else:
                    log_output.write("No reads mapping to "
                                     "{} {}\n".format(region, chromosome))
                    log_output.flush()
                    fsync(log_output.fileno())
        output_paths.append(output_path)
        output_file.close()
    return output_paths
示例#24
0
def estimate_bias_kmer(args):
    # Parameters
    maxDuplicates = 100
    pseudocount = 1.0

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    # Initializing dictionaries
    obsDictF = dict()
    obsDictR = dict()
    expDictF = dict()
    expDictR = dict()

    ct_reads_r = 0
    ct_reads_f = 0
    ct_kmers = 0

    # Iterating on HS regions
    for region in regions:

        # Initialization
        prevPos = -1
        trueCounter = 0

        # Evaluating observed frequencies ####################################
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):

            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prevPos:
                trueCounter += 1
            else:
                prevPos = p1
                trueCounter = 0
            if trueCounter > maxDuplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                ct_reads_f += 1
                try:
                    obsDictF[currStr] += 1
                except Exception:
                    obsDictF[currStr] = 1
            else:
                ct_reads_r += 1
                try:
                    obsDictR[currStr] += 1
                except Exception:
                    obsDictR[currStr] = 1

        # Evaluating expected frequencies ####################################
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        currRevComp = AuxiliaryFunctions.revcomp(currStr)

        # Iterating on each sequence position
        for i in range(0, len(currStr) - args.k_nb):
            ct_kmers += 1
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            try:
                expDictF[s] += 1
            except Exception:
                expDictF[s] = 1

            # Counting k-mer in dictionary for reverse complement
            s = currRevComp[i:i + args.k_nb]
            try:
                expDictR[s] += 1
            except Exception:
                expDictR[s] = 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    kmerComb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in kmerComb])
    bias_table_R = dict([(e, 0.0) for e in kmerComb])
    for kmer in kmerComb:
        try:
            obsF = obsDictF[kmer] + pseudocount
        except Exception:
            obsF = pseudocount
        try:
            expF = expDictF[kmer] + pseudocount
        except Exception:
            expF = pseudocount
        if ct_reads_f == 0:
            bias_table_F[kmer] = 1
        else:
            bias_table_F[kmer] = round(float(obsF / ct_reads_f) / float(expF / ct_kmers), 6)
        try:
            obsR = obsDictR[kmer] + pseudocount
        except Exception:
            obsR = pseudocount
        try:
            expR = expDictR[kmer] + pseudocount
        except Exception:
            expR = pseudocount
        if ct_reads_r == 0:
            bias_table_R[kmer] = 1
        else:
            bias_table_R[kmer] = round(float(obsR / ct_reads_r) / float(expR / ct_kmers), 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
示例#25
0
def main():
	parser = OptionParser(usage=usage)
	#parser.add_option("-s", action="store_true", dest="sam_input", default=False,
					  #help="Input is in SAM format instead of BAM format")
	(options, args) = parser.parse_args()
	if len(args) != 4:
		parser.print_help()
		sys.exit(1)
	psl_filename = args[0]
	ref_filename = args[1]
	contigs_filename = args[2]
	bam_filename = args[3]
	liftover_dir = args[1]
	
	references, ref_chromosomes = read_fasta(ref_filename)
	refname_to_id = dict([(name,i) for i,name in enumerate(ref_chromosomes)])
	print('Read', len(ref_chromosomes), 'reference chromosomes:', ','.join(ref_chromosomes), file=sys.stderr)
	contigs, contig_names = read_fasta(contigs_filename)
	print('Read', len(contig_names), 'contigs.', file=sys.stderr)
	bam_header = {'HD': {'VN': '1.0'}, 'SQ': [dict([('LN', len(references[chromosome])), ('SN', chromosome)]) for chromosome in ref_chromosomes] }
	outfile = Samfile(bam_filename, 'wb', header=bam_header)

	line_nr = 0
	header_read = False
	for line in (s.strip() for s in open(psl_filename)):
		line_nr += 1
		if line.startswith('------'): 
			header_read = True
			continue
		if not header_read: continue
		fields = line.split()
		assert len(fields) == 21, 'Error reading PSL file, offending line: %d'%line_nr
		sizes = [int(x) for x in fields[18].strip(',').split(',')]
		contig_starts = [int(x) for x in fields[19].strip(',').split(',')]
		ref_starts = [int(x) for x in fields[20].strip(',').split(',')]
		assert 0 < len(sizes) == len(contig_starts) == len(ref_starts)
		strand = fields[8]
		contig_name = fields[9]
		ref_name = fields[13]
		assert strand in ['-','+']
		assert contig_name in contigs
		assert ref_name in references
		a = AlignedRead()
		a.qname = contig_name
		if strand == '+':
			a.seq = str(contigs[contig_name])
		else:
			a.seq = str(contigs[contig_name].reverse_complement())
		a.flag = (16 if strand == '+' else 0)
		a.rname = refname_to_id[ref_name]
		a.pos = ref_starts[0]
		a.mapq = 255
		qpos = contig_starts[0]
		refpos = ref_starts[0]
		cigar = []
		# soft-clipping at the start?
		if contig_starts[0] > 0:
			cigar.append((4,contig_starts[0]))
		longest_insertion = 0
		longest_deletion = 0
		total_matches = 0
		total_insertion = 0
		total_deletion = 0
		for length, contig_start, ref_start in zip(sizes, contig_starts, ref_starts):
			assert contig_start >= qpos
			assert ref_start >= refpos
			# insertion?
			if contig_start > qpos:
				insertion_length = contig_start - qpos
				longest_insertion = max(longest_insertion, insertion_length)
				total_insertion += insertion_length
				append_to_cigar(cigar, 1, insertion_length)
				qpos = contig_start
			# deletion?
			if ref_start > refpos:
				deletion_length = ref_start - refpos
				longest_deletion = max(longest_deletion, deletion_length)
				total_deletion += deletion_length
				append_to_cigar(cigar, 2, deletion_length)
				refpos = ref_start
			# strech of matches/mismatches
			append_to_cigar(cigar, 0, length)
			refpos += length
			qpos += length
			total_matches += length
		# soft-clipping at the end?
		if len(a.seq) > qpos:
			cigar.append((4,len(a.seq) - qpos))
		a.cigar = tuple(cigar)
		# only use contigs where longest deletion is <= 10000 bp
		if longest_deletion > 10000: continue
		# require at least 200 matching positions
		if total_matches < 200: continue
		# require the matching positions to make up at least 75 percent of the contig
		# (without counting parts of the contig that are insertions).
		if total_matches / (len(a.seq) - total_insertion) < 0.75: continue
		outfile.write(a)
	outfile.close()
示例#26
0
from pysam import Samfile


def filter_reads(sam, mapq, osam):
    for aread in sam:
        if aread.is_unmapped:
            continue

        if aread.mapq >= mapq:
            osam.write(aread)


if __name__ == '__main__':
    mapq = int(sys.argv[1])
    for fn in sys.argv[2:]:
        isam = Samfile(fn)
        if fn.endswith('bam'):
            ofn = fn.replace('bam', 'map%s.bam' % mapq)
        elif fn.endswith('sam'):
            ofn = fn.replace('sam', 'map%s.bam' % mapq)
        else:
            ofn = fn + ".mapq%s.bam" % mapq

        if os.path.exists(ofn):
            print("Error:", ofn, "already exists!")
            continue

        osam = Samfile(ofn, 'wb', template=isam)
        filter_reads(isam, mapq, osam)
        osam.close()
示例#27
0
    """
    ret = []
    for i in re.findall("\d+|\^?[ATCGN]+", md):
        if i.startswith('^'):
            ret.extend(list(i[1:]))
        elif i[0] in ["A", "T", "C", "G", "N"]:
            ret.extend(list(i))
        else:
            ret.extend(['-'] * int(i))

    return ret


if __name__ == '__main__':
    f = Samfile(sys.argv[1])
    out = Samfile(sys.argv[1][:-4] + "_realign.bam", 'wb', template=f)
    count = 0.0
    n = 0.05
    for read in f:
        q, t = expandAlign(read)
        query, target = realign(read)
        replace(read, query, target)
        out.write(read)
        count += 1
        if (count / f.mapped) > n:
            n += 0.05
            print "[%s] -- parsed %d of %d reads (%.2f)" % (
                time.asctime(), int(count), f.mapped, count / f.mapped)

    out.close()
示例#28
0
    """
    Turns abbreviated MD into a full array
    """
    ret = []
    for i in re.findall("\d+|\^?[ATCGN]+", md):
        if i.startswith('^'):
            ret.extend(list(i[1:]))
        elif i[0] in ["A","T","C","G","N"]:
            ret.extend(list(i))
        else:
            ret.extend(['-']*int(i))

    return ret

if __name__ == '__main__':
    f = Samfile(sys.argv[1])
    out = Samfile(sys.argv[1][:-4]+"_realign.bam",'wb', template=f)
    count = 0.0
    n = 0.05
    for read in f: 
        q,t = expandAlign(read)
        query, target = realign(read)
        replace(read, query, target)
        out.write(read)
        count += 1
        if (count / f.mapped) > n:
            n += 0.05
            print "[%s] -- parsed %d of %d reads (%.2f)" % (time.asctime(), int(count), f.mapped, count/f.mapped )
        
    out.close()
示例#29
0
    outputFile.write("fixedStep chrom="+chrName+" start="+str(p1+1)+" step=1\n")
    fSum = sum(af[:window]); rSum = sum(ar[:window]);
    fLast = af[0]; rLast = ar[0]
    for i in range((window/2),len(af)-(window/2)):
      nhatf = Nf[i-(window/2)]*(af[i]/fSum)
      nhatr = Nr[i-(window/2)]*(ar[i]/rSum)
      zf = log(nf[i]+1)-log(nhatf+1)
      zr = log(nr[i]+1)-log(nhatr+1)
      outputFile.write(str(round(zf+zr,4))+"\n")
      #print i+p1+1-(window/2), af[i], ar[i], fSum, rSum, Nf[i-(window/2)], Nr[i-(window/2)]
      fSum -= fLast; fSum += af[i+(window/2)]; fLast = af[i-(window/2)+1]
      rSum -= rLast; rSum += ar[i+(window/2)]; rLast = ar[i-(window/2)+1]

    #for i in range(p1, p2):
    #  print i+1, z[i-p1]

  except Exception: continue

# Closing files
bamFile.close()
fastaFile.close()
coordFile.close()
outputFile.close()

# Converting to bigwig
os.system(" ".join(["wigToBigWig",outputFileName,csFileName,outputFileName[:-3]+"bw"]))
os.system(" ".join(["wigToBigWig",outputFileNameRaw,csFileName,outputFileNameRaw[:-3]+"bw"]))
#os.system(" ".join(["rm",outputFileName]))


示例#30
0
        r21p = 0.27
        r22p = 0.27
        r23p = 0.27
    else:
        r11p = 0.6
        r12p = 0.6
        r13p = 0.6
        r21p = 0.6
        r22p = 0.6
        r23p = 0.6
    signal1 = [
        e * r11p for e in fetchSignalBam(signalFile, region11, bamExt)
    ] + [e * r12p for e in fetchSignalBam(signalFile, region12, bamExt)
         ] + [e * r13p for e in fetchSignalBam(signalFile, region13, bamExt)]
    signal2 = [
        e * r21p for e in fetchSignalBam(signalFile, region21, bamExt)
    ] + [e * r22p for e in fetchSignalBam(signalFile, region22, bamExt)
         ] + [e * r23p for e in fetchSignalBam(signalFile, region23, bamExt)]
    signal = signal1 + signal2

    # Updating vector
    vector = vector + signal

    # Writing vector
    outputFile.write("\t".join([str(e) for e in vector]) + "\n")

# Closing all files
ctcfFile.close()
outputFile.close()
signalFile.close()
示例#31
0
    def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb, shift):
        """ 
        Estimates bias based on HS regions, DNase-seq signal and genomic sequences.

        Keyword arguments:
        regions -- DNase-seq HS regions.
        dnase_file_name -- DNase-seq file name.
        genome_file_name -- Genome to fetch genomic sequences from.
        
        Return:
        bias_table_F, bias_table_R -- Bias tables.
        """

        # Parameters
        maxDuplicates = 100
        pseudocount = 1.0

        # Initializing bam and fasta
        if(dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR
        bamFile = Samfile(dnase_file_name, "rb")
        fastaFile = Fastafile(genome_file_name)

        # Initializing dictionaries
        obsDictF = dict(); obsDictR = dict()
        expDictF = dict(); expDictR = dict()

        ct_reads_r=0
        ct_reads_f=0
        ct_kmers=0

        # Iterating on HS regions
        for region in regions:

            # Initialization
            prevPos = -1
            trueCounter = 0

            # Evaluating observed frequencies ####################################

            # Fetching reads
            for r in bamFile.fetch(region.chrom, region.initial, region.final):

                # Calculating positions
                if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift
                else: p1 = r.aend - (k_nb/2) + 1 - shift
                p2 = p1 + k_nb

                # Verifying PCR artifacts
                if(p1 == prevPos): trueCounter += 1
                else:
                    prevPos = p1
                    trueCounter = 0
                if(trueCounter > maxDuplicates): continue

                # Fetching k-mer
                try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
                except Exception: continue
                if(r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr)

                # Counting k-mer in dictionary
                if(not r.is_reverse):
                    ct_reads_r+=1
                    try: obsDictF[currStr] += 1
                    except Exception: obsDictF[currStr] = 1
                else:
                    ct_reads_f+=1
                    try: obsDictR[currStr] += 1
                    except Exception: obsDictR[currStr] = 1 


            # Evaluating expected frequencies ####################################

            # Fetching whole sequence
            try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
            except Exception: continue
            currRevComp = AuxiliaryFunctions.revcomp(currStr)

            # Iterating on each sequence position
            for i in range(0,len(currStr)-k_nb):
                ct_kmers+=1
                # Counting k-mer in dictionary
                s = currStr[i:i+k_nb]
                try: expDictF[s] += 1
                except Exception: expDictF[s] = 1

                # Counting k-mer in dictionary for reverse complement
                s = currRevComp[i:i+k_nb]
                try: expDictR[s] += 1
                except Exception: expDictR[s] = 1

        # Closing files
        bamFile.close()
        fastaFile.close()

        # Creating bias dictionary
        alphabet = ["A","C","G","T"]
        kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)]
        bias_table_F = dict([(e,0.0) for e in kmerComb]) 
        bias_table_R = dict([(e,0.0) for e in kmerComb]) 
        for kmer in kmerComb:
            try: obsF = obsDictF[kmer] + pseudocount
            except Exception: obsF = pseudocount
            try: expF = expDictF[kmer] + pseudocount
            except Exception: expF = pseudocount
            bias_table_F[kmer] = round(float(obsF/ct_reads_f)/float(expF/ct_kmers),6)
            try: obsR = obsDictR[kmer] + pseudocount
            except Exception: obsR = pseudocount
            try: expR = expDictR[kmer] + pseudocount
            except Exception: expR = pseudocount
            bias_table_R[kmer] = round(float(obsR/ct_reads_r)/float(expR/ct_kmers),6)

        # Return
        return [bias_table_F, bias_table_R]
                        align.rlen), sam.getrname(align.tid)
                    if align.cigar is not None:
                        align_len, query_aligned_len = cigar_parsing(
                            align.cigar)
                        nm = -1
                        if (query_aligned_len / query_len) * 100 >= coverage:
                            for coppia in align.tags:
                                if coppia[0] == "NM":
                                    nm = float(coppia[1])
                        if align_len != 0 and nm >= 0:
                            paired_perc_id = (
                                (align_len - nm) / align_len) * 100
                            if paired_perc_id >= identity_threshold:
                                match.setdefault(query_name, set())
                                match[query_name].add(ref_name)
            sam.close()
        else:
            print "no mapping data"
            sys.exit()

    if paired_sam is not None:
        if os.path.exists(paired_sam):
            r1_match = {}
            r2_match = {}
            sam = Samfile(paired_sam)
            for align in sam:
                if align.tid != -1:
                    query_name, query_len, ref_name = align.qname, float(
                        align.rlen), sam.getrname(align.tid)
                    if align.cigar is not None:
                        align_len, query_aligned_len = cigar_parsing(
示例#33
0
def fix_bigwig(chromosome, chromSizesFileName, chromSizesFileEnhName,
               mainBamFileName, toAddBamFileNameList, toRemoveBamFileNameList,
               outWigFileName):

    # Fixed parameters
    GENOME_WINDOW_SIZE = 1000000
    WINDOW_SIZE = 10
    TOTAL_BINS = GENOME_WINDOW_SIZE / WINDOW_SIZE

    # Get chrom sizes
    chrom_list, genome_sizes_dict = get_chrom_sizes(chromSizesFileName)

    # Open bam and wig files
    outWigFile = open(outWigFileName, "w")
    mainBamFile = Samfile(mainBamFileName, "rb")
    toAddBamFileList = [Samfile(e, "rb") for e in toAddBamFileNameList]
    toRemoveBamFileList = [Samfile(e, "rb") for e in toRemoveBamFileNameList]

    # Wig header
    wig_header = "fixedStep chrom=" + chromosome + " start=1 step=" + str(
        WINDOW_SIZE)
    outWigFile.write(wig_header + "\n")

    # Iterating on genomic regions for memory purposes
    for i in range(0, genome_sizes_dict[chromosome], GENOME_WINDOW_SIZE):

        # Region to fetch the signal
        region = [
            chromosome, i,
            min(i + GENOME_WINDOW_SIZE, genome_sizes_dict[chromosome])
        ]

        # Fetch signals
        vector_list_add = []
        vector_list_rm = []
        mainSignal = fetchSignalBam(TOTAL_BINS, region, mainBamFile)
        for j in range(0, len(toAddBamFileList)):
            vector_list_add.append(
                fetchSignalBam(TOTAL_BINS, region, toAddBamFileList[j]))
        for j in range(0, len(toRemoveBamFileList)):
            vector_list_rm.append(
                fetchSignalBam(TOTAL_BINS, region, toRemoveBamFileList[j]))

        # Writing signals
        for j in range(0, TOTAL_BINS):
            vMain = mainSignal[j]
            vToAdd = sum([0.2 * e[j] for e in vector_list_add])
            vToRemove = sum([0.3 * e[j] for e in vector_list_rm])
            outWigFile.write(
                str(max((vMain + vToAdd) - vToRemove, 0.0)) + "\n")

    # Termination
    mainBamFile.close()
    outWigFile.close()
    for e in toAddBamFileList:
        e.close()
    for e in toRemoveBamFileList:
        e.close()
    convert_to_bigwig(outWigFileName,
                      chromSizesFileEnhName,
                      ".".join(outWigFileName.split(".")[:-1] + ["bw"]),
                      remove_original=False)
示例#34
0
stagPeakFile = Samfile(stagPeakFileName, "rb")
outputActiveFile = open(outputActiveFileName, "w")
outputInactiveFile = open(outputInactiveFileName, "w")

# Iterating on region file
for line in regionFile:

  ll = line.strip().split("\t")
  
  chromosome = ll[0]; start = ll[1]; end = ll[2]; regionList = ll[3].split(":"); score = ll[4]; strand = ll[5]
  if(regionList[0] != "PROMOTER"): continue

  check = check_bam_at_least_one_read(stagPeakFile, [chromosome, int(start)-peakExt, int(end)+peakExt])
  if(not check): continue

  try: gene = aliasDict[regionList[1].upper()]
  except Exception: gene = regionList[1].upper()
  try: exp = expDict[gene]
  except Exception: continue

  if(regionList[2] == "ACTIVE"): outputActiveFile.write("\t".join([gene, exp])+"\n")
  elif(regionList[2] == "INACTIVE"): outputInactiveFile.write("\t".join([gene, exp])+"\n")

# Termination
regionFile.close()
stagPeakFile.close()
outputActiveFile.close()
outputInactiveFile.close()


示例#35
0
def subsample(fn, ns=None, paired=False):
    if ns is None:
        fn, ns = fn
    sample = []
    count = 0
    outdir_base = path.join(path.dirname(fn), 'subset')
    sf = Samfile(fn)
    try:
        i_weight = float(sf.mapped)/max(ns)
        print("Read out ", i_weight)
    except ValueError:
        i_weight = 0.0
        for read in sf:
            i_weight += 1
        print("Counted ", i_weight)
        i_weight /= float(max(ns))
        sf = Samfile(fn)

    if paired:
        read_2s = {}
    print(fn, count, i_weight)
    for i, read in enumerate(sf):
        key = random()**i_weight
        if not paired or read.is_read1:
            if len(sample) < max(ns):
                heappush(sample, (key, i+count, read))
            else:
                dropped = heappushpop(sample, (key, i+count, read))
                if paired:
                    read_2s.pop(dropped[-1].qname, None)
        elif paired:
            read_2s[read.qname] = read
        else:
            assert ValueError("I don't know how we got here")


    count += i

    for n in ns:
        outdir = outdir_base + '{:04.1f}M'.format(n/1e6)
        try:
            makedirs(outdir)
        except OSError:
            pass
        sampN = sorted(sample, reverse=True)[:int(n)]
        print("Kept {: >12,} of {: >12,} reads".format(len(sampN), count))
        print(fn, '->', outdir)
        stdout.flush()
        of = Samfile(path.join(outdir, 'accepted_hits.bam'),
                     mode='wb', template=sf)
        sample.sort(key=lambda heap_item: (heap_item[-1].tid, heap_item[-1].pos))
        missing_mates = 0
        for key, pos, read in sampN:
            of.write(read)
            if paired and read.is_proper_pair:
                if read.qname not in read_2s:
                    missing_mates += 1
                    continue
                of.write(read_2s[read.qname])
        of.close()
    sf.close()
    print(missing_mates)
    return [count for key, read, count in sample]
示例#36
0
            try:
                bbInter += ((treatInterCount / (len(ttadList) - 1)) /
                            (contrInterCount / (len(ctadList) - 1)))
            except Exception:
                pass

    abIntraDict["AA"].append(aaIntra)
    abIntraDict["AB"].append(abIntra)
    abIntraDict["BA"].append(baIntra)
    abIntraDict["BB"].append(bbIntra)
    abInterDict["AA"].append(aaInter)
    abInterDict["AB"].append(abInter)
    abInterDict["BA"].append(baInter)
    abInterDict["BB"].append(bbInter)

treatTadFile.close()
controlTadFile.close()

# Writing output
maxV = max([max(len(abIntraDict[e]), len(abInterDict[e])) for e in abList])
outputIntraFile = open(outputIntraFileName, "w")
outputInterFile = open(outputInterFileName, "w")
outputIntraFile.write("\t".join(abList) + "\n")
outputInterFile.write("\t".join(abList) + "\n")
for i in range(0, maxV):
    vecIntra = []
    vecInter = []
    for ab in abList:
        try:
            vecIntra.append(str(abIntraDict[ab][i]))
        except Exception:
示例#37
0
def print_reads(reads_to_print, ref_name, header):
    output_name = "{0}_{1}.bam".format(args.output_base, ref_name)
    output_samfile = Samfile(output_name, "wb", header=header)
    for aln in reads_to_print:
        output_samfile.write(aln)
    output_samfile.close()
示例#38
0
def main():
    """
    what to do if we execute the module as a script
    (not intended for user by user)
    """
    parser = ArgumentParser(description=__doc__)
    parser.add_argument('infile', default=stdin,
                        help='BAM/SAM input file (default: stdin)')
    parser.add_argument('--gzip', action='store_true', default=False, 
                        help='Compress paired-end output files')
    parser.add_argument('outfile1', help='Output file for first mate / single reads (default: stdout)')
    parser.add_argument('outfile2', help='(required for paired-end files) output filename for second mate')
    args = parser.parse_args()
    context = vars(args)
    outfile1 = context['outfile1']
    outfile2 = context['outfile2']
    f = Samfile(context['infile'])
    incomplete_pairs = []
    if context['gzip']:
        if PATH_TO_GZIP is not None:
            open_func = gzip_class_factory(PATH_TO_GZIP)
            fh1 = open_func(outfile1, 'w')
            fh2 = open_func(outfile2, 'w')
        else:
            fh1 = GzipFile(outfile1, 'wb')
            fh2 = GzipFile(outfile2, 'wb')
        is_paired = False
        gzwrite = gzwriter(fh1, fh2)
        for aread in f:
            is_paired = False
            qname = aread.qname
            for i in xrange(len(incomplete_pairs)):
                if incomplete_pairs[i].qname == qname:
                    mate_read = incomplete_pairs.pop(i)
                    # figure out order
                    if aread.flag & 0x4 == 0x4:
                        gzwrite(aread, mate_read)
                    else:
                        gzwrite(mate_read, aread)
                    is_paired = True
                    break
            if not is_paired: incomplete_pairs.append(aread)
        unpaired = len(incomplete_pairs)
        out1.close()
        out2.close()
        f.close()
    else:
        if not exists(outfile1): os.mknod(outfile1)
        if outfile2 is not None:
            if not exists(outfile2): os.mknod(outfile2)
        out1 = os.open(outfile1, os.O_WRONLY|os.O_NONBLOCK)
        out2 = os.open(outfile2, os.O_WRONLY|os.O_NONBLOCK)
        is_paired = False
        write = pair_writer(out1, out2)
        for aread in f:
            is_paired = False
            qname = aread.qname
            for i in xrange(len(incomplete_pairs)):
                if incomplete_pairs[i].qname == qname:
                    mate_read = incomplete_pairs.pop(i)
                    # figure out order
                    if aread.flag & 0x4 == 0x4:
                        write(aread, mate_read)
                    else:
                        write(mate_read, aread)
                    is_paired = True
                    break
            if not is_paired: incomplete_pairs.append(aread)
        unpaired = len(incomplete_pairs)
        os.close(out1)
        os.close(out2)
        f.close()
    if not unpaired == 0:
        raise RuntimeError('%d unpaired reads remaining' % unpaired) 
示例#39
0
    p2 = ll[2]
    name = ll[3]
    score = ll[4]
    strand = ll[5]
    if (chrom not in chrList): continue
    bestMotifList = getBestMotifList(ctcfBamFile, [chrom, int(p1), int(p2)])
    for motif in bestMotifList:
        allCtcfMinusPromFile.write("\t".join(
            [motif[0], motif[1], motif[2], name, motif[4], motif[5]]) + "\n")

# Closing files
featureFile.close()
allCtcfAllPromFile.close()
allCtcfPlusPromFile.close()
allCtcfMinusPromFile.close()
ctcfBamFile.close()

# Cat
tempCatFileName = tempLoc + "tempCatFileName.bed"
command = "cat " + allCtcfPlusPromFileName + " " + allCtcfMinusPromFileName + " > " + tempCatFileName
os.system(command)

# Sorting
tempSortFileName = tempLoc + "tempSortFileName.bed"
command = "sort -k1,1 -k2,2n " + tempCatFileName + " > " + tempSortFileName
os.system(command)

# Merge
command = "mergeBed -c 4,5,6 -o first,mean,first -i " + tempSortFileName + " > " + allCtcfAllPromFileName
os.system(command)
示例#40
0
def estimate_bias_pwm(args):
    # Parameters
    max_duplicates = 100

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])

    # Iterating on HS regions
    for region in regions:
        # Initialization
        prev_pos = -1
        true_counter = 0

        # Evaluating observed frequencies
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):
            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prev_pos:
                true_counter += 1
            else:
                prev_pos = p1
                true_counter = 0
            if true_counter > max_duplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                for i in range(0, len(currStr)):
                    obs_f_pwm_dict[currStr[i]][i] += 1
            else:
                for i in range(0, len(currStr)):
                    obs_r_pwm_dict[currStr[i]][i] += 1

        # Evaluating expected frequencies
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue

        # Iterating on each sequence position
        s = None
        for i in range(0, len(currStr) - args.k_nb):
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            for i in range(0, len(s)):
                exp_f_pwm_dict[s[i]][i] += 1

            # Counting k-mer in dictionary for reverse complement
            s = AuxiliaryFunctions.revcomp(s)
            for i in range(0, len(s)):
                exp_r_pwm_dict[s[i]][i] += 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Output pwms
    os.system("mkdir -p " + os.path.join(args.output_location, "pfm"))
    pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict]
    pwm_file_list = []
    pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb)))
    pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb)))
    pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb)))
    pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb)))

    pwm_file_list.append(pwm_obs_f)
    pwm_file_list.append(pwm_obs_r)
    pwm_file_list.append(pwm_exp_f)
    pwm_file_list.append(pwm_exp_r)

    for i in range(len(pwm_dict_list)):
        with open(pwm_file_list[i], "w") as pwm_file:
            for e in ["A", "C", "G", "T"]:
                pwm_file.write(" ".join([str(int(f)) for f in pwm_dict_list[i][e]]) + "\n")

    motif_obs_f = motifs.read(open(pwm_obs_f), "pfm")
    motif_obs_r = motifs.read(open(pwm_obs_r), "pfm")
    motif_exp_f = motifs.read(open(pwm_exp_f), "pfm")
    motif_exp_r = motifs.read(open(pwm_exp_r), "pfm")

    # Output logos
    os.system("mkdir -p " + os.path.join(args.output_location, "logo"))
    logo_obs_f = os.path.join(args.output_location, "logo", "obs_{}_f.pdf".format(str(args.k_nb)))
    logo_obs_r = os.path.join(args.output_location, "logo", "obs_{}_r.pdf".format(str(args.k_nb)))
    logo_exp_f = os.path.join(args.output_location, "logo", "exp_{}_f.pdf".format(str(args.k_nb)))
    logo_exp_r = os.path.join(args.output_location, "logo", "exp_{}_r.pdf".format(str(args.k_nb)))

    motif_obs_f.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_obs_r.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_exp_f.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)
    motif_exp_r.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    k_mer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in k_mer_comb])
    bias_table_R = dict([(e, 0.0) for e in k_mer_comb])
    for k_mer in k_mer_comb:
        obs_f = get_ppm_score(k_mer, motif_obs_f.pwm, args.k_nb)
        exp_f = get_ppm_score(k_mer, motif_exp_f.pwm, args.k_nb)
        bias_table_F[k_mer] = round(obs_f / exp_f, 6)
        obs_r = get_ppm_score(k_mer, motif_obs_r.pwm, args.k_nb)
        exp_r = get_ppm_score(k_mer, motif_exp_r.pwm, args.k_nb)
        bias_table_R[k_mer] = round(obs_r / exp_r, 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
示例#41
0
def estimate_bias_kmer(args):
    # Parameters
    maxDuplicates = 100
    pseudocount = 1.0

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    # Initializing dictionaries
    obsDictF = dict()
    obsDictR = dict()
    expDictF = dict()
    expDictR = dict()

    ct_reads_r = 0
    ct_reads_f = 0
    ct_kmers = 0

    # Iterating on HS regions
    for region in regions:

        # Initialization
        prevPos = -1
        trueCounter = 0

        # Evaluating observed frequencies ####################################
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):

            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prevPos:
                trueCounter += 1
            else:
                prevPos = p1
                trueCounter = 0
            if trueCounter > maxDuplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                ct_reads_f += 1
                try:
                    obsDictF[currStr] += 1
                except Exception:
                    obsDictF[currStr] = 1
            else:
                ct_reads_r += 1
                try:
                    obsDictR[currStr] += 1
                except Exception:
                    obsDictR[currStr] = 1

        # Evaluating expected frequencies ####################################
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue
        currRevComp = AuxiliaryFunctions.revcomp(currStr)

        # Iterating on each sequence position
        for i in range(0, len(currStr) - args.k_nb):
            ct_kmers += 1
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            try:
                expDictF[s] += 1
            except Exception:
                expDictF[s] = 1

            # Counting k-mer in dictionary for reverse complement
            s = currRevComp[i:i + args.k_nb]
            try:
                expDictR[s] += 1
            except Exception:
                expDictR[s] = 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    kmerComb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in kmerComb])
    bias_table_R = dict([(e, 0.0) for e in kmerComb])
    for kmer in kmerComb:
        try:
            obsF = obsDictF[kmer] + pseudocount
        except Exception:
            obsF = pseudocount
        try:
            expF = expDictF[kmer] + pseudocount
        except Exception:
            expF = pseudocount
        if ct_reads_f == 0:
            bias_table_F[kmer] = 1
        else:
            bias_table_F[kmer] = round(float(obsF / ct_reads_f) / float(expF / ct_kmers), 6)
        try:
            obsR = obsDictR[kmer] + pseudocount
        except Exception:
            obsR = pseudocount
        try:
            expR = expDictR[kmer] + pseudocount
        except Exception:
            expR = pseudocount
        if ct_reads_r == 0:
            bias_table_R[kmer] = 1
        else:
            bias_table_R[kmer] = round(float(obsR / ct_reads_r) / float(expR / ct_kmers), 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
示例#42
0
def estimate_bias_pwm(args):
    # Parameters
    max_duplicates = 100

    # Initializing bam and fasta
    bamFile = Samfile(args.reads_file, "rb")
    genome_data = GenomeData(args.organism)
    fastaFile = Fastafile(genome_data.get_genome())
    regions = GenomicRegionSet("regions")
    regions.read(args.regions_file)

    obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])
    exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb),
                           ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)])

    # Iterating on HS regions
    for region in regions:
        # Initialization
        prev_pos = -1
        true_counter = 0

        # Evaluating observed frequencies
        # Fetching reads
        for r in bamFile.fetch(region.chrom, region.initial, region.final):
            # Calculating positions
            if not r.is_reverse:
                cut_site = r.pos + args.forward_shift - 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            else:
                cut_site = r.aend + args.reverse_shift + 1
                p1 = cut_site - int(floor(args.k_nb / 2))
            p2 = p1 + args.k_nb

            # Verifying PCR artifacts
            if p1 == prev_pos:
                true_counter += 1
            else:
                prev_pos = p1
                true_counter = 0
            if true_counter > max_duplicates: continue

            # Fetching k-mer
            try:
                currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper()
            except Exception:
                continue
            if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr)

            # Counting k-mer in dictionary
            if not r.is_reverse:
                for i in range(0, len(currStr)):
                    obs_f_pwm_dict[currStr[i]][i] += 1
            else:
                for i in range(0, len(currStr)):
                    obs_r_pwm_dict[currStr[i]][i] += 1

        # Evaluating expected frequencies
        # Fetching whole sequence
        try:
            currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper()
        except Exception:
            continue

        # Iterating on each sequence position
        s = None
        for i in range(0, len(currStr) - args.k_nb):
            # Counting k-mer in dictionary
            s = currStr[i:i + args.k_nb]
            for i in range(0, len(s)):
                exp_f_pwm_dict[s[i]][i] += 1

            # Counting k-mer in dictionary for reverse complement
            s = AuxiliaryFunctions.revcomp(s)
            for i in range(0, len(s)):
                exp_r_pwm_dict[s[i]][i] += 1

    # Closing files
    bamFile.close()
    fastaFile.close()

    # Output pwms
    os.system("mkdir -p " + os.path.join(args.output_location, "pfm"))
    pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict]
    pwm_file_list = []
    pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb)))
    pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb)))
    pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb)))
    pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb)))

    pwm_file_list.append(pwm_obs_f)
    pwm_file_list.append(pwm_obs_r)
    pwm_file_list.append(pwm_exp_f)
    pwm_file_list.append(pwm_exp_r)

    for i in range(len(pwm_dict_list)):
        with open(pwm_file_list[i], "w") as pwm_file:
            for e in ["A", "C", "G", "T"]:
                pwm_file.write(" ".join([str(int(f)) for f in pwm_dict_list[i][e]]) + "\n")

    motif_obs_f = motifs.read(open(pwm_obs_f), "pfm")
    motif_obs_r = motifs.read(open(pwm_obs_r), "pfm")
    motif_exp_f = motifs.read(open(pwm_exp_f), "pfm")
    motif_exp_r = motifs.read(open(pwm_exp_r), "pfm")

    # Output logos
    os.system("mkdir -p " + os.path.join(args.output_location, "logo"))
    logo_obs_f = os.path.join(args.output_location, "logo", "obs_{}_f.pdf".format(str(args.k_nb)))
    logo_obs_r = os.path.join(args.output_location, "logo", "obs_{}_r.pdf".format(str(args.k_nb)))
    logo_exp_f = os.path.join(args.output_location, "logo", "exp_{}_f.pdf".format(str(args.k_nb)))
    logo_exp_r = os.path.join(args.output_location, "logo", "exp_{}_r.pdf".format(str(args.k_nb)))

    motif_obs_f.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_obs_r.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.2, yaxis_tic_interval=0.1)
    motif_exp_f.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)
    motif_exp_r.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic",
                        yaxis_scale=0.02, yaxis_tic_interval=0.01)

    # Creating bias dictionary
    alphabet = ["A", "C", "G", "T"]
    k_mer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)]
    bias_table_F = dict([(e, 0.0) for e in k_mer_comb])
    bias_table_R = dict([(e, 0.0) for e in k_mer_comb])
    for k_mer in k_mer_comb:
        obs_f = get_ppm_score(k_mer, motif_obs_f.pwm, args.k_nb)
        exp_f = get_ppm_score(k_mer, motif_exp_f.pwm, args.k_nb)
        bias_table_F[k_mer] = round(obs_f / exp_f, 6)
        obs_r = get_ppm_score(k_mer, motif_obs_r.pwm, args.k_nb)
        exp_r = get_ppm_score(k_mer, motif_exp_r.pwm, args.k_nb)
        bias_table_R[k_mer] = round(obs_r / exp_r, 6)

    write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
示例#43
0
def main(argv=None):
    """Main script."""
    ##########################
    # COMMAND-LINE ARGUMENTS #
    ##########################

    # Get myself
    program_name = sys.argv[0]

    if not argv:
        argv = sys.argv[1:]

    # Get the cluster type used to control arguments
    cluster_type = cluster.get_cluster_environment()

    parser  = argparse.ArgumentParser(
        description=__doc__, add_help=False,
        epilog=EPILOG, formatter_class=run.CustomFormatter)

    req = parser.add_argument_group('Required arguments')
    req.add_argument('-m', '--mode',
                     help='Operation mode', choices=['single', 'multi'],
                     required=True, metavar='mode')
    req.add_argument('-s', '--snps',
                     help='SNP BED file', required=True, metavar='<BED>')
    req.add_argument('-r', '--reads',
                     help='Mapped reads file [sam or bam]',
                     required=True, metavar='<[S/B]AM>')

    uni = parser.add_argument_group('Universal optional arguments')
    uni.add_argument('-p', '--prefix',
                     help='Prefix for temp files and output', default='TEST',
                     metavar='')
    uni.add_argument('-b', '--bam', action='store_true', dest='bam',
                     help='Mapped read file type is bam (auto-detected if *.bam)')
    uni.add_argument('-n', '--noclean', action='store_true',
                     help='Do not delete intermediate files (for debuging)')
    uni.add_argument('-R', '--random-seed', default=None, type=int,
                     help='Set the state of the randomizer (for testing)')
    uni.add_argument('-h', '--help', action='help',
                     help='show this help message and exit')

    mult = parser.add_argument_group('Multi(plex) mode arguments')
    mult.add_argument('-j', '--jobs', type=int,
                      help='Divide into # of jobs', default=100, metavar='')
    if cluster_type == 'slurm' or cluster_type == 'torque':
        mult.add_argument('-w', '--walltime',
                          help='Walltime for each job', default='3:00:00',
                          metavar='')
        mult.add_argument('-k', '--mem', dest='memory', metavar='',
                          help='Memory for each job', default='5000MB')
        mult.add_argument('--queue',
                          help='Queue to submit jobs to', default='batch',
                          metavar='')
        mult.add_argument('--cluster', choices=['torque', 'slurm', 'normal'],
                          help='Which cluster to use, normal uses threads ' +
                          'on this machine', default=cluster_type)
    mult.add_argument('--threads', type=int, metavar='', default=cpu_count(),
                      help='Max number of threads to run at a time ' +
                      '(normal mode only).')

    single = parser.add_argument_group('Single mode arguments')
    single.add_argument('-f', '--suffix', default='', metavar='',
                        help='Suffix for multiplexing [set automatically]')

    logging = parser.add_argument_group('Logging options')
    logging.add_argument('-q', '--quiet', action='store_true',
                         help="Quiet mode, only prints warnings.")
    logging.add_argument('-v', '--verbose', action='store_true',
                         help="Verbose mode, prints debug info too.")
    logging.add_argument('--logfile',
                         help='Logfile to write messages too, default is ' +
                         'STDERR')

    args = parser.parse_args()
    if args.random_seed is not None:
        random.seed(args.random_seed)
        print("Seed: ", args.random_seed, random.getstate()[1][:10])

    ###########################################################################
    #                            File Preparations                            #
    ###########################################################################

    # Take care of logging
    if args.logfile:
        logme.LOGFILE = args.logfile
    if args.quiet:
        logme.MIN_LEVEL = 'warn'
    elif args.verbose:
        logme.MIN_LEVEL = 'debug'

    # Initialize variables
    prefix = args.prefix + '_'

    # Make sure we can run ourselves
    if not run.is_exe(program_name):
        program_name = run.which(parser.prog)

    # Set the cluster type if we are in multi mode
    if args.mode == 'multi' and (cluster_type == 'slurm'
                                 or cluster_type == 'torque'):
        cluster.QUEUE = args.cluster

    # Check if the read file is sam or bam
    file_check = args.reads.split('.')
    file_check[-1] = file_check[-1].lower()
    sam_path, sam_file = os.path.split(args.reads)

    if args.reads.endswith('bam') or args.bam:
        mode = 'rb'
    else:
        mode = 'r'

    ##################
    # MULTIPLEX MODE #
    ##################

    # If we're running in multiplex mode
    if args.mode == 'multi':
        logme.log('Splitting sam file {} into {} files.'.format(sam_file,
                                                                args.jobs))
        reads_files = split_samfile(os.path.join(sam_path, sam_file),
                                    args.jobs, prefix)
        logme.log('Splitting complete.')

        # Create PBS scripts and submit jobs to the cluster
        subnoclean = ' --noclean' if args.noclean else ''
        logme.log('Submitting split files to cluster')
        jobs = []  # Hold job info for later checking
        for reads_file in reads_files:
            suffix = reads_file[-4:]

            command = ("python2 " + program_name + " --mode single --snps " +
                       args.snps + " --reads " + reads_file + " --suffix " +
                       suffix + " --prefix " + args.prefix + subnoclean +
                       ' --bam')

            if cluster_type == 'normal':
                jobs.append(cluster.submit(command, name=prefix + suffix,
                                           threads=args.threads))
            else:
                jobs.append(cluster.submit(command, name=prefix + suffix,
                                           time=args.walltime, cores=1,
                                           mem=args.memory,
                                           partition=args.queue))
            sleep(2)    # Pause for two seconds to make sure job is submitted

        # Now wait and check for all jobs to complete every so long
        logme.log('Submission done, waiting for jobs to complete.')

        # First wait for jobs in queue to complete
        cluster.wait(jobs)
        sleep(1)

        # Next, check if any jobs failed
        failed = []
        for i in range(1, args.jobs+1):
            suffix = str(i).zfill(4)
            if not os.path.isfile(prefix + suffix + '_done'):
                failed.append(prefix + suffix)

        # If any jobs failed, terminate
        if failed:
            logme.log('Some jobs failed!', 'critical')
            return -1

        logme.log('Jobs completed.')
        # Remove 'done' files in case we want to run again.
        os.system('rm {prefix}*_done'.format(prefix=prefix))

        # Once the jobs are done, concatenate all of the counts into one file.
        # Initialize dictionaries

        tot_pos_counts = {}
        tot_neg_counts = {}
        tot_tot_counts = {}
        tot_sum_pos = {}
        tot_sum_neg = {}

        for i in range(1, args.jobs+1):
            suffix = str(i).zfill(4)
            in_counts = prefix + 'SNP_COUNTS_' + suffix

            # Parse the line to add it to the total file
            with run.open_zipped(in_counts, 'r') as in_counts:
                for line in in_counts:
                    line = line.rstrip('\n')
                    line_t = line.split('\t')

                    if 'CHR' in line:
                        continue

                    pos = line_t[0] + '|' + line_t[1]

                    pos_split = line_t[2].split('|')
                    neg_split = line_t[3].split('|')

                    if pos in tot_pos_counts or pos in tot_neg_counts or pos in tot_tot_counts:
                        for j in range(len(pos_split)):
                            tot_pos_counts[pos][j] += int(pos_split[j])
                            tot_neg_counts[pos][j] += int(neg_split[j])
                        tot_sum_pos[pos] += int(line_t[4])
                        tot_sum_neg[pos] += int(line_t[5])
                        tot_tot_counts[pos] += int(line_t[6])

                    else:
                        tot_pos_counts[pos] = [0, 0, 0, 0]
                        tot_neg_counts[pos] = [0, 0, 0, 0]
                        tot_tot_counts[pos] = 0
                        tot_sum_pos[pos] = 0
                        tot_sum_neg[pos] = 0
                        for j in range(len(pos_split)):
                            tot_pos_counts[pos][j] += int(pos_split[j])
                            tot_neg_counts[pos][j] += int(neg_split[j])
                        tot_sum_pos[pos] += int(line_t[4])
                        tot_sum_neg[pos] += int(line_t[5])
                        tot_tot_counts[pos] += int(line_t[6])

        # Write out the final concatenated file
        with run.open_zipped(prefix + 'SNP_COUNTS.txt', 'w') as final_counts:
            final_counts.write('CHR\tPOSITION\tPOS_A|C|G|T\tNEG_A|C|G|T\t' +
                               'SUM_POS_READS\tSUM_NEG_READS\tSUM_READS\n')

            keys = sorted(tot_pos_counts.keys())

            for key in keys:
                pos = key.split('|')
                pos_fix = [str(x) for x in tot_pos_counts[key]]
                neg_fix = [str(x) for x in tot_neg_counts[key]]
                pos_out = '|'.join(pos_fix)
                neg_out = '|'.join(neg_fix)
                final_counts.write(str(pos[0]) + '\t' + str(pos[1]) + '\t' +
                                   pos_out + '\t' + neg_out + '\t' +
                                   str(tot_sum_pos[key]) + '\t' +
                                   str(tot_sum_neg[key]) + '\t' +
                                   str(tot_tot_counts[key]) + '\n')

        # Sort the file numerically
        os.system('sort -k1,2 -n ' + prefix + 'SNP_COUNTS.txt ' + ' -o ' +
                  prefix + 'SNP_COUNTS.txt')

        # Clean up intermediate files.
        if args.noclean is False:
            cluster.clean()
            os.system('rm {prefix}*COUNTS_* {prefix}*split_sam_*'.format(
                prefix=prefix))

    ###############
    # SINGLE MODE #
    ###############

    # If we're running in single mode (each job submitted by multiplex mode
    # will be running in single mode)
    elif args.mode == 'single':

        # First read in the information on the SNPs that we're interested in.
        snps = {}    # Initialize a dictionary of SNP positions

        with run.open_zipped(args.snps) as snp_file:
            for line in snp_file:
                line = line.rstrip('\n')
                line_t = line.split('\t')

                pos = chrom_to_num(line_t[0]) + '|' + str(line_t[2])
                snps[pos] = line_t[3]

        # This is the dictionary of potential SNPs for each read.
        potsnp_dict = {}

        # Now parse the SAM file to extract only reads overlapping SNPs.
        in_sam     = Samfile(args.reads, mode)
        references = in_sam.references  # Faster to make a copy of references.

        # Trackers to count how many reads are lost at each step
        indel_skip = 0
        nosnp_skip = 0
        count      = 0
        snp_count  = 0
        ryo_filter = 0

        for line in in_sam:
            count += 1

            # Skip lines that overlap indels OR don't match Ns
            cigarstring = line.cigarstring

            if 'D' in cigarstring or 'I' in cigarstring:
                indel_skip += 1
                continue

            # Split the tags to find the MD tag:
            tags = line.tags
            for tagname, tagval in tags:
                if tagname == 'MD' and 'N' in tagval:
                    # Remember that, for now, we're not allowing reads that
                    # overlap insertions/deletions.

                    chrom = references[line.rname]
                    pos   = line.pos
                    read  = line.seq

                    # We're assuming
                    # correct mapping such that FIRST MATES on the NEGATIVE
                    # STRAND are NEGATIVE, while SECOND MATES on the NEGATIVE
                    # STRAND are POSITIVE.

                    if line.is_reverse:
                        orientation = '-'
                    else:
                        orientation = '+'

                    # Parse the CIGAR string
                    cigar_types, cigar_vals = split_CIGAR(cigarstring)

                    if cigar_types[0] == 'S':
                        MD_start = int(cigar_vals[0])
                    else:
                        MD_start = 0

                    # Get the genomic positions corresponding to each base-pair
                    # of the read
                    read_genomic_positions = CIGAR_to_Genomic_Positions(
                        cigar_types, cigar_vals, line.pos+1)

                    # Get the tag data
                    MD_split = re.findall('\d+|\D+', tagval)

                    genome_start = 0

                    # The snp_pos dictionary will store the 1-base position
                    # => allele
                    snp_pos = {}
                    for i in MD_split:
                        if re.match('\^', i):
                            pass
                        elif i.isalpha():
                            if i == 'N':
                                snp_pos[read_genomic_positions[genome_start]] = read[MD_start]
                                MD_start += 1
                                genome_start += 1
                            else:
                                MD_start += 1
                                genome_start += 1
                        else:
                            MD_start += int(i)
                            genome_start += int(i)

                    for i in snp_pos:
                        snp_count += 1

                        # RYO: START EDIT - Implemented Filter
                        posVal = line.reference_name + '|' + str(i)
                        if posVal not in snps:
                            nosnp_skip += 1
                            continue
                        # RYO: END EDIT - Implmented Filter

                        snp = '{chr}|{i}\t{snp_pos}\t{orientation}'.format(
                            chr=chrom, i=i, snp_pos=snp_pos[i],
                            orientation=orientation)
                        if line.qname in potsnp_dict:
                            if snp not in potsnp_dict[line.qname]:
                                # RYO EDIT HERE - added conditional so that
                                # pairs of reads are not considered twice if
                                # they both overlap the same snp.
                                potsnp_dict[line.qname].append(snp)
                            else:
                                ryo_filter += 1
                        else:
                            potsnp_dict[line.qname] = []
                            potsnp_dict[line.qname].append(snp)

        in_sam.close()

        # Log all of the skipped reads
        logme.log('Total reads: {}'.format(count), 'debug')
        logme.log('Reads skipped for indels: {}'.format(indel_skip), 'debug')
        logme.log('Total SNPs checked: {}'.format(snp_count), 'debug')
        logme.log('SNPs not in SNP list: {}'.format(nosnp_skip), 'debug')
        logme.log('Ryo filter: {}'.format(ryo_filter), 'debug')

        # Initialize the counting dictionaries
        pos_counts = {}
        neg_counts = {}

        # Go through the potential SNP dictionary and choose one SNP at random
        # for those overlapping multiple SNPs
        if args.random_seed is not None:  # Dictionaries are unordered, so must sort for consistent random seed output.
            keys = sorted(list(potsnp_dict.keys()))
        else:  # Because sorting is slow, only do it if random seed is set, slowdown is about 0.1s per 1 million reads..
            keys = list(potsnp_dict.keys())
        for key in keys:
            snp = random.choice(potsnp_dict[key]).split('\t')

            if snp[0] in snps:
                if snp[0] in pos_counts or snp[0] in neg_counts:
                    if snp[2] == '+':
                        if snp[1] == 'A':
                            pos_counts[snp[0]][0] += 1
                        if snp[1] == 'C':
                            pos_counts[snp[0]][1] += 1
                        if snp[1] == 'G':
                            pos_counts[snp[0]][2] += 1
                        if snp[1] == 'T':
                            pos_counts[snp[0]][3] += 1

                    elif snp[2] == '-':
                        if snp[1] == 'A':
                            neg_counts[snp[0]][0] += 1
                        if snp[1] == 'C':
                            neg_counts[snp[0]][1] += 1
                        if snp[1] == 'G':
                            neg_counts[snp[0]][2] += 1
                        if snp[1] == 'T':
                            neg_counts[snp[0]][3] += 1

                else:
                    pos_counts[snp[0]] = [0, 0, 0, 0]
                    neg_counts[snp[0]] = [0, 0, 0, 0]
                    if snp[2] == '+':
                        if snp[1] == 'A':
                            pos_counts[snp[0]][0] += 1
                        if snp[1] == 'C':
                            pos_counts[snp[0]][1] += 1
                        if snp[1] == 'G':
                            pos_counts[snp[0]][2] += 1
                        if snp[1] == 'T':
                            pos_counts[snp[0]][3] += 1

                    elif snp[2] == '-':
                        if snp[1] == 'A':
                            neg_counts[snp[0]][0] += 1
                        if snp[1] == 'C':
                            neg_counts[snp[0]][1] += 1
                        if snp[1] == 'G':
                            neg_counts[snp[0]][2] += 1
                        if snp[1] == 'T':
                            neg_counts[snp[0]][3] += 1

        # Open the output file and write the SNP counts to it

        out_counts = prefix + 'SNP_COUNTS_' + args.suffix if args.suffix \
            else prefix + 'SNP_COUNTS.txt'

        with open(out_counts, 'w') as out_counts:
            # Write header
            out_counts.write('CHR\tPOSITION\tPOS_A|C|G|T\tNEG_A|C|G|T\t' +
                             'SUM_POS_READS\tSUM_NEG_READS\tSUM_READS\n')

            # Sort SNP positions and write them
            keys = sorted(pos_counts.keys())

            for key in keys:
                pos = key.split('|')
                sum_pos = sum(pos_counts[key])
                sum_neg = sum(neg_counts[key])
                tot_sum = sum(pos_counts[key]) + sum(neg_counts[key])
                pos_fix = [str(x) for x in pos_counts[key]]
                neg_fix = [str(x) for x in neg_counts[key]]
                positive = '|'.join(pos_fix)
                negative = '|'.join(neg_fix)

                out_counts.write(pos[0] + '\t' + pos[1] + '\t' + positive +
                                 '\t' + negative + '\t' + str(sum_pos) + '\t' +
                                 str(sum_neg) + '\t' + str(tot_sum) + '\n')

        if args.suffix:
            os.system('touch ' + prefix + args.suffix + '_done')
示例#44
0
def main():
    parser = OptionParser(usage=usage)
    #parser.add_option("-s", action="store_true", dest="sam_input", default=False,
    #help="Input is in SAM format instead of BAM format")
    (options, args) = parser.parse_args()
    if len(args) != 4:
        parser.print_help()
        sys.exit(1)
    psl_filename = args[0]
    ref_filename = args[1]
    contigs_filename = args[2]
    bam_filename = args[3]
    liftover_dir = args[1]

    references, ref_chromosomes = read_fasta(ref_filename)
    refname_to_id = dict([(name, i) for i, name in enumerate(ref_chromosomes)])
    print('Read',
          len(ref_chromosomes),
          'reference chromosomes:',
          ','.join(ref_chromosomes),
          file=sys.stderr)
    contigs, contig_names = read_fasta(contigs_filename)
    print('Read', len(contig_names), 'contigs.', file=sys.stderr)
    bam_header = {
        'HD': {
            'VN': '1.0'
        },
        'SQ': [
            dict([('LN', len(references[chromosome])), ('SN', chromosome)])
            for chromosome in ref_chromosomes
        ]
    }
    outfile = Samfile(bam_filename, 'wb', header=bam_header)

    line_nr = 0
    header_read = False
    for line in (s.strip() for s in open(psl_filename)):
        line_nr += 1
        if line.startswith('------'):
            header_read = True
            continue
        if not header_read: continue
        fields = line.split()
        assert len(
            fields
        ) == 21, 'Error reading PSL file, offending line: %d' % line_nr
        sizes = [int(x) for x in fields[18].strip(',').split(',')]
        contig_starts = [int(x) for x in fields[19].strip(',').split(',')]
        ref_starts = [int(x) for x in fields[20].strip(',').split(',')]
        assert 0 < len(sizes) == len(contig_starts) == len(ref_starts)
        strand = fields[8]
        contig_name = fields[9]
        ref_name = fields[13]
        assert strand in ['-', '+']
        assert contig_name in contigs
        assert ref_name in references
        a = AlignedRead()
        a.qname = contig_name
        if strand == '+':
            a.seq = str(contigs[contig_name])
        else:
            a.seq = str(contigs[contig_name].reverse_complement())
        a.flag = (16 if strand == '+' else 0)
        a.rname = refname_to_id[ref_name]
        a.pos = ref_starts[0]
        a.mapq = 255
        qpos = contig_starts[0]
        refpos = ref_starts[0]
        cigar = []
        # soft-clipping at the start?
        if contig_starts[0] > 0:
            cigar.append((4, contig_starts[0]))
        longest_insertion = 0
        longest_deletion = 0
        total_matches = 0
        total_insertion = 0
        total_deletion = 0
        for length, contig_start, ref_start in zip(sizes, contig_starts,
                                                   ref_starts):
            assert contig_start >= qpos
            assert ref_start >= refpos
            # insertion?
            if contig_start > qpos:
                insertion_length = contig_start - qpos
                longest_insertion = max(longest_insertion, insertion_length)
                total_insertion += insertion_length
                append_to_cigar(cigar, 1, insertion_length)
                qpos = contig_start
            # deletion?
            if ref_start > refpos:
                deletion_length = ref_start - refpos
                longest_deletion = max(longest_deletion, deletion_length)
                total_deletion += deletion_length
                append_to_cigar(cigar, 2, deletion_length)
                refpos = ref_start
            # strech of matches/mismatches
            append_to_cigar(cigar, 0, length)
            refpos += length
            qpos += length
            total_matches += length
        # soft-clipping at the end?
        if len(a.seq) > qpos:
            cigar.append((4, len(a.seq) - qpos))
        a.cigar = tuple(cigar)
        # only use contigs where longest deletion is <= 10000 bp
        if longest_deletion > 10000: continue
        # require at least 200 matching positions
        if total_matches < 200: continue
        # require the matching positions to make up at least 75 percent of the contig
        # (without counting parts of the contig that are insertions).
        if total_matches / (len(a.seq) - total_insertion) < 0.75: continue
        outfile.write(a)
    outfile.close()
示例#45
0
    if(counter == len(resVec)): break

  # Overlap and Writting output
  if(maxValue != globalMin):
    # t1 = MPBS
    t1 = [0,0]
    t2Write = [0,0]
    if(maxPos >= 0): t1 = [p1+maxPos, p1+maxPos+maxMotifLen]
    else: t1 = [p1-maxPos, p1-maxPos+maxMotifLen]
    maxOverlap = 0
    for f in footprints:
      # t2 = footprint
      t2 = [f.pos,f.aend]
      overlapN = overlap(t1, t2)
      if(overlapN > maxOverlap):
        maxOverlap = overlapN
        t2Write[0] = t2[0]
        t2Write[1] = t2[1]
    resVec = [maxValue, t1[0], t1[1], maxOverlap, t2Write[0], t2Write[1]]
    writeOutput(ll,regionTagCount,resVec,outFile)
  else: writeOutput(ll,regionTagCount,resVec,outFile)

# Termination
bedFile.close()
outFile.close()
genomeFile.close()
dnaseBam.close()
fpBam.close()


示例#46
0
def snp_workflow(ex, job, assembly, minsnp=40., mincov=5, path_to_ref=None, via='local',
                 logfile=sys.stdout, debugfile=sys.stderr):
    """Main function of the workflow"""
    ref_genome = assembly.fasta_by_chrom
    sample_names = [job.groups[gid]['name'] for gid in sorted(job.files.keys())]

    logfile.write("\n* Generate vcfs for each chrom/group\n"); logfile.flush()
    vcfs = dict((chrom,{}) for chrom in ref_genome.keys()) # {chr: {}}
    bams = {}
    # Launch the jobs
    for gid in sorted(job.files.keys()):
        # Merge all bams belonging to the same group
        runs = [r['bam'] for r in job.files[gid].itervalues()]
        bam = Samfile(runs[0])
        header = bam.header
        headerfile = unique_filename_in()
        for h in header["SQ"]:
            if h["SN"] in assembly.chrmeta:
                h["SN"] = assembly.chrmeta[h["SN"]]["ac"]
        head = Samfile( headerfile, "wh", header=header )
        head.close()
        if len(runs) > 1:
            _b = merge_bam(ex,runs)
            index_bam(ex,_b)
            bams[gid] = _b
        else:
            bams[gid] = runs[0]
        # Samtools mpileup + bcftools + vcfutils.pl
        for chrom,ref in ref_genome.iteritems():
            vcf = unique_filename_in()
            vcfs[chrom][gid] = (vcf,
                                pileup.nonblocking(ex, bams[gid], ref, header=headerfile,
                                                   via=via, stdout=vcf))
        logfile.write("  ...Group %s running.\n" %job.groups[gid]['name']); logfile.flush()
    # Wait for vcfs to finish and store them in *vcfs[chrom][gid]*
    for gid in sorted(job.files.keys()):
        for chrom,ref in ref_genome.iteritems():
            vcfs[chrom][gid][1].wait()
            vcfs[chrom][gid] = vcfs[chrom][gid][0]
        logfile.write("  ...Group %s done.\n" %job.groups[gid]['name']); logfile.flush()
    # Targz the pileup files (vcf)
    tarname = unique_filename_in()
    tarfh = tarfile.open(tarname, "w:gz")
    for chrom,v in vcfs.iteritems():
        for gid,vcf in v.iteritems():
            tarfh.add(vcf, arcname="%s_%s.vcf" % (job.groups[gid]['name'],chrom))
    tarfh.close()
    ex.add( tarname, description=set_file_descr("vcfs_files.tar.gz",step="pileup",type="tar",view='admin') )

    logfile.write("\n* Merge info from vcf files\n"); logfile.flush()
    outall = unique_filename_in()
    outexons = unique_filename_in()
    with open(outall,"w") as fout:
        fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+ \
                                 ['gene','location_type','distance'])+'\n')
    with open(outexons,"w") as fout:
        fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+['exon','strand','ref_aa'] \
                                  + ['new_aa_'+s for s in sample_names])+'\n')
    msa_table = dict((s,'') for s in [assembly.name]+sample_names)
    for chrom,v in vcfs.iteritems():
        logfile.write("  > Chromosome '%s'\n" % chrom); logfile.flush()
    # Put together info from all vcf files
        logfile.write("  - All SNPs\n"); logfile.flush()
        allsnps = all_snps(ex,chrom,vcfs[chrom],bams,outall,assembly,
                           sample_names,mincov,float(minsnp),logfile,debugfile)
    # Annotate SNPs and check synonymy
        logfile.write("  - Exonic SNPs\n"); logfile.flush()
        exon_snps(chrom,outexons,allsnps,assembly,sample_names,ref_genome,logfile,debugfile)
        for snprow in allsnps:
            for n,k in enumerate([assembly.name]+sample_names):
                msa_table[k] += snprow[3+n][0]
    description = set_file_descr("allSNP.txt",step="SNPs",type="txt")
    ex.add(outall,description=description)
    description = set_file_descr("exonsSNP.txt",step="SNPs",type="txt")
    ex.add(outexons,description=description)
    msafile = unique_filename_in()
    with open(msafile,"w") as msa:
        msa.write(" %i %i\n"%(len(msa_table),len(msa_table.values()[0])))
        for name,seq in msa_table.iteritems():
            msa.write("%s\t%s\n" %(name,seq))
    msa_table = {}
    description = set_file_descr("SNPalignment.txt",step="SNPs",type="txt")
    ex.add(msafile,description=description)
    # Create UCSC bed tracks
    logfile.write("\n* Create tracks\n"); logfile.flush()
    create_tracks(ex,outall,sample_names,assembly)
    # Create quantitative tracks
    logfile.write("\n* Create heteroz. and quality tracks\n"); logfile.flush()

    def _process_pileup(pileups, seq, startpos, endpos):
        atoi = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
        vectors = ([],[],[])
        for pileupcolumn in pileups:
            position = pileupcolumn.pos
            if position < startpos: continue
            if position >= endpos: break
            coverage = pileupcolumn.n
            ref_symbol = seq[position-startpos]
            ref = atoi.get(ref_symbol, 4)
            symbols = [0,0,0,0,0]
            quality = 0
            for pileupread in pileupcolumn.pileups:
                symbols[atoi.get(pileupread.alignment.seq[pileupread.qpos], 4)] += 1
                quality += ord(pileupread.alignment.qual[pileupread.qpos])-33
            quality = float(quality)/coverage
            info = heterozygosity(ref, symbols[0:4])
            if coverage > 0: vectors[0].append((position, position+1, coverage))
            if info > 0: vectors[1].append((position, position+1, info))
            if quality > 0: vectors[2].append((position, position+1, quality))
#            yield (position, position+1, coverage, info, quality)
        return vectors

    if job.options.get('make_bigwigs',False):
        _descr = {'groupId':0,'step':"tracks",'type':"bigWig",'ucsc':'1'}
        for gid,bamfile in bams.iteritems():
            _descr['groupId'] = gid
            bamtr = track(bamfile,format="bam")
            covname = unique_filename_in()+".bw"
            out_cov = track(covname, chrmeta=assembly.chrmeta)
            hetname = unique_filename_in()+".bw"
            out_het = track(hetname, chrmeta=assembly.chrmeta)
            qualname = unique_filename_in()+".bw"
            out_qual = track(qualname, chrmeta=assembly.chrmeta)
            for chrom, cinfo in assembly.chrmeta.iteritems():
                fasta = Fastafile(ref_genome[chrom])
                #process fasta and bam by 10Mb chunks
                for chunk in range(0,cinfo["length"],10**7):
                    fastaseq = fasta.fetch(cinfo['ac'], chunk, chunk+10**7)
                    vecs = _process_pileup(bamtr.pileup(chrom, chunk, chunk+10**7), fastaseq, chunk, chunk+10**7)
                    out_cov.write(vecs[0], fields=['start','end','score'], chrom=chrom)
                    out_het.write(vecs[1], fields=['start','end','score'], chrom=chrom)
                    out_qual.write(vecs[2], fields=['start','end','score'], chrom=chrom)
            out_cov.close()
            out_het.close()
            out_qual.close()
            description = set_file_descr(job.groups[gid]['name']+"_coverage.bw",**_descr)
            ex.add(covname,description=description)
            description = set_file_descr(job.groups[gid]['name']+"_heterozygosity.bw",**_descr)
            ex.add(hetname,description=description)
            description = set_file_descr(job.groups[gid]['name']+"_quality.bw",**_descr)
            ex.add(qualname,description=description)

    return 0
示例#47
0
outputFile4.write("\t".join(header4) + "\n")
for i in range(0, maxV):
    vec = []
    for j in range(0, len(vectorTable4)):
        try:
            vec.append(vectorTable4[j][i])
        except Exception:
            vec.append("NA")
        try:
            vec.append(vectorTable5[j][i])
        except Exception:
            vec.append("NA")
    outputFile4.write("\t".join(vec) + "\n")

stagFile.close()
outputFile1.close()
outputFile2.close()
outputFile3.close()
outputFile4.close()
genomeFile.close()
regionsFile.close()
#chrommHmmFile.close()
enhancersFile.close()
[e.close() for e in signalFileList]
[e.close() for e in controlFileList]
[e.close() for e in motifFileList]

# Removing all files
command = "rm -rf " + tempLocation
os.system(command)
示例#48
0
samfile = Samfile(args.path)
for segment in samfile.fetch(until_eof=True):
	num = segment.query_name.split("|")[0]
	for etype, eset in errors.iteritems():
		if(num in eset):
			errors2segments[etype][num].append(segment);
			break;
		
		
additional = defaultdict(list);
for fname in args.additional:
	tsamfile = Samfile(fname);
	for segment in tsamfile.fetch(until_eof=True):
		num = segment.query_name.split("|")[0]
		additional[num].append(ArWrapper(segment, tsamfile.getrname(segment.tid)))
	tsamfile.close();
		
		
		
		
for etype, d in errors2segments.iteritems():
	with open(os.path.join(args.outdir, "%s_%s_error.txt" % etype), 'w') as f:
		for num, segments in d.iteritems():
			if(segments[0].is_reverse):
				seq = reverse_complement(segments[0].seq);
			else:	
				seq = segments[0].seq
			
			f.write("%s\nnumber of read:\t%s\n\nSequence:\t%s\n\nSegments:\n\n" % ("_"*140, num, seq))
			for segment in segments:
				f.write("%s\t%s\t%d\t%s\t%d\t%s\n\n" % (segment.query_name.split("|")[2], samfile.getrname(segment.tid), segment.reference_start, segment.cigarstring, segment.get_tag("AS"), segment.query_name));