예제 #1
0
def _get(relative_path, genome=None):
    """
    :param relative_path: relative path of the file inside the repository
    :param genome: genome name. Can contain chromosome name after comma, like hg19-chr20,
                   in case of BED, the returning BedTool will be with added filter.
    :return: BedTools object if it's a BED file, or filepath
    """
    chrom = None
    if genome:
        if '-chr' in genome:
            genome, chrom = genome.split('-')
        check_genome(genome)
        relative_path = relative_path.format(genome=genome)

    path = abspath(join(dirname(__file__), relative_path))
    if not isfile(path) and isfile(path + '.gz'):
        path += '.gz'

    if path.endswith('.bed') or path.endswith('.bed.gz'):
        if path.endswith('.bed.gz'):
            bedtools = which('bedtools')
            if not bedtools:
                critical('bedtools not found in PATH: ' + str(os.environ['PATH']))
            debug('BED is compressed, creating BedTool')
            bed = BedTool(path)
        else:
            debug('BED is uncompressed, creating BedTool')
            bed = BedTool(path)

        if chrom:
            debug('Filtering BEDTool for chrom ' + chrom)
            bed = bed.filter(lambda r: r.chrom == chrom)
        return bed
    else:
        return path
예제 #2
0
def annotate_peaks(notsif, beds, names):
    """Takes notsif, transforms to bed, and outputs annotation of where the 
    miRNA seed is interrogating via Cytoscape edge attribute file.
    """
    strand = find_strand_from_filename(notsif)

    mirna_bed = BedTool(notsif_to_bed(notsif, strand), from_string=True)

    # create the reference beds
    reference = {}
    for name, bed in izip(names, beds):
        reference[name] = BedTool(bed)

    for name in names:

        # intersect the mirna bed with the reference annotations
        for hit in mirna_bed.intersect(reference[name], s=True, stream=True):

            # name field returned from notsif_to_bed is delimited by "|"
            mirna_name = hit.name.split("|")[0]
            gene_name = hit.name.split("|")[1]
            # Cytoscape formatting
            seed_length = "(%s)" % hit.score
            fields = (mirna_name, seed_length, gene_name, "=", name)
            print " ".join(map(str, fields))
예제 #3
0
def gene_regions(vf, af):
    v = BedTool(vf)
    feats = BedTool(af)
    
    # first establish all the columns in the annotation file
    cols = set(f[4] for f in feats)

    results = {}

    intersection = v.intersect(feats, wb=True)

    if len(intersection) > 0:
        #sort_cmd1 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$9"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s 1<>%s' % (intersection.fn, intersection.fn)
        #call(sort_cmd1, shell=True)
        tempfile1 = tempfile.mktemp()
        sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$9"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s > %s' % (intersection.fn, tempfile1)
        call(sort_cmd2, shell=True)
        intersection = BedTool(tempfile1)   
        annots = intersection.groupby(g=[1,2,3,4,5], c=6, ops='collapse')

        for entry in annots:
            regions = {}
            regions[entry[4]] = entry[5]

            results[entry.name] = Series(regions)

    df = DataFrame(results, index = cols)

    return df.T.fillna(0)
예제 #4
0
def xstream(a, b, distance, updown, out):
    """
    find all things in b that are within
    distance of a in the given direction
    (up or down-stream)
    """
    direction = dict(u="l", d="r")[updown[0]]
    kwargs = {'sw':True, direction: distance}

    if "l" in kwargs: kwargs["r"] = 0
    else: kwargs["l"] = 0
    a = BedTool(a).saveas()

    kwargs['stream'] = True
    c = a.window(b, **kwargs)
    afields = a.field_count()

    seen = collections.defaultdict(set)
    for feat in c:
        key = "\t".join(feat[:afields])
        # keep track of all the feature names that overlap this one
        seen[key].update((feat[afields + 3],))

    # the entries that did appear in the window
    for row in seen:
        out.write(row + "\t" + ",".join(sorted(seen[row])) + "\n")

    # write the entries that did not appear in the window'ed Bed
    for row in a:
        key = "\t".join(row[:afields])
        if key in seen: continue
        out.write(str(row) + "\t.\n")
    out.flush()
    assert len(BedTool(out.name)) == len(a)
예제 #5
0
def sequence_from_bedfile(fastafile, features=None, bedfile=None, pad5=0, pad3=0):
    """Fasta sequences from set of genomic features in a bed file
        Args:
            fastafile: fasta file with genomic sequence
            features: dataframe of features/coords with bed file col names
            bedfile: optionally provide a bed file instead
            pad5,pad3: flanking sequence at 5' or 3' ends
        Returns:
            a pandas dataframe with name, sequence and coord columns"""

    from pybedtools import BedTool
    if bedfile != None:
        features = utils.bed_to_dataframe(bedfile)
    new = []
    for n,r in features.iterrows():
        if r.strand == '+':
            coords = (r.chr,r.chromStart-pad5,r.chromEnd+pad3)
            seq = str(BedTool.seq(coords, fastafile))
        else: #reverse strand
            coords = (r.chr,r.chromStart-pad3,r.chromEnd+pad5)
            seq = str(BedTool.seq(coords, fastafile))
            seq = HTSeq.Sequence(seq).get_reverse_complement()
        #print n, coords, r['name']
        new.append([r['name'],str(seq),coords])
    new = pd.DataFrame(new, columns=['name','seq','coords'])
    return new
예제 #6
0
파일: reformat.py 프로젝트: rrane/jcvi
def calculate_ovl(nbedfile, obedfile, opts, scoresfile):
    nbedtool = BedTool(nbedfile)
    obedtool = BedTool(obedfile)

    ab = nbedtool.intersect(obedtool, wao=True, f=opts.f, r=opts.r, s=opts.s)
    cmd = """cut -f4,5,10,13 | awk -F $'\t' 'BEGIN { OFS = FS } ($3 != "."){ print $1,$3,$2,$4; }'"""
    sh(cmd, infile=ab.fn, outfile=scoresfile)
예제 #7
0
def segmentations(vf, af):
    v = BedTool(vf)
    feats = BedTool(af)
    results = {}
    intersection = v.intersect(feats, wb=True)
    if len(intersection) > 0:
        sort_cmd1 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$8"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s 1<>%s' % (intersection.fn, intersection.fn)
        call(sort_cmd1, shell=True)
        annots = intersection.groupby(g=[1,2,3,4,5], c=6, ops='collapse')
        for entry in annots: 
            regions = {}
            regions[entry[4]] = entry[5]

            results[entry.name] = Series(regions)

    names = {
        'CTCF': 'CTCF_REG', 
        'E':    'ENH', 
        'PF':   'TSS_FLANK', 
        'R':    'REP', 
        'T':    'TRAN', 
        'TSS':  'TSS', 
        'WE':   'WEAK_ENH'
    }

    return DataFrame(results, index=names.keys()).T.rename(columns=names)   
def generate_bed_file_annotations(bed_directory, output_directory, loci):
    """
        Generates the annotation file for every bed file in the bed_directory folder
    """
    
    # Loop over the bed files in the bed directory.
    bed_file_list = glob.glob(os.path.join(bed_directory, "*.bed"))
    logging.info("Start to generate BED file annotations")
    logging.info("Writing annotation to: {0}/".format(output_directory))
    for locus in loci:
        zscore = os.path.join(output_directory, locus) 
        bed_lines, rsids = _bed_from_zscore(zscore)
        tmp_bed = open("tmp.bed","w").writelines(bed_lines)
        snps = BedTool("tmp.bed")
        no_snps = _get_line_number(zscore)
        a_matrix= AnnotateLociMatrix(len(bed_file_list), no_snps)
        logging.info("Annotating locus: {0}, using VCF file {1}".format(locus, zscore))
        for beds in bed_file_list:
            test_annotation = BedTool(beds)
            inter = snps.intersect(test_annotation)
            idxs = []
            for inte in inter:
                idxs.append(rsids.index(inte.name))
            zeroes = np.zeros(len(rsids))
            for idx in idxs:
                zeroes[idx] = 1
            a_matrix.add_annotation(zeroes, beds)
        annotations_file = os.path.join(output_directory, locus + ".annotations")
        logging.info("Writing annotation matrix to: {0}".format(annotations_file))
        a_matrix.write_annotations(annotations_file)
        os.remove("tmp.bed")
예제 #9
0
def calc_origin_bkgd_freqs(bedtool, strand, fasta_filename, verbose):

    # add strand to bedtool
    if strand == 'pos':
        strand_char = '+'
    elif strand == 'neg':
        strand_char = '-'

    intervals = []
    for row in bedtool:
        # input is BED6, output needs BED6
        row.strand = strand_char
        intervals.append(row)

    stranded_bedtool = BedTool(intervals)

    fastatool = stranded_bedtool.sequence(fi=fasta_filename, s=True)

    kwargs = {'region_size_min':1,
              'region_size_max':1,
              'ignore_chroms':[],
              'only_chroms':[],
              'verbose':verbose}

    if verbose:
        print >>sys.stderr, ">> calculating background freqs ..."

    result = calc_bkgd_counts(fastatool.seqfn, **kwargs)

    return result
예제 #10
0
def gene_regions(vf, af):
    v = BedTool(vf)
    feats = BedTool(af)
    
    # first establish all the columns in the annotation file
    cols = set(f[4] for f in feats)

    results = {}

    intersection = v.intersect(feats, wb=True)

    if len(intersection) > 0:
        annots = intersection.groupby(g=[1,2,3,4], c=9, ops='collapse')

        for entry in annots:
            regions = {}
            for region in entry[4].split(','):  
                if region in regions:
                    regions[region] += 1
                else:
                    regions[region] = 1

            results[entry.name] = Series(regions)

    df = DataFrame(results, index = cols)

    return df.T.fillna(0)
예제 #11
0
def main():
    p = argparse.ArgumentParser(description=__doc__,
            formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument('bed', help='bed with miRNA as name')
    p.add_argument('--reference-beds', dest='reference', nargs='+', 
        help='reference beds for each feature to annotate')
    p.add_argument('--names', nargs='+', 
        help='names corresponding to reference files')
    args = p.parse_args()
    if not args.names and not args.reference:
        sys.exit(p.print_help())
    
    bed = BedTool(args.bed)
    
    # create the reference beds
    reference = {}
    for refname, refbed in izip(args.names, args.reference):
        reference[refname] = BedTool(refbed)
    
    for refname in args.names:
    
        # intersect the mirna bed with the reference annotations
        for b in bed.intersect(reference[refname], s=True, stream=True):
            # Cytoscape formatting
            fields = (b.name, "=", refname)
            print " ".join(map(str, fields))
예제 #12
0
파일: driver.py 프로젝트: Tay2510/PyCorn
def getNegativeDatasetFASTA(config):
	try:
		coordinates = BedTool(config['negativesBedFile'])
		genome = BedTool(config['maize_genome_filepath'])
		dataset = coordinates.sequence(fi=genome, fo=config['negative_dataset_output'])
	except ValueError:
		print 'getNegativeDatasetFASTA; File ', config['maize_genome_filepath'], ' not found'
예제 #13
0
파일: snps.py 프로젝트: rmagoglia/ASEr
def filter_bed(bedfile, snp_list, outfile=sys.stdout):
    """Filter a bedfile to only include snps in snp_list, print to outfile.

    :bedfile:  A bed file of all the SNPs, can be gzipped.
    :snp_list: List/tuple/set/frozenset of snp names.
    :outfile:  Something .bed or .bed.gz, deault STDOUT.
    :returns:  0 on success 1 on failure

    """
    try:
        from pybedtools import BedTool
    except ImportError:
        logme.log('pybedtools is not installed.\n' +
                  'Please install and try again. You can get it from here:\n' +
                  'https://github.com/daler/pybedtools',
                  level='error')
        return -1

    if not isinstance(snp_list, (tuple, list, set, frozenset)):
        raise Exception('snp_list must be tuple/list/set/frozenset ' +
                        'it is: {}'.format(type(snp_list)))

    bed      = BedTool(bedfile)
    filtered = bed.filter(lambda a: a.name in snp_list)

    with open_zipped(outfile, 'w') as fout:
        fout.write(str(filtered))
def get_coverage(bed_prefix, directory, file_prefix, bam):
    """
    Coverage at all positions is calculated. This is then used for coverage analysis and to determine read depth at any
    false negative sites
    :param bed_prefix: all regions in the bed files submitted are in a file generated during intersections
    :param directory: location of patient results
    :param file_prefix: prefix used for all files in pipeline i.e. worklist-patient
    :return out: filename for coverage stats
    """
    #TODO change BAM path so filename is not required
    print 'Generating coverage stats.'
    whole_bed = '/results/Analysis/MiSeq/MasterBED/GIAB/' + bed_prefix + '.whole.bed'
    out = directory + '/giab_results/whole_bed_coverage.txt'
    command = '/results/Pipeline/program/sambamba/build/sambamba depth base --min-coverage=0 -q29 -m -L ' + whole_bed + \
              ' ' + bam + ' > ' + out + '.tmp'
    try:
        subprocess.check_call(command, shell=True)
    except subprocess.CalledProcessError as e:
        print 'Error executing command:' + str(e.returncode)
        exit(1)
    print 'Sambamba complete.'
    #issue with sambamba that leaves out regions that have 0 coverage - intersect regions to find missing and add
    # them to the file at coverage 0
    temp_bed = out.replace('.txt', '.bed.tmp')
    command = 'awk \'{print($1"\\t"$2"\\t"$2+1"\\t"$3)}\' ' + out + '.tmp | grep -v "COV" > ' + temp_bed
    print command
    try:
        subprocess.check_call(command, shell=True)
        print 'BED coordinates extracted.'
    except subprocess.CalledProcessError as e:
        print 'Error executing command:' + str(e.returncode)
        exit(1)


    coverage_bed = BedTool(temp_bed)
    print 'BED tool created'
    whole_bedtool = BedTool(whole_bed)
    print 'Intersecting'
    missing_regions = whole_bedtool.intersect(coverage_bed, v=True)
    missing_file = directory + '/giab_results/regions_missing'
    missing_regions.moveto(missing_file)
    print 'Generating file'
    sample_split = file_prefix.split('-')
    sample = sample_split[1] + '-' + sample_split[2]
    command = '''while read i; do start=`echo "$i"|cut -f2`; end=`echo "$i"|cut -f3`; chr=`echo "$i"|cut -f1`; end_true=`echo "${end} - 1" | bc`; for j in $(seq $start $end_true); do new_end=`echo -e "${j} + 1" | bc`; echo -e "$chr\\t${j}\\t0\\t0\\t0\\t0\\t0\\t0\\t0\\t''' + sample + '''";done;done < ''' + missing_file + '> ' + directory + '/to_add'
    print command
    try:
        subprocess.check_call(command, shell=True)
    except subprocess.CalledProcessError as e:
        print 'Error executing command:' + str(e.returncode)
        exit(1)

    command = 'cat ' + out + '.tmp ' + directory + '/to_add > ' + out
    try:
        subprocess.check_call(command, shell=True)
    except subprocess.CalledProcessError as e:
        print 'Error executing command:' + str(e.returncode)
        exit(1)
    print 'fix complete.'
    return out
def main():
    p = argparse.ArgumentParser(description=__doc__,
            formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument('peaks', help='peaks bed')
    p.add_argument('exons', help='refseq exons from UCSC')
    p.add_argument('gtf', help='refseq gtf with feature of interest')
    p.add_argument('feature', help='feature of interest in the gtf')
    p.add_argument('-v', '--verbose', action="store_true", help='maximum verbosity')
    args = p.parse_args()
    
    if args.verbose: sys.stderr.write(">> building exon library...\n")
    exon_lib = make_exon_lib(args.exons)
    
    peaks = BedTool(args.peaks)
    exons = BedTool(args.exons)
    full_ref = BedTool(args.gtf)
    
    if args.verbose: sys.stderr.write(">> filtering for feature...\n")
    filtered_ref = full_ref.filter(lambda gtf: gtf[2] == args.feature)
    
    if args.verbose: sys.stderr.write(">> selecting exonic peaks...\n")
    exonic_peaks = peaks.intersect(exons, wo=True)
    
    if args.verbose: sys.stderr.write(">> calculating distance fractions...\n")
    # D for distance (returns negative if upstream)
    for peak in exonic_peaks.closest(filtered_ref, D="a"):
        try:
            p = ComplexLine(peak)
            corrected_distance = 0.0
            total_exon_length = 0.0
            # parse gtf attrs
            gene_id = p.gtfattrs.split(';')[0].rstrip('"').lstrip('gene_id "')

            # looking downstream wrt peak
            if p.gtfdistance > 0:
                # exon with peak
                corrected_distance = p.exonstop - p.peakstop
                for exon in exon_lib[p.exoninfo.name]:
                    # add downstream exon lengths
                    if exon > p.exoninfo.number:
                        corrected_distance += exon_lib[p.exoninfo.name][exon]
                        
            # looking upstream wrt peak
            else:
                # exon with peak
                corrected_distance = p.peakstart - p.exonstart
                for exon in exon_lib[p.exoninfo.name]:
                    # add upstream exon lengths
                    if exon < p.exoninfo.number:
                        corrected_distance += exon_lib[p.exoninfo.name][exon]
            
            for exon in exon_lib[p.exoninfo.name]:
                total_exon_length += exon_lib[p.exoninfo.name][exon]
            
            # fraction
            print (corrected_distance / total_exon_length)
        
        except ValueError:
            continue
예제 #16
0
파일: driver.py 프로젝트: Tay2510/PyCorn
def getPositiveDatasetFASTA(config):
	if (not os.path.isfile(config['positive_dataset_output'])):
		try:
			coordinates = BedTool(config['bed_file_post'])
			genome = BedTool(config['maize_genome_filepath'])
			dataset = coordinates.sequence(fi=genome, fo=config['positive_dataset_output'])
		except ValueError:
			print 'getPositiveDatasetFASTA; File ', config['maize_genome_filepath'], ' not found'
예제 #17
0
def cpg_islands(vf, af):
    print "inside cpg_islands"
    v = BedTool(vf)
    cpg = BedTool(af)
    overlap = v.intersect(cpg, wb=True)
    results = dict([ (r.name, 1) for r in overlap ])
    print "exit cpg_islands"
    return Series(results, name="cpg_island")
예제 #18
0
def motifs(vf, af):
    print "inside motif"
    v = BedTool(vf)
    cpg = BedTool(af)
    overlap = v.intersect(cpg, wb=True)
    results = dict([ (r.name, 1) for r in overlap ])
    print "exit motif"
    return Series(results, name="pwm")
예제 #19
0
def feat_dist(vf, af, name):
    print "inside feat_dist"
    v = BedTool(vf)
    a = BedTool(af)
    closest = v.closest(a, D="b")
    results = dict([ (r.name, int(r[len(r.fields)-1])) for r in closest ])
    print "exiting feat_dist"
    return Series(results, name=name)
예제 #20
0
def gc_content(vf, fa, flank=50):
    print "inside gc_content"
    v = BedTool(vf)
    flanks = v.slop(g=pybedtools.chromsizes('hg19'), b=flank)
    nc = flanks.nucleotide_content(fi=fa)
    results = dict([ (r.name, float(r[5])) for r in nc ])
    print "exiting gc_content"
    return Series(results, name="GC") 
예제 #21
0
파일: degenerate.py 프로젝트: adamjorr/zypy
def getCDSs(bedfilename, reffilename, strand):
    """
    return iterator of coding sequences
    """
    bed = BedTool(bedfilename)
    bed = bed.filter(lambda x: x.strand == strand)
    fasta = reffilename
    bed = bed.sequence(fi=fasta, s=True)
    return SeqIO.parse(bed.seqfn, "fasta")
def build_vcf_intervals(reads, vcf_recs, bam_handle):
    """
    Find if any of these reads match a known SUN/indel by simple bedtools intersections
    """
    vcf_bed_recs = [ChromosomeInterval(x.CHROM, x.start, x.end, None) for x in vcf_recs]
    vcf_bed = BedTool(vcf_bed_recs)
    reads_bed_recs = [(bam_handle.getrname(x.tid), x.positions[0], x.positions[-1]) for x in reads if len(x.positions) > 2]
    reads_bed = BedTool(reads_bed_recs)
    return list(vcf_bed.intersect(reads_bed))
예제 #23
0
def calc_signals(bam_filename, region_bed_filename, signal_colnum,
                 region_type, normalize, verbose):

    ''' generator to calculate signals from BED regions mapped onto positive and
    negative strand data.'''

    region_bedtool = BedTool(region_bed_filename)

    # bedtools.map operations
    operations = ('sum','count')

    signal_type = 'raw'
    if normalize:
        signal_type = 'norm'

    for signal_strand in STRANDS:

        signal_bedtool = load_coverage(bam_filename, strand=signal_strand,
                                       verbose=verbose)
        for oper in operations:

            map_bedtool = region_bedtool.map(signal_bedtool, o=oper,
                                             c=signal_colnum, null=0)

            for region_row, signal_row in izip(region_bedtool, map_bedtool):
 
                try:
                    region_name = region_row[3]
                    region_score = region_row[4]
                    region_strand = region_row[5]

                except IndexError:
                    region_name = '%s-%s-%d-%d' % (region_type,
                                                   region_row.chrom,
                                                   region_row.start,
                                                   region_row.end)
                    region_score = 0
                    # default
                    region_strand = 'none'

                if region_strand == '+':
                    region_strand = 'pos'
                elif region_strand == '-':
                    region_strand = 'neg'

                # last field is the calculated signal
                signal = float(signal_row[-1])

                if normalize and signal != 0:
                    region_size = float(region_row.end - region_row.start)
                    signal = signal / region_size

                result = (region_name, region_score, 'region-'+region_strand,
                          region_type, 'signal-'+signal_strand,
                          oper, signal, signal_type)

                yield result
예제 #24
0
def calc_intersection(bedtools, verbose):

    intersect_tool = BedTool()

    if verbose:
        print >>sys.stderr, ">> generating intersection ... "

    result = intersect_tool.multi_intersect(i=[bt.fn for bt in bedtools])

    return result
예제 #25
0
    def check_bed(self, bed_file, stream):
        bed = BedTool(bed_file, from_string=stream)
        try:
            sorted_bed = bed.sort()
            merged_bed = sorted_bed.merge(c="4", o="distinct")

            return merged_bed

        except Exception as exception:
            print ("ERROR: " + str(exception))
예제 #26
0
def folding_analysis(bedfilename, fastafilename, verbose):

    bedtool = BedTool(bedfilename)
    for region in bedtool:

        region_seq = bedtool.sequence()

        struct, mfe = RNA.fold(region_seq) 

        for pos, nuc  in enumerate(region_seq):
            struct_char = struct[pos]
예제 #27
0
def filterReadsByLength(inbam, minlength, maxlength):
    '''
    Takes a bam file and selects intervals that are within the defined lengths.
    Input: bam file and min/max lengths
    Output: bedTool
    '''
    # convert bam to bed
    intervals = BedTool(inbam).bam_to_bed()
    filt = intervals.filter(lambda x: len(x) > minlength and len(x) < maxlength).saveas()
    # print filt
    return filt
    def transcripts_list_to_bed6(self, file_name=None, save_in_file=False ):
        bed6_trans = [trans_to_b6.get_bed6() for trans_to_b6 in  self.transcripts_list()]
        # aqui deve entrar algum distema de filtros
        bed6_trans = BedTool('\n'.join(bed6_trans), from_string=True).sort()

        if not save_in_file:
            return bed6_trans
        else:
            if file_name:
                return bed6_trans.saveas(fn=file_name, trackline="track name='Transcripts {}' color=128,0,0".format(file_name.split('/')[-1]))
            else:
                raise IOError('\nthe file_name method function needs a name or complete file path with name.\n')
예제 #29
0
def repeats(vf, af):
    v = BedTool(vf)
    feats = BedTool(af)
    intersection = v.intersect(feats, wb=True)
    results = {}
    if len(intersection) > 0:
        annots = intersection.groupby(g=[1,2,3,4], c=8, ops='collapse')
        for entry in annots:
            types = entry[4].split(',') 
            results[entry.name] = len(types)

    return Series(results, name='repeat')
예제 #30
0
def main():
    p = optparse.OptionParser(__doc__)
    p.add_option("-a", dest="a", help="file to annotate. first 3 columns are "
                                      "chrom start stop")
    p.add_option("-b", dest="b", help="superbed to annotate with")

    p.add_option("--header", dest="header", help="a file has a header",
                    action="store_true", default=False)
    p.add_option("-N","--no-near", dest="no_near",
            help="dont find the nearest gene, just the up/downstream",
                    action="store_true", default=False)

    p.add_option("--upstream", dest="upstream", type=int, default=None,
                   help="distance upstream of [a] to look for [b]")
    p.add_option("--downstream", dest="downstream", type=int, default=None,
                   help="distance downstream of [a] to look for [b]")
    p.add_option("--transcripts", dest="transcripts", action="store_true",
            default=False, help="use transcript names in output as well as"
            " gene name. default is just gene name")

    opts, args = p.parse_args()
    if (opts.a is None or opts.b is None):
        sys.exit(not p.print_help())

    b = opts.b
    if not opts.transcripts:
        b = remove_transcripts(b)

    if not (opts.upstream or opts.downstream):
        superanno(opts.a, b, opts.header, opts.no_near, sys.stdout)

    else:
        out = open(BedTool._tmp(), "w")
        superanno(opts.a, b, opts.header, opts.no_near, out)
        out.close()

        new_header = []
        out_fh = open(out.name)
        new_header = [out_fh.readline().rstrip("\r\n")] if opts.header else []
        for xdir in ("upstream", "downstream"):
            dist = getattr(opts, xdir)
            if dist is None: continue
            new_out = open(BedTool._tmp(), "w")
            xstream(out_fh, b, dist, xdir, new_out)
            new_header.append("%s_%i" % (xdir, dist))
            new_out.close()
            out_fh = open(new_out.name)

        if opts.header:
            print "\t".join(new_header)
        for line in open(out_fh.name):
            sys.stdout.write(line)
예제 #31
0
def getFlank():
    """
        Change the position in tabbed file in order to
        include flank region. For vcf file, we need to
        generate a new temporary tabbed file which contain
        the ID, start position and stop position. If the
        argument --notempfile is passed, it create the file
        [tabbed_file_name]_out.[ext] in the directory
        ./result.
    """
    lenChr = parseFa()
    if (args.typeA != None and ext == "gff3") or change:
        fileTab2 = BedTool(tabO)
    else:
        fileTab2 = BedTool(args.tabinput)
    if args.verbose != 0 and args.notempf:
        print("\n ----- Creating file '" + tabOP + "'. ----- ")
    if (fileTab2.file_type == "bed"):
        with open(lenChr.name, "r") as lenC:
            res = ""
            for feature in fileTab2:
                fjoin = ("__".join(str(feature).split("\t"))).replace(
                    " ", "\\s")
                for line in lenC:
                    lenghtC = re.search(feature.chrom + "\t(\d+)", line)
                    if lenghtC:
                        break
                lenC.seek(0)
                if feature.stop + args.flank > int(lenghtC.group(
                        1)):  # TODO : unreadable, gotta change that
                    stop = int(lenghtC.group(1))
                else:
                    stop = feature.stop + args.flank
                if feature.start - args.flank < 0:
                    start = 0
                else:
                    start = feature.start - args.flank
                res += feature.chrom + "\t" + str(start) + "\t" + str(
                    stop) + "\t" + fjoin + "\n"
        BedTool(res, from_string=True, deli="\t").saveas(tabOP)
        #fileTab2.slop(b=args.flank, g=lenChr.name ,output=tabOP, header=True)
    if (fileTab2.file_type == "gff"):
        with open(lenChr.name, "r") as lenC:
            res = ""
            countCds = 0
            for feature in fileTab2:
                taline = str(feature).split("\t")
                if taline[2] == "CDS":
                    countCds += 1
                    taline[-1] = taline[-1][0:-1] + ";Note=" + str(countCds)
                else:
                    countCds = 0
                fjoin = ("__".join(taline)).replace(" ", "\\s")
                for line in lenC:
                    lenghtC = re.search(feature.chrom + "\t(\d+)", line)
                    if lenghtC:
                        break
                lenC.seek(0)
                if feature.stop + args.flank > int(lenghtC.group(
                        1)):  # TODO : unreadable, gotta change that
                    stop = int(lenghtC.group(1))
                else:
                    stop = feature.stop + args.flank
                if feature.start - args.flank < 0:
                    start = 0
                else:
                    start = feature.start + 1 - args.flank
                res += feature.chrom + "\t" + str(start) + "\t" + str(
                    stop) + "\t" + fjoin + "\n"
        BedTool(res, from_string=True, deli="\t").saveas(tabOP)
        #fileTab2.slop(b=args.flank, g=lenChr.name ,output=tabOP, header=True)
    elif (fileTab2.file_type == "vcf"):
        with open(lenChr.name, "r") as lenC:
            res = ""
            for feature in fileTab2:
                fjoin = ("__".join(str(feature).split("\t"))).replace(
                    " ", "\\s")
                for line in lenC:
                    lenghtC = re.search(feature.chrom + "\t(\d+)", line)
                    if lenghtC:
                        break
                lenC.seek(0)
                if feature.stop + args.flank - 1 + (len(feature[3]) - 1) > int(
                        lenghtC.group(
                            1)):  # TODO : unreadable, gotta change that
                    stop = int(lenghtC.group(1))
                else:
                    stop = feature.stop + args.flank - 1 + (len(feature[3]) -
                                                            1)
                if feature.start - args.flank - 1 < 0:
                    start = 0
                else:
                    start = feature.start - args.flank - 1
                res += feature.chrom + "\t" + str(start) + "\t" + str(
                    stop) + "\t" + fjoin + "\n"
        BedTool(res, from_string=True, deli="\t").saveas(tabOP)
    return (tabOP)
예제 #32
0
def make_features_multiTask(positive_windows, y_positive, nonnegative_regions_bed, 
                            bigwig_files, bigwig_names, genome, epochs, valid_chroms, test_chroms):
    chroms, chroms_sizes, genome_bed = get_genome_bed()
    train_chroms = chroms
    for chrom in valid_chroms + test_chroms:
        train_chroms.remove(chrom)
    genome_bed_train, genome_bed_valid, genome_bed_test = \
        [subset_chroms(chroms_set, genome_bed) for chroms_set in
         (train_chroms, valid_chroms, test_chroms)]

    positive_windows_train = []
    positive_windows_valid = []
    positive_windows_test = []
    positive_data_train = []
    positive_data_valid = []
    positive_data_test = []
    
    import pdb
    print('Splitting positive windows into training, validation, and testing sets')
    for positive_window, target_array in itertools.izip(positive_windows, y_positive):
        if len(positive_window.chrom) > 8:
            pdb.set_trace()
        chrom = positive_window.chrom
        start = int(positive_window.start)
        stop = int(positive_window.stop)
        if chrom in test_chroms:
            positive_windows_test.append(positive_window)
            positive_data_test.append((chrom, start, stop, shift_size, bigwig_files, [], target_array))
        elif chrom in valid_chroms:
            positive_windows_valid.append(positive_window)
            positive_data_valid.append((chrom, start, stop, shift_size, bigwig_files, [], target_array))
        else:
            positive_windows_train.append(positive_window)
            positive_data_train.append((chrom, start, stop, shift_size, bigwig_files, [], target_array))
    
    positive_windows_train = BedTool(positive_windows_train)
    positive_windows_valid = BedTool(positive_windows_valid)
    positive_windows_test = BedTool(positive_windows_test)

    import pdb
    print('Getting negative training examples')
    negative_windows_train = BedTool.cat(*(epochs*[positive_windows]), postmerge=False)
    #negative_windows_train = BedTool.cat(*(10*[positive_windows]), postmerge=False)
    #pdb.set_trace()
    negative_windows_train = negative_windows_train.shuffle(g=genome_sizes_file,
                                                            incl=genome_bed_train.fn,
                                                            excl=nonnegative_regions_bed.fn,
                                                            noOverlapping=False,
                                                            seed=np.random.randint(-214783648, 2147483647))
                                                            #seed=np.random.randint(-21478364, 21474836))
    print('Getting negative validation examples')
    negative_windows_valid = positive_windows_valid.shuffle(g=genome_sizes_file,
                                                            incl=genome_bed_valid.fn,
                                                            excl=nonnegative_regions_bed.fn,
                                                            noOverlapping=False,
                                                            seed=np.random.randint(-214783648, 2147483647))
                                                            #seed=np.random.randint(-21478364, 21474836))
    print('Getting negative testing examples')
    negative_windows_test = positive_windows_test.shuffle(g=genome_sizes_file,
                                                            incl=genome_bed_test.fn,
                                                            excl=nonnegative_regions_bed.fn,
                                                            noOverlapping=False,
                                                            seed=np.random.randint(-214783648, 2147483647))
                                                            #seed=np.random.randint(-21478364, 21474836))

    # Train
    print('Extracting data from negative training BEDs')
    negative_targets = np.zeros(y_positive.shape[1])
    negative_data_train = [(window.chrom, window.start, window.stop, shift_size, bigwig_files, [], negative_targets)
                           for window in negative_windows_train]

    # Validation
    print('Extracting data from negative validation BEDs')
    negative_data_valid = [(window.chrom, window.start, window.stop, shift_size, bigwig_files, [], negative_targets)
                           for window in negative_windows_valid]
    
    # Test
    print('Extracting data from negative testing BEDs')
    negative_data_test = [(window.chrom, window.start, window.stop, shift_size, bigwig_files, [], negative_targets)
                           for window in negative_windows_test]

    num_positive_train_windows = len(positive_data_train)
    
    data_valid = negative_data_valid + positive_data_valid
    data_test = negative_data_test + positive_data_test

    print('Shuffling training data')
    data_train = []
    for i in xrange(epochs):
        epoch_data = []
        epoch_data.extend(positive_data_train)
        epoch_data.extend(negative_data_train[i*num_positive_train_windows:(i+1)*num_positive_train_windows])
        np.random.shuffle(epoch_data)
        data_train.extend(epoch_data)

    print('Generating data iterators')
    bigwig_rc_order = get_bigwig_rc_order(bigwig_names)
    datagen_train = DataIterator(data_train, genome, batch_size, L, bigwig_rc_order)
    datagen_valid = DataIterator(data_valid, genome, batch_size, L, bigwig_rc_order)
    datagen_test = DataIterator(data_test, genome, batch_size, L, bigwig_rc_order)

    print(len(datagen_train), 'training samples')
    print(len(datagen_valid), 'validation samples')
    print(len(datagen_test), 'test samples')
    return datagen_train, datagen_valid, datagen_test, data_valid,data_test
예제 #33
0
def overlap(bed_file, gtf_file, result_file):
    variants_bed = BedTool(bed_file)
    gtf = BedTool(gtf_file)

    variants_bed.intersect(gtf, wb=True).moveto(result_file)
예제 #34
0
def radloci(bedgz):
    '''
    Use pybedtools to get estimated
    callable RAD loci positions. Return
    number of loci.
    '''
    bedcov = BedTool(bedgz)
    filtcov = []
    locidic = {}
    chrsize = {}
    chrset = set()
    for line in bedcov:
        mychr = line[0]
        chrset.add(mychr)
        # Locus must have minimum coverage of 4
        if int(line[3]) > 3:
            bedrow = (line[0], int(line[1]), int(line[2]))
            filtcov.append(bedrow)
        endpoint = int(line[2])
        if mychr in chrsize:
            endpoints = chrsize[mychr]
            endpoints.append(endpoint)
            chrsize[mychr] = endpoints
        else:
            chrsize[mychr] = [endpoint]
    # Replace list of endpoints with max chr position
    for key, value in chrsize.items():
        maxpos = max(value)
        chrsize[key] = maxpos
    # The chrsize dict now contains chr lengths

    filtbed = BedTool(filtcov)
    # Merge regions to get loci
    # Distance of 100 should merge
    # properly paired reads with insert size < 300
    loci = filtbed.merge(d=100)
    # Counter list for all loci
    locicnt = 0
    # Counter list for chr loci
    chrcnts = []
    # Dic to hold chr loci stats
    for l in loci:
        locicnt = locicnt + 1
        scaff = l[0]
        start = int(l[1])
        stop = int(l[2])
        locmid = int((start + stop) / 2)
        if scaff in locidic:
            scaffcnt = locidic[scaff][0] + 1
            locpos = locidic[scaff][1]
            locpos.append(locmid)
            locidic[scaff] = [scaffcnt, locpos]
        else:
            locidic[scaff] = [1, [locmid]]

    for key, value in locidic.items():
        # Append loci per chr to list
        # Ignore unplaced and scaff
        if 'npla' not in key and 'scaff' not in key:
            chrcnts.append(value[0])
        # Get chr len from dict
        chrlen = chrsize[key]
        # Calculate max bin size with chr len
        toprange = math.ceil(chrlen / 100000.0) * 100000.0
        # Number of 100kb bins
        binnum = int(toprange / 100000.0)
        # Calculate histogram
        hist, bins = np.histogram(value[1], bins=binnum, range=(0, toprange))
        hist = np.ndarray.tolist(hist)
        mhist = round(statistics.mean(hist), 4)
        sdhist = round(statistics.stdev(hist), 4)
        lchrdic = {}
        lchrdic['Loci'] = value[0]
        lchrdic['Mean loci per 100kb'] = mhist
        lchrdic['StDev loci per 100kb'] = sdhist
        locidic[key] = lchrdic
    try:
        lchrdic = {}
        lchrdic['Loci'] = locicnt
        lchrdic['Mean loci per chr'] = round(statistics.mean(chrcnts), 4)
        lchrdic['StDev loci per chr'] = round(statistics.stdev(chrcnts), 4)
        locidic['total'] = lchrdic
    except:
        lchrdic['Loci'] = 'NA'
        lchrdic['Mean loci per chr'] = 'NA'
        lchrdic['StDev loci per chr'] = 'NA'
        locidic['total'] = lchrdic

    for chrom in chrset:
        if chrom not in locidic:
            emptydic = {}
            emptydic['Loci'] = 0
            emptydic['Mean loci per 100kb'] = 0
            emptydic['StDev loci per 100kb'] = 0
            locidic[chrom] = emptydic
    return locidic
예제 #35
0
    if len(lstGenes) == 0:
        exit("[Nk_makeBED] Any gene found in genes list file `" + pathGenes +
             "`")
    # Search genes id
    lstMissingGene = lstGenes
    for gene in gffutilsDB.features_of_type("gene"):
        if gene.attributes["Name"][0] in lstGenes:
            setGene.add(gene)
            lstMissingGene.remove(gene.attributes["Name"][0])
    # Error if any gene not found
    if len(lstMissingGene) > 0:
        exit("[Nk_makeBED] Unable to find following gene(s): `" +
             ",".join(lstMissingGene) + "`")
#3nd case: bed & no genes => search intersected gene identifier
else:
    bed = BedTool(pathBed)
    genes = BedTool(pathGff)
    # Search gff exons intersection
    for intersect_elem in genes + bed:
        if intersect_elem.fields[2] == "exon":
            exon = gffutilsDB[intersect_elem.attrs["ID"]]
            # retrieve correspunding transcript
            for rna in gffutilsDB.parents(exon, order_by='start'):
                for gene in gffutilsDB.parents(rna,
                                               featuretype='gene',
                                               order_by='start'):
                    setGene.add(gene)
    # delete created temp file
    cleanup(remove_all=True)

#***** CONSTRUCT ANNOTATED REGIONS BED FILE *****#
예제 #36
0
    for s in range(0, len(genesnps), 1):
        if genesnps[s] in snptable:
            tempsnp = genesnps[s]
            ac_sum = ac_sum + int(snptable[tempsnp][1])
            hom_sum = hom_sum + int(snptable[tempsnp][2])
    return [ac_sum, hom_sum]


#Make list of all SNPs across all genes present in snpfile
allsnplist = makesnplist(options.snpfilename)

#Make a hashtable with keys as each SNP, and stores a list of indices of carriers for that SNP
count_table = {}

#Open vcf file
vcffile = BedTool(options.vcffilename)
if options.bedfilename is not None:
    bed = BedTool(options.bedfilename)
    vcffile_temp = vcffile.intersect(bed)
else:
    if chrformat == "chr":
        dummy_bed = BedTool('chr1000 100000000 100000001', from_string=True)
    else:
        dummy_bed = BedTool('1000 100000000 100000001', from_string=True)
    vcffile_temp = vcffile.subtract(dummy_bed)

for line_vcf1 in open(vcffile_temp.fn):
    line_vcf = line_vcf1.rstrip().split('\t')
    if line_vcf[0][0] != "#" and ("," not in line_vcf[4]):
        if not (options.passfilter and line_vcf[6] != "PASS"):
            if options.snpformat == "VCFID":
예제 #37
0
def overlap_pe(variants_bedpe_file, gtf_file, result_file):
    variants_bed = BedTool(variants_bedpe_file)
    gtf = BedTool(gtf_file)
    olaps = variants_bed.pair_to_bed(gtf, stream=True).moveto(result_file)
def get_muts_tracks_info(muts_input_file,
                         tracks_dir,
                         muts_dir_out,
                         split_muts_file_by_chr=True):

    muts_tracks_files = []

    tracks_files = [x for x in os.listdir(tracks_dir) if x.endswith('.bed')]
    if split_muts_file_by_chr:
        muts_files = []
        chr_ext = "." + tracks_files[0].split('.')[-1]
        if os.path.exists(muts_dir_out):
            muts_tracks_files = [
                muts_dir_out + '/' + x for x in os.listdir(muts_dir_out)
                if x.endswith('_overlapping_tracks.bed10')
            ]
            if len(muts_tracks_files) > 0:
                return muts_tracks_files
        else:
            os.mkdir(muts_dir_out)
            muts_files = [
                muts_dir_out + '/' + x for x in os.listdir(muts_dir_out)
                if x.endswith(chr_ext)
            ]
            if len(muts_files) <= 0:
                os.system(
                    """awk '{{print $0 >> "{muts_dir}/"$1"{chr_ext}"}}' {muts_file} """
                    .format(muts_dir=muts_dir_out,
                            chr_ext=chr_ext,
                            muts_file=muts_input_file))
                muts_files = [
                    muts_dir_out + '/' + x for x in os.listdir(muts_dir_out)
                    if x.endswith(chr_ext)
                ]

        print('muts_files:  ', muts_files)
        print('tracks_files: ', tracks_files)
        for muts_file in muts_files:
            if muts_file.split('/')[-1] in tracks_files:
                muts_tracks_file = muts_file + "_overlapping_tracks.bed10"
                if not os.path.exists(muts_tracks_file):
                    print("Intersecting and Grouping: ", muts_tracks_file)
                    BedTool(muts_file).intersect(
                        BedTool(tracks_dir + '/' + tracks_files[
                            tracks_files.index(muts_file.split('/')[-1])]),
                        wo=True,
                        loj=True).groupby(g=[1, 2, 3, 4, 5, 6, 7, 8, 9],
                                          c=13,
                                          o=['collapse'
                                             ]).saveas(muts_tracks_file)
                muts_tracks_files.append(muts_tracks_file)
    else:
        for tracks_file in tracks_files:
            if not os.path.exists(muts_dir_out):
                os.mkdir(muts_dir_out)
            muts_tracks_file = muts_dir_out + '/' + tracks_file + "_overlapping_tracks.bed10"
            if not os.path.exists(muts_tracks_file):
                print("Intersecting and Grouping: ", muts_tracks_file)
                BedTool(muts_input_file).intersect(
                    BedTool(tracks_dir + '/' + tracks_file), wo=True,
                    loj=True).groupby(g=[1, 2, 3, 4, 5, 6, 7, 8, 9],
                                      c=13,
                                      o=['collapse']).saveas(muts_tracks_file)
            muts_tracks_files.append(muts_tracks_file)
    print('muts_tracks_files: ', muts_tracks_files)

    return muts_tracks_files
예제 #39
0
def get_data(df):

    Popen('mkdir -p ./' + args.outfile + ".datamatrix/temp/", shell=True)

    bedtool = BedTool.from_dataframe(df).sort().saveas(
        args.outfile + '.datamatrix/bedtool_df.bed')
    a = nuc_cont(bedtool)

    if args.var_files:
        var_files_list = list(args.var_files)
        for i in range(len(var_files_list)):
            var = get_var_counts(bedtool, var_files_list[i])
            a = a.merge(var, on='name')
    elif not args.var_files:
        pass

    if args.bw_files:
        bw_files_list = list(args.bw_files)
        for i in range(len(bw_files_list)):
            bw = get_bigwig_scores(bw_files_list[i], df)
            a = a.merge(bw, on='name')
    elif not args.con_files:
        pass

    if args.kmer_list:
        print()
        print("Starting K-mer counting")
        get_kmer_counts(args.kmer_list)
    elif not args.kmer_list:
        pass

    if args.rnafold == True:
        print()
        print("Starting RNAfold for MFE scoring")
        get_MFE_scores()
    elif args.rnafold == False:
        pass

    if args.qgrs_mapper == True:
        print()
        print("Starting QGRS Mapper for G-Quadruplex scoring")
        get_QGRS_scores()
    elif args.qgrs_mapper == False:
        pass

    if str(args.nuc_info) == 'full':
        z = a.drop(
            ['length', 'seqname', 'start', 'end', 'score', 'strand', 'seq'], 1)
    else:
        z = a.drop('seq', 1)

    if (args.rnafold == True) or (args.qgrs_mapper == True) or (args.kmer_list
                                                                is not False):
        z.to_csv(args.outfile + ".datamatrix/temp/data_generic_results.csv",
                 index=False)

        temp_files = glob.glob(args.outfile + ".datamatrix/temp/*.csv")

        z_list = []

        for i in range(len(temp_files)):
            df = pd.read_csv(temp_files[i], index_col=0)
            if ((args.nuc_info == 'full') &
                ("data_generic_results.csv" in temp_files[i])):
                df = df.set_index('name')
            else:
                pass
            z_list.append(df)

        #with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor:
        #    z = executor.submit(pd.concat, z_list, axis=1, join='outer').result()

        z = pd.concat(z_list, axis=1, join='outer', sort=False)
        z = z.reset_index().rename(columns={'index': 'name'}).fillna(0)
    else:
        pass

    z = filter_columns(z)

    return z.drop_duplicates()
예제 #40
0
        sys.exit(1)
    if args.version:
        getVersion()
        sys.exit(1)
    if args.warn:
        warnings.filterwarnings("ignore")

    if args.fasta1 == None:
        sys.exit("ERROR : Argument --fasta1 (-f1) is missing.")
    if args.fasta2 == None:
        sys.exit("ERROR : Argument --fasta2 (-f2) is missing.")
    if args.tabinput == None:
        sys.exit("ERROR : Argument --tabinput (-ti) is missing.")

    global ext
    ext = BedTool(args.tabinput).file_type
    if ext == "gff":
        ext = "gff3"
    if args.directory[-1] == "/":
        args.directory = args.directory[:-1]
    try:
        os.mkdir(args.directory)
        if args.verbose != 0:
            print("\n ----- Creating directory '" + args.directory +
                  "/'. -----")
    except:
        pass

    if args.mismatch == None:
        if args.percentage:
            warnings.warn(
예제 #41
0
def main():
    """
    Start the TeddyPi pipeline, loads main configuration file, collects input files and parses TE/SV caller specific configuration.

    This module returns filtered and integrated datasets for tpi_ortho.py.
    """
    options = parse_args(argv[1:])
    modulename = "TeddyPi"
    print u"TeddyPi - Transposable Element detection and discovery for Phylogenetic Inference"
    print u"---------------------------------------------------------------------------------\n"

    print u"[ {} ] Initialize configuration from {}...".format(modulename, options.config),

    # Load main configuration
    with open(options.config) as fin:
        config = yaml.load(fin)
        programs = config['programs']
    print u"done."

    tpi_helpers.create_out_path(config['out_dir'])  # Create output directory

    transposons = config['refte']  # Load reference TE file

    # 1. Filter operations for each program and species

    filtered_files = defaultdict(dict)
    for samplename in config['samples']:
        print u"[ {} ] Loading data for sample {}; ".format(modulename, samplename)
        print u"[ {} ] Config has info on these TE/SV callers: {}".format(modulename,
                                                                          ",".join([elem['name'] for elem in programs]))

        per_sample_files = (fname for fname in os.listdir(config['data_dir']) if
                            fname.startswith(samplename) and fname.endswith(
                                ".vcf"))  # TODO avoid reloading processed files

        for sample_file in per_sample_files:
            # print "%s, " % sample_file
            per_sample_vcf = tpi_filter.LoadVCF(data_dir=config['data_dir'],
                                                out_dir=config['out_dir'],
                                                fname=sample_file,
                                                sname=samplename)
            simple_source = per_sample_vcf.vcf_source.split(" ")[0].lower()

            if config['programs'] == "auto" or simple_source in [elem['name'] for elem in programs]:
                per_sample_vcf.skip = False  # flag to skip filtering
                per_sample_vcf.filter_variants()

                print u"[ {} ] Filtered variants written to: {}\n".format(modulename, per_sample_vcf.out_fname)
                filtered_files[samplename][simple_source] = per_sample_vcf.out_fname
            else:
                print u"[ {} ] Error: Auto-detection of TE/SV callers disabled and VCF-source {} not mentioned in " \
                      u"config.\nskipping...".format(modulename, simple_source)

    # 2. Integrate SV-deletions and convert to Ref+ TE calls
    # tpi_svintegration.py

    if 'call_operations' in config.keys():
        print u"[ {} ] Call operations found in configfile".format(modulename)
        for op, sources in config['call_operations'].iteritems():
            try:
                assert (set([elem['name'] for elem in programs]) >= (set(sources)))
            except AssertionError:
                print u"VCF sources for operations have not been parsed."
                print u"[ {} ] For operation {}, sources {} were not parsed. Check \' programs \' parameter in {}" \
                    .format(modulename, op, ",".join(sources), options.config)

                continue

            if op == "non_redundant":
                print u"[ {} ] Starting operation {} on sources {} over all samples,".format(modulename, op,
                                                                                             ",".join(sources))
                for sample in filtered_files.keys():
                    print u"[ %s ] %s " % (op, sample)
                    sets = (BedTool(os.path.join(config['out_dir'], filtered_files[sample][src])) for src in sources)

                    nr = nonredundant_2_sets(sets)
                    nr_set_outfile = "{s}.{t}.nr.bed".format(s=sample, t="DEL")
                    nr_set_outfile = os.path.join(config['out_dir'], nr_set_outfile)
                    nr.saveas(nr_set_outfile)
                    print u"[ {} ] non_redundant set saved to {}".format(op, nr_set_outfile)

                    te_isect_outfile = "{s}.{t}.bed".format(s=sample, t="TE")
                    sv_set = nr.window(transposons, w=config['ortho_merge_distance']).saveas(
                        os.path.join(config['out_dir'], te_isect_outfile))
                    print u"[ {} ] TE intersected set saved to {}".format(op, os.path.join(config['out_dir'],
                                                                                           te_isect_outfile))

                    te_cls_outfile = "{s}.{t}.cls.bed".format(s=sample, t="TE")
                    sv_set = BedTool(cluster_calls(sv_set)).saveas(os.path.join(config['out_dir'], te_cls_outfile))
                    print u"[ {} ] clustered set saved to {}".format(op,
                                                                     os.path.join(config['out_dir'], te_cls_outfile))

            elif op == "intersection":
                print u"[ {} ] Starting operation {} on sources {} over all samples,".format(modulename, op,
                                                                                             ",".join(sources))
                for sample in filtered_files.keys():
                    print u"[ %s ] %s " % (op, sample)
                    sets = (BedTool(os.path.join(config['out_dir'], filtered_files[sample][src])) for src in sources)

                    isect = sets.next().window(sets.next(), w=100, u=True).sort()
                    isect_set_outfile = "{s}.{t}.is.vcf".format(s=sample, t="NONREF_ISEC")
                    isect.saveas(os.path.join(config['out_dir'], isect_set_outfile))

                    print u"[ {} ] intersected set saved to {}".format(op, os.path.join(config['out_dir'],
                                                                                        isect_set_outfile))
            elif op == "te_intersect":
                print u"[ {} ] Starting operation {} on sources {} over all samples,".format(modulename, op,
                                                                                             ",".join(sources))
                for sample in filtered_files.keys():
                    print u"[ %s ] %s " % (op, sample)
                    assert len(sources) == 1
                    src = sources[0]
                    bt_set = tpi_helpers.make_BED_fromVCF(os.path.join(config['out_dir'], filtered_files[sample][src]))

                    te_isect_outfile = "{s}.{t}.bed".format(s=sample, t="TE")
                    sv_set = bt_set.window(transposons, w=50).saveas(
                        os.path.join(config['out_dir'], te_isect_outfile))
                    print u"[ {} ] TE intersected set saved to {}".format(op, os.path.join(config['out_dir'],
                                                                                           te_isect_outfile))

                    te_cls_outfile = "{s}.{t}.cls.bed".format(s=sample, t="TE")
                    sv_set = BedTool(cluster_calls(sv_set)).saveas(os.path.join(config['out_dir'], te_cls_outfile))
                    print u"[ {} ] clustered set saved to {}".format(op,
                                                                     os.path.join(config['out_dir'], te_cls_outfile))
            else:
                print u"[ {} ] Operation '{}' not known. Nothing will be done. Check the configuration file.".format(
                    modulename, op)

    return 1
예제 #42
0
def main():

    args = check_options(get_options())

    # jellyfish par
    jfsize = '100M'

    # ?build bwa index
    bwaindexfile = os.path.basename(args.genome)

    tmpfolder = args.tmp

    bwatestindex = os.path.join(tmpfolder, bwaindexfile + '.sa')

    bwaindex = os.path.join(tmpfolder, bwaindexfile)

    bwabuild = True

    if os.path.isfile(bwatestindex):

        bwabuild = False

    if bwabuild:

        # build bwa index
        bwa.bwaindex(args.bwa, args.genome, tmpfolder)

        print("bwa index build finished ...")

    else:

        print("Use", bwatestindex)

    sampleinfor = dict()

    names = args.names.split(',')

    reads1 = args.reads1.split(',')

    reads2 = args.reads2.split(',')

    cnsfile = os.path.join(args.saved, '_'.join(names) + '_cns_probe.csv')

    print(cnsfile)

    cnsio = open(cnsfile, 'w')

    for i in range(len(names)):

        name = names[i]

        read1 = reads1[i]

        read2 = reads2[i]

        bamfile = os.path.join(tmpfolder, name + '.bam')

        bcffile = os.path.join(tmpfolder, name + '.bcf')

        jffile = os.path.join(tmpfolder, name + '.jf')

        cnsprobe = os.path.join(args.saved, name + '_probe.txt')

        # new add indel
        indelNprobe = os.path.join(args.saved, name + '_indel_probe.txt')

        mindepth = os.path.join(tmpfolder, name + '_mindepth.bed')

        if name in sampleinfor:

            print("error same name:", name)

        else:

            sampleinfor[name] = dict()

            sampleinfor[name]['read1'] = read1

            sampleinfor[name]['read2'] = read2

            sampleinfor[name]['bamfile'] = bamfile

            sampleinfor[name]['bcffile'] = bcffile

            sampleinfor[name]['jffile'] = jffile

            # sampleinfor[name]['kmerscore'] = kmerscore
            #
            # sampleinfor[name]['kmerscoreio'] = open(kmerscore, 'w')

            sampleinfor[name]['cnsprobe'] = cnsprobe

            sampleinfor[name]['cnsprobeio'] = open(cnsprobe, 'w')

            # new add indel
            sampleinfor[name]['indelNprobelist'] = list()
            sampleinfor[name]['indelNprobeio'] = open(indelNprobe, 'w')

            sampleinfor[name]['mindepth'] = mindepth

            # run bwa mem
            bwa.bwamem_paired(bwabin=args.bwa,
                              samtoolsbin=args.samtools,
                              reffile=bwaindex,
                              outfile=bamfile,
                              inputfile1=read1,
                              inputfile2=read2,
                              samplename=name,
                              threadnumber=args.threads)

            print("bwa mem", name, 'finished')

            # get min depth bed file
            bamdepth.bamdepthtobed(bamfile=bamfile,
                                   outbed=mindepth,
                                   mindepth=args.mindepth,
                                   minlength=200)

            print(mindepth, 'done')

            # generate bcf file from bam file
            bcftools.bamtobcf(bcfbin=args.bcftools,
                              reffile=bwaindex,
                              bamfile=bamfile,
                              outbcf=bcffile)

            print(bcffile, "done")

            # generate jf file

            jellyfish.makegenerator(filenames=[read1, read2],
                                    type='gz',
                                    generators='generators')

            jellyfish.jfgeneratorscount(jfpath=args.jellyfish,
                                        mer=args.length,
                                        output=jffile,
                                        generators='generators',
                                        threads=args.threads,
                                        size='100M')

            print(jffile, "done")

    probe = BedTool(args.probe).sort()

    for name in sampleinfor:

        nowprobe = BedTool(sampleinfor[name]['mindepth']).sort()

        probe = probe.intersect(nowprobe, wa=True, u=True)

    # cnsprobe

    for name in sampleinfor:

        bcfpool = Pool(args.threads)

        bcfrunerlist = list()

        consensusprobelist = list()

        for i in probe:

            probestr = str(i).rstrip()

            bcfconsensusruner = bcftools.BcfConsensusRuner(
                probestr=probestr,
                bcftoolspath=args.bcftools,
                bcffile=sampleinfor[name]['bcffile'],
                sample=name)

            bcfrunerlist.append(bcfconsensusruner)
            # consensusprobe = bcftools.probestrtoconsensus(bcfconsensusruner)
            #
            # print(probestr, consensusprobe, sep='\t')

        reslist = list()

        for res in bcfpool.imap_unordered(bcftools.probestrtoconsensus,
                                          bcfrunerlist):

            # print(res['probestr'], name, res['consensusprobe'], sep='\t', file=sampleinfor[name]['cnsprobeio'])

            if len(res['consensusprobe']) != args.length:

                sampleinfor[name]['indelNprobelist'].append(res)

            elif 'N' in res['consensusprobe']:

                continue

            else:

                consensusprobelist.append(res['consensusprobe'])
                # consensusprobelist.append(res)
                reslist.append(res)

        bcfpool.close()

        consensusprobekmerscore = jellyfish.jfquerylist(
            jfkmerfile=sampleinfor[name]['jffile'],
            jfpath=args.jellyfish,
            seqlist=consensusprobelist)

        kmerscoredict = dict()

        kmerscorelist = list()

        for score in consensusprobekmerscore:

            # print(score, file=sampleinfor[name]['kmerscoreio'])
            (subseq, kmerscore) = score.split(',')

            if 'N' not in subseq:

                kmerscoredict[subseq] = int(kmerscore)

                kmerscorelist.append(int(kmerscore))

        maxkmer = pd.Series(kmerscorelist).quantile(0.9)

        minkmer = args.minkmer

        for consensusprobe in reslist:

            probestr = consensusprobe['probestr']

            consensusprobeseq = consensusprobe['consensusprobe']

            if consensusprobeseq in kmerscoredict:

                if kmerscoredict[consensusprobeseq] <= maxkmer:

                    if kmerscoredict[consensusprobeseq] >= minkmer:

                        print(probestr,
                              consensusprobeseq,
                              kmerscoredict[consensusprobeseq],
                              sep='\t',
                              file=sampleinfor[name]['cnsprobeio'])

    for name in sampleinfor:

        sampleinfor[name]['cnsprobeio'].close()
        # sampleinfor[name]['kmerscoreio'].close()
        # print(sampleinfor)

        for res in sampleinfor[name]['indelNprobelist']:

            print(res['probestr'],
                  name,
                  res['consensusprobe'],
                  sep='\t',
                  file=sampleinfor[name]['indelNprobeio'])

        sampleinfor[name]['indelNprobeio'].close()

    probdict = dict()

    for name in sampleinfor:

        with open(sampleinfor[name]['cnsprobe']) as inio:

            for infor in inio:

                infor = infor.rstrip()

                inforlist = infor.split('\t')

                orgprb = inforlist[3]

                if orgprb in probdict:

                    probdict[orgprb][name] = infor

                else:

                    probdict[orgprb] = dict()

                    probdict[orgprb][name] = infor

    print('chrom',
          'start',
          'end',
          'refseq',
          ','.join(sampleinfor),
          'consensusprobe',
          'consensusscore',
          'consensussite',
          'consensusdiff',
          sep=',',
          file=cnsio)

    for orgprb in probdict:

        sharecount = len(probdict[orgprb])

        values_view = probdict[orgprb].values()
        value_iterator = iter(values_view)
        first_value = next(value_iterator).split('\t')

        outinfo = first_value[0:3]

        if len(sampleinfor) == sharecount:
            #         print(sampleinfor, sharecount)
            # print(orgprb, len(probdict[orgprb]))
            probelist = list()
            namelist = list()
            namelist.append('refseq')
            probelist.append(orgprb)

            for name in sampleinfor:

                infor = probdict[orgprb][name].split('\t')

                speciesprobe = infor[-2]

                namelist.append(name)
                if len(speciesprobe) == len(orgprb):
                    probelist.append(speciesprobe)

            if len(namelist) == len(probelist):
                #             print(namelist, probelist)

                res = probecompare.getconsensusprobe(probelist)
                outinfo.extend(probelist)
                print(','.join(outinfo),
                      res['consensusprobe'],
                      res['consensusscore'],
                      res['consensussite'],
                      res['consensusdiff'],
                      sep=',',
                      file=cnsio)

    cnsio.close()

    print("finished")
예제 #43
0
    def __init__(self,
                 intervals_file,
                 fasta_file,
                 dnase_file,
                 cell_line=None,
                 RNAseq_PC_file=None,
                 mappability_file=None,
                 GENCODE_dir=None,
                 use_linecache=True):

        # intervals
        if use_linecache:
            linecache.clearcache()
            BT = BedToolLinecache
        else:
            BT = BedTool

        self.bt = BT(intervals_file)

        # Fasta
        self.fasta_file = fasta_file
        self.fasta_extractor = None  # initialize later
        # DNase
        self.dnase_file = dnase_file
        self.dnase_extractor = None
        # mappability
        if mappability_file is None:
            # download the mappability file if not existing
            common_dl_dir = os.path.join(this_dir, "../../template/downloaded/dataloader_files")
            makedir_exist_ok(common_dl_dir)
            rf = RemoteFile(url="http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig",
                            md5="1d15ddafe2c8df51cf08495db96679e7")
            mappability_file = os.path.join(common_dl_dir, "wgEncodeDukeMapabilityUniqueness35bp.bigWig")
            if not os.path.exists(mappability_file) or not rf.validate(mappability_file):
                # download the path
                rf.get_file(mappability_file)
        self.mappability_file = mappability_file
        self.mappability_extractor = None
        # Gencode features
        if GENCODE_dir is None:
            gp = os.path.join(this_dir, "../../template/downloaded/dataloader_files/gencode_features/")
        else:
            gp = GENCODE_dir

        download_gencode_dir(gp)  # download files
        self.gencode_beds = [
            ("cpg", BedTool(gp + '/cpgisland.bed.gz')),
            ("cds", BedTool(gp + '/wgEncodeGencodeBasicV19.cds.merged.bed.gz')),
            ("intron", BedTool(gp + '/wgEncodeGencodeBasicV19.intron.merged.bed.gz')),
            ("promoter", BedTool(gp + '/wgEncodeGencodeBasicV19.promoter.merged.bed.gz')),
            ("utr5", BedTool(gp + '/wgEncodeGencodeBasicV19.utr5.merged.bed.gz')),
            ("utr3", BedTool(gp + '/wgEncodeGencodeBasicV19.utr3.merged.bed.gz')),
        ]
        # Overlap beds - could be done incrementally
        print("Overlapping all the bed-files")
        # The BT() and .fn are there in order to leverage BedToolLinecache
        self.overlap_beds = [(b, BT(self.bt.intersect(v, wa=True, c=True).fn))
                             for b, v in self.gencode_beds]
        print("Assesing the file")
        assert len(self.overlap_beds[1][1]) == len(self.bt)
        # Get the metadata features
        if cell_line is None:
            if RNAseq_PC_file is None:
                raise ValueError("RNAseq_PC_file has to be specified when cell_line=None")
            assert os.path.exists(RNAseq_PC_file)
        else:
            # Using the pre-defined cell-line
            output_dir = os.path.join(this_dir, "../../template/downloaded/dataloader_files/RNAseq_features/")
            makedir_exist_ok(output_dir)
            RNAseq_PC_file = os.path.join(output_dir, cell_line, "meta.txt")
            url_template = ('https://s3.eu-central-1.amazonaws.com/kipoi-models/dataloader_files/'
                            'FactorNet/dataloader_files/RNAseq_features/{}/meta.txt')
            # rf = RemoteFile(url=url_template.format(cell_line))
            if not os.path.exists(RNAseq_PC_file):  # or not rf.validate(mappability_file):
                # download the path
                download_url(url_template.format(cell_line), os.path.join(output_dir, cell_line), "meta.txt")
                # rf.get_file(RNAseq_PC_file)

        self.meta_feat = pd.read_csv(RNAseq_PC_file,
                                     sep="\t", header=None)[0].values
예제 #44
0
def samToTab():
    """
        In order to get the result of alignment usable for
        further  use, we create a new tabbed file in the
        same format as the inputted tabbed file. This file
        is the final output of this program.
        First of all, we extract the start and stop position
        and remove the flank region to get the initial
        annotation length. Then we generate the output file
        with these new position and keep all the information
        of the original tabbed file.
        The name of output file is by defaut
        "[fasta2_name]_out.[ext]" or the name that user
        specify with argument --output (but still with [ext]
        as extension). It return the name of the file created.
    """
    tabou = ""
    if (ext == "gff3" and args.typeA != None) or change:
        args.tabinput = tabO
    with open(args.tabinput, "r") as tabi, open(alnN, "r") as sam:
        for i in tabi:  # tabi = tabbed file after all modification
            line = i.split("\t")
            if line[0][0] == "#":
                tabou += i
                continue
            tabou += "# File generated the " + datetime.datetime.now(
            ).strftime(
                "%d %b %Y"
            ) + " with following command line : \n" + "# " + " ".join(
                sys.argv) + "\n"
            break
        for f in sam:  # samf = alignment file inside Bedtools object
            isMd = True
            samf = f.split("\t")
            if samf[0][0] == "@":
                continue
            if int(samf[1]) != 0:  # Ignoring complementary match (flag 2048)
                continue
            leng = 0
            res = re.findall("\d+\w", samf[5])
            for i in res:
                if i[-1] in ["M", "=", "X", "I", "S"]:
                    leng += int(i[:-1])
            if leng > args.flank:
                leng = leng - args.flank
            else:
                leng = 0
            countM = mdParser(
                f)  # countM is the number of perfect match for a alignment
            if args.mismatch != None:
                if not args.percentage:
                    if countM >= leng - args.mismatch:
                        isMd = True
                    else:
                        isMd = False
                else:
                    if countM >= leng - ((args.mismatch * leng) / 100):
                        isMd = True
                    else:
                        isMd = False
            tab = samf[0].replace("\\s", " ").split("__")
            if ext == "gff3" and isMd:
                tab[3] = int(samf[3]) + args.flank
                tab[4] = int(tab[3]) + leng - args.flank
                tabou += "\s".join(map(str, tab)) + "\n"
            elif ext == "bed" and isMd:
                tab[1] = int(samf[3]) + args.flank
                tab[2] = int(tab[1]) + leng - args.flank
                tabou += "\s".join(map(str, tab)) + "\n"
            elif ext == "vcf" and isMd:  #samf[5]==str(len(tab[3])+(args.flank*2))+"M" :
                tab[1] = int(samf[3]) + args.flank
                tabou += "\s".join(map(str, tab)) + "\n"
            #if f[11][-1]!="0" and f[5]=="101M":     # show ID of sequence which contain a missmatch
            #    print(f[0].split(":")[1]+"\t"+f[12])
            #if ext == "vcf" and samf[5]=="101M" and samf[5]==samf[12].split(":")[-1]+"M" and okw : # perfect match only for snp
            #if ext == "vcf" and samf[5]==str(len(samf[0].split("__")[3])+(args.flank*2))+"M" and okw :
        if args.verbose != 0:
            print(" ----- Creating file '" + args.out + "'. ----- \n")
        BedTool(tabou, from_string=True, deli="\s").saveas(args.out)
    return (args.out)
예제 #45
0
    os.makedirs(model_dir)
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
if not os.path.exists(srv_dir):
    os.makedirs(srv_dir)

# Train/val/test intervals
DATA_DIR = '/srv/scratch/jesikmin'
train_dir, val_dir, test_dir = os.path.join(DATA_DIR, 'train_interval'),\
                               os.path.join(DATA_DIR, 'val_interval'),\
                               os.path.join(DATA_DIR, 'test_interval')

print train_dir, val_dir, test_dir

# Get train/val/test intervals
train_intervals = list(BedTool(train_dir))
val_intervals = list(BedTool(val_dir))
test_intervals = list(BedTool(test_dir))
print '# of Train Intervals: {}'.format(len(train_intervals))
print '# of Val Intervals: {}'.format(len(val_intervals))
print '# of Test Intervals: {}'.format(len(test_intervals))

# Get input/output data directories
data = Data_Directories()
print data.intervals.keys()
print data.input_atac[day].keys()
print data.output_histone[day].keys()

# Extract input candidates
# Create an ArrayExtractor for ATAC-seq of a given day and specified fragment length
input_candidates = ArrayExtractor(data.input_atac[day][frag])
예제 #46
0
def isComplete(samtotabOut):
    """
        This function call the function getPosCds(tab) with
        both inputted tabbed file and newly generated tabbed
        file and check if CDS in the newly generated file
        are in the same position within the mRNA (or gene).
        It take the name of the tabbed file newly generated
        in argument.
    """
    if ext == "gff3":
        dicoPos1 = getPosCds(args.tabinput)
        dicoPos2 = getPosCds(samtotabOut)
        outTab = samtotabOut.split("/")[-1]
        geneInt = []
        #lastG=0
        geneOk = 0
        ok = 0
        countG = 0
        selectable = False
        filtered = "# File generated the " + datetime.datetime.now().strftime(
            "%d %b %Y"
        ) + " with following command line : \n" + "# " + " ".join(
            sys.argv) + "\n"
        for key1 in dicoPos1.keys():
            for key2 in dicoPos2.keys():
                if key2[0] == key1[0]:
                    if len(dicoPos1[key1]) == len(dicoPos2[key2]):
                        geneInt.append(key2[1])
                    # for v in range (0,len(dicoPos1[key1])) :
                    #     print(dicoPos1[key1][v])
                    #     if dicoPos1[key1][v] == dicoPos2[key2][v] : # TODO : c'est de la merde.
                    #         geneOk+=1
                    # print(geneOk)
                    # if geneOk >= len(dicoPos1[key1]) : # here we can add/rm condition to accept or not the mRNA/gene
                    #    # add the mRNA/gene number to the list of "acceptable mRNA/gene to select"
                    #     geneOk = 0
                    # else :
                    #     geneOk = 0
        if "gene" in typeAclean:
            typeC = "gene"
        elif "mrna" in typeAclean:
            typeC = "mrna"
        with open(samtotabOut, "r") as tabou:
            for line in tabou:
                if line[0] == "#":
                    continue
                lineS = line.strip().split("\t")
                if lineS[2].lower() == typeC:  # TODO : unreadable
                    resTag = re.search("ID=(\w+)", lineS[-1])
                    if resTag:
                        geneId = resTag.group(1)
                if lineS[2] == "CDS":
                    resTagCds = re.search("Parent=(\w+)", lineS[-1])
                    if resTagCds:
                        cdsId = resTagCds.group(1)
                if lineS[2].lower() == typeC:
                    countG += 1
                    if countG in geneInt:
                        selectable = True
                    else:
                        selectable = False
                if lineS[2] == "CDS" and geneId != cdsId:
                    selectable = False
                if selectable:
                    filtered += ("\s".join(lineS)) + "\n"
            countG = 0
        if args.verbose != 0:
            print(" ----- Generating filtered GFF file '" +
                  (args.directory + "/filtered_" + outTab) + "'. -----\n")
        BedTool(filtered, from_string=True,
                deli="\s").saveas(args.directory + "/filtered_" + outTab)
    return
예제 #47
0
    detected = [(int(x.name), float(x.attrs['topcoverage']))
                for x in BedTool(path)]
    detected.sort(key=lambda x: x[0])
    recovery = find_closest(detected, original)

    true_total = len(original)
    discovered_total = len(detected)

    true_positive = len([x for x in recovery if x[2] <= maxd])
    print(true_positive)
    false_positive = discovered_total - true_positive

    return true_positive / true_total, 1 - false_positive / discovered_total  # sensitivity specificity


original = [(int(x.name), float(x.score)) for x in BedTool(args.original)]
original.sort(key=lambda x: x[0])
#process_detected(args.detected, original, args.maxd)

name2stat = []
for path in [x for x in get_only_files(args.detected) if 'annotated' in x]:
    name = get_name(path, args.mode)
    print(name)
    sens, spec = process_detected(path, original, args.maxd)
    name2stat.append((name, sens * 100, spec * 100))

name2stat.sort(key=lambda x: int(x[0]))

data = [x[1] for x in name2stat], [x[2] for x in name2stat]
labels = [x[0] for x in name2stat]
fontsize = 24
예제 #48
0
def load_data(n_row=None, cleaned=True):
    # https://lncipedia.org/download
    data_dict = {
        'id': [],
        'name': [],
        'length': [],
        'ratio_g': [],
        'ratio_t': [],
        'ratio_c': [],
        'ratio_a': [],
        'number_exons': [],
        'chromosom': [],
        'start_pos': [],
        'end_pos': [],
        'length_from_pos': [],
        'number_introns': [],
        'mean_exon_length': [],
        'mfe': []
    }
    fasta_data = SeqIO.parse("data/lncipedia_5_2.fasta", "fasta")
    bed_raw_data = BedTool('data/lncipedia.bed')
    examiner = GFFExaminer()
    in_handle = open('data/lncipedia_5_2_hg38.gff')
    annotation_data = {}
    for i, rec in enumerate(GFF.parse(in_handle)):
        # chromosom e.g. chr1
        for feature in rec.features:
            # lncRNA eg. LNC1725
            if not feature.type == 'lnc_RNA':
                break

            exon_locations = []
            lnc_id = feature.id
            for sub_feature in feature.sub_features:
                if sub_feature.type == 'exon':
                    exon = (sub_feature.location.start,
                            sub_feature.location.end)
                    exon_locations.append(exon)

            annotation_data[lnc_id] = exon_locations

    in_handle.close()
    bed_data = {}

    for record in bed_raw_data:
        bed_data[record.name] = {
            'number_exons': int(record.fields[9]),
            'chromosom': record.fields[0],
            'start_pos':
            int(record.fields[1]),  # im bed -1 im vgl zu gff und online
            'end_pos': int(record.fields[2])
        }

    for i, record in enumerate(fasta_data):
        length = len(record.seq)
        data_dict['length'].append(length)
        data_dict['id'].append(record.id)
        data_dict['name'].append(record.name)
        if record.name in bed_data:
            for bed_feature in [
                    'number_exons', 'chromosom', 'start_pos', 'end_pos'
            ]:
                data_dict[bed_feature].append(
                    bed_data[record.name][bed_feature])

            end_pos = bed_data[record.name]['end_pos']
            start_pos = bed_data[record.name]['start_pos']
            exon_locations = annotation_data[record.id]
            data_dict['length_from_pos'].append(end_pos - start_pos)
            data_dict['number_introns'].append(
                calc_number_introns(start_pos, end_pos, exon_locations))
            data_dict['mean_exon_length'].append(
                calc_mean_exon_length(exon_locations))
        else:
            for feature in [
                    'number_exons', 'chromosom', 'start_pos', 'end_pos',
                    'length_from_pos', 'number_introns', 'mean_exon_length'
            ]:
                data_dict[feature].append(-1)

        count_g = 0
        count_a = 0
        count_t = 0
        count_c = 0

        for c in record.seq:
            if c == 'G':
                count_g += 1
            elif c == 'T':
                count_t += 1
            elif c == 'C':
                count_c += 1
            elif c == 'A':
                count_a += 1

        data_dict['ratio_g'].append(count_g / length * 100)
        data_dict['ratio_t'].append(count_t / length * 100)
        data_dict['ratio_c'].append(count_c / length * 100)
        data_dict['ratio_a'].append(count_a / length * 100)

        if n_row:
            if i == n_row:
                break

    list_of_lmfes = pickle.load(open("data/list_of_mfes2.pickle", "rb"))
    data_dict['mfe'].extend(list_of_lmfes)

    df = pd.DataFrame.from_dict(data_dict)
    # run only for rows where we have valid chromosomes
    df['chromosom'].loc[df['chromosom'] != -1] = df['chromosom'].loc[
        df['chromosom'] != -1].apply(lambda x: x.split('chr')[1])
    if cleaned:
        df = df[(df['chromosom'] != 'X') & (df['chromosom'] != 'Y')]
        df['chromosom'] = pd.to_numeric(df['chromosom'])
        # Also remove rows with invalid mfe and chromosomes
        df = df.loc[df['chromosom'] != -1].loc[
            df['mfe'] != -1].iloc[:, 2:].apply(lambda x:
                                               (x - x.mean()) / x.std(),
                                               axis=0)

    return df
예제 #49
0
            # bedtools map - a 3chase.bed - b 3chase.bedGraph - c 4 - o mean
            bed_annotated_int_comp = bed_annotated_int_bt.complement(
                g=chr_file)

            bed_annotated_int_bt.map(bedGraph_var_bt, c=4, o=statistic,
                                     null=0).saveas('values_' + id_worm + '_' +
                                                    var_traj + '.txt')
            bed_annotated_int_comp.map(bedGraph_var_bt,
                                       c=4,
                                       o=statistic,
                                       null=0).saveas('values_' + id_worm +
                                                      '_' + var_traj +
                                                      '.comp.txt')

            bedGraph_var_bt.intersect(bed_annotated_int_bt).saveas('values_' +
                                                                   id_worm +
                                                                   '_' +
                                                                   var_traj +
                                                                   '.bedGraph')
            bedGraph_var_bt.intersect(bed_annotated_int_comp).saveas(
                'values_' + id_worm + '_' + var_traj + '.comp.bedGraph')

    ## if no annotations present then returns bedGraph for its plotting and and empty bedgraph for annotated regions
    else:
        bedGraph_var_bt.saveas('values_' + id_worm + '_' + var_traj +
                               '.comp.bedGraph')
        bed_no_intervals = BedTool(list_no_intervals).saveas('values_' +
                                                             id_worm + '_' +
                                                             var_traj +
                                                             '.bedGraph')

# In[ ]:


from pybedtools import genome_registry
from pygtftk.gtf_interface import GTF


# In[ ]:


from pybedtools import BedTool
grch38gff='/home/drew/Desktop/IPyNB-Variant-Analysis/data/cuffcmp.combined.gtf'
#snps = BedTool('snps.bed.gz')  # [1]
genes = BedTool(grch38gff)    # [1]


# In[ ]:


get_ipython().run_cell_magic('bash', '', 'ln -P /home/drew/Desktop/IPyNB-Variant-Analysis/data\nln -P /media/drew/easystore/ReferenceGenomes/GCA_000001405.15_GRCh38_no_alt_analysis_set/\nln -P /media/drew/easystore/ReferenceGenomes/GRCh38/')


# In[ ]:


intergenic_snps = snps.subtract(genes)                       # [2]
nearby = genes.closest(intergenic_snps, d=True, stream=True) # [2, 3]

for gene in nearby:             # [4]
예제 #51
0
파일: SV.py 프로젝트: xlwuHIT/Sven
 def expand(self, svs, nbp):
     return BedTool(svs).slop(b=nbp, g='config/' + self._gt + '.genome')
window_size = 5001
process_all = False
sample_num = 1000

# In[3]:

# retrieve data
data = Data_Directories()
print data.intervals.keys()
print data.input_atac['day0'].keys()
print data.output_histone['day0'].keys()

# In[4]:

# get intervals for day0 data
day0_intervals = list(BedTool(data.intervals['day0']))
print '# of Intervals Extracted for day0: {}'.format(len(day0_intervals))

# In[5]:

# create an ArrayExtractor for ATAC-seq for day0 with 140 base pairs
bw_140bp_day0 = ArrayExtractor(data.input_atac['day0']['140'])
print 'Finished extracting bigwig for day0, 140bp'

# In[6]:

# create a BigWigExtractor for histone makr 'H3K27ac' for day0
bw_histone_mark_day0 = BigwigExtractor(data.output_histone['day0']['H3K27ac'])
print 'Finished extracting bigwig for day0, 140bp'

# In[7]:
예제 #53
0
def load_beddata(genome, bed_file, use_meta, use_gencode, input_dir, is_sorted, chrom=None):
    bed = BedTool(bed_file)
    if not is_sorted:
        print('Sorting BED file')
        bed = bed.sort()
        is_sorted = True
    blacklist = make_blacklist()
    print('Determining which windows are valid')
    bed_intersect_blacklist_count = bed.intersect(blacklist, wa=True, c=True, sorted=is_sorted)
    if chrom:
        nonblacklist_bools = np.array([i.chrom==chrom and i.count==0 for i in bed_intersect_blacklist_count])
    else:
        nonblacklist_bools = np.array([i.count==0 for i in bed_intersect_blacklist_count])
    print('Filtering away blacklisted windows')
    bed_filtered = bed.intersect(blacklist, wa=True, v=True, sorted=is_sorted)
    if chrom:
        print('Filtering away windows not in chromosome:', chrom)
        bed_filtered = subset_chroms([chrom], bed_filtered)
    print('Generating test data iterator')
    bigwig_names, bigwig_files_list = load_bigwigs([input_dir])
    bigwig_files = bigwig_files_list[0]
    if use_meta:
        meta_names, meta_list = load_meta([input_dir])
        meta = meta_list[0]
    else:
        meta = []
        meta_names = None
    
    shift = 0
    
    if use_gencode:
        cpg_bed = BedTool('resources/cpgisland.bed.gz')
        cds_bed = BedTool('resources/wgEncodeGencodeBasicV19.cds.merged.bed.gz')
        intron_bed = BedTool('resources/wgEncodeGencodeBasicV19.intron.merged.bed.gz')
        promoter_bed = BedTool('resources/wgEncodeGencodeBasicV19.promoter.merged.bed.gz')
        utr5_bed = BedTool('resources/wgEncodeGencodeBasicV19.utr5.merged.bed.gz')
        utr3_bed = BedTool('resources/wgEncodeGencodeBasicV19.utr3.merged.bed.gz')

        peaks_cpg_bedgraph = bed_filtered.intersect(cpg_bed, wa=True, c=True)
        peaks_cds_bedgraph = bed_filtered.intersect(cds_bed, wa=True, c=True)
        peaks_intron_bedgraph = bed_filtered.intersect(intron_bed, wa=True, c=True)
        peaks_promoter_bedgraph = bed_filtered.intersect(promoter_bed, wa=True, c=True)
        peaks_utr5_bedgraph = bed_filtered.intersect(utr5_bed, wa=True, c=True)
        peaks_utr3_bedgraph = bed_filtered.intersect(utr3_bed, wa=True, c=True)

        data_bed = [(window.chrom, window.start, window.stop, 0, bigwig_files, np.append(meta, np.array([cpg.count, cds.count, intron.count, promoter.count, utr5.count, utr3.count], dtype=bool)))
                    for window, cpg, cds, intron, promoter, utr5, utr3 in 
                    itertools.izip(bed_filtered, peaks_cpg_bedgraph,peaks_cds_bedgraph,peaks_intron_bedgraph,peaks_promoter_bedgraph,peaks_utr5_bedgraph,peaks_utr3_bedgraph)]
    else:
        data_bed = [(window.chrom, window.start, window.stop, shift, bigwig_files, meta)
                    for window in bed_filtered]
    #from data_iter import DataIterator
    from data_iter import DataIterator
    bigwig_rc_order = get_bigwig_rc_order(bigwig_names)
    datagen_bed = DataIterator(data_bed, genome, 100, L, bigwig_rc_order, shuffle=False)
    return bigwig_names, meta_names, datagen_bed, nonblacklist_bools
예제 #54
0
def download_and_unify_datasets(cell_name, assay_type, assay_info_dict, target_cellinfo_dirs_path, 
                                number_of_votes_from_highquality_datasets=1, 
                                number_of_votes_from_lowquality_datasets=2, 
                                number_of_files_to_consider_from_highquality_datasets='all', 
                                number_of_files_to_consider_from_lowquality_datasets='all', 
                                dont_consider_low_quality_datasets_when_highquality_datasets_available=True, 
                                consider_peak_score_from_peak_file = True, peak_score_index=6):
    
    current_dir = os.getcwd()
    final_dataset_of_this_assay_cell = cell_name+"_"+assay_type+".bed4"
    final_datasets_of_this_assay_cell = []
    if not os.path.exists(target_cellinfo_dirs_path+'/'+cell_name):
        os.mkdir(target_cellinfo_dirs_path+'/'+cell_name)
    if not os.path.exists(target_cellinfo_dirs_path+'/'+cell_name+'/'+assay_type):
        os.mkdir(target_cellinfo_dirs_path+'/'+cell_name+'/'+assay_type)
    os.chdir(target_cellinfo_dirs_path+'/'+cell_name+'/'+assay_type)
    print(target_cellinfo_dirs_path+'/'+cell_name+'/'+assay_type)
    if not os.path.exists(final_dataset_of_this_assay_cell):
        for factor in assay_info_dict.keys():
            peak_score_from_peak_file_exists = True
            final_dataset = factor+".bed4"
            if os.path.exists(final_dataset): #if the final merged file of this factor was already available then no need to do any more operations
                final_datasets_of_this_assay_cell.append(final_dataset)
                continue
            list_of_high_quality_datasets_from_this_factor = []
            list_of_low_quality_datasets_from_this_factor = []
            print(assay_info_dict[factor])
            if 'high' in assay_info_dict[factor]:
                if number_of_files_to_consider_from_highquality_datasets=='all':
                    list_of_high_quality_datasets_from_this_factor=assay_info_dict[factor][assay_info_dict[factor].index("high")+1]
                else:
                    for i in range(0, number_of_files_to_consider_from_highquality_datasets):
                        if i < len(assay_info_dict[factor][assay_info_dict[factor].index("high")+1]):
                            list_of_high_quality_datasets_from_this_factor.append(assay_info_dict[factor][assay_info_dict[factor].index("high")+1][i])
                        else:
                            break
            if 'low' in assay_info_dict[factor]:
                if 'high' in assay_info_dict[factor] and dont_consider_low_quality_datasets_when_highquality_datasets_available:
                    pass
                #in case no high quality dataset was available or it was specifically asked to include even with the availablility of high quality datasets then use the low quality datasets as well
                else:
                    if number_of_files_to_consider_from_lowquality_datasets=='all':
                        list_of_low_quality_datasets_from_this_factor=assay_info_dict[factor][assay_info_dict[factor].index("low")+1]
                    else:
                        for i in range(0, number_of_files_to_consider_from_lowquality_datasets):
                            if i < len(assay_info_dict[factor][assay_info_dict[factor].index("low")+1]):
                                list_of_low_quality_datasets_from_this_factor.append(assay_info_dict[factor][assay_info_dict[factor].index("low")+1][i])
                            else:
                                break
            
            #process the datasets from the high quality list
            list_of_high_quality_peakfiles_from_this_factor = []
            final_dataset_high_quality = factor+"_high" + ".bed"
            final_dataset_high_quality_name = open(final_dataset_high_quality, 'w')
            for dataset in list_of_high_quality_datasets_from_this_factor:
                dataset_name = ""
                if "ENCFF" in dataset:
                    dataset_path = "https://www.encodeproject.org/files/"+dataset+"/@@download/"+dataset+".bed.gz"
                    dataset_name = factor+"_"+dataset+".bed"
                    if not os.path.exists(dataset_name):
                        if not os.path.exists(dataset_name+".gz"):
                            downloaded_obj = urlopen(dataset_path)
                            print("downloading.... " + dataset_path)
                            with open(os.path.basename(dataset_name+".gz"), 'wb') as local_file:
                                local_file.write(downloaded_obj.read())
                        with gzip.open(dataset_name+".gz", 'rb') as dataset_name_zip, open(dataset_name, 'w') as dataset_name_unzipped:
                            dataset_name_unzipped.write(dataset_name_zip.read())
                        #os.system("gunzip " + dataset_name+".gz")
                elif dataset.startswith("http://") or dataset.startswith("ftp://"):
                    dataset_path = dataset
                    dataset_name = factor+"_"+dataset.strip().split('/')[-1]
                    dataset_name_unzipped = dataset_name
                    if "." in dataset_name: 
                        if dataset_name.split('.')[-1]=="gz":
                            dataset_name_unzipped = '.'.join(dataset_name.split('.')[0:-1])
                            
                    if os.path.exists(dataset_name_unzipped):#this could be the gzip or the unzipped file
                        dataset_name = '.'.join(dataset_name.split('.')[0:-1])
                    else:
                        if not os.path.exists(dataset_name):
                            downloaded_obj = urlopen(dataset_path)
                            print("downloading.... " + dataset_path)
                            with open(dataset_name, 'wb') as local_file:
                                local_file.write(downloaded_obj.read())
                        if "." in dataset_name: 
                            if dataset_name.split('.')[-1]=="gz":
                                with gzip.open(dataset_name, 'rb') as dataset_name_unzip_read, open(dataset_name_unzipped, 'wb') as dataset_name_unzipped_write:
                                    dataset_name_unzipped_write.write(dataset_name_unzip_read.read())
                                #os.system("gunzip " + dataset_name)
                                dataset_name = '.'.join(dataset_name.split('.')[0:-1])
                else:#path to a local file
                    dataset_path = dataset
                    dataset_name = factor+"_"+dataset.strip().split('/')[-1]
                    if not os.path.exists(dataset_name):
                        shutil.copy(dataset_name, "./")
                        if "." in dataset_name:
                            if dataset_name.split('.')[-1]=="gz":
                                with gzip.open(dataset_name, 'rb') as dataset_name_unzip_read, open('.'.join(dataset_name.split('.')[0:-1]), 'wb') as dataset_name_unzip_write:
                                    dataset_name_unzip_write.write(dataset_name_unzip_read.read())
                                #os.system("gunzip " + dataset_name)
                                dataset_name = '.'.join(dataset_name.split('.')[0:-1])
                                
                if dataset_name!="":
                    dataset_sort_bedtools = pybedtools.BedTool(dataset_name)
                    sorting_result = dataset_sort_bedtools.sort() 
                    list_of_high_quality_peakfiles_from_this_factor.append(sorting_result.fn)
            #Combine all high quality peak files into one
            peak_score_from_peak_file_exists = True
            if len(list_of_high_quality_peakfiles_from_this_factor)!=0:
                print(cell_name + ": high: " + assay_type + ":" + factor + ": "  + ','.join(list_of_high_quality_peakfiles_from_this_factor))
                #merge the high quality datasets
                if len(list_of_high_quality_peakfiles_from_this_factor)==1:
                    if assay_type == "ChromatinStates":
                        final_dataset_high_quality = list_of_high_quality_peakfiles_from_this_factor[0]
                    else:
                        merged_output = open(list_of_high_quality_peakfiles_from_this_factor[0], 'r').readlines()
                        try:#in case the line didn't have col index or the value of col index was not convertable to float then it's an indication of no score availabbility
                            peak_score = float(merged_output[0][peak_score_index])
                        except (IndexError, ValueError) as e:
                            repr( e )
                            peak_score_from_peak_file_exists = False
                        if peak_score_from_peak_file_exists and consider_peak_score_from_peak_file:
                            for line in merged_output:
                                final_dataset_high_quality_name.write('\t'.join(line.strip().split('\t')[0:3]) + '\t' + line.strip().split('\t')[peak_score_index] +"\n")
                        else:
                            for line in merged_output:
                                final_dataset_high_quality_name.write('\t'.join(line.strip().split('\t')[0:3]) +"\n")
                        final_dataset_high_quality_name.close()
                elif len(list_of_high_quality_peakfiles_from_this_factor)>1:
                    if assay_type == "ChromatinStates":
                        #write all the files into one
                        with open(final_dataset_high_quality, 'w') as concatenated_file_write: 
                            for file_name in list_of_high_quality_peakfiles_from_this_factor:
                                with open(file_name, 'r') as infile: 
                                    concatenated_file_write.write(infile.read())
                    else:
                        bedTools_obj = BedTool()
                        merging_all = bedTools_obj.multi_intersect(i=list_of_high_quality_peakfiles_from_this_factor).filter(lambda x: int(x[3]) >= number_of_votes_from_highquality_datasets).sort().merge()
                        
                        #in case the line didn't have col index or the value of col index was not convertable to float then it's an indication of no score availabbility
                        for file_i in  list_of_high_quality_peakfiles_from_this_factor:#check if all the files have peak scores
                            with open(file_i) as read_file_i:
                                try:
                                    h = read_file_i.readline()
                                    peak_score = float(h.strip().split('\t')[peak_score_index])
                                except (IndexError, ValueError) as e:
                                    repr( e )
                                    peak_score_from_peak_file_exists = False
                        if peak_score_from_peak_file_exists and consider_peak_score_from_peak_file:
                            #tmp_dir = './tmp_dir_to_remove_{}'.format(list_of_high_quality_peakfiles_from_this_factor[0].split('/')[-1])
                            #os.makedirs(tmp_dir)
                            list_of_high_quality_peakfiles_from_this_factor_updated = []
                            for i_file in list_of_high_quality_peakfiles_from_this_factor:
                                with open(i_file, 'r') as ifile, open(i_file + "_tmp", 'w') as ofile:
                                    for line in ifile.readlines():
                                        ofile.write('\t'.join(line.strip().split('\t')[0:3]) + '\t{}\n'.format(line.strip().split('\t')[peak_score_index]))
                                list_of_high_quality_peakfiles_from_this_factor_updated.append(i_file + "_tmp")
                            peak_score_index_updated = 3
                            #os.system('cp ' + merging_all.fn + ' . ' )
                            merging_all = merging_all.intersect(list_of_high_quality_peakfiles_from_this_factor_updated, wo=True).sort().groupby(g=[1,2,3], c=peak_score_index_updated+1+4, o=['mean'])#4 cols from the mergeBed and one extra from the intersection then it follows the cols from each file
                            #os.system('cp ' + merging_all.fn + ' . ' )
                            for l in list_of_high_quality_peakfiles_from_this_factor_updated:
                                os.remove(l)
                        merged_output = open(merging_all.fn, 'r').readlines()
                        for line in merged_output:
                            final_dataset_high_quality_name.write('\t'.join(line.strip().split('\t')[0::]) +"\n")
                        final_dataset_high_quality_name.close()
            #handling peak files from low quality datasets    
            list_of_low_quality_peakfiles_from_this_factor = []
            final_dataset_low_quality = factor+"_low" + ".bed"
            final_dataset_low_quality_name = open(final_dataset_low_quality, 'w')
            for dataset in list_of_low_quality_datasets_from_this_factor:
                dataset_name = ""
                if "ENCFF" in dataset:
                    dataset_path = "https://www.encodeproject.org/files/"+dataset+"/@@download/"+dataset+".bed.gz"
                    dataset_name = factor+"_"+dataset+".bed"
                    if not os.path.exists(dataset_name):
                        if not os.path.exists(dataset_name+".gz"):
                            downloaded_obj = urlopen(dataset_path)
                            print("downloading.... " + dataset_path)
                            with open(os.path.basename(dataset_name+".gz"), 'wb') as local_file:
                                local_file.write(downloaded_obj.read())
                        with gzip.open(dataset_name+".gz", 'rb') as dataset_name_zip, open(dataset_name, 'w') as dataset_name_unzipped:
                            dataset_name_unzipped.write(dataset_name_zip.read())
                        #os.system("gunzip " + dataset_name+".gz")
                elif dataset.startswith("http://") or dataset.startswith("ftp://"):
                    dataset_path = dataset
                    dataset_name = factor+"_"+dataset.strip().split('/')[-1]
                    dataset_name_unzipped = dataset_name
                    if "." in dataset_name: 
                        if dataset_name.split('.')[-1]=="gz":
                            dataset_name_unzipped = '.'.join(dataset_name.split('.')[0:-1])
                            
                    if os.path.exists(dataset_name_unzipped):#this could be the gzip or the unzipped file
                        dataset_name = '.'.join(dataset_name.split('.')[0:-1])
                    else:
                        if not os.path.exists(dataset_name):
                            downloaded_obj = urlopen(dataset_path)
                            print("downloading.... " + dataset_path)
                            with open(dataset_name, 'wb') as local_file:
                                local_file.write(downloaded_obj.read())
                        if "." in dataset_name: 
                            if dataset_name.split('.')[-1]=="gz":
                                with gzip.open(dataset_name, 'rb') as dataset_name_unzip_read, open(dataset_name_unzipped, 'wb') as dataset_name_unzipped_write:
                                    dataset_name_unzipped_write.write(dataset_name_unzip_read.read())
                                #os.system("gunzip " + dataset_name)
                                dataset_name = '.'.join(dataset_name.split('.')[0:-1])
                else:#path to a local file
                    dataset_path = dataset
                    dataset_name = factor+"_"+dataset.strip().split('/')[-1]
                    if not os.path.exists(dataset_name):
                        shutil.copy(dataset, "./"+dataset_name)
                        if "." in dataset_name:
                            if dataset_name.split('.')[-1]=="gz":
                                with gzip.open(dataset_name, 'rb') as dataset_name_unzip_read, open('.'.join(dataset_name.split('.')[0:-1]), 'wb') as dataset_name_unzip_write:
                                    dataset_name_unzip_write.write(dataset_name_unzip_read.read())
                                #os.system("gunzip " + dataset_name)
                                dataset_name = '.'.join(dataset_name.split('.')[0:-1])
                                
                if dataset_name!="":
                    dataset_sort_bedtools = pybedtools.BedTool(dataset_name)
                    sorting_result = dataset_sort_bedtools.sort() 
                    list_of_low_quality_peakfiles_from_this_factor.append(sorting_result.fn)
            #Combine all low quality peak files into one
            peak_score_from_peak_file_exists = True
            if len(list_of_low_quality_peakfiles_from_this_factor)!=0:
                print(cell_name + ": low: " + assay_type + ":" + factor + ": "  + ','.join(list_of_low_quality_peakfiles_from_this_factor))
                #merge the low quality datasets
                if len(list_of_low_quality_peakfiles_from_this_factor)==1:
                    if assay_type == "ChromatinStates":
                        final_dataset_low_quality = list_of_low_quality_peakfiles_from_this_factor[0]
                    else:
                        merged_output = open(list_of_low_quality_peakfiles_from_this_factor[0], 'r').readlines()
                        try:#in case the line didn't have col index or the value of col index was not convertable to float then it's an indication of no score availabbility
                            peak_score = float(merged_output[0][peak_score_index])
                        except (IndexError, ValueError) as e:
                            repr( e )
                            peak_score_from_peak_file_exists = False
                        if peak_score_from_peak_file_exists and consider_peak_score_from_peak_file:
                            for line in merged_output:
                                final_dataset_low_quality_name.write('\t'.join(line.strip().split('\t')[0:3]) + '\t' + line.strip().split('\t')[peak_score_index] + "\n")
                        else:
                            for line in merged_output:
                                final_dataset_low_quality_name.write('\t'.join(line.strip().split('\t')[0:3]) + "\n")
                        final_dataset_low_quality_name.close()
                
                elif len(list_of_low_quality_peakfiles_from_this_factor)>1:
                    if assay_type == "ChromatinStates":
                        #write all the files into one
                        with open(final_dataset_low_quality, 'w') as concatenated_file_write: 
                            for file_name in list_of_low_quality_peakfiles_from_this_factor:
                                with open(file_name, 'r') as infile: 
                                    concatenated_file_write.write(infile.read())
                    else:
                        bedTools_obj = BedTool()
                        merging_all = bedTools_obj.multi_intersect(i=list_of_low_quality_peakfiles_from_this_factor).filter(lambda x: int(x[3]) >= number_of_votes_from_lowquality_datasets).sort().merge()
                        for file_i in  list_of_low_quality_peakfiles_from_this_factor:#check if all the files have peak scores
                            with open(file_i) as read_file_i:
                                try:
                                    h = read_file_i.readline()
                                    peak_score = float(h.strip().split('\t')[peak_score_index]) 
                                except (IndexError, ValueError) as e:
                                    repr( e )
                                    peak_score_from_peak_file_exists = False
                        if peak_score_from_peak_file_exists and consider_peak_score_from_peak_file:
                            list_of_low_quality_datasets_from_this_factor_updated = []
                            for i_file in list_of_low_quality_peakfiles_from_this_factor:
                                with open(i_file, 'r') as ifile, open(i_file + "_tmp", 'w') as ofile:
                                    for line in ifile.readlines():
                                        ofile.write('\t'.join(line.strip().split('\t')[0:3]) + '\t{}\n'.format(line.strip().split('\t')[peak_score_index]))
                                list_of_low_quality_datasets_from_this_factor_updated.append(i_file + "_tmp")
                            peak_score_index_updated = 3
                            merging_all = merging_all.intersect(list_of_low_quality_datasets_from_this_factor_updated, wo=True).sort().groupby(g=[1,2,3], c=peak_score_index_updated+1+4, o=['mean'])#4 cols from the mergeBed and one extra from the intersection then it follows the cols from each file
                            for l in list_of_low_quality_datasets_from_this_factor_updated:
                                os.remove(l)
                        merged_output = open(merging_all.fn, 'r').readlines()
                        for line in merged_output:
                            final_dataset_low_quality_name.write('\t'.join(line.strip().split('\t')[0::]) +"\n")
                        final_dataset_low_quality_name.close()
            
            #Combine results of low and high quality peak files and merge them with adding the factor name
            peak_score_from_peak_file_exists = True
            final_file = ""
            if os.stat(final_dataset_high_quality).st_size==0 and os.stat(final_dataset_low_quality).st_size==0:
                continue
            else:
                merge_final_lines = []
                highlow_combined  = "highlow_combined"
                os.system("cat " + final_dataset_high_quality + " " + final_dataset_low_quality + " > " + highlow_combined)
                if assay_type == "ChromatinStates":#because the chromatinstates are defined for all genome bins merging them would cause create 25 regions only since all the bins are starting consequentively 
                    final_file = highlow_combined
                else:
                    with open(highlow_combined) as read_file_i:
                        try:
                            h = read_file_i.readline()
                            peak_score = float(h.strip().split('\t')[3])
                        except (IndexError, ValueError) as e:
                            repr( e )
                            peak_score_from_peak_file_exists = False
                    highlow_combined_obj = BedTool(highlow_combined)
                    merge_final = ""
                    if peak_score_from_peak_file_exists and consider_peak_score_from_peak_file:
                        merge_final = highlow_combined_obj.sort().merge(c=4, o='mean')
                    else:
                        merge_final = highlow_combined_obj.sort().merge()
                    final_file = merge_final.fn
                with open(final_file, 'r') as merge_final_read:
                    merge_final_lines = merge_final_read.readlines()
                    with open(final_dataset, 'w') as final_dataset_writer:  
                        if assay_type == "ChromatinStates":
                            for line in merge_final_lines:
                                final_dataset_writer.write('\t'.join(line.strip().split('\t')[0:3]) + '\t' + cell_name+"#ChromHMM#"+line.strip().split('\t')[3].replace(" ", "-") + '\n')
                        elif assay_type == "ChIP-seq":
                            if peak_score_from_peak_file_exists and consider_peak_score_from_peak_file:
                                for line in merge_final_lines:
                                    peak_score = "#"+str(line.strip().split('\t')[3])
                                    final_dataset_writer.write('\t'.join(line.strip().split('\t')[0:3]) + '\t' + cell_name+"#TFBinding#"+factor.replace(" ", "-")+peak_score + '\n')
                            else:
                                for line in merge_final_lines:
                                    peak_score = ""
                                    final_dataset_writer.write('\t'.join(line.strip().split('\t')[0:3]) + '\t' + cell_name+"#TFBinding#"+factor.replace(" ", "-")+peak_score + '\n')
                        else:
                            if peak_score_from_peak_file_exists and consider_peak_score_from_peak_file:
                                for line in merge_final_lines:
                                    peak_score = "#"+str(line.strip().split('\t')[3])
                                    final_dataset_writer.write('\t'.join(line.strip().split('\t')[0:3]) + '\t' + cell_name+"#"+factor.replace(" ", "-")+peak_score + '\n')
                            else:
                                peak_score = ""
                                for line in merge_final_lines:
                                    final_dataset_writer.write('\t'.join(line.strip().split('\t')[0:3]) + '\t' + cell_name+"#"+factor.replace(" ", "-")+peak_score + '\n')
                final_datasets_of_this_assay_cell.append(final_dataset)
                os.remove(highlow_combined)
            os.remove(factor+"_high" + ".bed")
            os.remove(factor+"_low" + ".bed")
        #combine peak files of all the factors into one
        with open(final_dataset_of_this_assay_cell, 'w') as final_dataset_of_this_cell_out:
            for peak_file in final_datasets_of_this_assay_cell:
                with open(peak_file, 'r') as infile:
                    final_dataset_of_this_cell_out.write(infile.read())
    
    os.chdir(current_dir)
    return final_dataset_of_this_assay_cell, final_datasets_of_this_assay_cell
예제 #55
0
def nonnegative_wrapper(a, bl_file):
    bl = BedTool(bl_file)
    a_slop = a.slop(g=genome_sizes_file, b=genome_window_size)
    return bl.cat(a_slop).fn
예제 #56
0
    newvals = [x for x in np.arange(0, max(yvals), scale)]
    newpos = np.arange(0, step * len(newvals), step)

    #print(sum(ylabels))
    ax.set_yticks(newpos)
    ax.set_yticklabels(["%d" % (x * 100) for x in newvals])


def check(region, mincov):
    return len([
        x for x in region.attrs['topcoverage'].split(",")[:3]
        if float(x) > mincov
    ]) > 1


annpeaks = [x for x in BedTool(args.path) if check(x, args.mincov)]

fontsize = 24
linewidth = 5
scores = [float(x.attrs['tss']) for x in annpeaks if x.attrs['tss'] != 'nan']
scores.sort()
selected_scores = [x for x in scores if x <= 300 and x >= -100]
#print(min(scores))

fig, axes = plt.subplots(ncols=2, figsize=(22, 7), frameon=False)
fig.tight_layout(rect=[0.05, 0.1, 1, 1])
fig.subplots_adjust(wspace=0.2)
for data, ax in zip([scores, selected_scores], axes):
    _, bins, _ = ax.hist(data, bins=20, density=True)

    ax.set_xlabel('TSS distance', fontsize=fontsize)
예제 #57
0
def intersect_count(chip_bed, windows_file):
    windows = BedTool(windows_file)
    chip_bedgraph = windows.intersect(chip_bed, wa=True, c=True, f=1.0*(genome_window_size/2+1)/genome_window_size, sorted=True)
    bed_counts = [i.count for i in chip_bedgraph]
    return bed_counts
예제 #58
0
def consolidate(nbedfile, obedfile, cbedfile):
    from pybedtools import BedTool

    nbedtool = BedTool(nbedfile)
    obedtool = BedTool(obedfile)

    ab = nbedtool.intersect(obedtool, s=True, u=True)
    ba = obedtool.intersect(nbedtool, s=True, u=True)

    cmd = "cat {0} {1} | sort -k1,1 -k2,2n".format(ab.fn, ba.fn)
    fp = popen(cmd)
    ovl = BedTool(fp.readlines())

    abmerge = ovl.merge(s=True, nms=True, scores="mean").sort()
    cmd = "cat {0}".format(abmerge.fn)
    fp = popen(cmd, debug=False)
    ovl = BedTool(fp.readlines())

    notovl = nbedtool.intersect(ovl.sort(), s=True, v=True)

    infile = "{0} {1}".format(notovl.fn, ovl.fn)
    tmpfile = "/tmp/reformat.{0}.bed".format(os.getpid())
    cmd = "sort -k1,1 -k2,2n"
    sh(cmd, infile=infile, outfile=tmpfile)

    fp = open(cbedfile, "w")
    bed = Bed(tmpfile)
    for b in bed:
        if ";" in b.accn:
            accns = set()
            for accn in b.accn.split(";"):
                accns.add(accn)
            b.accn = ";".join(accns)
        print(b, file=fp)
    fp.close()
    os.remove(tmpfile)

    sort([cbedfile, "-i"])
예제 #59
0
            print_compiled(current_selection, size)
            stat_counts.append(len(current_selection))
            current_selection = [m]
    else:
        print_compiled(current_selection, size)
        stat_counts.append(len(current_selection))

    return stat_counts


########################################################################################################
### Execution Section

file_dict = gather_files(args.path, args.replicates, args.name)
for dname, files in file_dict.items():
    blist = [BedTool(x) for x in files]
    size = len(blist)
    res_total, stat_total_counts = find_shared_peaks(blist, args.maxd)

    with open(os.path.join(args.outdir, "%s.gff" % dname), 'w') as f:

        f.write("# %s\n" %
                ",".join([os.path.basename(x).split(".")[0] for x in files]))
        for compiled in res_total:
            f.write(print_compiled(compiled, size))

    sys.stderr.write("\n%s\n" % dname)
    sys.stderr.write(shared_peaks_stat_to_string(stat_total_counts, size))

#import argparse
#import os
예제 #60
0
if not args.debug:
    pass
else:
    print()
    print(("Running in debug mode. Only the first " + str(args.debug) +
           " entries will be used."))

print()
print("Starting datamatrix assembly process")

Popen('mkdir ' + args.outfile + '.datamatrix', shell=True)

print()
print("Sorting input bed file.")

input_bed = BedTool(
    args.input_file).sort().saveas(args.outfile + '.datamatrix/input_list.bed')

if 'strand' in list(BedTool(input_bed[0:1]).saveas().to_dataframe().columns):
    print("Strand information found in input file. Running in stranded mode.")
    print()
    strd = True
else:
    print(
        "Strand information NOT found in input file. Running in unstranded mode."
    )
    print()
    strd = False

##Load the genome file that matches the version of the GTF you are using. Pysam will be used to build an index of
##the FASTA file.