Exemplo n.º 1
0
def test_output_bed_predict_denseout(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    # generate loss
    #
    # resolution < stepsize
    inputs = Array("x", numpy.random.random((7, 10)))
    outputs = Array('y',
                    numpy.random.random((7, 4)),
                    conditions=['c1', 'c2', 'c3', 'c4'])

    bwm = get_janggu(inputs, outputs)
    data_path = pkg_resources.resource_filename('janggu',
                                                'resources/10regions.bed')

    gi = GenomicIndexer.create_from_file(data_path, binsize=200, stepsize=200)

    dummy_eval = Scorer('pred',
                        lambda p: [0.1] * len(p),
                        exporter=ExportBed(gindexer=gi, resolution=200),
                        conditions=['c1', 'c2', 'c3', 'c4'])

    bwm.predict(inputs, callbacks=[dummy_eval])

    file_ = os.path.join(tmpdir.strpath, 'evaluation', bwm.name, 'pred.{}.bed')

    for cond in ['c1', 'c2', 'c3', 'c4']:
        assert os.path.exists(file_.format(cond))

    bed = BedTool(file_.format('c1'))

    nreg = 0
    for reg in bed:
        numpy.testing.assert_equal(float(reg.score), 0.1)
        nreg += 1

    assert nreg == 7, 'There should be 7 regions in the bed file.'
Exemplo n.º 2
0
 def process(self):
     all_sites = pd.read_csv(self.sites_file,usecols=['chr','coordinate'])
     all_sites = get_winid.convert_chr_to_num(all_sites)
     chrs = np.sort(all_sites['chr'].unique())
     all_sites_closest = []
     for chr in chrs:
         print('processing sites on chr '+str(chr))
         chr_file = self.data_dir+'chr'+str(chr)+'.tsv'
         if not os.path.exists(self.data_dir+'chr1.tsv'):
             self.split_by_chr()
         chr_sites = all_sites.query('chr==@chr')
         chr_sites['coordinate'] = chr_sites['coordinate'].astype('i8')
         chr_sites['end'] = chr_sites['coordinate']+1
         chr_sites = BedTool([tuple(x[1]) for x in chr_sites.iterrows()])
         chr_sites_closest = chr_sites.closest(chr_file,d=True,nonamecheck=True)
         for row in chr_sites_closest:
             all_sites_closest.extend([[row[0],row[1],row[6],row[7]]])
         del chr_sites_closest
         del chr_sites
         gc.collect()
     all_sites_closest = pd.DataFrame(all_sites_closest,columns=['chr','coordinate','score','distiance_to_nearest_DANN'])
     all_sites_closest = all_sites_closest.groupby(['chr','coordinate']).apply(self.mean_max).reset_index()
     with pd.HDFStore(self.additional_feature_file,'a') as h5s:
         h5s['DANN'] = all_sites_closest    
Exemplo n.º 3
0
def cell_scaling_factors_fragments(fragmentfile, selected_barcodes=None):
    """ Generates pseudo-bulk tracks.

    Parameters
    ----------
    fragmentfile : str
       Input fragments file.

    Returns
    -------
    pd.Series
       Series containing the barcode counts per barcode.

    """

    barcodecount = Counter()
    bed = BedTool(fragmentfile)
    for region in bed:
        bct = region.name
        if selected_barcodes is not None:
            if bct not in selected_barcodes:
                continue
        barcodecount[bct] += 1
    return pd.Series(barcodecount)
Exemplo n.º 4
0
def vcf_to_df_worker(arg):
    """ Convert CANVAS vcf to a dict, single thread
    """
    canvasvcf, exonbed, i = arg
    logging.debug("Working on job {}: {}".format(i, canvasvcf))
    samplekey = op.basename(canvasvcf).split(".")[0].rsplit('_', 1)[0]
    d = {'SampleKey': samplekey}

    exons = BedTool(exonbed)
    cn = parse_segments(canvasvcf)
    overlaps = exons.intersect(cn, wao=True)
    gcn_store = {}
    for ov in overlaps:
        # Example of ov.fields:
        # [u'chr1', u'11868', u'12227', u'ENSG00000223972.5',
        # u'ENST00000456328.2', u'transcribed_unprocessed_pseudogene',
        # u'DDX11L1', u'.', u'-1', u'-1', u'.', u'0']
        gene_name = "|".join((ov.fields[6], ov.fields[3], ov.fields[5]))
        if gene_name not in gcn_store:
            gcn_store[gene_name] = defaultdict(int)

        cn = ov.fields[-2]
        if cn == ".":
            continue
        cn = int(cn)
        if cn > 10:
            cn = 10
        amt = int(ov.fields[-1])
        gcn_store[gene_name][cn] += amt

    for k, v in sorted(gcn_store.items()):
        v_mean, v_median = counter_mean_and_median(v)
        d[k + ".avgcn"] = v_mean
        d[k + ".medcn"] = v_median
    cleanup()
    return d
Exemplo n.º 5
0
def Cluster2ExonSkipping(Cluster):
    '''
    '''
    for k1, v1 in Cluster.items():
        if len(v1) == 3:
            bed3list = []
            for k2, v2 in v1.items():
                bed3list.append(Bed(k2))
            bed3list.sort(key=sortbycoordinate)
            [bed1, bed2, bed3] = bed3list
            longer = bed2
            longer = longer.chr + "\t" + str(longer.start) + "\t" + str(
                longer.end)
            if bed1.start == bed2.start and bed2.end == bed3.end and bed1.end < bed3.start:
                alternative = BedTool("\t".join([
                    bed1.chr,
                    str(bed1.end),
                    str(bed3.start), k1 + "alternative", "0", bed1.strand
                ]),
                                      from_string=True)
                if len(alternative.intersect(m6A_bed)) >= 1:
                    yield "SE", "m6A", k1, v1[longer]
                else:
                    yield "SE", "nom6A", k1, v1[longer]
Exemplo n.º 6
0
Arquivo: peaks.py Projeto: xjyx/afp
def find_shared_peaks(multipath, maxd):
    #Split by chromosome

    chr2bedtools = defaultdict(list)
    for intervals in [BedTool(x) for x in multipath]:

        temp_d = defaultdict(list)
        for interval in intervals:
            temp_d[interval.chrom].append((interval))
        for chrom, local_intervals in temp_d.items():
            chr2bedtools[chrom].append(local_intervals)

    bedtools_list = [
        x[1] for x in sorted(chr2bedtools.items(), key=lambda x: x[0])
    ]

    stat_total_counts = []
    res_total = []
    for bedtools_chr in bedtools_list:
        res, stat_counts = _find_shared_peaks_chromosome(bedtools_chr, maxd)
        res_total.extend(res)
        stat_total_counts.extend(stat_counts)

    return res_total, stat_total_counts
Exemplo n.º 7
0
def make_annot_files(args, bed_for_annot):
    print('making annot file')
    df_bim = pd.read_csv(args.bimfile,
                         delim_whitespace=True,
                         usecols=[0, 1, 2, 3],
                         names=['CHR', 'SNP', 'CM', 'BP'])
    iter_bim = [['chr' + str(x1), x2 - 1, x2]
                for (x1, x2) in np.array(df_bim[['CHR', 'BP']])]
    bimbed = BedTool(iter_bim)
    annotbed = bimbed.intersect(bed_for_annot)
    bp = [x.start + 1 for x in annotbed]
    df_int = pd.DataFrame({'BP': bp, args.annot_name: 1})
    #
    #     bp = [x.start + 1 for x in annotbed]
    #     df_int = pd.DataFrame({'BP': bp, 'ANNOT':1})
    #  3d0c4464777b2578bf6f13386f0c1c9ab7d55046
    df_annot = pd.merge(df_bim, df_int, how='left', on='BP')
    df_annot.fillna(0, inplace=True)
    df_annot = df_annot[[args.annot_name]].astype(int)
    if args.annot_file.endswith('.gz'):
        with gzip.open(args.annot_file, 'wb') as f:
            df_annot.to_csv(f, sep="\t", index=False)
    else:
        df_annot.to_csv(args.annot_file, sep="\t", index=False)
Exemplo n.º 8
0
def makeBamFilterShortRNA(inFile, outprefix, genome):
    '''
    bed file as input first sort then bedtobam
    index the bam file finally
    '''
    bamFolder = '/'.join(outprefix.split('/')[:-1]) + '/bamFiles'
    tempBam = bamFolder + '/' + outprefix.split(
        '/')[-1] + '.filtered.sorted.bam'
    index_name = tempBam + '.bai'
    small_RNA = '/Users/wckdouglas/plasmaDNA/reference/smallRNA.bed'
    small_RNA_bed = Tool(small_RNA)
    if not os.path.isfile(tempBam):
        print 'Making %s ' % tempBam
        BedTool(inFile)\
            .sort()\
            .to_bam(g=genome)\
            .intersect(b=small_RNA_bed, v=True, f=0.8,r=True,s=True) \
            .saveas(tempBam)
        if os.path.isfile(index_name):
            os.remove(index_name)
        index = pysam.index(tempBam)
    else:
        print 'Used existing bamfile: %s' % tempBam
    return tempBam
Exemplo n.º 9
0
def main():
    """
    annotate a file with the neearest features in another.
    """
    p = argparse.ArgumentParser(description=__doc__, prog=sys.argv[0])
    p.add_argument("-a", dest="a", help="file to annotate")
    p.add_argument("-b", dest="b", help="file with annotations")
    p.add_argument("--upstream",
                   dest="upstream",
                   type=int,
                   default=None,
                   help="distance upstream of [a] to look for [b]")
    p.add_argument("--downstream",
                   dest="downstream",
                   type=int,
                   default=None,
                   help="distance downstream of [a] to look for [b]")
    p.add_argument("--report-distance",
                   dest="report_distance",
                   default=False,
                   help="report the distance, not just the genes",
                   action="store_true")
    args = p.parse_args()
    if (args.a is None or args.b is None):
        sys.exit(not p.print_help())

    c = add_closest(args.a, args.b)
    b = BedTool(args.b)
    # TODO: support --report-distance for up/downstream.
    if args.upstream:
        c = add_xstream(c, b, args.upstream, "up", args.report_distance)
    if args.downstream:
        c = add_xstream(c, b, args.downstream, "down", args.report_distance)

    for row in c.sort():
        print(row)
Exemplo n.º 10
0
def main():
    usage_text = """usage: %(prog)s [options] PopulationCoveredROI VCF ...
    takes a population covered ROI file and a vcf file containing the mutation calls 
    from all the samples in the population to calculate the mutation rate for all ROIs"""

    parser = argparse.ArgumentParser(description=usage_text)
    parser.add_argument('--version', action='version', version='%(prog)s 0.1')
    parser.add_argument("-o",
                        dest="outFile",
                        default="stdout",
                        help="output file name")
    parser.add_argument(dest='ROI',
                        metavar='ROI_file',
                        type=str,
                        help='population covered ROI file name')
    parser.add_argument(
        dest='vcf',
        metavar='VCF_file',
        type=str,
        help=
        'VCF file containing mutation calls for all the samples in the population'
    )
    # parser.add_argument( "--bed", dest="bedFiles", metavar='sample1.bed', type=str, nargs='+', help="bed filenames" )

    args = parser.parse_args()

    ROIBed = BedTool(args.ROI)

    result = intersectMutationRegardlessOfMutationType(args.vcf)

    mutation_count = intersectBedsAndRetNumMutations(result[0], ROIBed,
                                                     result[1])
    num_samples = result[2]
    track_length = ROIBed.sort().total_coverage()
    print("\n\n" + str(float(mutation_count / (num_samples * track_length))) +
          "\n\n")
Exemplo n.º 11
0
def load_bed_data_sc(genome, positive_windows, use_meta, use_gencode, input_dir, is_sorted, big_wig_list, num_pos, input_scATAC_dir, chrom=None):
    bed_filtered = positive_windows
    print 'Generating test data iterator'
    bigwig_names, bigwig_files_list  = load_bigwigs_sc([input_dir],num_pos,big_wig_list,input_scATAC_dir)
    bigwig_files = bigwig_files_list[0]
    if use_meta:
        meta_names, meta_list = load_meta([input_dir])
        meta = meta_list[0]
    else:
        meta = []
        meta_names = None

    shift = 0

    if use_gencode:
        cpg_bed = BedTool('resources/cpgisland.bed.gz')
        cds_bed = BedTool('resources/wgEncodeGencodeBasicV19.cds.merged.bed.gz')
        intron_bed = BedTool('resources/wgEncodeGencodeBasicV19.intron.merged.bed.gz')
        promoter_bed = BedTool('resources/wgEncodeGencodeBasicV19.promoter.merged.bed.gz')
        utr5_bed = BedTool('resources/wgEncodeGencodeBasicV19.utr5.merged.bed.gz')
        utr3_bed = BedTool('resources/wgEncodeGencodeBasicV19.utr3.merged.bed.gz')

        peaks_cpg_bedgraph = bed_filtered.intersect(cpg_bed, wa=True, c=True)
        peaks_cds_bedgraph = bed_filtered.intersect(cds_bed, wa=True, c=True)
        peaks_intron_bedgraph = bed_filtered.intersect(intron_bed, wa=True, c=True)
        peaks_promoter_bedgraph = bed_filtered.intersect(promoter_bed, wa=True, c=True)
        peaks_utr5_bedgraph = bed_filtered.intersect(utr5_bed, wa=True, c=True)
        peaks_utr3_bedgraph = bed_filtered.intersect(utr3_bed, wa=True, c=True)

        data_bed = [(window.chrom, window.start, window.stop, 0, bigwig_files, np.append(meta, np.array([cpg.count, cds.count, intron.count, promoter.count, utr5.count, utr3.count], dtype=bool)))
                    for window, cpg, cds, intron, promoter, utr5, utr3 in
                    itertools.izip(bed_filtered, peaks_cpg_bedgraph,peaks_cds_bedgraph,peaks_intron_bedgraph,peaks_promoter_bedgraph,peaks_utr5_bedgraph,peaks_utr3_bedgraph)]
    else:
        data_bed = [(window.chrom, window.start, window.stop, shift, bigwig_files, meta)
                    for window in bed_filtered]
    #from data_iter import DataIterator
    from data_iter_scFAN import DataIterator
    #pdb.set_trace()
    #tmpp = bed_filtered.saveas('/data1/fly/FactorNet/draw_plot/heatmap_plot/merge_heatmap/H1_bed/%d_bed.bed'%(num_pos))
    bigwig_rc_order = get_bigwig_rc_order(bigwig_names)
    datagen_bed = DataIterator(data_bed, genome, 100, L, bigwig_rc_order, shuffle=False)
    return bigwig_names, datagen_bed
if not os.path.exists(
        args.output):  # We will create a new directory for the TF
    os.makedirs(args.output)

# Format the TF argument to only have the TF name, in case the full ID was given as input
tf_list = []
for tf in args.transcription_factors:
    if 'var.' in tf:
        tf_list.append('.'.join(tf.replace(' ', '').split('.')[-2:]))
    else:
        tf_list.append(tf.replace(' ', '').split('.')[-1])

# Get the motif hits of the TF of interest
motif_hits = [
    x.split('\t')[:4]
    for x in open(args.motif_hit_file).read().strip().split('\n')
    if x.split('\t')[3].split('.')[-1] in tf_list
]
print(len(motif_hits), 'motif hits found')

# Intersect the filtered motif hits with the differential peaks
motif_hits_bed = BedTool('\n'.join(['\t'.join(x) for x in motif_hits]),
                         from_string=True)
diff_peak_filtered = motif_hits_bed.intersect(args.differential_peaks)
print(len(str(diff_peak_filtered).strip().split('\n')),
      'of motif hits intersect with a differential peak')

open(args.output + '/' + '_'.join(tf_list) + '_Hits.bed',
     'w').write(str(diff_peak_filtered))
Exemplo n.º 13
0
from snakemake import shell
from pybedtools import BedTool

# truncate the sorted output files to chromosome so no ends are out of bounds.
spp_trunc = snakemake.output.spp_sorted_tr
spp_sorted = snakemake.input.spp_sorted
macs2_trunc = snakemake.output.macs2_sorted_tr
macs2_sorted = snakemake.input.macs2_sorted
spp2_trunc = snakemake.output.spp2_sorted_tr
spp2_sorted = snakemake.input.spp2_sorted

genome = snakemake.input.genome

BedTool(spp_sorted).truncate_to_chrom('dm6').saveas(spp_trunc)
BedTool(spp2_sorted).truncate_to_chrom('dm6').saveas(spp2_trunc)
BedTool(macs2_sorted).truncate_to_chrom('dm6').saveas(macs2_trunc)


Exemplo n.º 14
0
def generateBackgroundForRegionTest(rna):
    target = BedTool('../H3K27me3/peaks_for_tdf/plus_' + rna + '_peaks.bed')
    hg19 = BedTool('../hg19/allChr.bed')
    hg19.subtract(target, output='../H3K27me3/region_test/bg_' + rna + ".bed")
Exemplo n.º 15
0
                    help="Length of the segment right upstream")
parser.add_argument('--outdir',
                    nargs='?',
                    required=True,
                    type=str,
                    help="Path to the output directory")
parser.add_argument(
    '--ylim',
    nargs='?',
    default=False,
    const=True,
    type=bool,
    help="If set, plots will be cut from the bottom up to the lowest bar")
args = parser.parse_args()

transcripts = BedTool(args.transcripts)
phages = BedTool(args.phages)
phaged_transcripts = [
    transcripts.intersect(b=phages, u=True, f=0.5),
    transcripts.intersect(b=phages, v=True, f=0.5)
]

############################################################################################################
### TSS count section

tss_counts_list = []
for ptr in phaged_transcripts:
    temp_dict = defaultdict(list)
    for transcript in ptr:
        temp_dict[transcript.name].append(int(
            transcript.attrs['tss_variants']))
Exemplo n.º 16
0
def make_bed_from_gff(gff: str,
                      up_offset: int = 2000,
                      valid_ids: List[str] = None,
                      flavour: str = 'body'):
    """Create pybedtools object for genes from a GFF file. Gene coordinates are promoter extended"""
    try:
        from pybedtools import BedTool
    except ImportError:
        raise ImportError(
            "pybedtools is not installed. Check out this link to install"
            " https://daler.github.io/pybedtools/main.html#install-via-conda")
    out = []
    ignored_genes = 0
    unknown_ids = 0
    if valid_ids is not None:
        valid_ids = {x: None for x in valid_ids}
    with open(gff) as h:
        # Testing whether first 5 lines are comment lines
        for i in range(5):
            l = next(h)
            if l[0] != '#':
                logger.warning(f"line num {i} is not comment line", flush=True)
        for l in tqdm(h):
            c = l.split('\t')
            if c[2] != 'gene':
                continue
            a = [x.split(' ') for x in c[8].rstrip('\n').split('; ')]
            a = {x: y.strip('"') for x, y in a}
            if 'gene_id' not in a:
                unknown_ids += 1
                continue
            if valid_ids is not None and a['gene_id'] not in valid_ids:
                ignored_genes += 1
                continue
            # Fetch start and end coordinate
            s, e = int(c[3]), int(c[4])
            if flavour == 'body':
                if c[6] == '+':
                    s = s - up_offset
                    s = max(s, 0)
                elif c[6] == '-':
                    e = e + up_offset
                else:
                    raise ValueError('ERROR: Unsupported symbol for strand')
            elif flavour == 'promoter':
                if c[6] == '+':
                    e = s + up_offset
                    s = s - up_offset
                    s = max(s, 0)
                elif c[6] == '-':
                    s = e - up_offset
                    s = max(s, 0)
                    e = e + up_offset
                else:
                    raise ValueError('ERROR: Unsupported symbol for strand')
            else:
                raise ValueError(
                    'ERROR: `flavour` can either be `body` or `promoter`')
            if c[0].startswith('chr'):
                chrom = c[0]
            else:
                chrom = f'chr{c[0]}'
            if 'gene_name' in a:
                gn = a['gene_name']
            else:
                gn = a['gene_id']
            o = '\t'.join([chrom, str(s), str(e), a['gene_id'], gn, c[6]])
            out.append(o)
    logger.info(f"{len(out)} genes found in the GFF file")
    logger.info(
        f"{ignored_genes} genes were ignored as they were not present in the valid_ids"
    )
    logger.info(
        f"{unknown_ids} genes were ignored as they did not have gene_id column"
    )
    return BedTool('\n'.join(out), from_string=True)
Exemplo n.º 17
0
# fp = '/Users/phi/data_local/databases/annotree/pfam_archaea/*'
fp = '/Users/phi/data_local/databases/annotree/pfam_bacteria/*'
tagged_data = []
corpus = {}

with open('/Users/phi/tmp/corpus.annotree.train.txt', 'w+') as out:
    for file in tqdm(glob(fp)):

        # TODO: skip overlap removal and deduplication for now, we need strand
        # info
        # for this
        # Rich Hickey would be proud ...
        # dom = deduplicate(remove_overlap(load_domains(file, fmt='pfamscan')))

        result = load_domains(file, fmt='pfamscan')
        dom = BedTool(list(result.values()))

        # make sure Pfam ID is truncated: PF00815.1 -> PF00815
        seq = create_domain_sequence(dom,
                                     keep_unknown=True,
                                     fmt_fn=lambda x: x.split('.')[0])

        text = list(seq.values())
        genome = os.path.basename(file).strip('_pfam.tsv')
        # e.g. ...
        # UBA9934_pfam.tsv
        # GB_GCA_001790445.1_pfam.tsv
        # RS_GCF_000012865.1_pfam.tsv
        if not 'UBA' in genome:
            genome = '_'.join(genome.split('_')[1:])
Exemplo n.º 18
0
parser.add_argument('--outdir',
                    required=True,
                    nargs='?',
                    type=str,
                    help="Path to the output directory")
args = parser.parse_args()


def check_interval(interval, mincov):
    return all(
        [float(x) > mincov for x in interval.attrs['topcoverage'].split(",")])


for path in get_only_files(args.path):
    if (path.endswith('gff') or path.endswith('bed')):
        bedtool = BedTool(path)
        if (len(bedtool)):
            with open(os.path.join(args.outdir, os.path.basename(path)),
                      'w') as f:
                for interval in bedtool:
                    if (check_interval(interval, args.mincov)):
                        center = int(interval.name)
                        f.write(
                            str(
                                Interval(interval.chrom,
                                         center - args.flank,
                                         center + args.flank,
                                         name=interval.name,
                                         strand=interval.strand,
                                         score=interval.attrs['topcoverage'])))
Exemplo n.º 19
0
    def generate_rdf_content(self):
        """Generate RDF content of the BED file

        Yields
        ------
        Graph
            RDF content
        """
        bedfile = BedTool(self.path)

        count = 0
        attribute_list = []

        total_lines = sum(1 for line in open(self.path))
        row_number = 0

        entity_type = self.namespace_data[self.format_uri(self.entity_name,
                                                          remove_space=True)]

        for feature in bedfile:

            # Percent
            row_number += 1
            self.graph_chunk.percent = row_number * 100 / total_lines

            # Entity
            if feature.name != '.':
                entity_label = feature.name
            else:
                entity_label = "{}_{}".format(self.entity_name, str(count))
            count += 1
            entity = self.namespace_entity[self.format_uri(entity_label)]

            self.graph_chunk.add((entity, rdflib.RDF.type, entity_type))
            self.graph_chunk.add(
                (entity, rdflib.RDFS.label, rdflib.Literal(entity_label)))

            # Faldo
            faldo_reference = None
            faldo_strand = None
            faldo_start = None
            faldo_end = None

            # Chromosome
            self.category_values["reference"] = {
                feature.chrom,
            }
            relation = self.namespace_data[self.format_uri("reference")]
            attribute = self.namespace_data[self.format_uri(feature.chrom)]
            faldo_reference = attribute
            self.faldo_abstraction["reference"] = relation
            self.graph_chunk.add((entity, relation, attribute))

            if "reference" not in attribute_list:
                attribute_list.append("reference")
                self.attribute_abstraction.append({
                    "uri":
                    self.namespace_data[self.format_uri("reference")],
                    "label":
                    rdflib.Literal("reference"),
                    "type": [
                        self.namespace_internal[self.format_uri(
                            "AskomicsCategory")], rdflib.OWL.ObjectProperty
                    ],
                    "domain":
                    entity_type,
                    "range":
                    self.namespace_data[self.format_uri(
                        "{}Category".format("reference"))],
                    "values": [feature.chrom]
                })
            else:
                # add the value
                for at in self.attribute_abstraction:
                    if at["uri"] == self.namespace_data[self.format_uri(
                            "reference"
                    )] and at[
                            "domain"] == entity_type and feature.chrom not in at[
                                "values"]:
                        at["values"].append(feature.chrom)

            # Start
            relation = self.namespace_data[self.format_uri("start")]
            attribute = rdflib.Literal(
                self.convert_type(feature.start +
                                  1))  # +1 because bed is 0 based
            faldo_start = attribute
            self.faldo_abstraction["start"] = relation
            self.graph_chunk.add((entity, relation, attribute))

            if "start" not in attribute_list:
                attribute_list.append("start")
                self.attribute_abstraction.append({
                    "uri":
                    self.namespace_data[self.format_uri("start")],
                    "label":
                    rdflib.Literal("start"),
                    "type": [rdflib.OWL.DatatypeProperty],
                    "domain":
                    entity_type,
                    "range":
                    rdflib.XSD.decimal
                })

            # End
            relation = self.namespace_data[self.format_uri("end")]
            attribute = rdflib.Literal(self.convert_type(feature.end))
            faldo_end = attribute
            self.faldo_abstraction["end"] = relation
            self.graph_chunk.add((entity, relation, attribute))

            if "end" not in attribute_list:
                attribute_list.append("end")
                self.attribute_abstraction.append({
                    "uri":
                    self.namespace_data[self.format_uri("end")],
                    "label":
                    rdflib.Literal("end"),
                    "type": [rdflib.OWL.DatatypeProperty],
                    "domain":
                    entity_type,
                    "range":
                    rdflib.XSD.decimal
                })

            # Strand
            strand = False
            strand_type = None
            if feature.strand == "+":
                self.category_values["strand"] = {
                    "+",
                }
                relation = self.namespace_data[self.format_uri("strand")]
                attribute = self.namespace_data[self.format_uri("+")]
                faldo_strand = self.get_faldo_strand("+")
                self.faldo_abstraction["strand"] = relation
                self.graph_chunk.add((entity, relation, attribute))
                strand = True
                strand_type = "+"
            elif feature.strand == "-":
                self.category_values["strand"] = {
                    "-",
                }
                relation = self.namespace_data[self.format_uri("strand")]
                attribute = self.namespace_data[self.format_uri("-")]
                faldo_strand = self.get_faldo_strand("-")
                self.faldo_abstraction["strand"] = relation
                self.graph_chunk.add((entity, relation, attribute))
                strand = True
                strand_type = "-"
            else:
                self.category_values["strand"] = {
                    ".",
                }
                relation = self.namespace_data[self.format_uri("strand")]
                attribute = self.namespace_data[self.format_uri(".")]
                faldo_strand = self.get_faldo_strand(".")
                self.faldo_abstraction["strand"] = relation
                self.graph_chunk.add((entity, relation, attribute))
                strand = True
                strand_type = "."

            if strand:
                if ("strand", strand_type) not in attribute_list:
                    attribute_list.append(("strand", strand_type))
                    self.attribute_abstraction.append({
                        "uri":
                        self.namespace_data[self.format_uri("strand")],
                        "label":
                        rdflib.Literal("strand"),
                        "type": [
                            self.namespace_internal[self.format_uri(
                                "AskomicsCategory")], rdflib.OWL.ObjectProperty
                        ],
                        "domain":
                        entity_type,
                        "range":
                        self.namespace_data[self.format_uri(
                            "{}Category".format("strand"))],
                        "values": [strand_type]
                    })

            # Score
            if feature.score != '.':
                relation = self.namespace_data[self.format_uri("score")]
                attribute = rdflib.Literal(self.convert_type(feature.score))
                self.graph_chunk.add((entity, relation, attribute))

                if "score" not in attribute_list:
                    attribute_list.append("score")
                    self.attribute_abstraction.append({
                        "uri":
                        self.namespace_data[self.format_uri("score")],
                        "label":
                        rdflib.Literal("score"),
                        "type": [rdflib.OWL.DatatypeProperty],
                        "domain":
                        entity_type,
                        "range":
                        rdflib.XSD.decimal
                    })

            location = BNode()
            begin = BNode()
            end = BNode()

            self.graph_chunk.add((entity, self.faldo.location, location))

            self.graph_chunk.add(
                (location, rdflib.RDF.type, self.faldo.region))
            self.graph_chunk.add((location, self.faldo.begin, begin))
            self.graph_chunk.add((location, self.faldo.end, end))

            self.graph_chunk.add(
                (begin, rdflib.RDF.type, self.faldo.ExactPosition))
            self.graph_chunk.add((begin, self.faldo.position, faldo_start))

            self.graph_chunk.add(
                (end, rdflib.RDF.type, self.faldo.ExactPosition))
            self.graph_chunk.add((end, self.faldo.position, faldo_end))

            self.graph_chunk.add(
                (begin, self.faldo.reference, faldo_reference))
            self.graph_chunk.add((end, self.faldo.reference, faldo_reference))

            if faldo_strand:
                self.graph_chunk.add((begin, rdflib.RDF.type, faldo_strand))
                self.graph_chunk.add((end, rdflib.RDF.type, faldo_strand))

            yield
Exemplo n.º 20
0
def getListOfBlackZones(chrom):
    blackList = BedTool('../wgEncodeDacMapabilityConsensusExcludable.bed')
    blackListChrom = blackList.filter(lambda b: b.chrom == chrom)
    return [(i.start, i.end) for i in blackListChrom]
Exemplo n.º 21
0
    '''
    return g.filter(featuretype_filter, featuretype).saveas().fn


def get_attribute(g, attribute):
    genes = []
    for feature in g:
        try:
            genes.append(feature[attribute])
        except AttributeError:
            genes.append('')
    return genes


print('loading bedfile and extracting exons...')
g = BedTool(tx_info)
exons = BedTool(subset_featuretypes(g, 'exon'))

print('validating and sorting records...')
exons = exons.remove_invalid().sort()

print('extracting attributes...')
exon_pd = pd.DataFrame([(e['chrom'], e['start'], e['end'], e['strand'])
                        for e in exons],
                       columns=['chrom', 'exonStarts', 'exonEnds', 'strand'])
exon_pd['exonStarts'] = exon_pd['exonStarts'].map(str)
exon_pd['exonEnds'] = exon_pd['exonEnds'].map(str)
exon_pd['transcript'] = get_attribute(exons, 'transcript_id')
exon_pd['gene'] = get_attribute(exons, 'gene_name')
exon_pd = exon_pd[exon_pd.gene != '']
Exemplo n.º 22
0
    def _make_target_bed(self,
                         bed_fpath,
                         work_dir,
                         output_dir,
                         is_debug,
                         padding=None,
                         fai_fpath=None,
                         genome=None,
                         reannotate=False):
        clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath,
                                                    'clean')
        if not can_reuse(clean_target_bed_fpath, bed_fpath):
            debug()
            debug('Cleaning target BED file...')
            bed = BedTool(bed_fpath)
            if bed.field_count() > 4:
                bed = bed.cut(range(4))
            bed = bed\
                .filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\
                .remove_invalid()
            with file_transaction(work_dir, clean_target_bed_fpath) as tx:
                bed.saveas(tx)
            debug('Saved to ' + clean_target_bed_fpath)
            verify_file(clean_target_bed_fpath, is_critical=True)

        sort_target_bed_fpath = intermediate_fname(work_dir,
                                                   clean_target_bed_fpath,
                                                   'sorted')
        if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath):
            debug()
            debug('Sorting target BED file...')
            sort_target_bed_fpath = sort_bed(
                clean_target_bed_fpath,
                output_bed_fpath=sort_target_bed_fpath,
                fai_fpath=fai_fpath)
            debug('Saved to ' + sort_target_bed_fpath)
            verify_file(sort_target_bed_fpath, is_critical=True)

        if genome in ebl.SUPPORTED_GENOMES:
            ann_target_bed_fpath = intermediate_fname(work_dir,
                                                      sort_target_bed_fpath,
                                                      'ann_plus_features')
            if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath):
                debug()
                if BedTool(sort_target_bed_fpath).field_count(
                ) == 3 or reannotate:
                    debug(
                        'Annotating target BED file and collecting overlapping genome features'
                    )
                    overlap_with_features(sort_target_bed_fpath,
                                          ann_target_bed_fpath,
                                          work_dir=work_dir,
                                          genome=genome,
                                          extended=True,
                                          reannotate=reannotate,
                                          only_canonical=True)
                else:
                    debug('Overlapping with genomic features:')
                    overlap_with_features(sort_target_bed_fpath,
                                          ann_target_bed_fpath,
                                          work_dir=work_dir,
                                          genome=genome,
                                          extended=True,
                                          only_canonical=True)
                debug('Saved to ' + ann_target_bed_fpath)
                verify_file(ann_target_bed_fpath, is_critical=True)
        else:
            ann_target_bed_fpath = sort_target_bed_fpath

        final_clean_target_bed_fpath = intermediate_fname(
            work_dir, ann_target_bed_fpath, 'clean')
        if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath):
            bed = BedTool(ann_target_bed_fpath).remove_invalid()
            with file_transaction(work_dir,
                                  final_clean_target_bed_fpath) as tx:
                bed.saveas(tx)
                pass
            verify_file(final_clean_target_bed_fpath, is_critical=True)

        self.bed_fpath = final_clean_target_bed_fpath
        self.bed = BedTool(self.bed_fpath)

        self.capture_bed_fpath = add_suffix(
            join(output_dir, basename(bed_fpath)), 'clean_sorted_ann')
        if not can_reuse(self.capture_bed_fpath, self.bed_fpath):
            with file_transaction(work_dir, self.capture_bed_fpath) as tx:
                self.get_capture_bed().saveas(tx)

        gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath)
        self.gene_keys_set = gene_key_set
        self.gene_keys_list = gene_key_list
        self.regions_num = self.get_capture_bed().count()

        self._make_qualimap_bed(work_dir)
        if padding:
            self._make_padded_bed(work_dir, fai_fpath, padding)
Exemplo n.º 23
0
def build_capsules(capsule_choice,
                   overlap,
                   bin_len,
                   ma,
                   include_last,
                   min_capsule_len,
                   custom_capsule_file,
                   gsea_superset,
                   tissue,
                   gene_context,
                   use_set,
                   number_sets,
                   limited_capsule_names_file,
                   cpg_arr=None,
                   sort_caps=True):
    capsules, finalcpgs, capsule_names = [], [], []
    annotation_file = annotations450
    if 'genomic_binned' in capsule_choice:
        overlap = int(overlap * bin_len)
        genome_file = hg19
        gname = os.path.basename(genome_file).split('.')[0]
        overlap_file = '{}.{}.overlap.{}.bed'.format(gname, bin_len, overlap)
        if not os.path.exists(overlap_file):
            BedTool(genome_file).makewindows(
                g=genome_file, w=bin_len,
                s=bin_len - overlap).saveas('{}.{}.overlap.{}.bed'.format(
                    gname, bin_len, overlap))  #.to_dataframe().shape
        print(annotation_file, overlap_file)
        final_modules, modulecpgs, module_names = get_binned_modules(
            ma=ma,
            a=annotation_file,
            b=overlap_file,
            include_last=include_last,
            min_capsule_len=min_capsule_len)
        print('LEN_MODULES', len(final_modules))
        capsules.extend(final_modules)
        finalcpgs.extend(modulecpgs)
        capsule_names.extend(module_names)

    if 'custom_bed' in capsule_choice:
        final_modules, modulecpgs, module_names = get_binned_modules(
            ma=ma,
            a=annotation_file,
            b=custom_capsule_file,
            include_last=include_last,
            min_capsule_len=min_capsule_len)
        capsules.extend(final_modules)
        finalcpgs.extend(modulecpgs)
        capsule_names.extend(module_names)

    if 'custom_set' in capsule_choice:
        final_modules, modulecpgs, module_names = return_custom_capsules(
            ma=ma,
            capsule_file=custom_capsule_file,
            capsule_sets=['all'],
            min_capsule_len=min_capsule_len,
            include_last=include_last)
        capsules.extend(final_modules)
        finalcpgs.extend(modulecpgs)
        capsule_names.extend(module_names)

    if np.intersect1d(CAPSULES, capsule_choice).tolist() or isinstance(
            cpg_arr, pd.DataFrame):
        final_modules, modulecpgs, module_names = return_final_capsules(
            methyl_array=ma,
            capsule_choice=capsule_choice
            if not isinstance(cpg_arr, pd.DataFrame) else None,
            min_capsule_len=min_capsule_len,
            collection=gsea_superset,
            tissue=tissue,
            n_top_sets=number_sets,
            limited_capsule_names_file=limited_capsule_names_file,
            gsea_superset=gsea_superset,
            cpg_arr=cpg_arr,
            sort_caps=sort_caps)
        capsules.extend(final_modules)
        finalcpgs.extend(modulecpgs)
        capsule_names.extend(module_names)

    # if 0:
    #
    # 	selected_sets=np.intersect1d(['UCSC_RefGene_Name','UCSC_RefGene_Accession', 'UCSC_RefGene_Group', 'UCSC_CpG_Islands_Name', 'Relation_to_UCSC_CpG_Island', 'Phantom', 'DMR', 'Enhancer', 'HMM_Island', 'Regulatory_Feature_Name', 'Regulatory_Feature_Group', 'DHS'],capsule_choice).tolist()
    # 	if selected_sets:
    # 		final_modules,modulecpgs,module_names=return_custom_capsules(ma=ma,capsule_file=selected_caps_file, capsule_sets=selected_sets, min_capsule_len=min_capsule_len, include_last=include_last, limited_capsule_names_file=limited_capsule_names_file)
    # 		capsules.extend(final_modules)
    # 		finalcpgs.extend(modulecpgs)
    # 		capsule_names.extend(module_names)
    #
    # 	gsea_bool=(("GSEA" in capsule_choice and gsea_superset) or 'all_gene_sets' in capsule_choice)
    #
    # 	if gsea_bool:
    # 		final_modules,modulecpgs,module_names=return_gsea_capsules(ma=ma,tissue=tissue,context_on=gene_context,use_set=use_set,gsea_superset=gsea_superset,n_top_sets=number_sets,min_capsule_len=min_capsule_len, all_genes=('all_gene_sets' in capsule_choice), limited_capsule_names_file=limited_capsule_names_file)
    # 		capsules.extend(final_modules)
    # 		finalcpgs.extend(modulecpgs)
    # 		capsule_names.extend(module_names)

    final_modules = capsules
    modulecpgs = list(set(finalcpgs))
    module_names = capsule_names

    # if limited_capsule_names_file and not (selected_sets or gsea_bool):
    # 	with open(limited_capsule_names_file) as f:
    # 		limited_capsule_names=f.read().replace('\n',' ').split()
    # 	capsules=[]
    # 	capsule_names=[]
    # 	for i in range(len(module_names)):
    # 		if module_names[i] in limited_capsule_names:
    # 			capsule_names.append(module_names[i])
    # 			capsules.append(final_modules[i])
    #
    # 	modulecpgs=list(set(list(reduce(lambda x,y: x+y,capsules))))
    # 	final_modules=capsules
    # 	module_names=capsule_names

    print("{} modules, {} cpgs, {} module names, {} missing".format(
        len(final_modules), len(modulecpgs), len(module_names),
        ma.beta.isnull().sum().sum()))

    return final_modules, modulecpgs, module_names
Exemplo n.º 24
0
        ]))
    return [
        len(het_index_unique),
        len(ch_index_unique),
        len(hom_index_unique), total_ac
    ]


#Make list of all SNPs across all genes present in snpfile
allsnplist = makesnplist(options.snpfilename)

#Make a hashtable with keys as each SNP, and stores a list of indices of carriers for that SNP
count_table = {}

#Open vcf file
vcffile = BedTool(options.vcffilename)
if options.bedfilename is not None:
    bed = BedTool(options.bedfilename)
    vcffile_temp = vcffile.intersect(bed)
else:
    if chrformat == "chr":
        dummy_bed = BedTool('chr1000 100000000 100000001', from_string=True)
    else:
        dummy_bed = BedTool('1000 100000000 100000001', from_string=True)
    vcffile_temp = vcffile.subtract(dummy_bed)

for line_vcf1 in open(vcffile_temp.fn):
    line_vcf = line_vcf1.rstrip().split('\t')
    if line_vcf[0][0] != "#" and ("," not in line_vcf[4]):
        if not (options.passfilter and line_vcf[6] != "PASS"):
            if options.snpformat == "VCFID":
Exemplo n.º 25
0
    def predict_variant_effect(self,  # pylint: disable=too-many-locals
                               bioseq,
                               variants,
                               conditions,
                               output_folder,
                               condition_filter=None,
                               batch_size=None):
        """Evaluates the performance.

        Parameters
        ----------
        bioseq : :code:`Bioseq`
            Input sequence containing the reference genome.
        variants :  str
            File name of a VCF file containg the variants under study.
        conditions : list(str)
            Condition labels for each output prediction.
        output_folder : str
            The method produces an hdf5 and a bed file as output.
            The bed-file contains the variant positions while the
            hdf5 file contains the reference and alternative variant scores
            for each output feature.
        condition_filter : str or None
            Regular expression filter on which conditions should be evaluated.
            If None, all output conditions will be returned.
        batch_size : int, None.
            Batch size. If None, a batch_size of 128 is used.


        Returns
        -------
        tuple:
            Tuple containing the output filenames: an hdf5 and a bed file.

        Examples
        --------

        .. code-block:: python

          # Evaluate all variants and all conditions (outputs)
          model.predict_variant_effect(DATA, VARIANTS, CONDITIONS,
                                       'vcfoutput')

          # Evaluate all variants and a subset of conditions (Ctcf output labels)
          model.predict_variant_effect(DATA, LABELS, CONDITIONS,
                                       'vcfoutput_subset',
                                       contition_filter='Cfcf')

        """
        if batch_size is None:
            batch_size = 128

        if len(self.kerasmodel.inputs) > 1:
            raise ValueError('Only one input layer supported for this operation.')
        binsize = self.kerasmodel.layers[0].input_shape[1] + bioseq.garray.order - 1

        if not bioseq.garray._full_genome_stored:
            raise ValueError('Incompatible Bioseq: '
                             'Bioseq must be loaded with store_whole_genome=True.')
        # the network might output arbitrarily many
        # output.
        # With the filter option it is possible to
        # restrict the analysis to certain features.
        if condition_filter is None:
            conditions = [(idx, cond) for idx, cond in enumerate(conditions)]
        else:
            conditions = [(idx, cond) for idx, cond in enumerate(conditions) \
                          if hasattr(re.search(condition_filter, cond), 'start')]


        icond = [el[0] for el in conditions]

        local_model = self.kerasmodel

        if len(conditions) != self.kerasmodel.output_shape[-1]:
            raise ValueError("The number of conditions does not match with the "
                             "number of network output units.")

        # get number of variants
        variantsstream = VariantStreamer(bioseq, variants, binsize, batch_size)

        nvariants = variantsstream.get_variant_count()

        h5file = h5py.File(os.path.join(output_folder, 'scores.hdf5'), 'w')

        h5file.create_dataset('labels', (len(conditions),),
                              dtype=h5py.special_dtype(vlen=str),
                              data=np.array([c[-1] for c in conditions],
                                            dtype=h5py.special_dtype(vlen=str)))

        refscore = h5file.create_dataset('refscore', (nvariants, len(conditions)),
                                         dtype='float16')
        altscore = h5file.create_dataset('altscore', (nvariants, len(conditions)),
                                         dtype='float16')
        diffscore = h5file.create_dataset('diffscore', (nvariants, len(conditions)),
                                          dtype='float16')
        logodds = h5file.create_dataset('logoddsscore', (nvariants, len(conditions)),
                                        dtype='float16')


        bar = Bar('Parsing {}: '.format(variants), max=int(np.ceil(nvariants/float(batch_size))))

        chromlist = []
        poslist = []
        vnamelist = []
        reflist = []
        altlist = []
        ibatch = 0

        # read variants file
        for names, chroms, poss, ra, aa, reference, alternative in variantsstream.flow():
            bar.next()

            if reference.shape[0] <= 0:
                # reached the end of the file
                break

            ref_score = local_model.predict_on_batch(reference)
            alt_score = local_model.predict_on_batch(alternative)

            chromlist += chroms
            poslist += poss
            vnamelist += names
            reflist += ra
            altlist += aa

            refscore[ibatch:(ibatch + ref_score.shape[0])] = ref_score[:, icond].astype('float16')
            altscore[ibatch:(ibatch + ref_score.shape[0])] = alt_score[:, icond].astype('float16')
            diffscore[ibatch:(ibatch + ref_score.shape[0])] = \
                alt_score[:, icond].astype('float16') - ref_score[:, icond].astype('float16')
            logodds[ibatch:(ibatch + ref_score.shape[0])] = \
                np.log(alt_score[:, icond].astype('float16')/
                       ref_score[:, icond].astype('float16') + 1e-7)
            ibatch += ref_score.shape[0]

        #form large string
        BedTool('\n'.join(['{} {} {} {}_{}>{}'.format(chrom, start, start+1, name, ref, alt)
                           for chrom, start, name, ref, alt in zip(chromlist,
                                                                   poslist,
                                                                   vnamelist, reflist, altlist)]),
                from_string=True).saveas(os.path.join(output_folder, 'snps.bed.gz'))

        bar.finish()
        h5file.close()

        return (os.path.join(output_folder, 'scores.hdf5'),
                os.path.join(output_folder, 'snps.bed.gz'))
Exemplo n.º 26
0
            tempd[a[0]] = float(a[3])
    return tempd


names2diff = defaultdict(list)
for path in args.diff:
    for k, v in readdiff(path).items():
        names2diff[k].append(v)
names2diff = dict([x for x in names2diff.items() if len(x[1]) == ldiff])

#print(names2diff)

###READ peak intensities

names2intensities = {}
for interval in BedTool(args.path):
    distance = int(interval.attrs['start_gene_distance'])
    if (distance < args.distance):
        genename = interval.attrs['start_gene']
        intensity = [
            float(x) if x != 'None' else 0
            for x in interval.attrs['maxcov'].split(",")
        ]
        lint = len(intensity)
        names2intensities[genename] = intensity

###CORRELATE peak intensities to differential gene expression
diff2timepoints = {0: '0h', 1: '0.5h', 2: '4h'}
int2timepoints = {
    0: 'pre',
    1: '0h',
print 'Current working directory is:' + os.getcwd()
print '\n'

#generate bed file names
bed_names = [f.replace('bam', 'bed') for f in datafiles]
size_selected_small = [f.replace('bam', 'small.bed') for f in datafiles]
size_selected_med = [f.replace('bam', 'med.bed') for f in datafiles]
size_selected_big = [f.replace('bam', 'big.bed') for f in datafiles]

#generate file names for length analysis
lengths_names = [f.replace('bam', 'lengths') for f in datafiles]

#generate bed files with bam_to_bed tool (makes bed12 format)
for i in range(len(datafiles)):
    temp_bed = BedTool(datafiles[i]).bam_to_bed(bedpe=True).to_dataframe()

    #need to strip out start and end position of whole insert (bed12 is both reads)
    #column names actually represent <chrom>, <start of insert>, <end of insert>
    temp_bed_stripped = temp_bed.iloc[:, [0, 1, 5]].sort_values(
        by=['chrom', 'start', 'strand'])

    #calculate insert size as column 4 and save file with bed_name
    temp_bed_stripped[
        'length'] = temp_bed_stripped['strand'] - temp_bed_stripped['start']

    temp_bed_stripped.to_csv(bed_names[i], sep="\t", header=False, index=False)

    #analyze lengths of inserts
    temp_lengths = temp_bed_stripped.groupby(by=['length'])['length'].count()
Exemplo n.º 28
0
def test_dna_dims_order_1_from_subset_dataframe(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    order = 1
    data_path = pkg_resources.resource_filename('janggu', 'resources/')
    bed_merged = os.path.join(data_path, 'sample.gtf')
    refgenome = os.path.join(data_path, 'sample_genome.fa')

    roi = pandas.read_csv(
        bed_merged,
        sep='\t',
        header=None,
        usecols=[0, 2, 3, 4, 5, 6],
        skiprows=2,
        names=['chrom', 'name', 'start', 'end', 'score', 'strand'])
    roi.start -= 1
    print(roi)

    data = Bioseq.create_from_refgenome('train',
                                        refgenome=refgenome,
                                        roi=roi,
                                        storage='ndarray',
                                        store_whole_genome=True,
                                        order=order)

    np.testing.assert_equal(data[0], data[data.gindexer[0]])
    assert len(data.garray.handle) == 2

    # for order 1
    assert len(data) == 2
    assert data.shape == (2, 10000, 1, 4)
    assert data[:].sum() == 20000

    roi = BedTool(bed_merged)
    data = Bioseq.create_from_refgenome('train',
                                        refgenome=refgenome,
                                        roi=roi,
                                        storage='ndarray',
                                        store_whole_genome=True,
                                        order=order)

    np.testing.assert_equal(data[0], data[data.gindexer[0]])
    assert len(data.garray.handle) == 2

    # for order 1
    assert len(data) == 2
    assert data.shape == (2, 10000, 1, 4)
    assert data[:].sum() == 20000

    roi = [iv for iv in BedTool(bed_merged)]
    data = Bioseq.create_from_refgenome('train',
                                        refgenome=refgenome,
                                        roi=roi,
                                        storage='ndarray',
                                        store_whole_genome=True,
                                        order=order)

    np.testing.assert_equal(data[0], data[data.gindexer[0]])
    assert len(data.garray.handle) == 2

    # for order 1
    assert len(data) == 2
    assert data.shape == (2, 10000, 1, 4)
    assert data[:].sum() == 20000
Exemplo n.º 29
0
def main(ME_centric,
         bed12,
         U2_GTAG_5_file,
         U2_GTAG_3_file,
         phylop,
         ME_len,
         ME_DB=False):

    n = 100

    min_intron_lenght = 80

    if phylop != "NA":
        phylop_bw = pyBigWig.open(phylop)

    U2_GTAG_5 = PWM_to_dict(U2_GTAG_5_file)
    U2_GTAG_3 = PWM_to_dict(U2_GTAG_3_file)

    U2_GTAG_5_max_score = 0
    U2_GTAG_3_max_score = 0

    for index in range(13):
        U2_GTAG_5_max_score += max(U2_GTAG_5['A'][index],
                                   U2_GTAG_5['C'][index],
                                   U2_GTAG_5['T'][index],
                                   U2_GTAG_5['G'][index])

    for index in range(17):
        U2_GTAG_3_max_score += max(U2_GTAG_3['A'][index],
                                   U2_GTAG_3['C'][index],
                                   U2_GTAG_3['T'][index],
                                   U2_GTAG_3['G'][index])

    TOTAL_U2_max_score = U2_GTAG_5_max_score + U2_GTAG_3_max_score

    found_ME = set([])
    ME_chroms = set([])

    for row in csv.reader(open(ME_centric), delimiter='\t'):

        ME, transcript, sum_total_coverage, total_SJs, total_coverages, len_micro_exon_seq_found, micro_exon_seq_found, total_number_of_micro_exons_matches, U2_scores, mean_conservations, P_MEs, total_ME = row

        ME_strand, ME_start, ME_end = ME.split("_")[-3:]
        ME_chrom = "_".join(ME.split("_")[:-3])

        found_ME.add(ME)
        ME_chroms.add(ME_chrom)

    introns = set([])

    non_detected_ME = defaultdict(
        list
    )  # a microexon can be derived from more than one transcript. The idea is to collapese the transcript

    SJ_start_seqs = {}
    SJ_end_seqs = {}

    for row in csv.reader(open(bed12), delimiter='\t'):

        blocksizes = list(map(int, row[10].strip(",").split(",")))
        qstarts = list(map(int, row[11].strip(",").split(",")))

        start = int(row[1])
        end = int(row[2])
        strand = row[5]
        bn = int(row[9])
        chrom = row[0]
        transcript = row[3]

        f_seq = ""
        r_seq = ""

        if chrom in Genome:

            for q1, q2, b1 in zip(qstarts, qstarts[1:], blocksizes):

                istart = start + q1 + b1
                iend = start + q2

                SJ_ID = transcript + str(istart)

                intron = " ".join(
                    [chrom, str(istart),
                     str(iend), "SJ", "0", strand])

                #if chrom in ME_chroms:

                introns.add(intron)

                # Indexing tag library

                estart = start + q1
                eend = start + q1 + b1

                f_seq += str(Genome[chrom][estart:eend])

                if (chrom, eend) in SJ_start_seqs:

                    if f_seq[-100:] > len(SJ_start_seqs[(chrom, eend)]):

                        SJ_start_seqs[(chrom, eend)] = f_seq[-100:]

                else:

                    SJ_start_seqs[(chrom, eend)] = f_seq[-100:]

            for q1, b1 in zip(qstarts[::-1], blocksizes[::-1]):

                estart = start + q1
                eend = start + q1 + b1

                r_seq = str(Genome[chrom][estart:eend]) + r_seq

                if (chrom, estart) in SJ_end_seqs:

                    if r_seq[:100] > len(SJ_end_seqs[(chrom, estart)]):

                        SJ_end_seqs[(chrom, estart)] = r_seq[:100]

                else:

                    SJ_end_seqs[(chrom, estart)] = r_seq[:100]

            for q1, q2, q3, b1, b2, b3 in zip(qstarts, qstarts[1:],
                                              qstarts[2:], blocksizes,
                                              blocksizes[1:], blocksizes[2:]):

                estart = start + q2
                eend = start + q2 + b2
                elength = eend - estart
                exon = "_".join([chrom, strand, str(estart), str(eend)])

                SJ_start = start + q1 + b1
                SJ_end = start + q3
                ME_intron = " ".join(
                    [chrom,
                     str(SJ_start),
                     str(SJ_end), "SJ", "0", strand])

                dn = Genome[chrom][(estart -
                                    2):estart] + Genome[chrom][eend:(eend + 2)]

                if strand == "-":
                    dn = dn.reverse_complement()

                dn = str(dn).upper()

                if elength <= ME_len and dn == "AGGT" and exon not in found_ME:

                    #if chrom in ME_chroms:

                    introns.add(ME_intron)

                    non_detected_ME[(chrom, estart, eend, strand,
                                     elength)].append(transcript)

    ##### Microexon database ######

    if ME_DB != False:

        for row in csv.reader(open(ME_DB), delimiter='\t'):

            if len(row) == 12:

                blocksizes = list(map(int, row[10].strip(",").split(",")))
                qstarts = list(map(int, row[11].strip(",").split(",")))

                start = int(row[1])
                end = int(row[2])
                strand = row[5]
                bn = int(row[9])
                chrom = row[0]

                if chrom in Genome:

                    for q1, q2, q3, b1, b2, b3 in zip(qstarts, qstarts[1:],
                                                      qstarts[2:], blocksizes,
                                                      blocksizes[1:],
                                                      blocksizes[2:]):

                        estart = start + q2
                        eend = start + q2 + b2
                        elength = eend - estart
                        exon = "_".join(
                            [chrom, strand,
                             str(estart),
                             str(eend)])
                        transcript = row[3]

                        SJ_start = start + q1 + b1
                        SJ_end = start + q3
                        ME_intron = " ".join([
                            chrom,
                            str(SJ_start),
                            str(SJ_end), "SJ", "0", strand
                        ])

                        dn = Genome[chrom][
                            (estart - 2):estart] + Genome[chrom][eend:(eend +
                                                                       2)]

                        if strand == "-":
                            dn = dn.reverse_complement()

                        dn = str(dn).upper()

                        if elength <= ME_len and dn == "AGGT" and exon not in found_ME:

                            #introns.add(ME_intron)

                            non_detected_ME[(chrom, estart, eend, strand,
                                             elength)].append(transcript)

    introns_str = "\n".join(list(introns))

    intron_bed = BedTool(introns_str, from_string=True)
    intron_bed = intron_bed.sort()

    TOTAL_SJ_starts = set([])
    TOTAL_SJ_ends = set([])

    with open('data/ME_canonical_SJ_tags.DB.fa',
              'w') as out_tags, open('data/DB.ME_centric',
                                     'w') as out_ME_centric:

        for i in non_detected_ME.items():

            ME_info, transcripts = i

            chrom, estart, eend, strand, elength = ME_info

            transcript = transcripts[0]

            #ME = "_".join([chrom, str(estart), strand, str(eend)])
            ME = "_".join([chrom, strand, str(estart), str(eend)])

            if elength <= ME_len and dn == "AGGT" and exon not in found_ME:

                if phylop == "NA":

                    mean_conservation = 0

                else:

                    try:
                        mean_conservation = phylop_bw.stats(chrom,
                                                            estart - 2,
                                                            eend + 2,
                                                            type="mean")[0]
                    except RuntimeError:
                        mean_conservation = 0

                    if mean_conservation == None:
                        mean_conservation = 0

                ME5 = str(Genome[chrom][estart - 14:estart + 3]).upper()
                ME3 = str(Genome[chrom][eend - 3:eend + 10]).upper()

                micro_exon_seq_found = str(Genome[chrom][estart:eend]).upper()

                if strand == "-":

                    ME5 = str(Genome[chrom][eend - 3:eend +
                                            14].reverse_complement()).upper()
                    ME3 = str(Genome[chrom][estart - 10:estart +
                                            3].reverse_complement()).upper()

                    micro_exon_seq_found = str(
                        Genome[chrom]
                        [estart:eend].reverse_complement()).upper()

                U2_score = 0

                i = 0

                for N in ME5:
                    U2_score += U2_GTAG_3[N][i]
                    i += 1

                i = 0

                for N in ME3:
                    U2_score += U2_GTAG_5[N][i]
                    i += 1

                U2_score = percent(U2_score, TOTAL_U2_max_score)

                ME_bed = BedTool(" ".join(
                    [chrom,
                     str(estart),
                     str(eend - 1), "ME", "0", strand]),
                                 from_string=True)

                SJs_bed = intron_bed.intersect(ME_bed,
                                               wa=True,
                                               s=True,
                                               F=1,
                                               nonamecheck=True)

                SJs = set([])

                SJ_starts = []
                SJ_ends = []

                if len(SJs_bed) != 0:

                    for sj in SJs_bed:

                        SJ_chrom, SJ_start, SJ_end, ID, score, SJ_strand = str(
                            sj).strip("\n").split("\t")
                        SJ = SJ_chrom + ":" + SJ_start + SJ_strand + SJ_end

                        SJ_starts.append(int(SJ_start))
                        SJ_ends.append(int(SJ_end))

                        SJs.add(SJ)

                        TOTAL_SJ_starts.add((chrom, SJ_start))
                        TOTAL_SJ_ends.add((chrom, SJ_end))

                        ### TAG creation

                        UP_TAG = SJ_start_seqs[(SJ_chrom, int(SJ_start))]
                        DOWN_TAG = SJ_end_seqs[(SJ_chrom, int(SJ_end))]

                        ME_TAG = UP_TAG + Genome[chrom][estart:eend] + DOWN_TAG

                        tag_pos = "_".join(
                            map(str, [
                                len(UP_TAG), micro_exon_seq_found,
                                len(DOWN_TAG)
                            ]))

                        if strand == "-":

                            ME_TAG = ME_TAG.reverse_complement()

                            tag_pos = "_".join(
                                map(str, [
                                    len(UP_TAG), micro_exon_seq_found,
                                    len(DOWN_TAG)
                                ][::-1]))

                        ME_TAG = str(ME_TAG).upper()

                        ME_TAG_ID = chrom + ":" + "".join(
                            [str(estart), strand,
                             str(eend)])

                        out_tags.write(">" +
                                       "|".join([SJ, transcript, tag_pos]) +
                                       "\n")
                        out_tags.write(ME_TAG + "\n")

                        # print ">" + "|".join([ ME_TAG_ID, transcript,  tag_pos ])
                        # print ME_TAG

                    total_SJs = ",".join(SJs)

                    min_intron_seq = str(
                        Genome[chrom][max(SJ_starts):min(SJ_ends)]).upper()

                    if strand == "-":

                        min_intron_seq = str(
                            Genome[chrom][max(SJ_starts):min(SJ_ends)].
                            reverse_complement()).upper()

                    total_number_of_micro_exons_matches = min_intron_seq.count(
                        "AG" + micro_exon_seq_found + "GT")

                    P_ME = 1 - (
                        1 -
                        (float(1) / float(4**len(micro_exon_seq_found) + 4))
                    )**(len(min_intron_seq) - (len(micro_exon_seq_found) + 4))

                    info = ME, transcript, 0, total_SJs, 0, elength, micro_exon_seq_found, total_number_of_micro_exons_matches, U2_score, mean_conservation, P_ME, "|".join(
                        map(str, [ME, U2_score, mean_conservation]))

                    out_ME_centric.write("\t".join(map(str, info)) + "\n")
Exemplo n.º 30
0
def test_janggu_variant_streamer_order_1_revcomp(tmpdir):
    os.environ['JANGGU_OUTPUT'] = tmpdir.strpath
    """Test Janggu creation by shape and name. """
    data_path = pkg_resources.resource_filename('janggu', 'resources/')

    order = 1

    refgenome = os.path.join(data_path, 'sample_genome.fa')
    vcffile = os.path.join(data_path, 'sample.vcf')

    dna = Bioseq.create_from_refgenome('dna',
                                       refgenome=refgenome,
                                       storage='ndarray',
                                       binsize=50,
                                       store_whole_genome=True,
                                       order=order)

    annot = BedTool([Interval('chr2', 110, 130, '-')])

    # even binsize
    vcf = VariantStreamer(dna, vcffile, binsize=10, batch_size=1)
    it_vcf = iter(vcf.flow())
    next(it_vcf)
    # C to T
    #print(names, chroms, poss, ra, aa)
    #print(reference)
    #print(alternative)
    #assert names[0] == 'refmismatch'
    #np.testing.assert_equal(reference, alternative)
    #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))

    next(it_vcf)
    # C to T
    #print(names, chroms, poss, ra, aa)
    #print(reference)
    #print(alternative)
    #np.testing.assert_equal(reference[0,4,0,:], np.array([0,1,0,0]))
    #np.testing.assert_equal(alternative[0,4,0,:], np.array([0,0,0,1]))

    names, chroms, poss, ra, aa, reference, alternative = next(it_vcf)
    # T to C
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    #    np.testing.assert_equal(reference[0,4,0,:], np.array([0,0,0,1]))
    #    np.testing.assert_equal(alternative[0,4,0,:], np.array([0,1,0,0]))

    # even binsize
    vcf = VariantStreamer(dna,
                          vcffile,
                          binsize=10,
                          batch_size=1,
                          annotation=annot)
    it_vcf = iter(vcf.flow())
    next(it_vcf)
    # C to T

    next(it_vcf)
    # C to T

    names, chroms, poss, ra, aa, reference2, alternative2 = next(it_vcf)
    # T to C
    print(names, chroms, poss, ra, aa)
    print(reference)
    print(alternative)
    np.testing.assert_equal(reference, reference2[:, ::-1, :, ::-1])
    np.testing.assert_equal(alternative, alternative2[:, ::-1, :, ::-1])