Exemplo n.º 1
0
def classify_peak(peak, sample, motifs):
    pc_peak = (peak.contig, 
               peak.start+peak.summit-300, 
               peak.start+peak.summit+300)
    nc_peak = (peak.contig, 
               peak.start+peak.summit-2000, 
               peak.start+peak.summit+2000)
    status = []
    for motif in motifs:
        fname = tf_peak_fnames[
            (motif.tf_name, RMID_term_name_mapping[sample])][0]
        fp = TabixFile(fname)
        if peak[0] not in fp.contigs: 
            status.append(0)
            continue
        pc_peaks = list(fp.fetch(*pc_peak))
        if len(pc_peaks) > 0:
            status.append(1)
            continue
        nc_peaks = list(fp.fetch(*nc_peak))
        if len(nc_peaks) == 0:
            status.append(-1)
        else:
            status.append(0)
    return status
Exemplo n.º 2
0
def get_interval_data(genes, INT):
    '''
    Get interval data for gene from input files in int_fns. 
    '''
    for fn, name in INT:
        tb = TabixFile(fn)

        for g in genes:

            # Get region for searching replication timing data
            g_len = g.total_length
            midp = round((g.start + g.stop) / 2)
            min_width = 10e3  # search region at least 1 kb
            if g_len < min_width:
                start = midp - round(min_width / 2)
                stop = midp + round(min_width / 2)
                gstr = '%s:%d-%d' % (g.chrom, start, stop)
            else:
                gstr = '%s:%d-%d' % (g.chrom, g.start, g.stop)

            # Call to tabix to get dat from bedGraph
            try:
                it_genes = tb.fetch(gstr)
            except ValueError:  # handle regions where no interval can be made
                g.intervalData[name] = None
                continue
            intData = []
            for itr in it_genes:
                if itr == '': continue
                itr = itr.split('\t')
                intData.append(float(itr[-1]))
            if len(intData) > 0:
                g.intervalData[name] = np.mean(intData)
                continue
            else:
                # Extend search if value not found
                extends0 = [50e3, 100e3, 500e3, 1e6]
                extends = []
                for e in extends0:
                    if e > g_len: extends.append(e)
                found = False
                for e in extends:
                    start = max(1, midp - round(e / 2))
                    stop = midp + round(e / 2)
                    gstr = '%s:%d-%d' % (g.chrom, start, stop)

                    it_genes = tb.fetch(gstr)
                    for itr in it_genes:
                        if itr == '': continue
                        itr = itr.split('\t')
                        intData.append(float(itr[-1]))
                        found = True
                    if found == True:
                        g.intervalData[name] = np.mean(intData)
                        break

                if found == False:
                    g.intervalData[name] = None

    return genes
Exemplo n.º 3
0
def main():
    args = parse_arguments()
    print(VariantFile(BUILD_TO_VCF[args.reference_build]).header)
    vcf_file = TabixFile(BUILD_TO_VCF[args.reference_build])
    rsid_file = TabixFile(BUILD_TO_RSID[args.reference_build],
                          index=f'{BUILD_TO_RSID[args.reference_build]}.csi')

    def rsid_to_coordinates(rsid):
        rs_number = int(rsid.replace('rs', ''))
        for row in rsid_file.fetch('rs', rs_number - 1, rs_number):
            chrom, pos = row.split()[2:]
            yield chrom, int(pos)

    for variant in args.variants:
        if COORD_REGEX.match(variant):
            chrom, pos = variant.split(':')
            chrom = chrom_to_hgvs(chrom, reference_build=args.reference_build)
            pos = int(pos)
            for row in vcf_file.fetch(chrom, pos - 1, pos):
                print(row)
        elif RSID_REGEX.match(variant):
            for chrom, pos in rsid_to_coordinates(variant):
                for row in vcf_file.fetch(chrom, pos - 1, pos):
                    print(row)
        else:
            raise RuntimeError('Improperly formatted query')
Exemplo n.º 4
0
def get_interval_data(regions, INT):
    '''
    Get interval data for region for input files in int_fns. 
    Computes mean data value in 100 kb window around region midpoint.
    '''
    for fn, name in INT:
        tb = TabixFile(fn)

        for r in regions:

            # Get region for searching replication timing data
            r_len = r.length
            midp = round((r.start + r.stop) / 2)
            min_width = 10e3  # search region at least 1 kb
            if r_len < min_width:
                start = midp - round(min_width / 2)
                stop = midp + round(min_width / 2)
                rstr = '%s:%d-%d' % (r.chrom, start, stop)
            else:
                rstr = r.region_string

            try:
                it_regions = tb.fetch(rstr)
            except ValueError:  # handle regions where no interval can be made
                r.intervalData[name] = None
                continue
            intData = []
            for rtr in it_regions:
                if rtr == '': continue
                intData.append(float(rtr.split('\t')[-1]))

            if len(intData) > 0:
                r.intervalData[name] = np.mean(intData)
                continue
            else:
                # Extend search if value not found
                extends = [50e3, 100e3, 500e3, 1e6]
                found = False
                for e in extends:
                    start = max(1, midp - round(e / 2))
                    stop = midp + round(e / 2)
                    rstr = '%s:%d-%d' % (r.chrom, start, stop)

                    it_regions = tb.fetch(rstr)
                    for rtr in it_regions:
                        if rtr == '': continue
                        intData.append(float(rtr.split('\t')[-1]))
                        found = True
                    if found == True:
                        r.intervalData[name] = np.mean(intData)
                        break

                if found == False:
                    r.intervalData[name] = None

    return regions
Exemplo n.º 5
0
    def __process_chromosome(self, chromosome_queue,
                             tabix_reader: pysam.TabixFile):
        vcf = VCF()
        samples = vcf.get_sample_names(self.vcf_file)
        while True:
            try:
                chromosome, size = chromosome_queue.get()
            except queue.Empty:
                time.sleep(0.1)
                continue
            if chromosome is None:
                break
            write_header = True

            window_writer = open(
                os.path.join(self.binned_output_folder,
                             chromosome + "_window.csv"), 'w')
            chunks = self.sliding_window_generator(size)
            print("\nScreening: {}".format(chromosome))
            for start_pos, end_pos in chunks:
                records = tabix_reader.fetch(chromosome,
                                             start_pos,
                                             end_pos,
                                             multiple_iterators=True)
                vcf_arr = [SNP(line, samples) for line in list(records)]
                alleles_window_sample_dict = self.determine_alleles(
                    vcf_arr, samples)
                self.__write_window_to_file(alleles_window_sample_dict,
                                            window_writer, chromosome,
                                            start_pos, end_pos, write_header)
                write_header = False
            window_writer.close()
Exemplo n.º 6
0
class _ALLC:
    def __init__(self, path, region):
        self.f = TabixFile(path)
        try:
            self.f_region = self.f.fetch(region)
        except ValueError:
            self.f_region = TabixIterator()

    def readline(self):
        return self.f_region.next()

    def close(self):
        self.f.close()
Exemplo n.º 7
0
    def _match_clinvar_one_variant(
            variant: Variant, tabix: TabixFile,
            cols: List[str]) -> Optional[Dict[str, Any]]:
        """Match the variant to the given ClinVar tabix table.

        Args:
            variant: Variant to be matched
            tabix: Tabix indexed CliVar table
            cols: All ClinVar columns in the table

        Returns:
            None if no ClinVar match. When matched, returns a `dict` of the clinvar record,
            where the key ``final_clinical_significance`` stores the final clinical significance type
            in :class:`ClinicalSignificance`.
        """
        try:
            # TabixFile.fetch will raise ValueError if the given region is out of bound
            row_iter = tabix.fetch(
                region=f"{variant.chrom}:{variant.start_pos}-{variant.end_pos}"
            )
        except ValueError as e:
            # Do nothing if it's querying for a chromosome not in the ClinVar table
            if "could not create iterator for region" not in e.args[0]:
                logger.opt(
                    exception=e).debug(f"Tabix fetch ClinVar failed: {e}")
            return None

        for row in row_iter:
            record = dict(zip(cols, row.split("\t")))
            if (int(record["start"]) == variant.start_pos
                    and int(record["stop"]) == variant.end_pos
                    and record["alt"] == variant.alt_allele):
                if record["ref"] != variant.ref_allele:
                    logger.warning(
                        f"{variant!r} got a clinvar match but their reference alleles are different: "
                        f"{variant.ref_allele!r} != {record['ref']!r}")
                # Parse the clinical significance of the record
                record[
                    "final_clinical_significance"] = ClinicalSignificance.parse_clinvar_record(
                        record)
                return record
        return None
Exemplo n.º 8
0
class ExploreGnomad:
    def __init__(self, gnomad_file, frequency_table):
        self.gnomad = TabixFile(gnomad_file)
        self.frequencies = pd.read_csv(frequency_table, sep="\t", header=None)
        self.frequencies.columns = ["CHR:POS", "REF", "ALT", "AF"]
        self.frequencies[["CHR",
                          "POS"]] = self.frequencies["CHR:POS"].str.split(
                              ":", expand=True)
        self.frequencies["POS"] = self.frequencies["POS"].astype(int)

    def search_position(self, chr, pos, ref, alt):
        query_lines = self.gnomad.fetch(chr, pos - 1, pos)
        for variant in query_lines:
            variant_split = variant.split("\t")
            var_ref, var_alt = variant_split[3:5]
            if ref == var_ref and alt == var_alt:
                info_line = variant_split[-1]
                match = re.search(";AF_nfe=([0-9.e+\\-]+);", info_line)
                if match:
                    return match.group(1)
        return None

    def search_all(self, output_path):
        nfe_AF = [None] * len(self.frequencies)
        for i, row in self.frequencies.iterrows():
            if i % 1000 == 0:
                print(f"{round(100*i/len(self.frequencies))} % Done")
            nfe_AF[i] = self.search_position(row["CHR"], row["POS"],
                                             row["REF"], row["ALT"])

        self.frequencies["nfe_AF"] = nfe_AF
        self.frequencies.to_csv(
            output_path,
            sep="\t",
            index=None,
            columns=["CHR", "POS", "REF", "ALT", "AF", "nfe_AF"])
Exemplo n.º 9
0
            snps[row['SNP']] = row

    with open('DrugInfo.csv') as src:
        drug_info = {row['SNP']: row for row in csv.DictReader(src)}

    with open('okg.ped') as pop_src:
        # mapping: sample id -> population id
        populations = {
            indiv['Individual ID']: indiv['Population']
            for indiv in csv.DictReader(pop_src, delimiter='\t')
        }

    print 'Determining genomic coordinates for sequences.'
    f = TabixFile('snps.sorted.txt.gz', parser=asTuple())
    snp_table = {}
    for row in f.fetch():
        _, snp, chrom, pos = row
        if snp in snps or snp in drug_info:
            snp_table[snp] = {'chromosome': chrom, 'pos': int(pos)}
    with open('snps.py', 'w') as dump:
        dump.write(WARNING)
        dump.write('COORDINATES = %s\n' % snp_table)
        dump.write('DATA = %s\n' % snps)
        dump.write('DRUG_INFO = %s\n' % drug_info)
    print 'Data written to snps.py'
    print 'Determining allele frequencies (using data from 1000 Genomes)'
    genotypes = {snp: snp_data['Code'] for snp, snp_data in snps.iteritems()}
    variants = list(
        ga4gh.search_variants(genotypes, dataset=ga4gh.OKG, repo_id='google'))
    # determine allele frequencies for different population
    freqs = {
Exemplo n.º 10
0
from pysam import VariantFile
from pysam import TabixFile
from pyfaidx import Fasta

# data files
reference_file = 'S_lycopersicum_chromosomes.2.40.fa'
annotation_file = 'gene_models.gff.gz'
variant_file = 'tomato_snps.bcf'

# load reference
reference = Fasta(reference_file)

# load annotations
annotations = TabixFile(annotation_file)

# laod variants
variants = VariantFile(variant_file)

# regions to query
region1 = ("SL2.40ch01", 15000, 21000)
region2 = ("SL2.40ch01", 20000, 70000)

region1_reference = reference[region1[0]][region1[1]: region1[2]]
region1_annotations = [a for a in annotations.fetch(*region1, parser=pysam.asGTF())]
region1_variants = [a for a in variants.fetch(*region1)]

region2_reference = reference[region2[0]][region2[1]: region2[2]]
region2_annotations = [a for a in annotations.fetch(*region2, parser=pysam.asGTF())]
region2_variants = [a for a in variants.fetch(*region2)]
Exemplo n.º 11
0
                continue
            row['disease'] = disease[0:-1]
            snps[row['SNP']] = row 
    
    with open('DrugInfo.csv') as src:
        drug_info = {row['SNP']: row for row in csv.DictReader(src)} 

    with open('okg.ped') as pop_src:
        # mapping: sample id -> population id
        populations = {indiv['Individual ID']: indiv['Population']
                for indiv in csv.DictReader(pop_src, delimiter='\t')}

    print 'Determining genomic coordinates for sequences.'
    f = TabixFile('snps.sorted.txt.gz', parser=asTuple()) 
    snp_table = {}
    for row in f.fetch():
        _, snp, chrom, pos = row
        if snp in snps or snp in drug_info:
            snp_table[snp] = {
                'chromosome': chrom,
                'pos': int(pos)
            } 
    with open('snps.py', 'w') as dump:
        dump.write(WARNING)
        dump.write('COORDINATES = %s\n'% snp_table)
        dump.write('DATA = %s\n'% snps)
        dump.write('DRUG_INFO = %s\n'% drug_info)
    print 'Data written to snps.py' 
    print 'Determining allele frequencies (using data from 1000 Genomes)'
    genotypes = {snp: snp_data['Code'] for snp, snp_data in snps.iteritems()} 
    variants = list(ga4gh.search_variants(genotypes, dataset=ga4gh.OKG, repo_id='google'))
Exemplo n.º 12
0
class IndexedBedFile(DataSource):
    name = "indexed_bedfile"
    version = "0.1.0"
    container = "dataframe"
    partition_access = False
    description = "A bgzipped and indexed bedfile"

    def __init__(self, urlpath, include_unmapped=True, metadata=None):
        self._urlpath = urlpath
        self._include_unmapped = include_unmapped
        self._dataset = None
        self._dtype = None
        self._chroms = None
        super(IndexedBedFile, self).__init__(metadata=metadata)

    def _open_dataset(self):
        self._dataset = TabixFile(self._urlpath)

    def _get_schema(self):
        if self._dataset is None:
            self._open_dataset()
        self._chroms = list(self._dataset.contigs)

        rec = next(self._dataset.fetch(self._chroms[0], parser=asTuple()))
        num_fields = len(rec)

        chrom_coord_dtype = np.int64
        dtypes = {
            "chrom": pd.CategorialDtype(self._chroms + ["NULL"], ordered=True),
            "start": chrom_coord_dtype,
            "end": chrom_coord_dtype,
            "name": str,
            "score": np.float32,
            "strand": bool,
        }
        self._dtype = {
            key: dtypes[key]
            for key in list(dtypes.keys())[:num_fields]
        }
        return Schema(
            datashape=None,
            dtype=self._dtype,
            shape=(None, len(self._dtype)),
            npartitions=len(self._chroms),
            extra_metadata={},
        )

    def _get_partition(self, i):
        chrom = self._chroms[i]
        columns = list(self._dtype.keys())
        return pd.DataFrame(list(self._dataset.fetch(chrom, parser=asTuple())),
                            columns=columns).astype(self._dtype)

    def read(self):
        self._load_metadata()
        return pd.concat(
            [self.read_partition(i) for i in range(self.npartitions)],
            ignore_index=True)

    def _close(self):
        # close any files, sockets, etc
        if self._dataset is not None:
            self._dataset.close()