Exemplo n.º 1
0
def load_annos(args):
    """
    Populate a dictionary of Tabixfile handles for
    each annotation file.  Other modules can then
    access a given handle and fetch data from it
    as follows:

    dbsnp_handle = annotations.annos['dbsnp']
    hits = dbsnp_handle.fetch(chrom, start, end)
    """
    anno_files = get_anno_files(args)
    for anno in anno_files:
        try:
            # .gz denotes Tabix files.
            if anno_files[anno].endswith(".gz"):
                if anno == "clinvar":
                    annos[anno] = pysam.Tabixfile(anno_files[anno],
                                                  encoding='utf8')
                else:
                    annos[anno] = pysam.Tabixfile(anno_files[anno])
            # .bw denotes BigWig files.
            elif anno_files[anno].endswith(".bw"):
                from bx.bbi.bigwig_file import BigWigFile
                annos[anno] = BigWigFile(open(anno_files[anno]))

        except IOError:
            raise IOError("Gemini cannot open this annotation file: %s. \n"
                          "Have you installed the annotation files?  If so, "
                          "have they been moved or deleted? Exiting...\n\n"
                          "For more details:\n\t"
                          "http://gemini.readthedocs.org/en/latest/content/"
                          "#installation.html\#installing-annotation-files\n" %
                          anno_files[anno])
Exemplo n.º 2
0
    def __init__(self,
                 src_file,
                 title='COSMIC',
                 version=None,
                 gpp_tabix_file=None):
        self.title = title

        super(Cosmic, self).__init__(src_file,
                                     title=self.title,
                                     version=version)

        if gpp_tabix_file is None:
            raise ValueError(
                "A second index by gene protein position must be specified.")

        self.db_genomePos = pysam.Tabixfile(src_file)
        header = self.db_genomePos.header.next()
        self.src_headers = header.lstrip('#').strip().split('\t')

        self.db_geneProteinPos = pysam.Tabixfile(gpp_tabix_file)
        gppHeader = self.db_geneProteinPos.header.next()
        self.gpp_headers = gppHeader.lstrip('#').strip().split('\t')

        self.output_headers = [
            'COSMIC_n_overlapping_mutations',
            'COSMIC_overlapping_mutation_AAs',
            'COSMIC_overlapping_mutation_descriptions',
            'COSMIC_overlapping_primary_sites'
        ]
        self.logger = logging.getLogger(__name__)
Exemplo n.º 3
0
    def testFetchAll(self):
        remote_file = pysam.Tabixfile(self.url, "r")
        remote_result = list(remote_file.fetch())
        local_file = pysam.Tabixfile(self.local, "r")
        local_result = list(local_file.fetch())

        self.assertEqual(len(remote_result), len(local_result))
        for x, y in zip(remote_result, local_result):
            self.assertEqual(x, y)
Exemplo n.º 4
0
    def __init__(self, format_fn, exomes=True):

        self.exomes = exomes

        self.tabix_files = {}
        if exomes:
            self.tabix_files['_'] = pysam.Tabixfile(format_fn)
        else:
            for chrom in map(str, range(1, 23)) + ['X']:
                self.tabix_files[chrom] = pysam.Tabixfile(
                    format_fn.format(chrom))
Exemplo n.º 5
0
 def __init__(self):
     ref_path = '/stor/work/Lambowitz/ref/hg19_ref/genes'
     exons = ref_path + '/gencode.exon.bed.gz'
     self.exons = pysam.Tabixfile(exons)
     transcriptom_peaks = project_path + '/transcriptome/macs2/unfragmented.fwd_peaks_genomics.narrowPeak.gz'
     self.transcriptome_peaks = pysam.Tabixfile(transcriptom_peaks)
     self.bam = pysam.Samfile(
         project_path +
         '/merged_bam/dedup/unfragmented.chrM_filter.dedup.bam')
     self.bed = pysam.Tabixfile(project_path +
                                '/bed_files/merged_bed/unfragmented.bed.gz')
Exemplo n.º 6
0
    def __init__(self, vcffile=None):

        self.vcffile = vcffile

        self.filename = os.path.splitext(os.path.basename(str(vcffile)))[0]

        self.header = open('%s/scripts/header.vcf' %
                           (settings.BASE_DIR)).readlines()

        # create folder merge if it doesn't exists
        if not os.path.exists('merge'):
            os.makedirs('merge')
        # enter inside folder
        os.chdir('merge')

        self.annotation_files = OrderedDict()

        pysam.tabix_index('../snpeff/snpeff.output.vcf', preset='vcf')

        self.annotation_files['snpeff'] = {
            'info':
            'EFF',
            'file':
            pysam.Tabixfile('../snpeff/snpeff.output.vcf.gz',
                            'r',
                            encoding="utf-8")
        }

        pysam.tabix_index('../vep/vep.output.sorted.vcf', preset='vcf')

        self.annotation_files['vep'] = {
            'info':
            'CSQ',
            'file':
            pysam.Tabixfile('../vep/vep.output.sorted.vcf.gz',
                            'r',
                            encoding="utf-8")
        }

        pysam.tabix_index('../snpsift/snpsift.final.vcf', preset='vcf')

        self.annotation_files['vartype'] = {
            'info':
            'VARTYPE,SNP,MNP,INS,DEL,MIXED,HOM,HET',
            'file':
            pysam.Tabixfile('../snpsift/snpsift.final.vcf.gz',
                            'r',
                            encoding="utf-8")
        }

        self.dbsnp = pysam.Tabixfile(settings.dbsnp, 'r', encoding="utf-8")
Exemplo n.º 7
0
    def testFetchAll(self):
        shutil.copyfile(self.filename_src, self.filename_dst)
        shutil.copyfile(self.index_src, self.index_dst)
        same_basename_file = pysam.Tabixfile(self.filename_src,
                                             "r",
                                             index=self.index_src)
        same_basename_results = list(same_basename_file.fetch())
        diff_index_file = pysam.Tabixfile(self.filename_dst,
                                          "r",
                                          index=self.index_dst)
        diff_index_result = list(diff_index_file.fetch())

        self.assertEqual(len(same_basename_results), len(diff_index_result))
        for x, y in zip(same_basename_results, diff_index_result):
            self.assertEqual(x, y)
Exemplo n.º 8
0
    def __init__(self, fname=None, fileobj=None, region=None):
        self._bins = {}
        self._bin_list = []
        self._cur_bin_idx = 0
        self._cur_bin_pos = 0
        self._tellpos = 0
        self._total = 0
        self._length = 0
        self.__tabix = None

        self.filename = fname

        if os.path.exists('%s.tbi' % fname):
            self.__tabix = pysam.Tabixfile(fname)

        if fileobj:
            self.__readfile(fileobj)
        elif fname:
            with ngsutils.support.ngs_utils.gzip_opener(fname) as fobj:
                self.__readfile(fobj)
        elif region:
            chrom, startend = region.split(':')
            if '-' in startend:
                start, end = [int(x) for x in startend.split('-')]
            else:
                start = int(startend)
                end = start
            start -= 1

            self.__add_region(BedRegion(chrom, start, end))
        else:
            raise ValueError(
                "Must specify either filename, fileobj, or region")
Exemplo n.º 9
0
    def plot_genes(self, fn_genes):

        #genes = regions_from_bed(fn_genes,names=True)
        #locs,vals = genes.get_locations_over_interval(self.chr,self.start,self.end)

        max_loc = -1
        y = 0
        last_start, last_end = 0, 0
        #for i in xrange(locs.shape[0]):

        count = 0
        max_count = 10
        #for gene_line in tabix.Tabix(fn_genes).fetch(self.chr,self.start,self.end):
        tbx_genes = pysam.Tabixfile(fn_genes)
        for gene_line in tbx_genes.fetch(self.chr, self.start, self.end):
            sline = gene_line.split()
            start, end, name = int(sline[4]), int(sline[5]), sline[12]
            if start == last_start and end == last_end: continue
            #y=start<max_loc and y-1.5 or 0
            if count == max_count:
                y = 0
                count = 0
            else:
                y -= 1
                count += 1

            f_size = 8
            max_loc = max(max_loc, end + f_size * 1.1 * len(name))
            self.gene_ax.plot([start, end], [y, y], 'g', linewidth=4, alpha=.6)
            self.gene_ax.annotate(name, (end + 100, y),
                                  fontsize=f_size,
                                  horizontalalignment='left')

            last_start, last_end = start, end
def getphastcons(kmerpos, phastconsbed):
    #kmerpos = {} # {age : {location : [[chrm, kmerstart, kmerstop, strand]]}}
    phastconsdict = {
    }  # {age : {location : {binnumber : [phastconsvalue1, phastconsvalue2, ...]}}}
    #Bin1 is 100 bp upstream of kmerstart. For a 4mer, the kmer would be bins 101-104, and bins 105-205 would be 100 bp downstream of kmerstop.
    phastconstabix = pysam.Tabixfile(phastconsbed)
    for age in kmerpos:
        phastconsdict[age] = {}
        for location in kmerpos[age]:
            phastconsdict[age][location] = {}
            for kmer in kmerpos[age][location]:
                chrm = kmer[0]
                kmerstart = int(kmer[1])
                kmerstop = int(kmer[2])
                strand = kmer[3]
                phastconsscores = {}  # {windowbin : score}
                if strand == '+':
                    windowstart = kmerstart - 100
                    windowend = kmerstop + 100
                    try:
                        for bed in phastconstabix.fetch(chrm,
                                                        windowstart,
                                                        windowend,
                                                        parser=pysam.asBed()):
                            windowbin = str(int(bed.start) - windowstart)
                            phastconsscore = float(bed.name)
                            phastconsscores[windowbin] = phastconsscore
                    except ValueError:
                        print 'WARNING: problem with {0}:{1}-{2}:{3}.'.format(
                            str(chrm), str(kmerstart), str(kmerstop), strand)

                elif strand == '-':
                    windowstart = kmerstart - 100
                    windowend = kmerstop + 100
                    try:
                        for bed in phastconstabix.fetch(chrm,
                                                        windowstart,
                                                        windowend,
                                                        parser=pysam.asBed()):
                            windowbin = str(windowend - int(bed.start))
                            phastconsscore = float(bed.name)
                            phastconsscores[windowbin] = phastconsscore
                    except ValueError:
                        print 'WARNING: problem with {0}:{1}-{2}:{3}.'.format(
                            str(chrm), str(kmerstart), str(kmerstop), strand)

                if len(
                        phastconsscores
                ) > 0:  #if there were any bases in the UTR that had phastcons scores
                    for windowbin in phastconsscores:
                        if phastconsdict[age][location].has_key(
                                windowbin) == False:
                            phastconsdict[age][location][windowbin] = [
                                phastconsscores[windowbin]
                            ]
                        elif phastconsdict[age][location].has_key(windowbin):
                            phastconsdict[age][location][windowbin].append(
                                phastconsscores[windowbin])

    return phastconsdict
Exemplo n.º 11
0
def annotate_variants_bool(args, select_cursor, update_cursor):
    """
    Populate a new, user-defined column in the variants
    table with a BOOLEAN indicating whether or not
    overlaps were detected between the variant and the 
    annotation file.
    """
    add_requested_column(args.col_name, update_cursor)

    # For each, use Tabix to detect overlaps with the user-defined
    # annotation file.  Update the variant row with T/F if overlaps found.
    annos = pysam.Tabixfile(args.anno_file)
    select_cursor.execute("SELECT chrom, start, end, variant_id FROM variants")
    for row in select_cursor:
        has_hit = False
        for hit in annos.fetch(str(row['chrom']), int(row['start']), int(row['end'])):
            has_hit = True
            break
        if has_hit:
            update_qry = "UPDATE variants SET " + args.col_name + " = 1 " + \
                         " WHERE variant_id = " + str(row['variant_id'])
        else:
            update_qry = "UPDATE variants SET " + args.col_name + " = 0 " + \
                         " WHERE variant_id = " + str(row['variant_id'])
        update_cursor.execute(update_qry)
Exemplo n.º 12
0
 def get(self, chrom, position, ref, alt):
     if self.has_chr_prefix and not chrom.startswith('chr'):
         chrom = 'chr' + chrom
     elif not self.has_chr_prefix and chrom.startswith('chr'):
         chrom = chrom[3:]
     if not self.overlaps(chrom, position):
         self.chrom = chrom
         self.start = position
         self.stop = position + self.step_bp
         self.data = dict()
         for f in self.files:
             with pysam.Tabixfile(f, 'r') as tabix:
                 for row in tabix.fetch(self.chrom,
                                        self.start - 1,
                                        self.stop + 1,
                                        parser=pysam.asTuple()):
                     name = ':'.join(row[:4])
                     cadd_raw, cadd_phred = map(float, row[4:6])
                     if name in self.data:
                         if self.data[name][1] < cadd_phred:
                             self.data[name] = (cadd_raw, cadd_phred)
                     else:
                         self.data[name] = (cadd_raw, cadd_phred)
     return self.data.get(':'.join((chrom, str(position), ref, alt)),
                          (None, None))
Exemplo n.º 13
0
    def testBasicFetching(self):
        '''Test fetching records with only a gene protein position from an existing tabix file.'''
        genePPFilename = "testdata/small_cosmic_with_gp_and_gpp/small_cosmic_trimmed_for_sorting.txt.tbi.byAA.sorted.tsv.gz"
        tabixFile = pysam.Tabixfile(genePPFilename)
        headers = [
            "Gene_name", "HGNC_ID", "Sample_name", "Primary_site",
            "Site_subtype", "Primary_histology", "Histology_subtype",
            "Mutation_ID", "Mutation_CDS", "Mutation_AA",
            "Mutation_Description", "Mutation_zygosity",
            "Mutation_NCBI36_genome_position",
            "Mutation_GRCh37_genome_position", "Pubmed_PMID", "startAA",
            "endAA"
        ]

        results = tabixFile.fetch(reference="EGFR", start=748, end=749)
        resultDicts = []
        for result in results:
            resultDicts.append(dict(zip(headers, result.strip().split('\t'))))
        self.assertTrue(
            len(resultDicts) == 2,
            "Should have only had two entries, but found: " + str(resultDicts))

        #A1BG	5	ME024T	NS	NS	malignant_melanoma	NS	226401	c.1132G>A	p.D378N	Substitution - Missense	unk		19:58861796-58861796	22622578
        results = tabixFile.fetch(reference="A1BG", start=377, end=378)
        resultDicts = []
        for result in results:
            resultDicts.append(dict(zip(headers, result.strip().split('\t'))))
        self.assertTrue(
            len(resultDicts) == 1,
            "Should have only had one entry, but found: " + str(resultDicts))
Exemplo n.º 14
0
def load_chromosome(db, chromosome):
    ''' load genome reference '''
    import pysam
    db_dict = defaultdict(list)

    f = pysam.Tabixfile(db)
    for line in f.fetch(chromosome):

        chrom, pos = line.split('\t')[0:2]

        # resolve position when record is a deletion
        if len(line.split()[3]) > len(line.split()[4]):
            pos = resolve_db_position(args.input_type, int(pos))

        chrom.replace('chr', '')
        key = ','.join([chrom, str(pos)])

        if args.flag_with_id:
            value = line.split('\t')[2]
        else:
            value = 'T'

        if value not in db_dict[key]:
            db_dict[key].append(value)

    f.close()
    return db_dict
Exemplo n.º 15
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("--genotype",
                        help="Tabix indexed pileup file.",
                        required=True)
    parser.add_argument("--intervals", help="BED file.", required=True)
    parser.add_argument("--padding",
                        type=int,
                        default=10,
                        help="Number of bases to expand intervals, when "
                        "filtering based on adjacent indels [%default]")
    parser.add_argument("--min-distance-to-indels",
                        type=int,
                        default=5,
                        help="Variants closer than this distance from indels "
                        "are filtered [%default].")
    args = parser.parse_args(argv)

    genotype = pysam.Tabixfile(args.genotype)
    with open(args.intervals) as bed_file:
        intervals = text.parse_lines_by_contig(bed_file, pysam.asBed())

    for (_, beds) in sorted(intervals.items()):
        for (name, sequence) in build_genes(args, genotype, beds):
            FASTA(name, None, sequence).write(sys.stdout)

    return 0
Exemplo n.º 16
0
    def setUp(self):

        self.tabix = pysam.Tabixfile(self.filename)
        self.compare = [
            x[:-1].split("\t") for x in gzip.open(self.filename, "r")
            if not x.startswith("#")
        ]
Exemplo n.º 17
0
def coverage(bam_paths,
             gtf_path,
             transcript_ids=None,
             verbose=False,
             agg_func=None):

    # Setup record iterator from gtf file.
    gtf_file = pysam.Tabixfile(gtf_path, parser=pysam.asGTF())
    gtf_records = (rec for rec in gtf_file.fetch() if rec.feature == 'exon')

    if transcript_ids is not None:
        transcript_ids = set(transcript_ids)
        gtf_records = (rec for rec in gtf_records
                       if rec['transcript_id'] in transcript_ids)

    if verbose:
        gtf_records = tqdm(gtf_records, leave=False)

    # Build frame.
    rows = _coverage_gen(bam_paths, gtf_records, agg_func=agg_func)
    index_names = ['transcript_id', 'chr', 'start', 'end', 'strand']

    result = pd.DataFrame.from_records(rows,
                                       columns=index_names + list(bam_paths))
    result = result.set_index(index_names)

    return result
Exemplo n.º 18
0
def parse_tabix_file_subset(tabix_filenames, subset_i, subset_n, record_parser):
    """
    Returns a generator of parsed record objects (as returned by record_parser) for the i'th out n subset of records
    across all the given tabix_file(s). The records are split by files and contigs within files, with 1/n of all contigs
    from all files being assigned to this the i'th subset.

    Args:
        tabix_filenames: a list of one or more tabix-indexed files. These will be opened using pysam.Tabixfile
        subset_i: zero-based number
        subset_n: total number of subsets
        record_parser: a function that takes a file-like object and returns a generator of parsed records
    """
    start_time = time.time()
    open_tabix_files = [pysam.Tabixfile(tabix_filename) for tabix_filename in tabix_filenames]
    tabix_file_contig_pairs = [(tabix_file, contig) for tabix_file in open_tabix_files for contig in tabix_file.contigs]
    tabix_file_contig_subset = tabix_file_contig_pairs[subset_i : : subset_n]  # get every n'th tabix_file/contig pair
    short_filenames = ", ".join(map(os.path.basename, tabix_filenames))
    num_file_contig_pairs = len(tabix_file_contig_subset)
    print(("Loading subset %(subset_i)s of %(subset_n)s total: %(num_file_contig_pairs)s contigs from "
           "%(short_filenames)s") % locals())
    counter = 0
    for tabix_file, contig in tabix_file_contig_subset:
        header_iterator = tabix_file.header
        records_iterator = tabix_file.fetch(contig, 0, 10**9, multiple_iterators=True)
        for parsed_record in record_parser(itertools.chain(header_iterator, records_iterator)):
            counter += 1
            yield parsed_record

            if counter % 100000 == 0:
                seconds_elapsed = int(time.time()-start_time)
                print(("Loaded %(counter)s records from subset %(subset_i)s of %(subset_n)s from %(short_filenames)s "
                       "(%(seconds_elapsed)s seconds)") % locals())

    print("Finished loading subset %(subset_i)s from  %(short_filenames)s (%(counter)s records)" % locals())
Exemplo n.º 19
0
 def __init__(self, tabix_file_name,**dict):
     '''
     wrapped in DBI.init(filename,"tabix")
     '''
     self.tabix_file_name=tabix_file_name
     self.dict=dict
     try:
         self.data=pysam.Tabixfile(tabix_file_name)
     except:
         print >>sys.stderr,"WARNING: Can't init the tabix file",tabix_file_name
     self.header=None
     if dict.has_key("header") and dict["header"]==True:
         f=TableIO.parse(tabix_file_name)
         h=f.next()
         l=len(h)
         for i in range(l):
             h[i]=h[i].strip()
         self.header=h
         f.close()
     elif dict.has_key("header") and isinstance(dict["header"],list):
         self.header=dict["header"]
     elif dict.has_key("header") and isinstance(dict["header"],str):
         fh=TableIO.parse(dict["header"])
         self.header=fh.next()
         #print >>sys.stderr,self.header
     self.tabix_format="simple"
     if self.dict.has_key("tabix"):
         self.tabix_format=self.dict["tabix"]
Exemplo n.º 20
0
def _annotate_variants(args, conn, metadata, get_val_fn, col_names=None, col_types=None, col_ops=None):
    """Generalized annotation of variants with a new column.

    get_val_fn takes a list of annotations in a region and returns
    the value for that region to update the database with.

    Separates selection and identification of values from update,
    to avoid concurrent database access errors from sqlite, especially on
    NFS systems. The retained to_update list is small, but batching
    could help if memory issues emerge.
    """
    # For each, use Tabix to detect overlaps with the user-defined
    # annotation file.  Update the variant row with T/F if overlaps found.
    anno = pysam.Tabixfile(args.anno_file)
    naming = guess_contig_naming(anno)
    cursor = conn.bind.connect()
    add_requested_columns(args, cursor, col_names, col_types)
    conn.commit()
    cursor.close()

    conn, metadata = database.get_session_metadata(str(conn.bind.url))
    cursor = conn.bind.connect()

    last_id = 0
    current_id = 0
    total = 0
    CHUNK_SIZE = 100000
    to_update = []

    select_res = cursor.execution_options(stream_results=True).execute('''SELECT chrom, start, end, ref, alt, variant_id FROM variants''')
    while True:
        for row in select_res.fetchmany(CHUNK_SIZE):

            # update_data starts out as a list of the values that should
            # be used to populate the new columns for the current row.
            # Prefer no pysam parsing over tuple parsing to work around bug in pysam 0.8.0
            # https://github.com/pysam-developers/pysam/pull/44
            if args.anno_file.endswith(('.vcf', '.vcf.gz')):
                update_data = get_val_fn(annotations_in_vcf(row, anno, None, naming, args.region_only, True))
            else:
                update_data = get_val_fn(annotations_in_region(row, anno, None, naming))
            #update_data = get_val_fn(annotations_in_region(row, anno, "tuple", naming))
            # were there any hits for this row?
            if len(update_data) > 0:
                # we add the primary key to update_data for the
                # where clause in the SQL UPDATE statement.
                update_data.append(str(row["variant_id"]))
                to_update.append(tuple(update_data))

            current_id = row["variant_id"]

        if current_id <= last_id:
            break
        elif len(to_update) > 0:
            _update_variants(metadata, to_update, col_names, cursor)

            total += len(to_update)
            print("updated", total, "variants")
            last_id = current_id
        to_update = []
Exemplo n.º 21
0
def annotate_variants_list(args, select_cursor, update_cursor):
    """
    Populate a new, user-defined column in the variants
    table with a INTEGER indicating the count of overlaps
    between the variant and the 
    annotation file.
    """
    add_requested_column(args.col_name, update_cursor)

    # For each, use Tabix to count overlaps with the user-defined
    # annotation file.  Update the variant row with the count.
    annos = pysam.Tabixfile(args.anno_file)
    select_cursor.execute("SELECT chrom, start, end, variant_id FROM variants")
    for row in select_cursor:
        hit_list = []
        for hit in annos.fetch(str(row['chrom']), int(row['start']), int(row['end']),
                               parser=pysam.asTuple()):
            try:
                hit_list.append(hit[int(args.col_extract) - 1])
            except IndexError:
                sys.exit("Column " + args.col_extract + " exceeds \
                          the number of columns in your \
                          annotation file. Exiting.")
                          
        hits = ",".join(hit_list)
        if len(hit_list):
            update_qry = "UPDATE variants SET " + args.col_name + " = '" + hits + \
                        "' WHERE variant_id = " + str(row['variant_id'])
        else:
            update_qry = "UPDATE variants SET " + args.col_name + " = NULL" + \
                        " WHERE variant_id = " + str(row['variant_id'])
        update_cursor.execute(update_qry)
Exemplo n.º 22
0
def SNP_chunhe(inputItem):
    global SNPFileName
    snpMAFvalue = []  #该CBS区域内对应上得snp位点的MAF值
    snpNum = 0  #所有snp点的个数
    MAFNum = 0  #MAF值大于等于0.8的个数
    MAFratio = 0  #MAFNum/snpNum

    tb = pysam.Tabixfile(SNPFileName)
    for item in tb.fetch(inputItem[1], int(inputItem[2]), int(inputItem[3])):
        snplist = item.strip().split("\t")
        if int(snplist[1]) in range(int(inputItem[2]), int(inputItem[3]) + 1):
            snpMAFvalue.append(snplist[16])  #在pup文件中第17列是MAF的值
            if float(snplist[16]) >= 0.8:
                MAFNum += 1
    snpNum = len(snpMAFvalue)
    if snpNum == 0:
        MAFvaluemean = 0
        snpNum = 0
        inputItem.append(str(0))  #增加MAF-value-mean
        inputItem.append(str(0))  #增加All-SNP-num
        inputItem.append("None")  #增加MAF-value-num(>=0.8)
        inputItem.append("None")  #增加MAF-ratio(=MAF-value-num/All-SNP-num)
    else:
        MAFvalue = sum(map(lambda x: float(x), snpMAFvalue))  #MAF所有值得和
        MAFvaluemean = round(MAFvalue / snpNum, 2)  #MAF均值
        MAFratio = round(MAFNum / snpNum, 2)  #MAF值大于等于0.8得占比
        inputItem.append(str(MAFvaluemean))  #增加MAF-value-mean
        inputItem.append(str(snpNum))  #增加All-SNP-num
        inputItem.append(str(MAFNum))  #增加MAF-value-num(>=0.8)
        inputItem.append(
            str(MAFratio))  #增加MAF-ratio(=MAF-value-num/All-SNP-num)
    # print inputItem
    return inputItem
Exemplo n.º 23
0
def ensureIndexed(bedPath, preset="bed", trySorting=True):
    if not bedPath.endswith(".gz"):
        if not os.path.exists(bedPath + ".gz"):
            logging.info("bgzf compressing {}".format(bedPath))
            pysam.tabix_compress(bedPath, bedPath + ".gz")
            if not os.path.exists(bedPath + ".gz"):
                raise Exception(
                    "Failed to create compress {preset} file for {file}; make sure the {preset} file is "
                    "sorted and the directory is writeable".format(
                        preset=preset, file=bedPath))
        bedPath += ".gz"
    if not os.path.exists(bedPath + ".tbi"):
        logging.info("creating tabix index for {}".format(bedPath))
        pysam.tabix_index(bedPath, preset=preset)
        if not os.path.exists(bedPath + ".tbi"):
            raise Exception(
                "Failed to create tabix index file for {file}; make sure the {preset} file is "
                "sorted and the directory is writeable".format(preset=preset,
                                                               file=bedPath))

    line = pysam.Tabixfile(bedPath).fetch().next()
    if len(line.strip().split("\t")) < 6 and preset == "bed":
        raise AnnotationError(
            "BED files need to have at least 6 (tab-delimited) fields (including "
            "chrom, start, end, name, score, strand; score is unused)")
    if len(line.strip().split("\t")) < 9 and preset == "gff":
        raise AnnotationError(
            "GFF/GTF files need to have at least 9 tab-delimited fields")

    return bedPath
Exemplo n.º 24
0
def load_contig(contig):
    '''save cadd contig into mongodb collection.
       should be an iterable.
    '''
    # if CADD_INPUT == "exome":
    # CADD_INPUT = exome
    tabix = pysam.Tabixfile(whole_genome)
    src_db = get_src_db()
    target_coll = src_db["cadd"]
    t0 = time.time()
    cnt = 0
    docs = (doc for doc in fetch_generator(tabix, contig))
    doc_list = []
    for doc in docs:
        doc_list.append(doc)
        cnt += 1
        if len(doc_list) == 100:
            target_coll.insert(doc_list,
                               manipulate=False,
                               check_keys=False,
                               w=0)
            doc_list = []
        if cnt % 100000 == 0:
            print(cnt, timesofar(t0))
    if doc_list:
        target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0)
    print("successfully loaded cadd chromosome %s into mongodb" % contig)
    print("total docs: {}; total time: {}".format(cnt, timesofar(t0)))
Exemplo n.º 25
0
def main(args):
    tbx = pysam.Tabixfile(args.tabix)

    with open(args.table) as l1seq:
        for line in l1seq:

            if line.startswith('UUID'):
                header = line.strip().split()
                header.append(args.name)
                print '\t'.join(header)
                continue

            c = line.strip().split()
            chrom = c[1]
            start = int(c[2])
            end = int(c[3])

            annotations = []

            if chrom in tbx.contigs:
                for rec in tbx.fetch(chrom, start, end):
                    if args.nonref:
                        sfam = c[6]
                        annot = rec.strip().split()
                        if annot[3] == sfam:
                            annotations.append('|'.join(annot))
                    else:
                        annotations.append('|'.join(rec.strip().split()))

            annotations = list(set(annotations))  # uniqify

            if len(annotations) == 0: annotations.append('NA')

            print line.strip() + '\t' + ','.join(annotations)
Exemplo n.º 26
0
def main(argv):
    parser = argparse.ArgumentParser(prog="paleomix vcf_to_fasta")
    parser.add_argument("--genotype", help="Tabix indexed VCF file.",
                        required=True)
    parser.add_argument("--intervals", help="BED file.", required=True)
    parser.add_argument("--padding", type=int, default=10,
                        help="Number of bases to expand intervals, when "
                             "checking for adjacent indels [%default]")
    parser.add_argument("--whole-codon-indels-only",
                        action="store_true", default=False,
                        help="If true, only indels where (length % 3) == 0 "
                             "are retained [%default]")
    parser.add_argument("--ignore-indels",
                        action="store_true", default=False,
                        help="Do not include indels generated FASTA "
                             "sequence [%default].")
    opts = parser.parse_args(argv)

    print("Running buildRegions.py", end="", file=sys.stderr)
    if opts.whole_codon_indels_only:
        print(", assuming sequences represents CDS", end="", file=sys.stderr)
    print(" ...", file=sys.stderr)

    genotype = pysam.Tabixfile(opts.genotype)
    intervals = read_intervals(opts.intervals)
    if intervals is None:
        return 1

    return genotype_genes(opts, intervals, genotype)
Exemplo n.º 27
0
    def fetch(self, chrom, start, end=None):
        """ fetch records from a Tabix indexed VCF, requires pysam
            if start and end are specified, return iterator over positions
            if end not specified, return individual ``_Call`` at start or None
        """
        if not pysam:
            raise Exception('pysam not available, try "pip install pysam"?')

        if not self.filename:
            raise Exception('Please provide a filename (or a "normal" fsock)')

        if not self._tabix:
            self._tabix = pysam.Tabixfile(self.filename)

        if self._prepend_chr and chrom[:3] == 'chr':
            chrom = chrom[3:]

        # not sure why tabix needs position -1
        start = start - 1

        if end is None:
            self.reader = self._tabix.fetch(chrom, start, start + 1)
            try:
                return self.next()
            except StopIteration:
                return None

        self.reader = self._tabix.fetch(chrom, start, end)
        return self
Exemplo n.º 28
0
class VcfReader:
    """
    Read comfortably from VCF style files with main focus on chr, start, ref and alt fields.
    Note that this API uses 1-based coordinates with both start and end included in the interval. 
    PySam API uses 0-based half-open intervals, so we have to convert internally.
    """
    def __init__(self, input_file):
        self.filename = input_file
        self.indexed = False

        if input_file.strip() == "-":
            ifile = sys.stdin
        elif input_file.endswith(".bz2"):
            try:
                ifile = bz2file.BZ2File(input_file, "r", buffering=0)
            except Exception, e:
                raise e
        elif input_file.endswith(".gz") or input_file.endswith(".bgz"):
            # try to open the file with Tabix
            try:
                ifile = pysam.Tabixfile(input_file, parser=pysam.asVCF())
                self.indexed = True
            except Exception, e:
                try:
                    ifile = gzip.GzipFile(input_file, "r")
                except Exception, e:
                    raise e
Exemplo n.º 29
0
def process_vcf_slice(tabix_file, chrm, start, stop, position_data):

    tbx = pysam.Tabixfile(tabix_file)
    tbx_lines = tbx.fetch(chrm, start, stop)

    numb_of_seqs = len(position_data._fields[9:])
    alignment = np.zeros((stop-start,numb_of_seqs), np.string0)

    # This 'error handling' needs to be rewritten.
    current_data = []
    if tbx_lines == None:
        return 'error'

    for line in tbx_lines:
        current_base = position_data._make(line.strip().split("\t"))
        base_calls = callSNPs(current_base, numb_of_seqs)
        current_data.append(base_calls.copy())

    alignment = np.array(current_data)
    inform_sites = count_informative_sites(alignment)
    
    if current_base == None:
        return 'error'
    else:
        taxa = current_base._fields[9:]
        info = "tree 'chrm={0},start={1},stop={2},inform_sites={3}':".format(current_base.CHROM, start, stop, inform_sites)
        oneliner = array2OnelinerAlignment(info, taxa, alignment.transpose())

    if ":" in oneliner and oneliner[-1] == ';': # this prevents bad alignments from getting printed
        return oneliner
    else:
        return 'error'
Exemplo n.º 30
0
def get_vcf_lines(refvcf, pos_buffer, chr, pos):
    startpos = int(pos) - int(pos_buffer)
    endpos = int(pos) + int(pos_buffer)
    tabixfile = pysam.Tabixfile(refvcf)
    vcfline_generator = tabixfile.fetch(chr, startpos, endpos)
    lines = list(vcfline_generator)
    return lines