Exemplo n.º 1
0
    atacBam2Bed.py (-h | --help)
    
Options:
    
    --minMapQ=<minMapQ>  Minimum mapping quality of reads [default: 20]
    --size=<size>        Size of open region around insertion [default: 50]
    --rmDup              Flag to remove duplicate reads
    --help               Output this message
    
"""
import pysam
import collections
from general_python import docopt
from ngs_python.bam import pysamfunc
# Extract arguments
args = docopt.docopt(__doc__,version = 'v1')
# Process size argument
try:
    args['--size'] = int(args['--size'])
except ValueError:
    raise IOError('size argument must be an integer divisible by 2')
if args['--size'] % 2:
    raise IOError('size argument must be an integer divisible by 2')
# Process size argument
try:
    args['--minMapQ'] = int(args['--minMapQ'])
except ValueError:
    raise IOError('minMapQ argument must be an integer')
# Create counter to store processing metrics
counter = collections.defaultdict(int)
# Open input and output files
Exemplo n.º 2
0
Usage:
    meanCoverageIntervals.py <intervals> <outfile> <bam>... 
        [--minmap=<mm>] [--rmdup] [--onebased] [--header]

Options:
    --minmap=<mm>  Minimum mapping quality for read [default: 0].
    --rmdup        Skip duplicate reads in calculating coverage.
    --onebased     Intervals have a one-based start. Otherwise a
        zero-based start is presumed.

'''
# Load required modules
from general_python import docopt
from ngs_python.bam import pysam_coverage
# Extract arguments
args = docopt.docopt(__doc__, version='v1')
args['--minmap'] = int(args['--minmap'])
# Open interval list file and extract data
intervalList = []
with open(args['<intervals>']) as intervalFile:
    for line in intervalFile:
        chrom, start, end = line.strip().split('\t')[:3]
        intervalList.append((chrom, int(start), int(end)))
# Adjust intervals if they are one based
if args['--onebased']:
    intervalList = [(x[0], x[1] - 1, x[2]) for x in intervalList]
# Extract mean coverage for intervals
bamCov = pysam_coverage.multiple_coverage(args['<bam>'])
outDF = bamCov.mean_coverage(intervals=intervalList,
                             map_quality=args['--minmap'],
                             remove_dup=args['--rmdup'])
Exemplo n.º 3
0
'''bam2bedgraph.py

Usage:
    
    bam2bed.py <bam> <bed>

'''
# Import required modules
import pysam
from general_python import docopt
# Extract arguments
args = docopt.docopt(__doc__, version='1.0')
# Open input bam file and output bed
bamFile = pysam.AlignmentFile(args['<bam>'])
bedFile = open(args['<bed>'], 'w')
# Create chromosome dictionary
chromDict = {}
for chrom in bamFile.references:
    chromDict[bamFile.gettid(chrom)] = chrom
# Create strand and read dictionary
strandDict = {False:'+', True:'-'}
readDict = {True:'/1', False:'/2'}
# Create output bed
for read in bamFile:
    if read.is_unmapped:
        continue
    bedFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' %(
        chromDict[read.reference_id],
        read.reference_start,
        read.reference_end,
        read.query_name + readDict[read.is_read1],
'''ensembl_name_gtf.py

Usage:
    
    ensembl_name_gtf.py <gtffile> <outfile>

'''
# Import required modules
import collections
import re
from general_python import docopt
# Extract arguments
args = docopt.docopt(__doc__, version='1.0')
# Create regular expressions
eRE = re.compile('gene_id\s+"(.*?)";')
bRE = re.compile('gene_biotype\s+"(.*?)";')
# Read in input file to gene dictionary
geneDict = collections.defaultdict(set)
with open(args['<gtffile>'], 'r') as inFile:
    for line in inFile:
        if line.startswith('#'):
            continue
        data = line.strip().split('\t')[8]
        ensembl = re.search(eRE, data).group(1)
        biotype = re.search(bRE, data).group(1)
        geneDict[ensembl].add(biotype)
# Create output file
counter = [0, 0, 0]
with open(args['<outfile>'], 'w') as outFile:
    for ensembl, biotype in geneDict.items():
        counter[0] += 1
    --minGene=<minGene>  Minimum significant genes in GO set [default: 3].
    --log2Col=<log2Col>  Column for log2 fold change data. Supplying this
        results in positive and negative fold change genes being considered
        seperately.
    --includeCombined    Include combined geneset alongside positive and
        negative genes sets. Only effectice with --log2Col argument.
    --onlyAnno           Only consider genes with annotation.
    --noHeader           Results file has no header.
    
"""
# Import required modules
from ngs_python.gtf import gene_conversion
from general_python import docopt

# Extract arguments
args = docopt.docopt(__doc__, version="v1")
args["<geneCol>"] = int(args["<geneCol>"])
args["<statCol>"] = int(args["<statCol>"])
args["<statMax>"] = float(args["<statMax>"])
args["--minGO"] = int(args["--minGO"])
args["--maxGO"] = int(args["--maxGO"])
if args["--log2Col"] is not None:
    args["--log2Col"] = int(args["--log2Col"])
# Parse gmt
geneAnno = gene_conversion.parse_gmt(args["<gmt>"])
if isinstance(args["--log2Col"], int):
    # Extract gene list
    allGenes, posGenes, negGenes = gene_conversion.extract_gene_results_posneg(
        results=args["<results>"],
        geneCol=args["<geneCol>"],
        log2Col=args["--log2Col"],