示例#1
0
def set_1bp_list():
	chromLens = genutils.read_chrom_len(chromLenFile)
	counts = {}
	for chromName in chromLens:
		counts[chromName] = []
#		if chromName != "chr1":
#			continue
		for i in range(0,chromLens[chromName] + 1):
			counts[chromName].append("")
	return counts
示例#2
0
def set_1bp_list():
    chromLens = genutils.read_chrom_len(chromLenFile)
    counts = {}
    for chromName in chromLens:
        counts[chromName] = []
        #		if chromName != "chr1":
        #			continue
        for i in range(0, chromLens[chromName] + 1):
            counts[chromName].append("")
    return counts
示例#3
0
# Used for NA19240

import argparse
import os
import signal
import glob
import pandas as pd
import numpy as np
import math
from NGS_utils import *
import pickle
import subprocess
import genutils

chromLenFile = '/home/jmkidd/kidd-lab/genomes/hg19/hg19-EBV-fos-ecoli/hg19.fa.fai'
chromLens = genutils.read_chrom_len(chromLenFile)
chromOrder = get_chromOrder("human")


def create_snp_list(all_var_file):
    f = open(all_var_file, "r")
    snp_pos = []
    snp_geno = []
    snp_phase = []
    for line in f:
        line = line.strip().split("\t")
        chr = line[0]
        #		if chr != "chr1" and chr!="chr2":
        #			break
        if chr != CHROM and chromOrder[chr] < chromOrder[CHROM]:
            continue
示例#4
0
		c = l.split()[0]
		chromOrder[c] = i
	return chromOrder

def get_chromList():
#	chromLenFile = '/home/jmkidd/kidd-lab/genomes/hg19/hg19-EBV-fos-ecoli/hg19.fa.fai'
	chromLenFile = '/home/jmkidd/kidd-lab/genomes/hg18/genome-index/hg18.fa.fai'
	chr_list=[]
	inFile = open(chromLenFile,'r')
	for i,l in enumerate(inFile):
		l = l.rstrip()
		c = l.split()[0]
		chr_list.append(c)
	return chr_list
chromLenFile = '/home/jmkidd/kidd-lab/genomes/hg19/hg19-EBV-fos-ecoli/hg19.fa.fai'
chromLens = genutils.read_chrom_len(chromLenFile)

class BedIterator:
	def __init__(self, filename):
		if filename[-2:]=='gz':
			self.file = gzip.open(filename, "r")
		else:
			self.file = open(filename, "r")
	def __iter__(self):
		return self
	def __next__(self):
		return self.next()
	def next(self):
		line = next(self.file)
		fields = line.rstrip().split()
		return tuple(fields)
def find_clone_cov(pool_name, sample_name):
    windowFile = '/home/jmkidd/kidd-lab-scratch/shiya-projects/indian_Jacob/script/new_genome_window.txt'
    windowSize = 1000.0  # assume each window is 1000 unmasked bp
    chromLenFile = '/home/jmkidd/kidd-lab/genomes/hg19/hg19-EBV-fos-ecoli/hg19.fa.fai'
    chromLens = genutils.read_chrom_len(chromLenFile)
    coverage = {}
    counts = {}
    pos1 = {}
    pos2 = {}
    print 'Initializing chrom lists'
    for chromName in chromLens:
        counts[chromName] = []
        coverage[chromName] = []
        pos1[chromName] = []
        pos2[chromName] = []
    f = open(windowFile, 'r')
    for line in f:
        line = line.rstrip()
        line = line.split("\t")
        c = line[0]
        if c == "EBV" or c == "eColiK12" or c == "pCC1FOS":
            continue
        b = int(line[1])
        e = int(line[2])
        counts[c].append(0)
        coverage[c].append(0)
        pos1[c].append(b)
        pos2[c].append(e)
    outDirBase = '/home/jmkidd/kidd-lab/jmkidd-projects/additional-fosmid-pools/results/pools/%s/' % (
        sample_name)
    poolNames = pool_name.keys()
    poolNames.sort()
    minLen = 10000
    minDP = 0.25
    for pool in poolNames:
        baseDir = outDirBase + pool + '/'
        cloneFileName = baseDir + pool + '.markdup.clone'
        inFileName = cloneFileName + '.sel.%i.%.2f' % (minLen, minDP)
        f = open(inFileName, 'r')
        for line in f:
            line = line.rstrip()
            line = line.split()
            if line[0] == 'track':
                continue
            chr = line[0]
            start = int(line[1])
            end = int(line[2])
            cov = float(line[3])
            size = int(line[4]) / 1000
            find1 = find_pos(start - 1, pos1[chr])
            find2 = find_pos(end, pos2[chr])
            if find1 >= 0:
                for j in range(find1, find2 + 1):
                    if j < len(counts[chr]):
                        counts[chr][j] += 1
                        coverage[chr][j] += cov
                    else:
                        print line, find1, find2, len(counts[chr])
            else:
                print 'not found', chr, start
    '''
	f=open(windowFile,'r')
	i = 0
	outFile = open("wgs_clone_coverage.txt","w")
	old_c="chr1"
	for line in f:
		line = line.rstrip()
		line = line.split("\t")
		c = line[0]
		if c!=old_c:
			i = 0
			old_c = c
		if c=="EBV" or c=="eColiK12" or c=="pCC1FOS":
			continue
		b = int(line[1]) + 1 #make them all 1 based
		e = int(line[2])
		outFile.write('%s\t%i\t%i\t%f\t%i\n' % (c,b,e,coverage[c][i],counts[c][i]))
		i +=1
	dbfile=open('NA19240_wgs_clone_coverage_pickle','wb')
	pickle.dump(counts,dbfile)
	pickle.dump(coverage,dbfile)
	'''
    return counts, coverage
def find_clone_cov(pool_name,sample_name):
	windowFile = '/home/jmkidd/kidd-lab-scratch/shiya-projects/indian_Jacob/script/new_genome_window.txt'
	windowSize = 1000.0 # assume each window is 1000 unmasked bp
	chromLenFile = '/home/jmkidd/kidd-lab/genomes/hg19/hg19-EBV-fos-ecoli/hg19.fa.fai'
	chromLens = genutils.read_chrom_len(chromLenFile)
	coverage = {}
	counts = {}
	pos1 = {}
	pos2 = {}
	print 'Initializing chrom lists'
	for chromName in chromLens:
		counts[chromName] = []
		coverage[chromName] = []
		pos1[chromName]=[]
		pos2[chromName]=[]
	f=open(windowFile,'r')
	for line in f:
		line = line.rstrip()
		line = line.split("\t")
		c = line[0]
		if c=="EBV" or c=="eColiK12" or c=="pCC1FOS":
			continue
		b = int(line[1])
		e = int(line[2])
		counts[c].append(0)
		coverage[c].append(0)
		pos1[c].append(b)
		pos2[c].append(e)
	outDirBase= '/home/jmkidd/kidd-lab/jmkidd-projects/additional-fosmid-pools/results/pools/%s/' %(sample_name)
	poolNames = pool_name.keys()
	poolNames.sort()
	minLen = 10000
	minDP = 0.25
	for pool in poolNames:
		baseDir = outDirBase+pool + '/'
		cloneFileName = baseDir + pool + '.markdup.clone'
		inFileName = cloneFileName + '.sel.%i.%.2f' % (minLen,minDP)
		f = open(inFileName,'r')
		for line in f:
			line = line.rstrip()
			line = line.split()
			if line[0] == 'track':
				continue
			chr = line[0]
			start = int(line[1])
			end = int(line[2])
			cov = float(line[3])
			size = int(line[4])/1000
			find1 = find_pos(start-1,pos1[chr])
			find2 = find_pos(end,pos2[chr])
			if find1>=0:
				for j in range(find1,find2+1):
					if j < len(counts[chr]):
						counts[chr][j]+=1
						coverage[chr][j]+=cov
					else:
						print line,find1,find2,len(counts[chr])
			else:
				print 'not found',chr,start
	
	'''
	f=open(windowFile,'r')
	i = 0
	outFile = open("wgs_clone_coverage.txt","w")
	old_c="chr1"
	for line in f:
		line = line.rstrip()
		line = line.split("\t")
		c = line[0]
		if c!=old_c:
			i = 0
			old_c = c
		if c=="EBV" or c=="eColiK12" or c=="pCC1FOS":
			continue
		b = int(line[1]) + 1 #make them all 1 based
		e = int(line[2])
		outFile.write('%s\t%i\t%i\t%f\t%i\n' % (c,b,e,coverage[c][i],counts[c][i]))
		i +=1
	dbfile=open('NA19240_wgs_clone_coverage_pickle','wb')
	pickle.dump(counts,dbfile)
	pickle.dump(coverage,dbfile)
	'''
	return counts,coverage