import sys from vcfDict import vcfDict if __name__ == "__main__": vcfLoc,bpFile,bpOut = sys.argv[1],sys.argv[2],sys.argv[3] outFile = open(bpOut,"w") vcf = vcfDict(vcfLoc) vcf.loadDict() with open(bpFile) as f: for line in f: if line[0] == "p": sline = line.split(",") windowID = sline[0] if vcf.getLine(windowID) != "": #Site is present outFile.write(line) else: outFile.write(line) outFile.close()
def isDataLine(line): """ Determines in line contains site data """ if len(line) > 1: return line[0] != "#" return False if __name__ == "__main__": """ """ anc_loc, desc_loc, merge_loc = sys.argv[1], sys.argv[2], sys.argv[3] desc = vcfDict(desc_loc) desc.loadDict() merge_file = open(merge_loc, "w") #Write header information to merge file with open(desc_loc) as desc_file: for desc_line in desc_file: if not isDataLine(desc_line): merge_file.write(desc_line) else: #data lines start break with open(anc_loc) as anc_file: for anc_line in anc_file: if isDataLine(anc_line):
def isDataLine(line): """ Determines in line contains site data """ if len(line) > 1: return line[0] != "#" return False if __name__ == "__main__": """ Given a .txt file of locations("CHROM:POS") and a vcf file, creates a new vcf file of sites that match the given locations. """ locations_dir, vcf_dir, out_loc = sys.argv[1], sys.argv[2], sys.argv[3] vcf_dict = vcfDict(vcf_dir) vcf_dict.loadDict() out_file = open(out_loc, "w") #Write header information to merge file with open(vcf_dir) as vcf_file: for vcf_line in vcf_file: if not isDataLine(vcf_line): out_file.write(vcf_line) else: #data lines start break with open(locations_dir) as loc_file: for loc in loc_file: loc = loc.replace(" ", "")
def isDataLine(line): """ Determines in line contains site data """ if len(line) > 1: return line[0] != "#" return False if __name__ == "__main__": """ """ CC3_loc,GP2_loc,common_sites_loc = sys.argv[1],sys.argv[2],sys.argv[3] desc = vcfDict(GP2_loc) desc.loadDict() merge_file = open(common_sites_loc,"w") with open(CC3_loc) as anc_file: for anc_line in anc_file: if isDataLine(anc_line): anc_line_col = str.split(anc_line) anc_line_key = anc_line_col[0] + ":" + anc_line_col[1] desc_line = desc.getLine(anc_line_key) if desc_line != "": merge_file.write(desc_line) pass
def sameAltBase(line_a, line_b): """ Returns whether the sites have the same alternate base """ altIdx = 4 sline_a = str.split(line_a) sline_b = str.split(line_b) return sline_a[altIdx] == sline_b[altIdx] if __name__ == "__main__": #Get location of both .vcf files and load vcfDicts for each file_a, file_b = sys.argv[1], sys.argv[2] dict_a = vcfDict(file_a) dict_b = vcfDict(file_b) dict_a.loadDict() dict_b.loadDict() #size information for each file before processing aSize = dict_a.getSize() bSize = dict_b.getSize() a_keys = dict_a.getKeys() bAnda = 0 same_alt = 0 #Check every key in dict_a against dict_b #if a site is present .getLine() removes the site after for key in a_keys:
""" Determines in line contains site data """ if len(line) > 1: return line[0] != "#" return False if __name__ == "__main__": """ """ known_sites,ccm = sys.argv[1],sys.argv[2] sites = vcfDict(known_sites) sites.loadDict() with open(ccm) as ccm_vcf: for line in ccm_vcf: if isDataLine(line): key = str.split(line)[0] + ":"+ str.split(line)[1] siteLine = sites.getLine(key) if siteLine != "": print siteLine else: pass # print key
import sys import traceback from vcfDict import vcfDict from vcfLine import vcfLine from vcfSample import vcfSample from mpileLine import mpileLine if __name__ == "__main__": mpile_in,vcf_in,csv_out = sys.argv[1],sys.argv[2],sys.argv[3] #Load dictionary of sites in vcf_in file vcfSites = vcfDict(vcf_in) vcfSites.loadDict() #Create output file for csv_out outFile = open(csv_out,"w") with open(mpile_in) as pileFile: for line in pileFile: try: pile_line = mpileLine(line) print pile_line.repr() if vcfSites.siteExists(pile_line.siteID): vcf_line = vcfLine(vcfSites.getLine(pile_line.siteID)) except Exception, e: print sys.exc_info()[0] print traceback.format_exc() print line outFile.close()
def isDataLine(line): """ Determines in line contains site data """ if len(line) > 1: return line[0] != "#" return False if __name__ == "__main__": """ """ common_sites_loc,target_file,output_file = sys.argv[1],sys.argv[2],sys.argv[3] desc = vcfDict(common_sites_loc) desc.loadDict() outFile = open(output_file,"w") with open(target_file) as t_file: for t_line in t_file: if isDataLine(t_line): t_line_col = str.split(t_line) t_line_key = t_line_col[0] + ":" + t_line_col[1] desc_line = desc.getLine(t_line_key) if desc_line != "": #outFile.write(desc_line) #do not write line
from vcfDict import vcfDict import sys if __name__ == "__main__": mpile_in,vcf_sites_loc,mpile_out_loc = sys.argv[1],sys.argv[2],sys.argv[3] vcfSites = vcfDict(vcf_sites_loc) vcfSites.loadDict() mpile_out = open(mpile_out_loc,"w") with open(mpile_in) as mpile_file: for line in mpile_file: sline = line.split("\t") lineID = sline[0] + ":" + sline[1] if vcfSites.siteExists(lineID): mpile_out.write(line) mpile_out.close() mpile_file.close()
import sys from vcfDict import vcfDict if __name__ == "__main__": vcfFile, bpFile, outFile_loc = sys.argv[1], sys.argv[2], sys.argv[3] vcfSites = vcfDict(vcfFile) vcfSites.loadDict() outFile = open(outFile_loc, "w") with open(vcfFile) as vcf_f: for line in vcf_f: if line[0] == "#": outFile.write(line) with open(bpFile) as bp: for line in bp: if line[0] != '""': sline = line.split(",") id = sline[1][1:-1] resp = vcfSites.getLine(id) if resp != "": outFile.write(resp) outFile.close() vcf_f.close() bp.close()
#Removes sites from intAB present in other samples except for sample B and sample A for i in range(0, len(samples)): if i != bSetIdx and (samples[i].getFileName() != targetDict.getFileName()): for ele in samples[i].getKeys(): try: intAB.remove(ele) except: pass return intAB if __name__ == "__main__": """ """ GPB = vcfDict("../data/compareSNPs/tmp/GP2-3_B_m0.vcf") GPC = vcfDict("../data/compareSNPs/tmp/GP2-3_C_m0.vcf") GPD = vcfDict("../data/compareSNPs/tmp/GP2-3_D_m0.vcf") GPE = vcfDict("../data/compareSNPs/tmp/GP2-3_E_m0.vcf") GPF = vcfDict("../data/compareSNPs/tmp/GP2-3_F_m0.vcf") GPG = vcfDict("../data/compareSNPs/tmp/GP2-3_G_m0.vcf") GPH = vcfDict("../data/compareSNPs/tmp/GP2-3_H_m0.vcf") GPI = vcfDict("../data/compareSNPs/tmp/GP2-3_I_m0.vcf") GPJ = vcfDict("../data/compareSNPs/tmp/GP2-3_J_m0.vcf") GPK = vcfDict("../data/compareSNPs/tmp/GP2-3_K_m0.vcf") GPL = vcfDict("../data/compareSNPs/tmp/GP2-3_L_m0.vcf") GPM = vcfDict("../data/compareSNPs/tmp/GP2-3_M_m0.vcf") GPN = vcfDict("../data/compareSNPs/tmp/GP2-3_N_m0.vcf") GPO = vcfDict("../data/compareSNPs/tmp/GP2-3_O_m0.vcf") CCB = vcfDict("../data/compareSNPs/tmp/CC_B_m0.vcf")
Return whether the given site is one of the target sites """ lineID = lineVcf.chrom + ":" + lineVcf.pos return targetSitesDict.getLine(lineID) != "" if __name__ == "__main__": vcf_unfilt, vcf_targetSites, out_loc = sys.argv[1], sys.argv[2], sys.argv[ 3] outFile = open(out_loc, "w") outFile.write( "siteID,start_chrom,start_pos,stop_chrom,stop_pos,altSiteCount,refSiteCount,avgDepth,avgRefReads,avgAltReads,avgOtherReads" + "\n") targetSitesDict = vcfDict(vcf_targetSites) targetSitesDict.loadDict() window = [] with open(vcf_unfilt) as unfilt: for line in unfilt: lineVcf = vcfLine(line) if lineVcf.isDataLine: #Add data lines from unfiltered file to window window.append(lineVcf) if len( window ) == WINDOW_SIZE + 1: #Remove oldest site seen keep size == WINDOW_SIZE window.pop(0)