def call_MEI_candidate(VCF): ''' ''' ## Create VCF with candidate MEI calls candidateVCF = formats.VCF() candidateVCF.header = VCF.header ## Create VCF with non-candidate MEI filteredVCF = formats.VCF() filteredVCF.header = VCF.header ## For each variant for variant in VCF.variants: ## Filter calls distinct from INS if variant.info['SVTYPE'] != 'INS': continue ## Search for poly(A) tail at inserted sequence polyA, monomerA = search4polyA(variant.alt) ## Search for poly(T) tail at inserted sequence polyT, monomerT = search4polyT(variant.alt) ## a) Filter calls if poly(A) nor poly(T) found if not polyA and not polyT: filteredVCF.add(variant) ## b) Add variant passing all the filters else: candidateVCF.add(variant) return candidateVCF, filteredVCF
print "fastaFile: ", fastaFile print "refDir: ", refDir print "fileName: ", fileName print "outDir: ", outDir print print "***** Executing ", scriptName, ".... *****" print ## Start ## #### 1. Read fasta file containing supporting reads fastaObj = fasta() fastaObj.fasta_reader(fastaFile) #### 2. Read VCF file VCFObj = formats.VCF() VCFObj.read_VCF(inputVCF) ## For each MEI for VCFlineObj in VCFObj.lineList: ### Define insertion Id chrom = VCFlineObj.chrom pos = str(VCFlineObj.pos) insertionType = VCFlineObj.infoDict["TYPE"] family = VCFlineObj.infoDict[ "CLASS"] if "CLASS" in VCFlineObj.infoDict else 'NA' ## Initialize subFamily as unknown percDiv = "NA" subFamily = "NA"
#print "test: ", donorId, tumorType #print "donorIdProjectCodeDict: ", donorIdProjectCodeDict #### 2. Compute the allele count of each source element in EOPC-DE ################################################################### ## EOPC-DE is the tumor type with available samples for the validation of L1 source elements. # Initialize a dictionary with the following structure: # - dict1: key(sourceElementId) -> dict2: key1("alleleCount") -> value1(alleleCount value) # key2("donorIdList") -> list of donor ids containing the insertion # sourceElementId: chr:beg-end header("2. Compute the allele count of each source element in EOPC-DE") VCFObj = formats.VCF() donorIdList = VCFObj.read_VCF_multiSample(sourceElementGt) alleleCountsDict = {} ## For each MEI: for MEIObj in VCFObj.lineList: end = (MEIObj.infoDict["BKPB"] if "BKPB" in MEIObj.infoDict else "UNK") sourceElementId = MEIObj.chrom + ':' + str(MEIObj.pos) + '-' + str(end) print "** source element ** ", sourceElementId ## Initialize source element dictionary alleleCountsDict[sourceElementId] = {} alleleCountsDict[sourceElementId]["alleleCount"] = 0
print "***** ", scriptName, " configuration *****" print "VCF: ", VCFPaths print "sampleId: ", sampleId print "outDir: ", outDir print print "***** Executing ", scriptName, ".... *****" print ## Start ## #### 1. Create VCF object and read input VCF header("1. Process input VCFs") paths = open(VCFPaths, 'r') # Make merged VCF object completeVCFObj = formats.VCF() ## Read one VCF per iteration and add the variants to the merged VCF for VCFfile in paths: VCFfile = VCFfile.rstrip('\n\r') VCFObj = formats.VCF() VCFObj.read_VCF(VCFfile) # Add variant objects for lineObj in VCFObj.lineList: completeVCFObj.addLine(lineObj) # Create header if completeVCFObj.header == "":
targetDonorsList = targetDonorsList + donorIdList ## Open output file outFilePath = outDir + '/' + ancestryCode + '_donorIdList.tsv' outFile = open(outFilePath, 'w') ## Write each donorId in the output file. One id per row for donorId in donorIdList: row = donorId + '\n' outFile.write(row) #### 2. Read input multi-sample VCF and generate a VCF object ############################################################### header("2. Read input multi-sample VCF and generate a VCF object") VCFObj = formats.VCF() VCFObj4Fst = formats.VCF() VCFObj.read_VCF_multiSample(inputVCF) #### 3. Select target donors and source elements ################################################## header("3. Select target donors and source elements") ## target source elements are rare elements with a MAF < 1% targetSourceList = [ "1p35.2", "1q23.3", "2q21.3", "3p24.1", "3q26.1", "5q13.1", "7p12.3", "7q31.2", "8p23.1f", "9q22.33", "10q25.1", "11p11.2", "11q14.2", "21q21.1" ] ## For each MEI:
print "***** ", scriptName, " configuration *****" print "VCF1KGP: ", VCF1KGP print "VCFPCAWG: ", VCFPCAWG print "outDir: ", outDir print print "***** Executing ", scriptName, ".... *****" print ## Start ## #### 1. Read input VCFs and generate VCF objects ############################################################# header("1. Process input VCFs ") ## 1000 genomes multi-sample VCF VCFObj1KGP = formats.VCF() donorIdList1KGP = VCFObj1KGP.read_VCF_multiSample(VCF1KGP) ## PCAWG VCF VCFObjPCAWG = formats.VCF() VCFObjPCAWG.read_VCF(VCFPCAWG) #### 2. Select common MEI from 1000 genomes and PCAWG ############################################################ header("2. Select common MEI from 1000 genomes and PCAWG") outVCFObj = formats.VCF() counter = 1 ## For each PCAWG MEI
## Start ## #### 1. Create database with all MEI events ############################################ ## First, make list with all the identified MEI across all the provided samples allMEIlist = [] with open(VCFs) as VCFs: ### Process a VCF in each iteration for VCF in VCFs: VCF = VCF.rstrip('\n') ## 1. Generate VCF object VCFObj = formats.VCF() VCFObj.read_VCF(VCF) VCFheader = VCFObj.header ## Select insertions passing all the filters for MEIObj in VCFObj.lineList: if (MEIObj.filter == "PASS"): allMEIlist.append(MEIObj) ## Then organize them into a dictionary MEIDict = organizeMEI(allMEIlist) #### 2. Generate a consensus VCF with a non-redundant list of MEI events ##########################################################################
## A) Normal matched VCF not provided or file does not exist if (germlineVCF == False): msg = "Matched normal VCF not provided" log("WARNING", msg) germlineMEIDict = False ## B) Normal VCF provided but does not exist elif not (os.path.isfile(germlineVCF)): msg = "Matched normal VCF does not exist" log("WARNING", msg) germlineMEIDict = False ## C) Normal VCF provided -> Organize germline MEI into a dictionary else: germlineVCFObj = formats.VCF() germlineVCFObj.read_VCF(germlineVCF) germlineMEIDict = organizeMEI(germlineVCFObj.lineList) #### 1. Create somatic VCF object and read input VCF VCFObj = formats.VCF() VCFObj.read_VCF(inputVCF) #### 2. Find somatic duplicated insertions # Duplicated filtering flag provided if "DUP" in filterList: dupList = findDuplicates(VCFObj.lineList) print "number_duplicates: ", len(dupList), dupList #### 3. Organize somatic MEI into a dictionary.
def call_NUMT(vcf, mtGenome, outDir): ''' ''' ## 0. Create temporary folder tmpDir = outDir + '/tmp' unix.mkdir(tmpDir) ## 1. Write inserted sequences into fasta file fastaPath = tmpDir + '/insertions.fa' fasta = ins2fasta(vcf, tmpDir) fasta.write(fastaPath) ## 2. Create index for the mitochondrial genome fileName = 'mtGenome' mtIndex = alignment.index_minimap2(mtGenome, fileName, tmpDir) ## 3. Align inserted sequences against the mitochondrial genome PAF_path = alignment.alignment_minimap2(fastaPath, mtIndex, 'hits2mt', 1, tmpDir) PAF_mt = formats.PAF() PAF_mt.read(PAF_path) ## 4. Generate single PAF objects per inserted sequence: PAFs_mt = group_alignments(PAF_mt) ## 5. Make NUMTs calls NUMTs = {} for insId in PAFs_mt: chain = PAFs_mt[insId].chain(20, 50) # Make NUMT call if enough % of sequence resolved if chain.perc_query_covered() >= 60: coords = chain.interval_template() NUMT = {} NUMT['ITYPE'] = 'NUMT' NUMT['MT_COORD'] = str(coords[0]) + '-' + str(coords[1]) NUMTs[insId] = NUMT ## 6. Generate output VCF containing NUMT calls ## Create header for output dictionary outVCF = formats.VCF() outVCF.header = vcf.header ## Add MEI specific fields to the VCF header info2add = {'ITYPE': ['.', 'String', 'Type of insertion (solo, partnered or orphan)'], \ '3PRIME': ['0', 'Flag', 'Partnered 3-prime transduction'], \ '5PRIME': ['0', 'Flag', 'Partnered 5-prime transduction'], \ 'FAM': ['.', 'String', 'Repeat family'], \ 'CYTOID': ['.', 'String', 'Source element cytoband identifier'], \ 'RETRO_LEN': ['1', 'Integer', 'Inserted retrotransposon length'], \ 'TRUNCATION_5_LEN': ['1', 'Integer', 'Size of 5prime truncation'], \ 'TRUNCATION_3_LEN': ['1', 'Integer', 'Size of 3prime truncation'], \ 'INVERSION_LEN': ['1', 'Integer', '5-inversion length'], \ 'RETRO_COORD': ['.', 'String', 'Coordinates for inserted retrotransposon piece of sequence'], \ 'IS_FULL': ['0', 'Flag', 'Full length mobile element'], \ 'ORF1': ['0', 'Flag', 'ORF1 identified'], \ 'ORF2': ['0', 'Flag', 'ORF2 identified'], \ 'COMPETENT': ['0', 'Flag', 'Potential competent full L1 with intact ORFs'], \ 'TDCOORD_5PRIME': ['1', 'Integer', '5-prime transduced sequence coordinates'], \ 'TDCOORD_3PRIME': ['1', 'Integer', '3-prime transduced sequence coordinates'], \ 'TDLEN_5PRIME': ['1', 'Integer', '5-prime transduction length'], \ 'TDLEN_3PRIME': ['1', 'Integer', '3-prime transduction length'], \ 'STRAND': ['.', 'String', 'Insertion DNA strand (+ or -)'], } outVCF.header.info.update(info2add) ## Select INS corresponding to MEI calls and add update info field with MEI features for variant in vcf.variants: insId = variant.chrom + ':' + str(variant.pos) # Discard unresolved inserted sequences if (insId not in NUMTs): continue variant2add = copy.deepcopy(variant) variant2add.info.update(NUMTs[insId]) outVCF.add(variant2add) ## 9. Do cleanup #unix.rm([tmpDir]) return outVCF
def call_MEI(vcf, consensus, reference, sourceDb, outDir): ''' ''' ## 0. Create temporary folder tmpDir = outDir + '/tmp' unix.mkdir(tmpDir) ## 1. Write inserted sequences into fasta file fastaPath = tmpDir + '/MEI_candidate.fa' fasta = ins2fasta(vcf, tmpDir) fasta.write(fastaPath) ## 2. Create index for consensus sequences fileName = 'consensus' consensusIndex = alignment.index_minimap2(consensus, fileName, tmpDir) ## 3. Align inserted sequences against consensus: PAF_path = alignment.alignment_minimap2(fastaPath, consensusIndex, 'hits2consensus', 1, tmpDir) PAF_consensus = formats.PAF() PAF_consensus.read(PAF_path) ## Temporary index = "/Users/brodriguez/Research/References/Annotations/H.sapiens/hg38/Repetitive_dna/smallRNAs.mmi" PAF_path = alignment.alignment_minimap2(fastaPath, index, 'hits2small_MEI', 1, tmpDir) ## Align inserted sequences against the reference genome #SAM_path = alignment.alignment_bwa(fastaPath, reference, 'hits2genome', 1, tmpDir) #PAF_path = alignment.sam2paf(SAM_path, 'hits2genome', tmpDir) #PAF_genome = formats.PAF() #PAF_genome.read(PAF_path) ## 4. Generate single PAF objects per inserted sequence: PAFs_consensus = group_alignments(PAF_consensus) #PAFs_genome = group_alignments(PAF_genome) ## 5. Resolve structure for each insertion with matches on retrotransposon consensus sequences structures = {} for insId in PAFs_consensus: structures[insId] = MEI_structure(PAFs_consensus[insId], fasta.seqDict[insId]) seqBeg, seqEnd = structures[insId]['CHAIN'].interval() ## 6. Resolve 3' partnered transductions structures = resolve_partnered_3prime(structures, fasta, reference, sourceDb, tmpDir) ## 6. Search for 5' partnered transductions structures = search4partnered_5prime(structures, fasta, reference, tmpDir) ## 7. Search for orphan transductions ## Remove resolved insertions #for insId in structures: # if structures[insId]['PASS']: # del PAFs_genome[insId] ## Do orphan transduction search #search4orphan(PAFs_genome, sourceDb, fasta) # TO FINISH LATER (Only two L1 orphan transductions so far..) ## 8. Generate output VCF containing MEI calls ## Create header for output dictionary outVCF = formats.VCF() outVCF.header = vcf.header ## Add MEI specific fields to the VCF header info2add = {'ITYPE': ['.', 'String', 'Type of insertion (solo, partnered, orphan or NUMT)'], \ '3PRIME': ['0', 'Flag', 'Partnered 3-prime transduction'], \ '5PRIME': ['0', 'Flag', 'Partnered 5-prime transduction'], \ 'FAM': ['.', 'String', 'Repeat family'], \ 'CYTOID': ['.', 'String', 'Source element cytoband identifier'], \ 'RETRO_LEN': ['1', 'Integer', 'Inserted retrotransposon length'], \ 'TRUNCATION_5_LEN': ['1', 'Integer', 'Size of 5prime truncation'], \ 'TRUNCATION_3_LEN': ['1', 'Integer', 'Size of 3prime truncation'], \ 'INVERSION_LEN': ['1', 'Integer', '5-inversion length'], \ 'RETRO_COORD': ['.', 'String', 'Coordinates for inserted retrotransposon piece of sequence'], \ 'IS_FULL': ['0', 'Flag', 'Full length mobile element'], \ 'ORF1': ['0', 'Flag', 'ORF1 identified'], \ 'ORF2': ['0', 'Flag', 'ORF2 identified'], \ 'COMPETENT': ['0', 'Flag', 'Potential competent full L1 with intact ORFs'], \ 'TDCOORD_5PRIME': ['1', 'Integer', '5-prime transduced sequence coordinates'], \ 'TDCOORD_3PRIME': ['1', 'Integer', '3-prime transduced sequence coordinates'], \ 'TDLEN_5PRIME': ['1', 'Integer', '5-prime transduction length'], \ 'TDLEN_3PRIME': ['1', 'Integer', '3-prime transduction length'], \ 'STRAND': ['.', 'String', 'Insertion DNA strand (+ or -)'], \ 'MT_COORD': ['.', 'String', 'Coordinates for the piece of MT genome integrated'] } outVCF.header.info.update(info2add) ## Select INS corresponding to MEI calls and add update info field with MEI features for variant in vcf.variants: insId = variant.chrom + ':' + str(variant.pos) # Discard unresolved inserted sequences if (insId not in structures) or ((insId in structures) and (structures[insId]['PASS'] is False)): continue variant2add = copy.deepcopy(variant) variant2add.info.update(structures[insId]) outVCF.add(variant2add) ## 9. Do cleanup #unix.rm([tmpDir]) return outVCF
print() print('***** ', scriptName, 'configuration *****') print('vcf: ', vcf) print('consensus: ', consensus) print('reference: ', reference) print('mtGenome: ', mtGenome) print('fileName: ', fileName) print('outDir: ', outDir, "\n") ########## ## MAIN ## ########## ## Note: NUMT detection is disabled ## 1. Read VCF VCF = formats.VCF() VCF.read(vcf) ## 2. Load source elements database annotDir = '/Users/brodriguez/Research/Projects/HGSVC2/Analysis/Source_L1/V2/data/' annotations = annotation.load_annotations(['TRANSDUCTIONS'], VCF.header.refLengths, annotDir, None, 1, outDir) ## 2. Filter VCF by selecting retrotransposition insertion candidates # (inserted sequences with polyA/T tails at their ends) candidateVCF, filteredVCF = call_MEI_candidate(VCF) ## 3. Search for NUMTs #NUMT_VCF = call_NUMT(filteredVCF, mtGenome, outDir)
print "outDir: ", outDir print print "***** Executing ", scriptName, ".... *****" print ## Start ## #### 1. Read input VCFs and generate VCF objects ################################################### # Important requirement: the two VCF must have the same MEIs sorted also in the same order header("1. Process input VCFs ") ## Normal genome multi-sample VCF VCFObjNormal = formats.VCF() donorIdListNormal = VCFObjNormal.read_VCF_multiSample(VCFnormal) ## Tumor genome multi-sample VCF VCFObjTumor = formats.VCF() donorIdListTumor = VCFObjTumor.read_VCF_multiSample(VCFtumor) #### 2. Identify MEI that are blood/normal specific singletons ############################################################### # These cases are potential blood somatic events... header("2. Identify MEI that are blood/normal specific singletons") # Make VCF object with all the candidate blood somatic MEI VCFObjBloodSomatic = formats.VCF() ### For each MEI