def BAM2BED(BAM, outDir): ''' Convert BAM file into BED using bedtools Input: 1. BAM: Path to BAM file 2. outDir: Output directory Output: 1. BED: Path to BED file ''' ## 0. Create logs directory logDir = outDir + '/Logs' unix.mkdir(logDir) ## 1. Convert BAM into BED BED_path = outDir + '/alignments.bed' err = open(logDir + '/BAM2BED.err', 'w') command = 'bedtools bamtobed -split -i ' + BAM + ' > ' + BED_path status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'BAM2BED' msg = 'BAM to BED conversion failed' log.step(step, msg) ## 2. Add header to BED file header = "#ref \t beg \t end \t name \t score \t strand \n" with open(BED_path, 'r') as original: data = original.read() with open(BED_path, 'w') as modified: modified.write(header + data) return BED_path
def gene_annotation_lighter(metaclustersList, annovarDir, outDir): ''' Perform gene-based annotation for a list of input events Input: 1. metaclustersList 1. events: List containing input events to be annotated. Events should be objects containing ref, beg and end attributes. 2. annovarDir: Directory containing the two files used by ANNOVAR to perform gene based-annotation: a) build_annot.txt - Text file containing annotated transcript coordinates b) build_annotMrna.fa - Fasta containing annotated transcript sequences 3. outDir: Output directory Output: New 'geneAnnot' attribute set for each input event. 'geneAnnot' is a tuple(region,gene) ''' ## 1. Create output directory unix.mkdir(outDir) ## 2. Create input file containing events intervals for ANNOVAR create_annovar_input_lighter(metaclustersList, 'events.annovar', outDir) annovarInput = outDir + '/events.annovar' ## 3. Annotate events intervals with ANNOVAR out1, out2 = run_annovar(annovarInput, annovarDir, outDir) ## 4. Add gene annotation info to the events addGnAnnot2events_lighter(metaclustersList, out1) ## Do cleanup unix.rm([annovarInput, out1, out2])
def komplexityFilter(komplexityThreshold, inFasta, outFasta, outDir): ''' Filter fasta file using komplexity tool Input: 1. komplexityThreshold: Complexity threshold filter. 2. inFasta: input FASTA file name 3. outFasta: output FASTA file name 4. outDir: input AND output directory (it must be the same) Output: 1. allFastas: Filteres FASTA file complete path. ''' # Set input an output files allFastas_all = outDir + '/' + inFasta allFastas = outDir + '/' + outFasta logDir = outDir + '/Logs' unix.mkdir(logDir) command = 'kz --filter --threshold ' + str( komplexityThreshold ) + ' --fasta < ' + allFastas_all + ' > ' + allFastas err = open(logDir + '/komplexity.err', 'w') status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'KOMPLEXITY' msg = 'Komplexity filter failed. PID: ' + str(os.getpid()) log.step(step, msg) return allFastas
def load_annotations(annotations2load, refLengths, annotationsDir, germlineMEI, threads, outDir): ''' Load a set of annotation files in bed formats into a bin database Input: 1. annotations2load: list of annotations to load. Annotations available: REPEATS, TRANSDUCTIONS and EXONS 2. refLengths: Dictionary containing reference ids as keys and as values the length for each reference 3. annotationsDir: Directory containing annotation files 4. germlineMEI: Bed file containing set of known germline MEI. None if not available 5. threads: number of threads used to parallelize the bin database creation 6. outDir: Output directory Output: 1. annotations: dictionary containing one key per type of annotation loaded and bin databases containing annotated features as values (None for those annotations not loaded) ''' ## 0. Initialize dictionary annotations = {} annotations['REPEATS'] = None annotations['TRANSDUCTIONS'] = None annotations['EXONS'] = None annotations['GERMLINE-MEI'] = None ## Create output directory unix.mkdir(outDir) ## 1A. Load annotated repeats into a bin database if 'REPEATS' in annotations2load: repeatsBed = annotationsDir + '/repeats.bed' annotations['REPEATS'] = formats.bed2binDb(repeatsBed, refLengths, threads) ## 1B. Load L1 repeats and pA into a bin database elif 'REPEATS-L1' in annotations2load: repeatsBed = annotationsDir + '/repeats.L1.pA.bed' annotations['REPEATS'] = formats.bed2binDb(repeatsBed, refLengths, threads) ## 2. Create transduced regions database if 'TRANSDUCTIONS' in annotations2load: ## Create bed file containing transduced regions sourceBed = annotationsDir + '/srcElements.bed' # buffer equals -150 to avoid the end of the src element transducedPath = databases.create_transduced_bed(sourceBed, 10000, -150, outDir) ## Load transduced regions into a bin database annotations['TRANSDUCTIONS'] = formats.bed2binDb(transducedPath, refLengths, threads) ## 3. Create exons database if 'EXONS' in annotations2load: exonsBed = annotationsDir + '/exons.bed' annotations['EXONS'] = formats.bed2binDb(exonsBed, refLengths, threads) ## 4. Create germline MEI database if 'GERMLINE-MEI' in annotations2load: annotations['GERMLINE-MEI'] = formats.bed2binDb(germlineMEI, refLengths, threads) return annotations
def targeted_alignment_minimap2(FASTA, targetInterval, reference, outDir, outFormat): # NOTE 2020: In 2020: # def targeted_alignment_minimap2(FASTA, targetInterval, reference, outDir): ''' Align a set of sequences into a reference target region. Useful for doing local realignment of reads around SV breakpoints. Much faster than whole genome realignment Input: 1. FASTA: Path to FASTA file with sequences to align 2. targetInterval: Reference genome interval where sequences will be aligned. The interval must be provided as chr:beg-end. 3. reference: Path to the reference sequences in fasta format. An index of the reference generated with samtools faidx must be located in the same directory 4. outDir: Output directory 5. outFormat: BAM or SAM Output: 1. BAM: Path to sorted BAM file containing input sequences alignments or 'None' if realignment failed ''' ## 0. Create logs directory logDir = outDir + '/Logs' unix.mkdir(logDir) ## 1. Extract the reference target region prior alignment target = outDir + '/target.fa' err = open(logDir + '/target.err', 'w') command = 'samtools faidx ' + reference + ' ' + targetInterval + ' > ' + target status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'TARGET' msg = 'Extraction of reference target region failed' log.step(step, msg) return None ## 2. Align the sequences into the target region # Use -Y to get soft clippings for supplementary alignments SAM = outDir + '/alignments.sam' err = open(logDir + '/align.err', 'w') command = 'minimap2 -Y -a ' + target + ' ' + FASTA + ' > ' + SAM status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'ALIGN' msg = 'Local alignment failed' log.step(step, msg) return None # NOTE 2020: In 2020 these 2 lines were deleted: if outFormat == "SAM": return SAM ## 3. Convert SAM to sorted BAM BAM = bamtools.SAM2BAM(SAM, outDir) ## 4. Do cleanup unix.rm([target, SAM]) return BAM
def SAM2BAM(SAM, outDir): ''' Convert SAM file into sorted BAM and make BAM index Input: 1. SAM: File containing alignments in SAM format Output: 1. BAM_sorted: Sorted and indexed BAM file. BAM index located in the same directory with the extension '.bai' ''' ## 0. Create logs directory logDir = outDir + '/Logs' unix.mkdir(logDir) ## 1. Convert SAM into BAM BAM = outDir + '/alignments.bam' err = open(logDir + '/SAM2BAM.err', 'w') command = 'samtools view -Sb ' + SAM + ' > ' + BAM status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'SAM2BAM' msg = 'SAM to BAM conversion failed' log.step(step, msg) ## 2. Sort bam BAM_sorted = outDir + '/alignments.sorted.bam' err = open(logDir + '/sort.err', 'w') command = 'samtools sort ' + BAM + ' > ' + BAM_sorted status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'SORT' msg = 'BAM sorting failed' log.step(step, msg) ## 3. Index bam BAM_index = outDir + '/alignments.sorted.bam.bai' err = open(logDir + '/index.err', 'w') command = 'samtools index ' + BAM_sorted + ' > ' + BAM_index status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'INDEX' msg = 'BAM indexing failed' log.step(step, msg) return BAM_sorted
def samtools_index_bam(BAM, outDir): ''' Index bam file using samtools Input: 1. BAM: Input bam file complete path. Output: 1. Doesn't return anything. Creates bam index files. ''' logDir = outDir + '/Logs' unix.mkdir(logDir) command = 'samtools index ' + BAM err = open(logDir + '/samtools_index_bam.err', 'w') status = subprocess.call(command, stderr=err, shell=True) return
def create_targeted_fasta(targetIntervalList, reference, outDir): ''' Extract regions of interest from a fasta file. Input: 1. targetIntervalList: Reference genome list of intervals to be extracted. The intervals must be provided as chr:beg-end. 2. reference: Path to fasta file. An index of the reference generated with samtools faidx must be located in the same directory 3. outDir: Output directory Output: 1. target: Path to fasta file with sequences extarcted from intervals. ''' ## 0. Create logs directory logDir = outDir + '/Logs' unix.mkdir(logDir) ## 1. Extract the reference target regions target = outDir + '/targetRegions.fa' err = open(logDir + '/target.err', 'w') targetRegionsPath = outDir + '/targetRegions.txt' targetRegions = open(targetRegionsPath, 'w') for targetInterval in targetIntervalList: targetRegions.write(targetInterval + '\n') targetRegions.close() command = 'samtools faidx ' + reference + ' -r ' + targetRegionsPath + ' -o ' + target status = subprocess.call(command, stderr=err, shell=True) if status != 0: step = 'TARGET' msg = 'Extraction of reference target region failed' log.step(step, msg) return None # TODO: remove targetRegionsPath file return target
def identity_metaclusters_retrotest(metaclusters, bam, outDir): ''' Determine retrotest metaclusters identity. If there is only a cluster and it contains a polyA tail, it will be clasified as a partnered event. Else, it will be an orphan transduction. Partnered: ---------> ----AAAAAA> ---------> ------AAAA> Orphan: ---------> ----ACGTCA> ---------> ------ACG> Input: 1. metaclusters: List of retrotest metaclusters 2. bam: Bam file 3. outDir: output directory Output: Fill metacluster identity attribute with 'partnered' or 'orphan' ''' # set new confDict parameters to search for clippings newconfDict = {} newconfDict['targetEvents'] = ['CLIPPING'] newconfDict['minMAPQ'] = 30 newconfDict['minCLIPPINGlen'] = 8 newconfDict['overhang'] = 0 newconfDict['filterDuplicates'] = True newconfDict['readFilters'] = ['mateUnmap', 'insertSize', 'SMS'] # for each metacluster for metacluster in metaclusters: # if there is no reciprocal clusters if metacluster.orientation != 'RECIPROCAL': ## 1. Collect clippings in region eventsDict = bamtools.collectSV(metacluster.ref, metacluster.refLeftBkp - 100, metacluster.refRightBkp + 100, bam, newconfDict, None, supplementary=False) ## 2. Create clipping consensus # create bkp dir bkpDir = outDir + '/BKP' unix.mkdir(bkpDir) # initialize variable clipConsensus = None # if cluster orientation is plus if metacluster.orientation == 'PLUS': # if there is only a clipping event if len(eventsDict['RIGHT-CLIPPING']) == 1: clipConsensus = eventsDict['RIGHT-CLIPPING'][ 0].clipped_seq() # if there is more than a clipping event elif len(eventsDict['RIGHT-CLIPPING']) > 1: clipConsensusPath, clipConsensus = bkp.makeConsSeqs( eventsDict['RIGHT-CLIPPING'], 'INT', bkpDir) # if cluster orientation is minus elif metacluster.orientation == 'MINUS': # if there is only a clipping event if len(eventsDict['LEFT-CLIPPING']) == 1: clipConsensus = eventsDict['LEFT-CLIPPING'][0].clipped_seq( ) # if there is more than a clipping event elif len(eventsDict['LEFT-CLIPPING']) > 1: clipConsensusPath, clipConsensus = bkp.makeConsSeqs( eventsDict['LEFT-CLIPPING'], 'INT', bkpDir) ## 3. polyA search if there is a consensus if clipConsensus: # set metacluster identity to partnered if there is polyA/polyT tail in consensus seq if has_polyA_illumina(clipConsensus): metacluster.identity = 'partnered' # set metacluster identity to orphan if metacluster not partnered if metacluster.identity != 'partnered': metacluster.identity = 'orphan'
def retrotransposon_structure(FASTA_file, index, outDir): ''' Infer the insertion size, structure, poly-A, target site duplication length and other insertion structural features Input: 1. FASTA_file: Path to FASTA file containing the sequence 2. index: Minimap2 index for consensus retrotransposon sequences database 3. outDir: Output directory Output: 1. structure: dictionary containing insertion structure information ''' structure = {} ## 0. Create logs directory ## logDir = outDir + '/Logs' unix.mkdir(logDir) ## 1. Align the sequence into the retrotransposon sequences database ## PAF_file = alignment.alignment_minimap2(FASTA_file, index, 'alignment2consensus', 1, outDir) ## 2. Read PAF alignments ## PAF = formats.PAF() PAF.read(PAF_file) # Exit function if no hit on the retrotransposons database if not PAF.alignments: return structure ## 3. Chain complementary alignments ## chain = PAF.chain(100, 20) ## 4. Infer insertion features ## ## Retrieve inserted seq FASTA = formats.FASTA() FASTA.read(FASTA_file) sequence = list(FASTA.seqDict.values())[0] ## 4.1 Insertion type structure['INS_TYPE'], structure['FAMILY'], structure[ 'CYTOBAND'] = insertion_type(chain) ## 4.2 Insertion strand structure['STRAND'], structure['POLYA'] = infer_strand( structure['INS_TYPE'], sequence, chain) ## 4.3 Sequence lengths lengths = infer_lengths(structure['INS_TYPE'], chain, structure['STRAND']) structure.update(lengths) ## 4.4 Insertion mechanism (TPRT or EI) structure['MECHANISM'] = infer_integration_mechanism( chain, structure['TRUNCATION_3_LEN'], structure['POLYA']) ## 4.5 Target site duplication (TO DO LATER...) #search4tsd() ## 4.6 Percentage resolved structure['PERC_RESOLVED'] = chain.perc_query_covered() return structure
def call_NUMT(vcf, mtGenome, outDir): ''' ''' ## 0. Create temporary folder tmpDir = outDir + '/tmp' unix.mkdir(tmpDir) ## 1. Write inserted sequences into fasta file fastaPath = tmpDir + '/insertions.fa' fasta = ins2fasta(vcf, tmpDir) fasta.write(fastaPath) ## 2. Create index for the mitochondrial genome fileName = 'mtGenome' mtIndex = alignment.index_minimap2(mtGenome, fileName, tmpDir) ## 3. Align inserted sequences against the mitochondrial genome PAF_path = alignment.alignment_minimap2(fastaPath, mtIndex, 'hits2mt', 1, tmpDir) PAF_mt = formats.PAF() PAF_mt.read(PAF_path) ## 4. Generate single PAF objects per inserted sequence: PAFs_mt = group_alignments(PAF_mt) ## 5. Make NUMTs calls NUMTs = {} for insId in PAFs_mt: chain = PAFs_mt[insId].chain(20, 50) # Make NUMT call if enough % of sequence resolved if chain.perc_query_covered() >= 60: coords = chain.interval_template() NUMT = {} NUMT['ITYPE'] = 'NUMT' NUMT['MT_COORD'] = str(coords[0]) + '-' + str(coords[1]) NUMTs[insId] = NUMT ## 6. Generate output VCF containing NUMT calls ## Create header for output dictionary outVCF = formats.VCF() outVCF.header = vcf.header ## Add MEI specific fields to the VCF header info2add = {'ITYPE': ['.', 'String', 'Type of insertion (solo, partnered or orphan)'], \ '3PRIME': ['0', 'Flag', 'Partnered 3-prime transduction'], \ '5PRIME': ['0', 'Flag', 'Partnered 5-prime transduction'], \ 'FAM': ['.', 'String', 'Repeat family'], \ 'CYTOID': ['.', 'String', 'Source element cytoband identifier'], \ 'RETRO_LEN': ['1', 'Integer', 'Inserted retrotransposon length'], \ 'TRUNCATION_5_LEN': ['1', 'Integer', 'Size of 5prime truncation'], \ 'TRUNCATION_3_LEN': ['1', 'Integer', 'Size of 3prime truncation'], \ 'INVERSION_LEN': ['1', 'Integer', '5-inversion length'], \ 'RETRO_COORD': ['.', 'String', 'Coordinates for inserted retrotransposon piece of sequence'], \ 'IS_FULL': ['0', 'Flag', 'Full length mobile element'], \ 'ORF1': ['0', 'Flag', 'ORF1 identified'], \ 'ORF2': ['0', 'Flag', 'ORF2 identified'], \ 'COMPETENT': ['0', 'Flag', 'Potential competent full L1 with intact ORFs'], \ 'TDCOORD_5PRIME': ['1', 'Integer', '5-prime transduced sequence coordinates'], \ 'TDCOORD_3PRIME': ['1', 'Integer', '3-prime transduced sequence coordinates'], \ 'TDLEN_5PRIME': ['1', 'Integer', '5-prime transduction length'], \ 'TDLEN_3PRIME': ['1', 'Integer', '3-prime transduction length'], \ 'STRAND': ['.', 'String', 'Insertion DNA strand (+ or -)'], } outVCF.header.info.update(info2add) ## Select INS corresponding to MEI calls and add update info field with MEI features for variant in vcf.variants: insId = variant.chrom + ':' + str(variant.pos) # Discard unresolved inserted sequences if (insId not in NUMTs): continue variant2add = copy.deepcopy(variant) variant2add.info.update(NUMTs[insId]) outVCF.add(variant2add) ## 9. Do cleanup #unix.rm([tmpDir]) return outVCF
def call_MEI(vcf, consensus, reference, sourceDb, outDir): ''' ''' ## 0. Create temporary folder tmpDir = outDir + '/tmp' unix.mkdir(tmpDir) ## 1. Write inserted sequences into fasta file fastaPath = tmpDir + '/MEI_candidate.fa' fasta = ins2fasta(vcf, tmpDir) fasta.write(fastaPath) ## 2. Create index for consensus sequences fileName = 'consensus' consensusIndex = alignment.index_minimap2(consensus, fileName, tmpDir) ## 3. Align inserted sequences against consensus: PAF_path = alignment.alignment_minimap2(fastaPath, consensusIndex, 'hits2consensus', 1, tmpDir) PAF_consensus = formats.PAF() PAF_consensus.read(PAF_path) ## Temporary index = "/Users/brodriguez/Research/References/Annotations/H.sapiens/hg38/Repetitive_dna/smallRNAs.mmi" PAF_path = alignment.alignment_minimap2(fastaPath, index, 'hits2small_MEI', 1, tmpDir) ## Align inserted sequences against the reference genome #SAM_path = alignment.alignment_bwa(fastaPath, reference, 'hits2genome', 1, tmpDir) #PAF_path = alignment.sam2paf(SAM_path, 'hits2genome', tmpDir) #PAF_genome = formats.PAF() #PAF_genome.read(PAF_path) ## 4. Generate single PAF objects per inserted sequence: PAFs_consensus = group_alignments(PAF_consensus) #PAFs_genome = group_alignments(PAF_genome) ## 5. Resolve structure for each insertion with matches on retrotransposon consensus sequences structures = {} for insId in PAFs_consensus: structures[insId] = MEI_structure(PAFs_consensus[insId], fasta.seqDict[insId]) seqBeg, seqEnd = structures[insId]['CHAIN'].interval() ## 6. Resolve 3' partnered transductions structures = resolve_partnered_3prime(structures, fasta, reference, sourceDb, tmpDir) ## 6. Search for 5' partnered transductions structures = search4partnered_5prime(structures, fasta, reference, tmpDir) ## 7. Search for orphan transductions ## Remove resolved insertions #for insId in structures: # if structures[insId]['PASS']: # del PAFs_genome[insId] ## Do orphan transduction search #search4orphan(PAFs_genome, sourceDb, fasta) # TO FINISH LATER (Only two L1 orphan transductions so far..) ## 8. Generate output VCF containing MEI calls ## Create header for output dictionary outVCF = formats.VCF() outVCF.header = vcf.header ## Add MEI specific fields to the VCF header info2add = {'ITYPE': ['.', 'String', 'Type of insertion (solo, partnered, orphan or NUMT)'], \ '3PRIME': ['0', 'Flag', 'Partnered 3-prime transduction'], \ '5PRIME': ['0', 'Flag', 'Partnered 5-prime transduction'], \ 'FAM': ['.', 'String', 'Repeat family'], \ 'CYTOID': ['.', 'String', 'Source element cytoband identifier'], \ 'RETRO_LEN': ['1', 'Integer', 'Inserted retrotransposon length'], \ 'TRUNCATION_5_LEN': ['1', 'Integer', 'Size of 5prime truncation'], \ 'TRUNCATION_3_LEN': ['1', 'Integer', 'Size of 3prime truncation'], \ 'INVERSION_LEN': ['1', 'Integer', '5-inversion length'], \ 'RETRO_COORD': ['.', 'String', 'Coordinates for inserted retrotransposon piece of sequence'], \ 'IS_FULL': ['0', 'Flag', 'Full length mobile element'], \ 'ORF1': ['0', 'Flag', 'ORF1 identified'], \ 'ORF2': ['0', 'Flag', 'ORF2 identified'], \ 'COMPETENT': ['0', 'Flag', 'Potential competent full L1 with intact ORFs'], \ 'TDCOORD_5PRIME': ['1', 'Integer', '5-prime transduced sequence coordinates'], \ 'TDCOORD_3PRIME': ['1', 'Integer', '3-prime transduced sequence coordinates'], \ 'TDLEN_5PRIME': ['1', 'Integer', '5-prime transduction length'], \ 'TDLEN_3PRIME': ['1', 'Integer', '3-prime transduction length'], \ 'STRAND': ['.', 'String', 'Insertion DNA strand (+ or -)'], \ 'MT_COORD': ['.', 'String', 'Coordinates for the piece of MT genome integrated'] } outVCF.header.info.update(info2add) ## Select INS corresponding to MEI calls and add update info field with MEI features for variant in vcf.variants: insId = variant.chrom + ':' + str(variant.pos) # Discard unresolved inserted sequences if (insId not in structures) or ((insId in structures) and (structures[insId]['PASS'] is False)): continue variant2add = copy.deepcopy(variant) variant2add.info.update(structures[insId]) outVCF.add(variant2add) ## 9. Do cleanup #unix.rm([tmpDir]) return outVCF