Exemplo n.º 1
0
def BAM2BED(BAM, outDir):
    '''
    Convert BAM file into BED using bedtools

	Input:
		1. BAM: Path to BAM file 
        2. outDir: Output directory

	Output:
		1. BED: Path to BED file
    '''
    ## 0. Create logs directory
    logDir = outDir + '/Logs'
    unix.mkdir(logDir)

    ## 1. Convert BAM into BED
    BED_path = outDir + '/alignments.bed'
    err = open(logDir + '/BAM2BED.err', 'w')
    command = 'bedtools bamtobed -split -i ' + BAM + ' > ' + BED_path
    status = subprocess.call(command, stderr=err, shell=True)

    if status != 0:
        step = 'BAM2BED'
        msg = 'BAM to BED conversion failed'
        log.step(step, msg)

    ## 2. Add header to BED file
    header = "#ref \t beg \t end \t name \t score \t strand \n"
    with open(BED_path, 'r') as original:
        data = original.read()
    with open(BED_path, 'w') as modified:
        modified.write(header + data)

    return BED_path
Exemplo n.º 2
0
def gene_annotation_lighter(metaclustersList, annovarDir, outDir):
    '''
    Perform gene-based annotation for a list of input events
 
    Input: 
        1. metaclustersList
        1. events: List containing input events to be annotated. Events should be objects containing ref, beg and end attributes.
        2. annovarDir: Directory containing the two files used by ANNOVAR to perform gene based-annotation:
                a) build_annot.txt     - Text file containing annotated transcript coordinates
                b) build_annotMrna.fa  - Fasta containing annotated transcript sequences
        3. outDir: Output directory

    Output:
    
        New 'geneAnnot' attribute set for each input event. 
        'geneAnnot' is a tuple(region,gene) 
    '''

    ## 1. Create output directory
    unix.mkdir(outDir)

    ## 2. Create input file containing events intervals for ANNOVAR 
    create_annovar_input_lighter(metaclustersList, 'events.annovar', outDir)
    annovarInput = outDir + '/events.annovar'

    ## 3. Annotate events intervals with ANNOVAR 
    out1, out2 = run_annovar(annovarInput, annovarDir, outDir)

    ## 4. Add gene annotation info to the events
    addGnAnnot2events_lighter(metaclustersList, out1)
    
    ## Do cleanup
    unix.rm([annovarInput, out1, out2])
Exemplo n.º 3
0
def komplexityFilter(komplexityThreshold, inFasta, outFasta, outDir):
    '''
    Filter fasta file using komplexity tool
    Input:
        1. komplexityThreshold: Complexity threshold filter.
        2. inFasta: input FASTA file name
        3. outFasta: output FASTA file name
        4. outDir: input AND output directory (it must be the same)
    Output:
        1. allFastas: Filteres FASTA file complete path.
    '''

    # Set input an output files
    allFastas_all = outDir + '/' + inFasta
    allFastas = outDir + '/' + outFasta

    logDir = outDir + '/Logs'
    unix.mkdir(logDir)

    command = 'kz --filter --threshold ' + str(
        komplexityThreshold
    ) + ' --fasta < ' + allFastas_all + ' > ' + allFastas
    err = open(logDir + '/komplexity.err', 'w')
    status = subprocess.call(command, stderr=err, shell=True)
    if status != 0:
        step = 'KOMPLEXITY'
        msg = 'Komplexity filter failed. PID: ' + str(os.getpid())
        log.step(step, msg)

    return allFastas
Exemplo n.º 4
0
def load_annotations(annotations2load, refLengths, annotationsDir, germlineMEI, threads, outDir):
    '''
    Load a set of annotation files in bed formats into a bin database

    Input:
        1. annotations2load: list of annotations to load. Annotations available: REPEATS, TRANSDUCTIONS and EXONS
        2. refLengths: Dictionary containing reference ids as keys and as values the length for each reference  
        3. annotationsDir: Directory containing annotation files
        4. germlineMEI: Bed file containing set of known germline MEI. None if not available
        5. threads: number of threads used to parallelize the bin database creation
        6. outDir: Output directory
    
    Output:
        1. annotations: dictionary containing one key per type of annotation loaded and bin databases containing annotated features as values (None for those annotations not loaded)
    '''
    ## 0. Initialize dictionary
    annotations = {}
    annotations['REPEATS'] = None
    annotations['TRANSDUCTIONS'] = None
    annotations['EXONS'] = None
    annotations['GERMLINE-MEI'] = None

    ## Create output directory
    unix.mkdir(outDir)

    ## 1A. Load annotated repeats into a bin database
    if 'REPEATS' in annotations2load:

        repeatsBed = annotationsDir + '/repeats.bed'
        annotations['REPEATS'] = formats.bed2binDb(repeatsBed, refLengths, threads)

    ## 1B. Load L1 repeats and pA into a bin database
    elif 'REPEATS-L1' in annotations2load:
        
        repeatsBed = annotationsDir + '/repeats.L1.pA.bed'
        annotations['REPEATS'] = formats.bed2binDb(repeatsBed, refLengths, threads)
    
    ## 2. Create transduced regions database
    if 'TRANSDUCTIONS' in annotations2load:

        ## Create bed file containing transduced regions
        sourceBed = annotationsDir + '/srcElements.bed'
        # buffer equals -150 to avoid the end of the src element
        transducedPath = databases.create_transduced_bed(sourceBed, 10000, -150, outDir)
        
        ## Load transduced regions into a bin database
        annotations['TRANSDUCTIONS'] = formats.bed2binDb(transducedPath, refLengths, threads)

    ## 3. Create exons database
    if 'EXONS' in annotations2load:

        exonsBed = annotationsDir + '/exons.bed'
        annotations['EXONS'] = formats.bed2binDb(exonsBed, refLengths, threads)

    ## 4. Create germline MEI database
    if 'GERMLINE-MEI' in annotations2load:
        annotations['GERMLINE-MEI'] = formats.bed2binDb(germlineMEI, refLengths, threads)

    return annotations
Exemplo n.º 5
0
def targeted_alignment_minimap2(FASTA, targetInterval, reference, outDir,
                                outFormat):
    # NOTE 2020: In 2020:
    # def targeted_alignment_minimap2(FASTA, targetInterval, reference, outDir):
    '''
    Align a set of sequences into a reference target region. 
    
    Useful for doing local realignment of reads around SV breakpoints. Much faster than whole genome realignment

    Input:
        1. FASTA: Path to FASTA file with sequences to align
        2. targetInterval: Reference genome interval where sequences will be aligned. The interval must be provided as chr:beg-end.
        3. reference: Path to the reference sequences in fasta format. An index of the reference generated with samtools faidx must be located in the same directory
        4. outDir: Output directory
        5. outFormat: BAM or SAM

    Output:
        1. BAM: Path to sorted BAM file containing input sequences alignments or 'None' if realignment failed 
    '''
    ## 0. Create logs directory
    logDir = outDir + '/Logs'
    unix.mkdir(logDir)

    ## 1. Extract the reference target region prior alignment
    target = outDir + '/target.fa'
    err = open(logDir + '/target.err', 'w')
    command = 'samtools faidx ' + reference + ' ' + targetInterval + ' > ' + target
    status = subprocess.call(command, stderr=err, shell=True)

    if status != 0:
        step = 'TARGET'
        msg = 'Extraction of reference target region failed'
        log.step(step, msg)
        return None

    ## 2. Align the sequences into the target region
    # Use -Y to get soft clippings for supplementary alignments
    SAM = outDir + '/alignments.sam'
    err = open(logDir + '/align.err', 'w')
    command = 'minimap2 -Y -a ' + target + ' ' + FASTA + ' > ' + SAM
    status = subprocess.call(command, stderr=err, shell=True)

    if status != 0:
        step = 'ALIGN'
        msg = 'Local alignment failed'
        log.step(step, msg)
        return None

    # NOTE 2020: In 2020 these 2 lines were deleted:
    if outFormat == "SAM":
        return SAM

    ## 3. Convert SAM to sorted BAM
    BAM = bamtools.SAM2BAM(SAM, outDir)

    ## 4. Do cleanup
    unix.rm([target, SAM])

    return BAM
Exemplo n.º 6
0
def SAM2BAM(SAM, outDir):
    '''
    Convert SAM file into sorted BAM and make BAM index

	Input:
		1. SAM: File containing alignments in SAM format

	Output:
		1. BAM_sorted: Sorted and indexed BAM file. BAM index located in the same directory with the extension '.bai'
    '''
    ## 0. Create logs directory
    logDir = outDir + '/Logs'
    unix.mkdir(logDir)

    ## 1. Convert SAM into BAM
    BAM = outDir + '/alignments.bam'
    err = open(logDir + '/SAM2BAM.err', 'w')
    command = 'samtools view -Sb ' + SAM + ' > ' + BAM
    status = subprocess.call(command, stderr=err, shell=True)

    if status != 0:
        step = 'SAM2BAM'
        msg = 'SAM to BAM conversion failed'
        log.step(step, msg)

    ## 2. Sort bam
    BAM_sorted = outDir + '/alignments.sorted.bam'
    err = open(logDir + '/sort.err', 'w')
    command = 'samtools sort ' + BAM + ' > ' + BAM_sorted
    status = subprocess.call(command, stderr=err, shell=True)

    if status != 0:
        step = 'SORT'
        msg = 'BAM sorting failed'
        log.step(step, msg)

    ## 3. Index bam
    BAM_index = outDir + '/alignments.sorted.bam.bai'
    err = open(logDir + '/index.err', 'w')
    command = 'samtools index ' + BAM_sorted + ' > ' + BAM_index
    status = subprocess.call(command, stderr=err, shell=True)

    if status != 0:
        step = 'INDEX'
        msg = 'BAM indexing failed'
        log.step(step, msg)

    return BAM_sorted
Exemplo n.º 7
0
def samtools_index_bam(BAM, outDir):
    '''
    Index bam file using samtools

    Input:
        1. BAM: Input bam file complete path.
    
    Output:
        1. Doesn't return anything. Creates bam index files.
    '''
    logDir = outDir + '/Logs'
    unix.mkdir(logDir)

    command = 'samtools index ' + BAM
    err = open(logDir + '/samtools_index_bam.err', 'w')
    status = subprocess.call(command, stderr=err, shell=True)

    return
Exemplo n.º 8
0
def create_targeted_fasta(targetIntervalList, reference, outDir):
    '''
    Extract regions of interest from a fasta file.
    
    Input:
        1. targetIntervalList: Reference genome list of intervals to be extracted. The intervals must be provided as chr:beg-end.
        2. reference: Path to fasta file. An index of the reference generated with samtools faidx must be located in the same directory
        3. outDir: Output directory

    Output:
        1. target: Path to fasta file with sequences extarcted from intervals.
    '''
    ## 0. Create logs directory
    logDir = outDir + '/Logs'
    unix.mkdir(logDir)

    ## 1. Extract the reference target regions
    target = outDir + '/targetRegions.fa'
    err = open(logDir + '/target.err', 'w')
    targetRegionsPath = outDir + '/targetRegions.txt'
    targetRegions = open(targetRegionsPath, 'w')

    for targetInterval in targetIntervalList:
        targetRegions.write(targetInterval + '\n')
    targetRegions.close()

    command = 'samtools faidx ' + reference + ' -r ' + targetRegionsPath + ' -o ' + target
    status = subprocess.call(command, stderr=err, shell=True)

    if status != 0:
        step = 'TARGET'
        msg = 'Extraction of reference target region failed'
        log.step(step, msg)
        return None

    # TODO: remove targetRegionsPath file

    return target
Exemplo n.º 9
0
def identity_metaclusters_retrotest(metaclusters, bam, outDir):
    '''
    Determine retrotest metaclusters identity. If there is only a cluster and it contains a polyA tail, 
    it will be clasified as a partnered event. Else, it will be an orphan transduction. 
    
    Partnered:
    --------->
             ----AAAAAA>
       --------->
           ------AAAA>

    Orphan:
    --------->
             ----ACGTCA>
       --------->
           ------ACG>
    
    Input:
    1. metaclusters: List of retrotest metaclusters
    2. bam: Bam file
    3. outDir: output directory
    
    Output:
    Fill metacluster identity attribute with 'partnered' or 'orphan'    
    '''

    # set new confDict parameters to search for clippings
    newconfDict = {}
    newconfDict['targetEvents'] = ['CLIPPING']
    newconfDict['minMAPQ'] = 30
    newconfDict['minCLIPPINGlen'] = 8
    newconfDict['overhang'] = 0
    newconfDict['filterDuplicates'] = True
    newconfDict['readFilters'] = ['mateUnmap', 'insertSize', 'SMS']

    # for each metacluster
    for metacluster in metaclusters:

        # if there is no reciprocal clusters
        if metacluster.orientation != 'RECIPROCAL':

            ## 1. Collect clippings in region
            eventsDict = bamtools.collectSV(metacluster.ref,
                                            metacluster.refLeftBkp - 100,
                                            metacluster.refRightBkp + 100,
                                            bam,
                                            newconfDict,
                                            None,
                                            supplementary=False)

            ## 2. Create clipping consensus
            # create bkp dir
            bkpDir = outDir + '/BKP'
            unix.mkdir(bkpDir)

            # initialize variable
            clipConsensus = None

            # if cluster orientation is plus
            if metacluster.orientation == 'PLUS':

                # if there is only a clipping event
                if len(eventsDict['RIGHT-CLIPPING']) == 1:
                    clipConsensus = eventsDict['RIGHT-CLIPPING'][
                        0].clipped_seq()

                # if there is more than a clipping event
                elif len(eventsDict['RIGHT-CLIPPING']) > 1:
                    clipConsensusPath, clipConsensus = bkp.makeConsSeqs(
                        eventsDict['RIGHT-CLIPPING'], 'INT', bkpDir)

            # if cluster orientation is minus
            elif metacluster.orientation == 'MINUS':

                # if there is only a clipping event
                if len(eventsDict['LEFT-CLIPPING']) == 1:
                    clipConsensus = eventsDict['LEFT-CLIPPING'][0].clipped_seq(
                    )

                # if there is more than a clipping event
                elif len(eventsDict['LEFT-CLIPPING']) > 1:
                    clipConsensusPath, clipConsensus = bkp.makeConsSeqs(
                        eventsDict['LEFT-CLIPPING'], 'INT', bkpDir)

            ## 3. polyA search if there is a consensus
            if clipConsensus:

                # set metacluster identity to partnered if there is polyA/polyT tail in consensus seq
                if has_polyA_illumina(clipConsensus):
                    metacluster.identity = 'partnered'

        # set metacluster identity to orphan if metacluster not partnered
        if metacluster.identity != 'partnered': metacluster.identity = 'orphan'
Exemplo n.º 10
0
def retrotransposon_structure(FASTA_file, index, outDir):
    '''    
    Infer the insertion size, structure, poly-A, target site duplication length and other insertion structural features

    Input:
        1. FASTA_file: Path to FASTA file containing the sequence
        2. index: Minimap2 index for consensus retrotransposon sequences database
        3. outDir: Output directory
        
    Output:
        1. structure: dictionary containing insertion structure information
    '''
    structure = {}

    ## 0. Create logs directory ##
    logDir = outDir + '/Logs'
    unix.mkdir(logDir)

    ## 1. Align the sequence into the retrotransposon sequences database ##
    PAF_file = alignment.alignment_minimap2(FASTA_file, index,
                                            'alignment2consensus', 1, outDir)

    ## 2. Read PAF alignments ##
    PAF = formats.PAF()
    PAF.read(PAF_file)

    # Exit function if no hit on the retrotransposons database
    if not PAF.alignments:
        return structure

    ## 3. Chain complementary alignments ##
    chain = PAF.chain(100, 20)

    ## 4. Infer insertion features ##
    ## Retrieve inserted seq
    FASTA = formats.FASTA()
    FASTA.read(FASTA_file)
    sequence = list(FASTA.seqDict.values())[0]

    ## 4.1 Insertion type
    structure['INS_TYPE'], structure['FAMILY'], structure[
        'CYTOBAND'] = insertion_type(chain)

    ## 4.2 Insertion strand
    structure['STRAND'], structure['POLYA'] = infer_strand(
        structure['INS_TYPE'], sequence, chain)

    ## 4.3 Sequence lengths
    lengths = infer_lengths(structure['INS_TYPE'], chain, structure['STRAND'])
    structure.update(lengths)

    ## 4.4 Insertion mechanism (TPRT or EI)
    structure['MECHANISM'] = infer_integration_mechanism(
        chain, structure['TRUNCATION_3_LEN'], structure['POLYA'])

    ## 4.5 Target site duplication (TO DO LATER...)
    #search4tsd()

    ## 4.6 Percentage resolved
    structure['PERC_RESOLVED'] = chain.perc_query_covered()

    return structure
Exemplo n.º 11
0
def call_NUMT(vcf, mtGenome, outDir):
    '''
    '''
    ## 0. Create temporary folder
    tmpDir = outDir + '/tmp'
    unix.mkdir(tmpDir)

    ## 1. Write inserted sequences into fasta file
    fastaPath = tmpDir + '/insertions.fa'
    fasta = ins2fasta(vcf, tmpDir)
    fasta.write(fastaPath)

    ## 2. Create index for the mitochondrial genome
    fileName = 'mtGenome'
    mtIndex = alignment.index_minimap2(mtGenome, fileName, tmpDir)

    ## 3. Align inserted sequences against the mitochondrial genome
    PAF_path = alignment.alignment_minimap2(fastaPath, mtIndex, 'hits2mt', 1,
                                            tmpDir)
    PAF_mt = formats.PAF()
    PAF_mt.read(PAF_path)

    ## 4. Generate single PAF objects per inserted sequence:
    PAFs_mt = group_alignments(PAF_mt)

    ## 5. Make NUMTs calls
    NUMTs = {}

    for insId in PAFs_mt:
        chain = PAFs_mt[insId].chain(20, 50)

        # Make NUMT call if enough % of sequence resolved
        if chain.perc_query_covered() >= 60:

            coords = chain.interval_template()

            NUMT = {}
            NUMT['ITYPE'] = 'NUMT'
            NUMT['MT_COORD'] = str(coords[0]) + '-' + str(coords[1])
            NUMTs[insId] = NUMT

    ## 6. Generate output VCF containing NUMT calls
    ## Create header for output dictionary
    outVCF = formats.VCF()
    outVCF.header = vcf.header

    ## Add MEI specific fields to the VCF header
    info2add = {'ITYPE': ['.', 'String', 'Type of insertion (solo, partnered or orphan)'], \
                '3PRIME': ['0', 'Flag', 'Partnered 3-prime transduction'], \
                '5PRIME': ['0', 'Flag', 'Partnered 5-prime transduction'], \
                'FAM': ['.', 'String', 'Repeat family'], \
                'CYTOID': ['.', 'String', 'Source element cytoband identifier'], \
                'RETRO_LEN': ['1', 'Integer', 'Inserted retrotransposon length'], \
                'TRUNCATION_5_LEN': ['1', 'Integer', 'Size of 5prime truncation'], \
                'TRUNCATION_3_LEN': ['1', 'Integer', 'Size of 3prime truncation'], \
                'INVERSION_LEN': ['1', 'Integer', '5-inversion length'], \
                'RETRO_COORD': ['.', 'String', 'Coordinates for inserted retrotransposon piece of sequence'], \
                'IS_FULL': ['0', 'Flag', 'Full length mobile element'], \
                'ORF1': ['0', 'Flag', 'ORF1 identified'], \
                'ORF2': ['0', 'Flag', 'ORF2 identified'], \
                'COMPETENT': ['0', 'Flag', 'Potential competent full L1 with intact ORFs'], \
                'TDCOORD_5PRIME': ['1', 'Integer', '5-prime transduced sequence coordinates'], \
                'TDCOORD_3PRIME': ['1', 'Integer', '3-prime transduced sequence coordinates'], \
                'TDLEN_5PRIME': ['1', 'Integer', '5-prime transduction length'], \
                'TDLEN_3PRIME': ['1', 'Integer', '3-prime transduction length'], \
                'STRAND': ['.', 'String', 'Insertion DNA strand (+ or -)'],
                }
    outVCF.header.info.update(info2add)

    ## Select INS corresponding to MEI calls and add update info field with MEI features
    for variant in vcf.variants:
        insId = variant.chrom + ':' + str(variant.pos)

        # Discard unresolved inserted sequences
        if (insId not in NUMTs):
            continue

        variant2add = copy.deepcopy(variant)
        variant2add.info.update(NUMTs[insId])
        outVCF.add(variant2add)

    ## 9. Do cleanup
    #unix.rm([tmpDir])

    return outVCF
Exemplo n.º 12
0
def call_MEI(vcf, consensus, reference, sourceDb, outDir):
    '''
    '''
    ## 0. Create temporary folder
    tmpDir = outDir + '/tmp'
    unix.mkdir(tmpDir)

    ## 1. Write inserted sequences into fasta file
    fastaPath = tmpDir + '/MEI_candidate.fa'
    fasta = ins2fasta(vcf, tmpDir)
    fasta.write(fastaPath)

    ## 2. Create index for consensus sequences
    fileName = 'consensus'
    consensusIndex = alignment.index_minimap2(consensus, fileName, tmpDir)

    ## 3. Align inserted sequences against consensus:
    PAF_path = alignment.alignment_minimap2(fastaPath, consensusIndex,
                                            'hits2consensus', 1, tmpDir)
    PAF_consensus = formats.PAF()
    PAF_consensus.read(PAF_path)

    ## Temporary
    index = "/Users/brodriguez/Research/References/Annotations/H.sapiens/hg38/Repetitive_dna/smallRNAs.mmi"
    PAF_path = alignment.alignment_minimap2(fastaPath, index, 'hits2small_MEI',
                                            1, tmpDir)

    ## Align inserted sequences against the reference genome
    #SAM_path = alignment.alignment_bwa(fastaPath, reference, 'hits2genome', 1, tmpDir)
    #PAF_path = alignment.sam2paf(SAM_path, 'hits2genome', tmpDir)
    #PAF_genome = formats.PAF()
    #PAF_genome.read(PAF_path)

    ## 4. Generate single PAF objects per inserted sequence:
    PAFs_consensus = group_alignments(PAF_consensus)
    #PAFs_genome = group_alignments(PAF_genome)

    ## 5. Resolve structure for each insertion with matches on retrotransposon consensus sequences
    structures = {}

    for insId in PAFs_consensus:
        structures[insId] = MEI_structure(PAFs_consensus[insId],
                                          fasta.seqDict[insId])
        seqBeg, seqEnd = structures[insId]['CHAIN'].interval()

    ## 6. Resolve 3' partnered transductions
    structures = resolve_partnered_3prime(structures, fasta, reference,
                                          sourceDb, tmpDir)

    ## 6. Search for 5' partnered transductions
    structures = search4partnered_5prime(structures, fasta, reference, tmpDir)

    ## 7. Search for orphan transductions
    ## Remove resolved insertions
    #for insId in structures:
    #    if structures[insId]['PASS']:
    #        del PAFs_genome[insId]

    ## Do orphan transduction search
    #search4orphan(PAFs_genome, sourceDb, fasta) # TO FINISH LATER (Only two L1 orphan transductions so far..)

    ## 8. Generate output VCF containing MEI calls
    ## Create header for output dictionary
    outVCF = formats.VCF()
    outVCF.header = vcf.header

    ## Add MEI specific fields to the VCF header
    info2add = {'ITYPE': ['.', 'String', 'Type of insertion (solo, partnered,  orphan or NUMT)'], \
                '3PRIME': ['0', 'Flag', 'Partnered 3-prime transduction'], \
                '5PRIME': ['0', 'Flag', 'Partnered 5-prime transduction'], \
                'FAM': ['.', 'String', 'Repeat family'], \
                'CYTOID': ['.', 'String', 'Source element cytoband identifier'], \
                'RETRO_LEN': ['1', 'Integer', 'Inserted retrotransposon length'], \
                'TRUNCATION_5_LEN': ['1', 'Integer', 'Size of 5prime truncation'], \
                'TRUNCATION_3_LEN': ['1', 'Integer', 'Size of 3prime truncation'], \
                'INVERSION_LEN': ['1', 'Integer', '5-inversion length'], \
                'RETRO_COORD': ['.', 'String', 'Coordinates for inserted retrotransposon piece of sequence'], \
                'IS_FULL': ['0', 'Flag', 'Full length mobile element'], \
                'ORF1': ['0', 'Flag', 'ORF1 identified'], \
                'ORF2': ['0', 'Flag', 'ORF2 identified'], \
                'COMPETENT': ['0', 'Flag', 'Potential competent full L1 with intact ORFs'], \
                'TDCOORD_5PRIME': ['1', 'Integer', '5-prime transduced sequence coordinates'], \
                'TDCOORD_3PRIME': ['1', 'Integer', '3-prime transduced sequence coordinates'], \
                'TDLEN_5PRIME': ['1', 'Integer', '5-prime transduction length'], \
                'TDLEN_3PRIME': ['1', 'Integer', '3-prime transduction length'], \
                'STRAND': ['.', 'String', 'Insertion DNA strand (+ or -)'], \
                'MT_COORD': ['.', 'String', 'Coordinates for the piece of MT genome integrated']
                }

    outVCF.header.info.update(info2add)

    ## Select INS corresponding to MEI calls and add update info field with MEI features
    for variant in vcf.variants:
        insId = variant.chrom + ':' + str(variant.pos)

        # Discard unresolved inserted sequences
        if (insId not in structures) or ((insId in structures) and
                                         (structures[insId]['PASS'] is False)):
            continue

        variant2add = copy.deepcopy(variant)
        variant2add.info.update(structures[insId])
        outVCF.add(variant2add)

    ## 9. Do cleanup
    #unix.rm([tmpDir])

    return outVCF