Exemplo n.º 1
0
def run_nucmer(ref, query, output, min_percent_id=95, run_promer=False):
    '''Run nucmer and return a list of alignment objects'''
    runner = nucmer.Runner(
        ref,
        query,
        output,
        min_id=min_percent_id,
        coords_header=False,
        maxmatch=True,
        simplify=False,
        promer=run_promer)  # nucmer default break length is 200
    runner.run()
    file_reader = coords_file.reader(output)
    alignments = [coord for coord in file_reader]
    return alignments
Exemplo n.º 2
0
def getTIRs(elements=None,
            flankdist=10,
            minid=80,
            minterm=10,
            minseed=5,
            diagfactor=0.3,
            mites=False,
            report='split',
            temp=None,
            keeptemp=False,
            alignTool='nucmer',
            verbose=False):
    """ 
	Align elements to self and attempt to identify TIRs. 
	Optionally attempt to construct synthetic MITEs from TIRs.
	"""
    # Set temp directory to cwd if none.
    if not temp:
        temp = os.getcwd()
    # For each candidate LTR element
    for rec in elements:
        # Create temp paths for single element fasta and alignment coords
        tempFasta = os.path.join(temp, cleanID(rec.id) + '.fasta')
        tempCoords = tempFasta + '.coords'
        # Write current element to single fasta
        manageTemp(record=rec, tempPath=tempFasta, scrub=False)
        # Align to self with nucmer
        if alignTool == 'nucmer':
            # Compose Nucmer script for current element vs self
            runner = nucmer.Runner(tempFasta,
                                   tempFasta,
                                   tempCoords,
                                   min_id=minid,
                                   min_length=minseed,
                                   diagfactor=diagfactor,
                                   mincluster=minterm,
                                   breaklen=200,
                                   maxmatch=True,
                                   simplify=False)
            # Execute nucmer
            runner.run()
        elif alignTool == 'blastn':
            # Alternatively, use blastn as search tool and write nucmer.coords-like output.
            cmds = makeBlast(seq=tempFasta, outfile=tempCoords, pid=minid)
            run_blast(cmds, verbose=verbose)
        # Import coords file to iterator object
        file_reader = coords_file.reader(tempCoords)
        # Exclude hits to self. Also converts iterator output to stable list
        alignments = [hit for hit in file_reader if not hit.is_self_hit()]
        # Filter hits less than min length (Done internally for nucmer, not blastn.)
        alignments = [
            hit for hit in alignments if hit.ref_end - hit.ref_start >= minterm
        ]
        # Filter for hits on same strand i.e. tandem repeats / LTRs
        alignments = [hit for hit in alignments if not hit.on_same_strand()]
        # Filter for 5' repeats which begin within x bases of element start
        alignments = [hit for hit in alignments if hit.ref_start <= flankdist]
        # Scrub overlappying ref / query segments, and also complementary 3' to 5' flank hits
        alignments = [hit for hit in alignments if hit.ref_end < hit.qry_end]
        # Sort largest to smallest dist between end of ref (subject) and start of query (hit)
        # x.qry_start - x.ref_end = length of internal segment
        alignments = sorted(alignments,
                            key=lambda x: (x.qry_end - x.ref_end),
                            reverse=True)
        # If alignments exist after filtering report features using alignment pair with largest
        # internal segment i.e. first element in sorted list.
        if alignments:
            if verbose:
                [print(x) for x in alignments]
            if report == 'all':
                yield rec
            if report in ['split', 'external']:
                # yield TIR slice - append "_TIR"
                extSeg = rec[alignments[0].ref_start:alignments[0].ref_end + 1]
                extSeg.id = extSeg.id + "_TIR"
                extSeg.name = extSeg.id
                extSeg.description = "[" + rec.id + " TIR segment]"
                yield extSeg
            if report in ['split', 'internal']:
                # yield internal slice - append "_I"
                intSeg = rec[alignments[0].ref_end:alignments[0].qry_end + 1]
                intSeg.id = intSeg.id + "_I"
                intSeg.name = intSeg.id
                intSeg.description = "[" + rec.id + " internal segment]"
                yield intSeg
            if mites:
                # Assemble TIRs into hypothetical MITEs
                synMITE = rec[alignments[0].ref_start:alignments[0].ref_end +
                              1] + rec[alignments[0].
                                       qry_end:alignments[0].qry_start + 1]
                synMITE.id = synMITE.id + "_synMITE"
                synMITE.name = synMITE.id
                synMITE.description = "[Synthetic MITE constructed from " + rec.id + " TIRs]"
                yield synMITE
        else:
            # If alignment list empty after filtering print alert and continue
            print('No TIRs found for candidate element: %s' % rec.id)
        # Scrub single fasta and coords file for current element.
        if not keeptemp:
            manageTemp(tempPath=tempFasta, scrub=True)
            manageTemp(tempPath=tempCoords, scrub=True)
Exemplo n.º 3
0
        else:
            contigs.append(x)
            #print("long", x.id)

    for pathname in glob.glob("*.fasta"):
        basename = os.path.basename(pathname)

        for x in short_contigs:

            if x.id in basename:
                runner = nucmer.Runner(basename,
                                       basename,
                                       "%(x)s_out.coords" % {'x': x.id},
                                       maxmatch=True,
                                       simplify=False,
                                       mincluster=2000,
                                       min_id=99,
                                       min_length=2000,
                                       coords_header=True)

                runner.run()

# The below lines are for saving fasta files of the contigs if desired
#SeqIO.write(short_contigs , "short_contigs.fasta", "fasta")
#SeqIO.write(lin_contigs , "lin_contigs.fasta", "fasta")

# The below lines are for visually checking which files are repetitive or not
'''
for pathname in glob.glob("*.coords"):
Exemplo n.º 4
0
#check if valid parameter is provided for identity value
if args.minidentity.isdigit() == False:
    print('Error: Identity (-i) threshold must be a numeric value')
    exit()
#Set a default header if user-defined header name is absent
if args.header == None:
    args.header = 'reordered_contig'
print(args.header)

#Save query file in faidx index
contigfile = Fasta(args.query)

#run nucmer program and filter the coords output file
runner = nucmer.Runner(args.reference,
                       args.query,
                       args.coordinates,
                       min_id=args.minidentity,
                       coords_header=False)
runner.run()

#open output files
coordsfile = open(args.coordinates)
outfile = open(args.output, 'w')

#reorder the sequences based on the reference genome coordinates
reordered = ''
#print(contigfile[0].name)
for line in coordsfile:
    fields = line.replace('[BEGIN]', '').rstrip('\n').split('\t')[:-1]
    #print(fields)
    start = int(fields[2])
Exemplo n.º 5
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('oddities_csv')
    p.add_argument('genomes_dir', help='fastani database dir')
    p.add_argument('--percent-threshold', type=float,
                   default=95.0)
    p.add_argument('--length-threshold', type=int, default=0)
    p.add_argument('-v', '--verbose', action='store_true')
    p.add_argument('--genome-extension', default='', type=str)
    args = p.parse_args()

    print('loading', args.oddities_csv)
    print('getting genomes from:', args.genomes_dir)
    print('length threshold for alignments (bp):', args.length_threshold)
    print('lower cutoff for identity (%):', args.percent_threshold)

    prefix = args.oddities_csv
    assert prefix.endswith('.csv')
    if prefix.endswith('.csv'):
        prefix = prefix[:-4]

    fp = open(args.oddities_csv, 'rt')
    r = csv.DictReader(fp)

    alignments_dir = prefix + '.alignments'
    print('putting alignments in:', alignments_dir)
    try:
        os.mkdir(alignments_dir)
    except FileExistsError:
        print('warning: directory already exists!')

    print('----')

    for row in r:
        cluster_name = row['cluster']
        ident1 = os.path.basename(row['ident1'])
        ident2 = os.path.basename(row['ident2'])

        if args.verbose:
            print(cluster_name, ident1, ident2)

        # copy & name genome files "clusterx.y.IDENT.fa. gunzip if necessary,
        # since nucmer doesn't handle gzip.
        fn1 = find_genome_filename(args.genomes_dir, ident1, args.genome_extension)
        genome1 = os.path.join(alignments_dir, '{}.{}.fa'.format(cluster_name, ident1))
        copy_and_gunzip_genome(fn1, genome1)

        fn2 = find_genome_filename(args.genomes_dir, ident2, args.genome_extension)
        genome2 = os.path.join(alignments_dir, '{}.{}.fa'.format(cluster_name, ident2))
        copy_and_gunzip_genome(fn2, genome2)

        nucmer_output_name = os.path.join(alignments_dir, cluster_name + '.a')

        if not os.path.exists(nucmer_output_name):
            print('running {} alignments...'.format(cluster_name))
            runner = nucmer.Runner(genome1, genome2, nucmer_output_name)
            runner.run()
            print('...done!')
        else:
            if args.verbose:
                print('using cached alignments file', nucmer_output_name)

        file_reader = coords_file.reader(nucmer_output_name)
        alignments = [coord for coord in file_reader if not coord.is_self_hit()]

        # alignment obj:
        # 'frame', 'hit_length_qry', 'hit_length_ref', 'intersects_variant', 'is_self_hit', 'on_same_strand', 'percent_identity', 'qry_coords', 'qry_coords_from_ref_coord', 'qry_end', 'qry_length', 'qry_name', 'qry_start', 'ref_coords', 'ref_coords_from_qry_coord', 'ref_end', 'ref_length', 'ref_name', 'ref_start', 'reverse_query', 'reverse_reference', 'to_msp_crunch']

        # sort alignments by length of hit
        alignments.sort(key = lambda x: -x.hit_length_qry)

        # track alignments over a particular threshold
        keep_alignments = []
        all_bp = 0
        aligned_bp = 0
        weighted_percent_identity = 0.
        skipped_bp = 0
        skipped_aln = 0

        for alignment in alignments:
            weighted_percent_identity += alignment.percent_identity * alignment.hit_length_qry
            all_bp += alignment.hit_length_qry

            # do we pass the length and percent identitiy thresholds? if so,
            # keep!
            if alignment.hit_length_qry >= args.length_threshold and \
               alignment.percent_identity >= args.percent_threshold:
                aligned_bp += alignment.hit_length_qry
                keep_alignments.append(alignment)
            else:
                skipped_bp += alignment.hit_length_qry
                skipped_aln += 1

        # ditch if no alignments
        if not keep_alignments:
            print('** FLAG: no kept alignments for {}, punting.'.format(cluster_name))
            print('')
            continue

        # set up the printed out info
        lca_name = "(root)"     # if empty lca, => root of taxonomy.
        if row['lca']:
            lca_name = row['lca']

        shared_kmers = int(row['shared_kmers'])
        ksize = int(row['ksize'])

        # nice output! with some flags.
        print('{}: {:.0f}kb aln ({:.0f}k {}-mers) across {}; longest contig: {:.0f} kb'.format(cluster_name, aligned_bp / 1000, shared_kmers / 1000, ksize, lca_name, keep_alignments[0].hit_length_qry / 1000))
        print('weighted percent identity across alignments: {:.1f}%'.format(weighted_percent_identity / all_bp))
        print('skipped {:.0f} kb of alignments in {} alignments (< {} bp or < {:.0f}% identity)'.format(skipped_bp / 1000, skipped_aln, args.length_threshold, args.percent_threshold))
        if abs(math.log(shared_kmers / aligned_bp) > 1):
            print('** FLAG, oddly too little or too many aligned bp vs k-mers')

        ### track & remove contigs from query genome (genome2)

        keep_d = defaultdict(set)
        for aln in keep_alignments:
            keep_d[aln.qry_name].add(aln)

        bp_removed = remove_contigs(ident2, genome2, keep_d)

        flag_2 = 0
        if bp_removed > 2.5*aligned_bp:
            flag_2 = 1

            # reset to rm kept, and removed is empty.
            os.unlink(genome2 + '.kept.fa')
            with open(genome2 + '.removed.fa', 'wt') as fp:
                pass

        ### track & remove contigs from ref genome (genome1)

        keep_d = defaultdict(set)
        for aln in keep_alignments:
            keep_d[aln.ref_name].add(aln)

        bp_removed = remove_contigs(ident1, genome1, keep_d)

        flag_1 = 0
        if bp_removed > 2.5*aligned_bp:
            flag_1 = 1
            # reset to rm kept, and removed is empty.
            os.unlink(genome1 + '.kept.fa')
            with open(genome1 + '.removed.fa', 'wt') as fp:
                pass

        # output summary of flags

        if flag_1 and flag_2:
            print('** FLAGFLAG, too much removed from both!')
        elif flag_1 and not flag_2:
            print('** FLAG, {} is probably contaminated (too much rm from {})'.format(ident2, ident1))
        elif flag_2 and not flag_1:
            print('** FLAG, {} is probably contaminated (too much rm from {})'.format(ident1, ident2))

        print('')