Пример #1
0
def generateFastas(input, index, Contigs, query):
    # loop through fasta once, generating query and reference
    contiglist = Contigs[index + 1:] + keepers
    with open('query_{}.fa'.format(index), 'w') as qFasta:
        with open('reference_{}.fa'.format(index), 'w') as rFasta:
            with open(input, 'r') as infile:
                for Id, Sequence in SimpleFastaParser(infile):
                    if Id == query:
                        qFasta.write('>%s\n%s\n' % (Id, softwrap(Sequence)))
                    elif Id in contiglist:
                        rFasta.write('>%s\n%s\n' % (Id, softwrap(Sequence)))
Пример #2
0
def main(args):
    # setup menu with argparse
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
        def __init__(self, prog):
            super(MyFormatter, self).__init__(prog, max_help_position=48)
    parser = argparse.ArgumentParser(prog='gff2prot.py',
                                     description='''Script to convert GFF3 and FASTA proteins.''',
                                     epilog="""Written by Jon Palmer (2018) [email protected]""",
                                     formatter_class=MyFormatter)
    parser.add_argument('-g', '--gff3', required=True,
                        help='Genome annotation GFF3 format')
    parser.add_argument('-f', '--fasta', required=True,
                        help='Genome in FASTA format')
    parser.add_argument('--no_stop', action='store_true',
                        help='Dont print stop codon')
    args = parser.parse_args(args)

    # translate GFF3 to proteins
    # load into dictionary
    Genes = {}
    Genes = lib.gff2dict(args.gff3, args.fasta, Genes)

    for k, v in natsorted(list(Genes.items())):
        if v['type'] == 'mRNA':
            for i, x in enumerate(v['ids']):
                if args.no_stop:
                    Prot = v['protein'][i].rstrip('*')
                else:
                    Prot = v['protein'][i]
                sys.stdout.write('>%s %s\n%s\n' % (x, k, lib.softwrap(Prot)))
Пример #3
0
def SortRenameHeaders(input, basename, output, minlen=False):
    Seqs = []
    with open(input, 'r') as infile:
        for header, sequence in SimpleFastaParser(infile):
            Seqs.append((header, len(sequence), sequence))
    # sort by length
    sortedSeqs = sorted(Seqs, key=lambda x: x[1], reverse=True)
    # loop through and return contigs and keepers
    counter = 1
    with open(output, 'w') as outfile:
        for name, length, seq in sortedSeqs:
            newName = '{:}_{:}'.format(basename, counter)
            if len(newName) > 16:
                print((
                    'Error. {:} fasta header too long.  Choose a different --base name. NCBI/GenBank max is 16 characters'
                    .format(newName)))
                sys.exit(1)
            if minlen:
                if length >= int(minlen):
                    outfile.write('>{:}\n{:}\n'.format(newName, softwrap(seq)))
                    counter += 1
            else:
                outfile.write('>{:}\n{:}\n'.format(newName, softwrap(seq)))
                counter += 1
Пример #4
0
def create_partitions(fasta,
                      genes,
                      partition_list,
                      proteins=False,
                      transcripts=False,
                      repeats=False,
                      num=50,
                      tmpdir='.',
                      interval=2000,
                      partitions=True,
                      debug=False):
    # function to create EVM partition intervals that do not split genes
    if not os.path.isdir(tmpdir):
        os.makedirs(tmpdir)
    SeqRecords = SeqIO.index(fasta, 'fasta')
    PID = os.getpid()
    bedGenes = os.path.join(tmpdir, 'genes.{}.bed'.format(PID))
    superGenes = os.path.join(tmpdir, 'genes.{}.supergenes.bed'.format(PID))
    interGenes = gene_blocks_to_interlap(genes)
    if proteins:
        interProteins = exonerate_blocks_to_interlap(proteins)
    if transcripts:
        interTranscripts = blocks_to_interlap(transcripts)
    if repeats:
        interRepeats = blocks_to_interlap(repeats)
    Results = []
    with open(genes, 'r') as infile:
        for line in infile:
            if line.startswith('#') or line.startswith('\n'):
                continue
            line = line.rstrip()
            cols = line.split('\t')
            if cols[2] == 'gene':
                Results.append([
                    cols[0],
                    int(cols[3]),
                    int(cols[4]), cols[8], cols[5], cols[6]
                ])
    # sort the results by contig and position
    ChrGeneCounts = {}
    totalGeneCount = 0
    sortedResults = natsorted(Results, key=lambda x: (x[0], x[1]))
    with open(bedGenes, 'w') as outfile:
        for x in sortedResults:
            totalGeneCount += 1
            outfile.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                x[0], x[1], x[2], x[3], x[4], x[5]))
            if not x[0] in ChrGeneCounts:
                ChrGeneCounts[x[0]] = 1
            else:
                ChrGeneCounts[x[0]] += 1
    ChrNoGenes = len(SeqRecords) - len(ChrGeneCounts)
    superGeneCount = 0
    lib.log.debug(
        '{:,} total contigs; skipping {:,} contigs with no genes'.format(
            len(SeqRecords), ChrNoGenes))
    if partitions:
        # now merge overlaping genes [strand] to get conservative locus boundaries
        cmd = ['bedtools', 'merge', '-s', '-i', bedGenes]
        merged = {}
        with open(superGenes, 'w') as outfile:
            for line in lib.execute(cmd):
                superGeneCount += 1
                line = line.rstrip()
                if line.count('\t') != 2:
                    lib.log.debug(
                        'Error parsing bedtools merge line:\n{}'.format(line))
                    continue
                chr, start, end = line.split('\t')
                outfile.write('{}\t{}\t{}\tSuperGene_{}\n'.format(
                    chr, start, end, superGeneCount))
                if chr not in merged:
                    merged[chr] = [(int(start), int(end), -1)]
                else:
                    diff = int(start) - merged[chr][-1][1]
                    merged[chr].append((int(start), int(end), diff))
        lib.log.debug(
            'Merged {} genes into {} supergenes with bedtools'.format(
                totalGeneCount, superGeneCount))
        # parse Results and get coordinates to partitions
        Partitions = {}
        Commands = {}
        for k, v in natsorted(merged.items()):
            if not k in ChrGeneCounts:  # no genes, so can safely skip
                continue
            Partitions[k] = []
            if len(v) > num:
                chunks = math.ceil(len(v) / num)
                num_genes = int(round(len(v) / chunks))
                chunks = int(chunks)
                for i in range(chunks):
                    if k in Commands:
                        continue
                    i = i + 1
                    if i == 1:
                        start = 1
                    else:
                        phase = int(round(interval / 3))
                        if len(Partitions[k]) > 0:
                            start = Partitions[k][-1][1] - phase
                        else:
                            start = 1
                    loc = i * num_genes
                    if i == chunks:
                        end = len(SeqRecords[k])
                    else:
                        if loc >= len(v):
                            end = len(SeqRecords[k])
                        else:
                            end = getBreakPoint(v,
                                                loc,
                                                direction='reverse',
                                                gap=interval)
                            if not end:
                                end = getBreakPoint(v,
                                                    loc,
                                                    direction='forward',
                                                    gap=interval)
                    if not end:
                        Commands[k] = {'n': len(v)}
                        lib.log.debug('{} --> {} bp'.format(
                            k, len(SeqRecords[k])))
                    else:
                        partLen = end - start
                        if partLen < 10000:
                            continue
                        Partitions[k].append((start, end))
                        partName = '{}_{}-{}'.format(k, start, end)
                        Commands[partName] = {'n': num_genes, 'chr': k}
                        lib.log.debug('{} --> {} bp'.format(partName, partLen))
            else:
                Commands[k] = {'n': len(v)}
                lib.log.debug('{} --> {} bp'.format(k, len(SeqRecords[k])))
        # now loop through partitions and write files for EVM
        with open(partition_list, 'w') as partout:
            for chr, p in natsorted(Partitions.items()):
                chrDir = os.path.join(tmpdir, chr)
                if not os.path.isdir(chrDir):
                    os.makedirs(chrDir)
                if len(p) == 0:
                    partout.write('{}\t{}\t{}\n'.format(
                        chr, os.path.abspath(chrDir), 'N'))
                    chrFasta = os.path.join(chrDir, os.path.basename(fasta))
                    with open(chrFasta, 'w') as fastaout:
                        fastaout.write('>{}\n{}\n'.format(
                            chr, lib.softwrap(str(SeqRecords[chr].seq))))
                    genePred = os.path.join(chrDir, os.path.basename(genes))
                    RangeFinder(interGenes, chr, 1, len(SeqRecords[chr]),
                                genePred)
                    if proteins:
                        protPred = os.path.join(chrDir,
                                                os.path.basename(proteins))
                        RangeFinder(interProteins, chr, 1,
                                    len(SeqRecords[chr]), protPred)
                    if transcripts:
                        tranPred = os.path.join(chrDir,
                                                os.path.basename(transcripts))
                        RangeFinder(interTranscripts, chr, 1,
                                    len(SeqRecords[chr]), tranPred)
                    if repeats:
                        repPred = os.path.join(chrDir,
                                               os.path.basename(repeats))
                        RangeFinder(interRepeats, chr, 1, len(SeqRecords[chr]),
                                    repPred)
                else:
                    for coords in p:
                        partDir = os.path.join(
                            chrDir, '{}_{}-{}'.format(chr, coords[0],
                                                      coords[1]))
                        if not os.path.isdir(partDir):
                            os.makedirs(partDir)
                        partout.write('{}\t{}\t{}\t{}\n'.format(
                            chr, os.path.abspath(chrDir), 'Y',
                            os.path.abspath(partDir)))
                        partFasta = os.path.join(partDir,
                                                 os.path.basename(fasta))
                        with open(partFasta, 'w') as fastaout:
                            fastaout.write('>{}\n{}\n'.format(
                                chr,
                                lib.softwrap(
                                    str(SeqRecords[chr].seq[coords[0] -
                                                            1:coords[1]]))))
                        # split genes GFF3
                        genePred = os.path.join(partDir,
                                                'gene_predictions.gff3')
                        RangeFinder(interGenes, chr, coords[0], coords[1],
                                    genePred)
                        if proteins:
                            protPred = os.path.join(partDir,
                                                    os.path.basename(proteins))
                            RangeFinder(interProteins, chr, coords[0],
                                        coords[1], protPred)
                        if transcripts:
                            tranPred = os.path.join(
                                partDir, os.path.basename(transcripts))
                            RangeFinder(interTranscripts, chr, coords[0],
                                        coords[1], tranPred)
                        if repeats:
                            repPred = os.path.join(partDir,
                                                   os.path.basename(repeats))
                            RangeFinder(interRepeats, chr, coords[0],
                                        coords[1], repPred)
    else:
        Commands = {}
        with open(partition_list, 'w') as partout:
            for chr in SeqRecords:
                if not chr in ChrGeneCounts:  # no genes so skip
                    continue
                Commands[chr] = {'n': len(SeqRecords[chr])}
                chrDir = os.path.join(tmpdir, chr)
                if not os.path.isdir(chrDir):
                    os.makedirs(chrDir)
                partout.write('{}\t{}\t{}\n'.format(chr,
                                                    os.path.abspath(chrDir),
                                                    'N'))
                chrFasta = os.path.join(chrDir, os.path.basename(fasta))
                with open(chrFasta, 'w') as fastaout:
                    fastaout.write('>{}\n{}\n'.format(
                        chr, lib.softwrap(str(SeqRecords[chr].seq))))
                genePred = os.path.join(chrDir, os.path.basename(genes))
                RangeFinder(interGenes, chr, 1, len(SeqRecords[chr]), genePred)
                if proteins:
                    protPred = os.path.join(chrDir, os.path.basename(proteins))
                    RangeFinder(interProteins, chr, 1, len(SeqRecords[chr]),
                                protPred)
                if transcripts:
                    tranPred = os.path.join(chrDir,
                                            os.path.basename(transcripts))
                    RangeFinder(interTranscripts, chr, 1, len(SeqRecords[chr]),
                                tranPred)
                if repeats:
                    repPred = os.path.join(chrDir, os.path.basename(repeats))
                    RangeFinder(interRepeats, chr, 1, len(SeqRecords[chr]),
                                repPred)

    return Commands
Пример #5
0
def main(args):
    # setup menu with argparse
    class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
        def __init__(self, prog):
            super(MyFormatter, self).__init__(prog, max_help_position=48)

    parser = argparse.ArgumentParser(
        prog='contig_cleaner.py',
        usage="%(prog)s [options] -i genome.fa -o cleaned.fa",
        description=
        '''Script that removes short scaffolds that are duplicated elsewhere.''',
        epilog="""Written by Jon Palmer (2016) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--input',
                        required=True,
                        help='Multi-fasta genome file')
    parser.add_argument('-o',
                        '--out',
                        required=True,
                        help='Cleaned output (FASTA)')
    parser.add_argument('-p',
                        '--pident',
                        type=int,
                        default=95,
                        help='percent identity of contig')
    parser.add_argument('-c',
                        '--cov',
                        type=int,
                        default=95,
                        help='coverage of contig')
    parser.add_argument('-m',
                        '--minlen',
                        type=int,
                        default=500,
                        help='Minimum length of contig')
    parser.add_argument('--cpus',
                        default=2,
                        type=int,
                        help='Number of CPUs to use')
    parser.add_argument('--exhaustive',
                        action='store_true',
                        help='Compute every contig, else stop at N50')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Debug the output')
    args = parser.parse_args(args)

    # setup some global variables used in functions above
    global GENOME, CPUS, PIDENT, COV, keepers, repeats
    GENOME = args.input
    CPUS = args.cpus
    PIDENT = args.pident
    COV = args.cov
    keepers, repeats = ([], ) * 2

    # run some checks of dependencies first
    programs = ['minimap2']
    CheckDependencies(programs)

    # calculate N50 of assembly
    n50 = calcN50(args.input)

    # now get list of scaffolds, shortest->largest
    if args.exhaustive:
        scaffolds, keepers = Sortbysize(args.input, False, minlen=args.minlen)
    else:
        scaffolds, keepers = Sortbysize(args.input, n50, minlen=args.minlen)

    print("-----------------------------------------------")
    PassSize = len(scaffolds) + len(keepers)
    print(
        ("{:,} input contigs, {:,} larger than {:,} bp, N50 is {:,} bp".format(
            countfasta(args.input), PassSize, args.minlen, n50)))
    if args.exhaustive:
        print(("Checking duplication of {:,} contigs".format(len(scaffolds))))
    else:
        print(("Checking duplication of {:,} contigs shorter than N50".format(
            len(scaffolds))))
    print("-----------------------------------------------")

    # now generate pool and parallel process the list
    mp_output = multithread_aligning(scaffolds)

    for output, garbage in mp_output:
        if not garbage:
            keepers.append(output)
        else:
            repeats.append(output)

    print("-----------------------------------------------")
    print((
        "{:,} input contigs; {:,} larger than {:} bp; {:,} duplicated; {:,} written to file"
        .format(countfasta(args.input), PassSize, args.minlen, len(repeats),
                len(keepers))))
    if args.debug:
        print(("\nDuplicated contigs are:\n{:}\n".format(', '.join(repeats))))
        print(("Contigs to keep are:\n{:}\n".format(', '.join(keepers))))

    # finally write a new reference based on list of keepers
    with open(args.out, 'w') as output:
        with open(args.input, 'r') as input:
            for title, sequence in SimpleFastaParser(input):
                if title in keepers:
                    output.write('>{}\n{}\n'.format(title, softwrap(sequence)))