def generatePSP(positives, negatives, outfile):
    ''' generate a discrimitative PSP file from
    the positives and negatives that can be used
    to do descriminative MEME '''

    psp_options = PARAMS["psp_options"]

    nseqs_pos = int(FastaIterator.count(positives))
    nseqs_neg = int(FastaIterator.count(negatives))

    if nseqs_pos < 2 or nseqs_neg < 2:
        E.warn("%s: input files do not have sufficent sequences"
               "to run psp-gen, skipping" % outfile)
        P.touch(outfile)
        return

    # get appropriate options from meme options
    if PARAMS.get("meme_revcomp", True):
        psp_options += " -revcomp"

    statement = '''psp-gen -pos %(positives)s
                           -neg %(negatives)s
                           %(psp_options)s
                   > %(outfile)s '''

    P.run(statement)
示例#2
0
def calculateSequenceComposition(interval_names,
                                 sequence_file,
                                 outfile,
                                 header_line=True):
    '''
    given a set of interval names that are present in a
    fasta file, return CpG content file
    '''
    interval_file = open(interval_names)
    if header_line:
        interval_file.readline()
    sequence_file = open(sequence_file)

    interval_set = set()
    for line in interval_file.readlines():
        interval_set.add(line[:-1])

    temp = P.getTempFile("/ifs/scratch")
    for record in FastaIterator.iterate(sequence_file):
        seq_id = record.title.split(" ")[0]
        if seq_id in interval_set:
            temp.write(">%s\n%s\n" % (record.title, record.sequence))
    temp.close()

    inf = temp.name
    statement = '''cat %(inf)s | python %(scriptsdir)s/fasta2table.py
                   -s cpg -s length
                   --log=%(outfile)s.log > %(outfile)s'''
    P.run()
示例#3
0
文件: Masker.py 项目: BioXiao/cgat
    def maskSequences(self, sequences):
        '''mask a collection of sequences.'''

        outfile, infile = tempfile.mkstemp()

        for x, s in enumerate(sequences):
            os.write(outfile, ">%i\n%s\n" % (x, s))

        os.close(outfile)

        statement = self.mCommand % locals()

        E.debug("statement: %s" % statement)

        s = subprocess.Popen(statement,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             close_fds=True)

        (out, err) = s.communicate()

        if s.returncode != 0:
            raise RuntimeError(
                "Error in running %s \n%s\nTemporary directory" %
                (statement, err))

        result = [
            x.sequence for x in FastaIterator.iterate(StringIO.StringIO(out))]

        os.remove(infile)

        return result
示例#4
0
def buildInputFiles(infile, outfiles):
    '''
    build input file based on parameters and fasta sequences
    that primers are to be designed for
    '''
    PARAMS["constraints_primer_mispriming_library"] = glob.glob("mispriming.dir/*.lib")[0]

    fasta, identifiers = infile[0], "identifiers.tsv"
    inf = IOTools.openFile(fasta)
    
    E.info("Reading ids for primer design")
    ids = readIdentifiers(identifiers)
    
    E.info("collecting sequences")
    for f in FastaIterator.iterate(IOTools.openFile(fasta)):
        if f.title in ids:
            outf = IOTools.openFile(os.path.join(
                "input.dir",f.title.replace(" ", "_").replace("/","_") + ".input").replace('"', ''), "w")
            seq = f.sequence
            outf.write("SEQUENCE_ID=%s\n" % f.title)
            for key, value in PARAMS.iteritems():
                if "constraints" in key:
                    outf.write("%s=%s\n" % (key.replace("constraints_", "").upper(), value))
            outf.write("SEQUENCE_TEMPLATE=%s\n=\n" % seq)
            outf.close()
示例#5
0
def segmentWithCpG(infile, with_contig_sizes=False):
    '''segment a fasta file, output locations of CpG.'''

    ninput, nskipped, noutput = 0, 0, 0

    iterator = FastaIterator.FastaIterator(infile)

    segments, contig_sizes = [], collections.OrderedDict()

    for cur_record in iterator:
        ninput += 1
        contig = re.sub("\s.*", "", cur_record.title)
        last = None
        contig_sizes[contig] = (0, len(cur_record.sequence))
        for pos, this in enumerate(cur_record.sequence.upper()):
            if last == "C" and this == "G":
                segments.append((contig, pos - 1, pos + 1, 1.0))
            last = this

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    if with_contig_sizes:
        return segments, contig_sizes

    return segments
示例#6
0
def countMotifs(infile, motifs):
    '''find regular expression *motifs* in
    sequences within fasta formatted *infile*.
    '''

    it = FastaIterator.FastaIterator(infile)
    positions = []
    while 1:
        try:
            seq = next(it)
        except StopIteration:
            break
        if not seq:
            break

        rseq = Genomics.complement(seq.sequence)
        lsequence = len(seq.sequence)
        pos = []
        for motif, pattern in motifs:

            for x in pattern.finditer(seq.sequence):
                pos.append((motif, "+", x.start(), x.end()))
            for x in pattern.finditer(rseq):
                pos.append(
                    (motif, "-", lsequence - x.end(), lsequence - x.start()))

        positions.append((seq.title, pos))

    return positions
示例#7
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                                   usage=globals()["__doc__"])

    parser.add_option("-b", "--bamfile", dest="bamfile", type="string",
                      help="supply bam file")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # read in contigs
    E.info("reading in contig file")
    contigs = {}
    for fasta in FastaIterator.iterate(options.stdin):
        contigs[fasta.title] = (1, len(fasta.sequence) - 1)
    E.info("read %i contigs" % len(contigs.keys()))

    # read in bamfile
    E.info("reading bam file")
    samfile = pysam.Samfile(options.bamfile)

    E.info("iterating over contigs")
    c = 0
    for contig, coords in contigs.iteritems():
        coords = list(coords)

        #################################
        # NB this is specific for my data!
        contig = contig.split(" ")[0]
        #################################

        species_counts = collections.defaultdict(int)
        for alignment in samfile.fetch(contig, coords[0], coords[1]):
            species_id = alignment.qname.split("|")[1]
            species_counts[species_id] += 1

        # at the moment ignore if there are no counts
        if len(species_counts.values()) == 0:
            E.warn("no reads map to %s" % contig)
            continue

        for species, count in species_counts.iteritems():
            if species_counts[species] == max(species_counts.values()):
                top_dog = species
                c += 1
                break
        E.info("species %s assigned to contig number %i" % (top_dog, c))
        options.stdout.write("%s\t%s\n" % (contig, top_dog))

    # write footer and output benchmark information.
    E.Stop()
def calculateSequenceComposition(interval_names,
                                 sequence_file,
                                 outfile,
                                 header_line=True):
    '''
    given a set of interval names that are present in a
    fasta file, return CpG content file
    '''
    interval_file = open(interval_names)
    if header_line:
        interval_file.readline()
    sequence_file = open(sequence_file)

    interval_set = set()
    for line in interval_file.readlines():
        interval_set.add(line[:-1])

    temp = P.getTempFile("/ifs/scratch")
    for record in FastaIterator.iterate(sequence_file):
        seq_id = record.title.split(" ")[0]
        if seq_id in interval_set:
            temp.write(">%s\n%s\n" % (record.title, record.sequence))
    temp.close()

    inf = temp.name
    statement = '''
    cat %(inf)s | cgat fasta2table
    -s na -s cpg -s length
    --log=%(outfile)s.log > %(outfile)s'''

    P.run()
示例#9
0
    def maskSequences(self, sequences):
        '''mask a collection of sequences.'''

        with tempfile.NamedTemporaryFile(mode="w+t", delete=False) as outf:
            for x, s in enumerate(sequences):
                outf.write(">%i\n%s\n" % (x, s))

        infile = outf.name
        statement = self.mCommand % locals()

        E.debug("statement: %s" % statement)

        s = subprocess.Popen(statement,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             close_fds=True)

        (out, err) = s.communicate()

        if s.returncode != 0:
            raise RuntimeError(
                "Error in running %s \n%s\nTemporary directory" %
                (statement, err))

        result = [
            x.sequence for x in FastaIterator.iterate(StringIO(out.decode()))
        ]

        os.remove(infile)

        return result
示例#10
0
    def maskSequences(self, sequences):
        '''mask a collection of sequences.'''

        outfile, infile = tempfile.mkstemp()

        for x, s in enumerate(sequences):
            os.write(outfile, ">%i\n%s\n" % (x, s))

        os.close(outfile)

        statement = self.mCommand % locals()

        E.debug("statement: %s" % statement)

        s = subprocess.Popen(statement,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             close_fds=True)

        (out, err) = s.communicate()

        if s.returncode != 0:
            raise RuntimeError(
                "Error in running %s \n%s\nTemporary directory" %
                (statement, err))

        result = [x.sequence for x in FastaIterator.iterate(StringIO(out))]

        os.remove(infile)

        return result
def countCompleteGenes(infile, outfile):
    '''
    count the number of genes that are classed
    as complete based on having a start and stop codon
    '''
    start = "ATG"
    stop = ["TAG", "TAA", "TGA"]

    ntotal = 0
    nstart = 0
    nstop = 0
    nstart_nstop = 0
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        ntotal += 1
        if fasta.sequence.startswith(start):
            nstart += 1
        if fasta.sequence[-3:len(fasta.sequence)] in stop:
            nstop += 1
        if fasta.sequence.startswith(
                start) and fasta.sequence[-3:len(fasta.sequence)] in stop:
            nstart_nstop += 1
    outf = open(outfile, "w")
    outf.write("total_genes\tpstart\tpstop\tpstart_stop\n")
    outf.write("\t".join(
        map(str, [
            ntotal,
            float(nstart) / ntotal,
            float(nstop) / ntotal,
            float(nstart_nstop) / ntotal
        ])) + "\n")
    outf.close()
示例#12
0
def removeContaminants(infiles, outfile):
    '''remove adaptor contamination from fastq files.
    
    This method uses cutadapt.
    '''

    infile, contaminant_file = infiles

    adaptors = []
    for entry in FastaIterator.FastaIterator(
            IOTools.openFile(contaminant_file)):
        adaptors.append("-a %s" % entry.sequence)

    adaptors = " ".join(adaptors)
    to_cluster = True

    statement = '''
    cutadapt 
    %(adaptors)s
    --overlap=%(contamination_min_overlap_length)i
    --format=fastq
    %(contamination_options)s
    <( zcat < %(infile)s )
    2> %(outfile)s.log
    | gzip > %(outfile)s
    '''
    P.run()
示例#13
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                                   usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-dir", dest="genome_dir", type="string",
                      help="supply help")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    contigs_map = {}
    for genome in glob.glob(os.path.join(options.genome_dir, "*")):
        for fasta in FastaIterator.iterate(IOTools.openFile(genome)):
            identifier = fasta.title.split("|")
            gi = identifier[1]
            contigs_map[gi] = fasta.title

    for line in options.stdin.readlines():
        data = line[:-1].split("\t")
        gi = data[1]
        assert gi in contigs_map, "cannot find genome with id gi|%s in genomes directory" % gi

        options.stdout.write("%s\t%s\n" % (data[0], contigs_map[gi]))

    # write footer and output benchmark information.
    E.Stop()
def filterByCoverage(infiles, outfile):

    fcoverage = PARAMS["coverage_filter"]
    contig_file = infiles[0]
    dbh = sqlite3.connect(
        os.path.join(PARAMS["results_resultsdir"], PARAMS["database"]))
    cc = dbh.cursor()
    contigs = set()
    for infile in infiles[1:]:
        dirsplit = infile.split("/")
        infile = os.path.join(
            PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1])
        tablename = P.toTable(os.path.basename(infile))
        if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"):
            statement = """SELECT contig_id ave FROM
                           (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id)
                           WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"])
            for data in cc.execute(statement).fetchall():
                contigs.add(data[0])
    outf = open(outfile, "w")
    print contigs
    for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)):
        identifier = fasta.title.split(" ")[0]
        if identifier in contigs:
            outf.write(">%s\n%s\n" % (identifier, fasta.sequence))
    outf.close()
def filterByCoverage(infiles, outfile):

    fcoverage = PARAMS["coverage_filter"]
    contig_file = infiles[0]
    dbh = sqlite3.connect(
        os.path.join(PARAMS["results_resultsdir"], PARAMS["database"]))
    cc = dbh.cursor()
    contigs = set()
    for infile in infiles[1:]:
        dirsplit = infile.split("/")
        infile = os.path.join(
            PARAMS["results_resultsdir"],
            dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1])
        tablename = P.toTable(os.path.basename(infile))
        if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile),
                                                ".coverage.load"):
            statement = """SELECT contig_id ave FROM
                           (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id)
                           WHERE ave > %i""" % (tablename,
                                                PARAMS["coverage_filter"])
            for data in cc.execute(statement).fetchall():
                contigs.add(data[0])
    outf = open(outfile, "w")
    print contigs
    for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)):
        identifier = fasta.title.split(" ")[0]
        if identifier in contigs:
            outf.write(">%s\n%s\n" % (identifier, fasta.sequence))
    outf.close()
def createOTUs(infiles, outfile):
    '''Make OTUs from the template sequences of all samples that passed
    filtering'''

    fasta_files = []
    for infile in infiles:
        fasta_file = P.snip(infile, '_true_snps.tsv', strip_path=True)\
                     + '_template.fasta'
        fasta_file = os.path.join('05_mapped_fastas.dir', fasta_file)
        assert os.path.exists(fasta_file)
        fasta_files.append(fasta_file)

    tmpf = P.getTempFilename('.')
    with IOTools.openFile(tmpf, 'w') as outf:
        for fasta_file in fasta_files:
            for fasta in FI.FastaIterator(IOTools.openFile(fasta_file)):
                outf.write('>' + fasta.title + ';size=1\n' + fasta.sequence +
                           '\n')

    out_up = P.snip(outfile, '.fasta') + '_up.txt'
    statement = ("usearch"
                 "  -cluster_otus %(tmpf)s"
                 "  -otus %(outfile)s"
                 "  -uparseout %(out_up)s"
                 "  -otu_radius_pct 1.0"
                 "  -uparse_break -999"
                 "  -relabel OTU_"
                 " &> %(outfile)s.log")
    to_cluster = False
    P.run()
示例#17
0
def buildInputFiles(infile, outfiles):
    '''
    build input file based on parameters and fasta sequences
    that primers are to be designed for
    '''
    PARAMS["constraints_primer_mispriming_library"] = glob.glob(
        "mispriming.dir/*.lib")[0]

    fasta, identifiers = infile[0], "identifiers.tsv"
    inf = IOTools.openFile(fasta)

    E.info("Reading ids for primer design")
    ids = readIdentifiers(identifiers)

    E.info("collecting sequences")
    for f in FastaIterator.iterate(IOTools.openFile(fasta)):
        if f.title in ids:
            outf = IOTools.openFile(
                os.path.join(
                    "input.dir",
                    f.title.replace(" ", "_").replace("/", "_") +
                    ".input").replace('"', ''), "w")
            seq = f.sequence
            outf.write("SEQUENCE_ID=%s\n" % f.title)
            for key, value in PARAMS.iteritems():
                if "constraints" in key:
                    outf.write(
                        "%s=%s\n" %
                        (key.replace("constraints_", "").upper(), value))
            outf.write("SEQUENCE_TEMPLATE=%s\n=\n" % seq)
            outf.close()
示例#18
0
def segmentUngapped(infile, gap_char, min_gap_size=0):

    iterator = FastaIterator.FastaIterator(infile)

    while 1:
        try:
            cur_record = iterator.next()
        except StopIteration:
            break

        if cur_record is None:
            break
        contig = re.sub("\s.*", "", cur_record.title)
        size = len(cur_record.sequence)

        last_end = 0
        for start, end in gapped_regions(cur_record.sequence, gap_char):
            if end - start < min_gap_size:
                continue

            if last_end != 0:
                yield(contig, last_end, start, 0)
            last_end = end

        if last_end < size:
            yield(contig, last_end, size, 0)
示例#19
0
def listAdaptors(infile):
    adaptors = []
    for entry in FastaIterator.FastaIterator(IOTools.openFile(infile)):
        adaptors.append("%s %s" %
                        (PARAMS["contamination_trim_type"], entry.sequence))
    adaptors = " ".join(adaptors)

    return adaptors
示例#20
0
def iterate_double_fasta ( fn1, fn2 ):
    iterator = FastaIterator.iterate_together( fn1, fn2 )
    for seq1, seq2 in iterator:
        yield AlignedPairs.UnalignedPair( 
            token1 = seq1.title,
            sequence1 = seq1.sequence,
            token2 = seq2.title,
            sequence2 = seq2.sequence )
示例#21
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id$", 
                             usage = globals()["__doc__"] )

    parser.add_option("-n", dest="N", type="int",
                      help="e.g N50 - the length at which 50% of contigs are equal or above")
    parser.add_option("-f", "--filter-length", dest="filter_length", type="int",
                      help="calculate stats on contigs longer than -f")

    parser.set_defaults(N = 50,
                        filter_length = 0)

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv )
    
    f = options.filter_length

    # iterate over the contigs/scaffolds and return stats                                                                                                                                                                                              
    number_of_contigs = 0

    N = options.N
    contig_lengths = []

    for record in FastaIterator.iterate(options.stdin):
        contig_length = len(list(record.sequence))
        if contig_length >= f:
            number_of_contigs += 1
            contig_lengths.append(contig_length)

    # mean, median and max contig/scaffold lengths
    mean_length = np.mean(contig_lengths)
    median_length = np.median(contig_lengths)
    max_length = max(contig_lengths)

    # iterate over contigs/scaffolds sorted by longest                                                                                                                                                                                               
    # and caculate the NX                                                                                                                                                                                                                    
    index = 0
    cum_length = 0
    total_length = sum(contig_lengths)
    for length in sorted(contig_lengths, reverse = True):
        while cum_length <= total_length*(float(N)/100):
            index += 1
            cum_length += length

    # output the results                                                                                                                                                                                                                     
    options.stdout.write("nscaffolds\tscaffold_length\tN%i\tmedian_length\tmean_length\tmax_length\n" % N)
    options.stdout.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (number_of_contigs, total_length, sorted(contig_lengths, reverse = True)[index], str(median_length), str(mean_length), str(max_length)))

    ## write footer and output benchmark information.
    E.Stop()
示例#22
0
def iterate_single_fasta ( fn1 ):
    iterator = FastaIterator.FastaIterator( fn1 )
    while 1:
        seq1, seq2 = iterator.next(), iterator.next()
        yield AlignedPairs.UnalignedPair( 
            token1 = seq1.title,
            sequence1 = seq1.sequence,
            token2 = seq2.title,
            sequence2 = seq2.sequence )
示例#23
0
def filterContigs(infile, outfile, length):
    '''
    filter contigs by length
    '''
    outf = open(outfile, "w")
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        seq_length = len(fasta.sequence)
        if seq_length < length: continue
        outf.write(">%s\n%s\n" % (fasta.title, fasta.sequence))
    outf.close()
示例#24
0
def filterContigs(infile, outfile, length):
    '''
    filter contigs by length
    '''
    outf = open(outfile, "w")
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        seq_length = len(fasta.sequence)
        if seq_length < length: continue
        outf.write(">%s\n%s\n" % (fasta.title, fasta.sequence))
    outf.close()
def extract16SSequences(infile, outfile):
    '''Extract any RNA fasta entry with the term 16S in the title'''

    outf = IOTools.openFile(outfile, 'w')

    for fasta in FastaIterator.FastaIterator(IOTools.openFile(infile)):
        if re.search('16S', fasta.title, re.IGNORECASE):
            outf.write('\n'.join(['>' + fasta.title, fasta.sequence]) + '\n')

    outf.close()
def contig_to_stats(contigs_file, stats_file, params):
    """
    calculate descriptive stats for a set
    of contigs / scaffolds
    """

    PARAMS = params

    if PARAMS["filter"]:
        f = PARAMS["filter"]
    else:
        f = 0

    # iterate over the contigs/scaffolds and return stats
    number_of_scaffolds = 0

    N = PARAMS["scaffold_n"]
    scaffold_lengths = []

    inf = open(contigs_file)
    for record in FastaIterator.iterate(inf):
        scaffold_length = len(list(record.sequence))
        if scaffold_length >= f:
            number_of_scaffolds += 1
            scaffold_lengths.append(scaffold_length)

    # mean, median and max contig/scaffold lengths
    mean_length = np.mean(scaffold_lengths)
    median_length = np.median(scaffold_lengths)
    max_length = max(scaffold_lengths)

    # iterate over contigs/scaffolds sorted by longest
    # and caculate the NX
    index = 0
    cum_length = 0
    total_length = sum(scaffold_lengths)
    for length in sorted(scaffold_lengths, reverse=True):
        while cum_length <= total_length * (float(N) / 100):
            index += 1
            cum_length += length

    # output the results
    outf = open(stats_file, "w")
    outf.write("nscaffolds\tscaffold_length\tN%i\tmedian_length\tmean_length\tmax_length\n" % N)
    outf.write(
        "%s\t%s\t%s\t%s\t%s\t%s\n"
        % (
            number_of_scaffolds,
            total_length,
            sorted(scaffold_lengths, reverse=True)[index],
            str(median_length),
            str(mean_length),
            str(max_length),
        )
    )
示例#27
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: sequence2alignment.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.set_defaults()

    (options, args) = E.Start(parser)

    iterator = FastaIterator.FastaIterator(sys.stdin)

    ninput, noutput, nskipped = 0, 0, 0

    options.stdout.write(
        "query\tsbjct\tquery_from\tquery_to\tsbjct_from\tsbjct_to\tquery_starts\tsbjct_starts\tblock_sizes\n"
    )

    while 1:
        try:
            cur_record = iterator.next()
        except StopIteration:
            break

        ninput += 1

        sequence = re.sub(" ", "", cur_record.sequence)
        l = len(sequence)

        map_sequence2mali = alignlib_lite.py_makeAlignmentVector()

        alignlib_lite.py_AlignmentFormatExplicit(0, sequence, 0, "X" *
                                                 l).copy(map_sequence2mali)

        options.stdout.write("\t".join(
            (cur_record.title, "ref",
             str(alignlib_lite.py_AlignmentFormatBlocks(map_sequence2mali)))) +
                             "\n")

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i.\n" %
                             (ninput, noutput, nskipped))

    E.Stop()
示例#28
0
def build_scaffold_lengths(contigs_file, outfile, params):
    '''
    output the distribution of scaffold lengths
    '''
    inf = open(contigs_file)
    outf = open(outfile, "w")
    outf.write("scaffold_name\tlength\n")
    for record in FastaIterator.iterate(inf):
        scaffold_length = len(list(record.sequence))
        outf.write("%s\t%i\n" % (record.title, scaffold_length))
    outf.close()
def build_scaffold_lengths(contigs_file, outfile, params):
    '''
    output the distribution of scaffold lengths
    '''
    inf = open(contigs_file)
    outf = open(outfile, "w")
    outf.write("scaffold_name\tlength\n")
    for record in FastaIterator.iterate(inf):
        scaffold_length = len(list(record.sequence))
        outf.write("%s\t%i\n" % (record.title, scaffold_length))
    outf.close()
示例#30
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: fastas2fasta.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    (options, args) = E.Start(parser)

    if len(args) < 2:
        raise ValueError(
            "please supply at least two filenames to concatenate.")

    iterators = []
    for a in args:
        iterators.append(FastaIterator.FastaIterator(IOTools.openFile(a, "r")))

    ninput, noutput, nerrors = 0, 0, 0

    while 1:

        sequences = []
        ids = []

        for iterator in iterators:
            try:
                cur_record = iterator.next()
            except StopIteration:
                break

            sequences.append(re.sub(" ", "", cur_record.sequence))
            ids.append(cur_record.title)

        if not sequences:
            break
        ninput += 1

        if len(sequences) != len(iterators):
            raise "unequal number of sequences in files."

        noutput += 1

        options.stdout.write(">%s\n%s\n" % (ids[0], "".join(sequences)))

    E.info("ninput=%i, noutput=%i, nerrors=%i" % (ninput, noutput, nerrors))

    E.Stop()
示例#31
0
def collectGenomeSizes(infile, outfile):
    '''
    output the genome sizes for each genome
    '''
    to_cluster = True
    outf = open(outfile, "w")
    outf.write("genome\tlength\n")
    # assume single fasta entry
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        name = P.snip(os.path.basename(infile), ".fna")
        length = len(list(fasta.sequence))
        outf.write("%s\t%s\n" % (name, str(length)))
    outf.close()
def collectGenomeSizes(infile, outfile):
    '''
    output the genome sizes for each genome
    '''
    to_cluster = True
    outf = open(outfile, "w")
    outf.write("genome\tlength\n")
    # assume single fasta entry
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        name = P.snip(os.path.basename(infile), ".fna")
        length = len(list(fasta.sequence))
        outf.write("%s\t%s\n" % (name, str(length)))
    outf.close()
def contig_to_stats(contigs_file, stats_file, params):
    '''
    calculate descriptive stats for a set
    of contigs / scaffolds
    '''

    PARAMS = params

    if PARAMS["filter"]:
        f = PARAMS["filter"]
    else:
        f = 0

    # iterate over the contigs/scaffolds and return stats
    number_of_scaffolds = 0

    N = PARAMS["scaffold_n"]
    scaffold_lengths = []

    inf = open(contigs_file)
    for record in FastaIterator.iterate(inf):
        scaffold_length = len(list(record.sequence))
        if scaffold_length >= f:
            number_of_scaffolds += 1
            scaffold_lengths.append(scaffold_length)

    # mean, median and max contig/scaffold lengths
    mean_length = np.mean(scaffold_lengths)
    median_length = np.median(scaffold_lengths)
    max_length = max(scaffold_lengths)

    # iterate over contigs/scaffolds sorted by longest
    # and caculate the NX
    index = 0
    cum_length = 0
    total_length = sum(scaffold_lengths)
    for length in sorted(scaffold_lengths, reverse=True):
        while cum_length <= total_length * (float(N) / 100):
            index += 1
            cum_length += length

    # output the results
    outf = open(stats_file, "w")
    outf.write(
        "nscaffolds\tscaffold_length\tN%i\tmedian_length\tmean_length\tmax_length\n"
        % N)
    outf.write("%s\t%s\t%s\t%s\t%s\t%s\n" %
               (number_of_scaffolds, total_length,
                sorted(scaffold_lengths, reverse=True)[index],
                str(median_length), str(mean_length), str(max_length)))
def runDREME(infile, outfile, neg_file="", options=""):
    ''' Run DREME on fasta file. If a neg_file is passed
    then DREME will use this as the negative set, otherwise
    the default is to shuffle the input '''

    nseqs_pos = int(FastaIterator.count(infile))
    if nseqs_pos < 2:
        E.warn("%s: less than 2 sequences - dreme skipped" % outfile)
        P.touch(outfile)
        return

    if neg_file:
        nseqs_neg = int(FastaIterator.count(neg_file))
        if nseqs_neg < 2:
            E.warn(
                "%s: less than 2 sequences in negatives file - dreme skipped" %
                outfile)
            P.touch(outfile)
            return
        else:
            neg_file = "-n %s" % neg_file

    logfile = outfile + ".log"
    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), outfile)
    tmpdir = P.get_temp_dir(".")

    statement = '''
    dreme -p %(infile)s %(neg_file)s -png
        -oc %(tmpdir)s
            %(dreme_options)s
            %(options)s
       > %(logfile)s
    '''

    P.run(statement)

    collectMEMEResults(tmpdir, target_path, outfile, method="dreme")
示例#35
0
文件: fasta2bed.py 项目: yangjl/cgat
def segmentGaps(infile, gap_char):

    iterator = FastaIterator.FastaIterator(infile)

    while 1:
        try:
            cur_record = iterator.next()
        except StopIteration:
            break

        if cur_record is None: break
        contig = re.sub("\s.*", "", cur_record.title)

        for start, end in gapped_regions(cur_record.sequence, gap_char):
            yield (contig, start, end, 0)
示例#36
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: analyze_sequences.py 2865 2010-03-03 10:18:28Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--species",
                      dest="species",
                      type="string",
                      help="species to use.")

    parser.add_option("-p",
                      "--prefix",
                      dest="prefix",
                      type="string",
                      help="prefix to use for temporary files.")

    parser.set_defaults(filename_map=None, )

    (options, args) = E.Start(parser, add_mysql_options=True)

    iterator = FastaIterator.FastaIterator(options.stdin)

    print "id\t" + SequenceProperties().GetHeader()

    while 1:
        cur_record = iterator.next()

        if cur_record is None:
            break

        sequence = re.sub(" ", "", cur_record.sequence)

        if len(sequence) % 3:
            raise "sequence %s is not multiples of 3: length=%i!" % (
                cur_record.title, len(sequence))

        s = SequenceProperties()
        s.Load(sequence)

        print cur_record.title + "\t" + str(s)

    E.Stop()
示例#37
0
def buildMisprimingLib(infiles, outfile):
    '''
    build fasta file of sequences to check for mispriming
    '''
    fasta, identifiers = infiles
    inf = IOTools.openFile(fasta)

    E.info("reading ids for sequences to keep")
    ids = readIdentifiers(identifiers)

    outf = IOTools.openFile(outfile, "w")
    E.info("collecting sequences")
    for f in FastaIterator.iterate(IOTools.openFile(fasta)):
        if f.title not in ids:
            outf.write(">%s\n%s\n" % (f.title, f.sequence))
    outf.close()
示例#38
0
def buildMisprimingLib(infiles, outfile):
    '''
    build fasta file of sequences to check for mispriming
    '''
    fasta, identifiers = infiles
    inf = IOTools.openFile(fasta)
    
    E.info("reading ids for sequences to keep")
    ids = readIdentifiers(identifiers)

    outf = IOTools.openFile(outfile, "w")
    E.info("collecting sequences")
    for f in FastaIterator.iterate(IOTools.openFile(fasta)):
        if f.title not in ids:
            outf.write(">%s\n%s\n" % (f.title, f.sequence))
    outf.close()
示例#39
0
def segmentFixedWidthWindows(infile, window_size, window_shift):
    """return a list of fixed contig sizes."""

    ninput, nskipped, noutput = 0, 0, 0

    iterator = FastaIterator.FastaIterator(infile)
    window_shift = window_size
    # at most 50% can be gap
    gap_cutoff = int(window_size // 2)
    segments = []

    while 1:
        ninput += 1
        try:
            cur_record = iterator.next()
        except StopIteration:
            break

        if cur_record is None:
            break
        contig = re.sub("\s.*", "", cur_record.title)
        seq = cur_record.sequence
        size = len(cur_record.sequence)

        for x in range(0, size, window_shift):
            s = seq[x:x + window_size].upper()
            gc, at = 0, 0
            for c in s:
                if c in "GC":
                    gc += 1
                elif c in "AT":
                    at += 1

            # skip segments containing mostly gaps
            if window_size - (gc + at) > gap_cutoff:
                nskipped += 1
                continue

            segments.append(
                (contig, x, x + window_size, float(gc) / (gc + at)))
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped_windows=%i" %
           (ninput, noutput, nskipped))

    return segments
示例#40
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-pm",
                      "--profilematrix",
                      dest="matrixfile",
                      type="string",
                      help="name of profile file you want to convert")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)
    #outf = IOTools.openFile("my_output", "w")
    for line in options.matrixfile:
        line = line.strip()
        fields = line.split()
        total = sum([float(col) for col in fields[1:]])
        if total == 0:
            continue
        else:
            for i, col in enumerate(fields):
                if i == 0: continue
                fields[i] = col / total
    options.stdout.write("\t".join(map(str, fields)))

    for fasta_read in FastaIterator.iterate(IOTools.openFile(
            options.fastafile)):
        read_sequence = fasta_read.sequence
        read_name = fasta_read.title
        quals = '.' * len(read_sequence)

        new_fastq = Fastq.Record(identifier=read_name,
                                 seq=read_sequence,
                                 quals=quals)
        new_fastq.fromPhred([30] * len(read_sequence), format='illumina-1.8')
        options.stdout.write(str(new_fastq) + "\n")
    # write footer and output benchmark information.
    E.Stop()
示例#41
0
def build_scaffold_lengths(contigs_file, outfile, params):
    '''
    output the distribution of scaffold lengths
    '''
    PARAMS = params

    if PARAMS["filter"]:
        f = PARAMS["filter"]
    else:
        f = 0
    inf = open(contigs_file)
    outf = open(outfile, "w")
    outf.write("scaffold_name\tlength\n")
    for record in FastaIterator.iterate(inf):
        scaffold_length = len(list(record.sequence))
        if scaffold_length > f:
            # rename sequences if they have a space in them
            outf.write("%s\t%i\n" % (record.title.replace(" ", "_"), scaffold_length))
    outf.close()
def filter16SGenes(infiles, outfile):
    '''Drop anything with gaps and drop anything with sequence length
    below 1400bp'''

    fasta_file, stat_file = infiles

    # fetch a list of those entries to be filtered
    to_drop = set()
    df = pd.read_table(stat_file, index_col=0)
    to_drop.update(df[df['ngap_regions'] > 0].index.tolist())
    to_drop.update(df[df['length'] < 1400].index.tolist())

    # drop from fasta file
    with IOTools.openFile(outfile, 'w') as outf:
        for fasta in FastaIterator.FastaIterator(IOTools.openFile(fasta_file)):
            if fasta.title in to_drop:
                continue
            else:
                outf.write('\n'.join(['>' + fasta.title, fasta.sequence]) + '\n')   
示例#43
0
def runMEMEOnSequences(infile, outfile):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the 
    top 10% of intervals (peakval) are used. 
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker
    '''
    to_cluster = True
    # job_options = "-l mem_free=8000M"

    nseqs = int(FastaIterator.count(infile))
    if nseqs == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        P.touch(outfile)
        return

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "meme", outfile)
    tmpdir = P.getTempDir(".")

    statement = '''
        meme %(infile)s -dna -revcomp 
                        -mod %(meme_model)s 
                        -nmotifs %(meme_nmotifs)s 
                        -oc %(tmpdir)s 
                        -maxsize %(motifs_max_size)s 
                        %(meme_options)s 
       > %(outfile)s.log
    '''

    P.run()

    collectMEMEResults(tmpdir, target_path, outfile)
def subsampleNReadsFromFasta(infile, outfile, nreads, logfile=""):

    checkParams()

    nseqs = FastaIterator.count(infile)

    if nreads > nseqs:
        prop = 1
    else:    
        prop = float(nreads)/float(nseqs)

    if logfile:
        logfile = "-L %s" % logfile

    statement = ''' python %(scriptsdir)s/fasta2fasta.py 
                     -I %(infile)s
                     %(logfile)s
                     -m sample
                     --sample-proportion=%(prop)s
                     -S %(outfile)s '''

    P.run()
def countCompleteGenes(infile, outfile):
    '''
    count the number of genes that are classed
    as complete based on having a start and stop codon
    '''
    start = "ATG"
    stop = ["TAG", "TAA", "TGA"]

    ntotal = 0
    nstart = 0
    nstop = 0
    nstart_nstop = 0
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        ntotal += 1
        if fasta.sequence.startswith(start):
            nstart += 1
        if fasta.sequence[-3:len(fasta.sequence)] in stop:
            nstop += 1
        if fasta.sequence.startswith(start) and fasta.sequence[-3:len(fasta.sequence)] in stop:
            nstart_nstop += 1
    outf = open(outfile, "w")
    outf.write("total_genes\tpstart\tpstop\tpstart_stop\n")
    outf.write("\t".join(map(str,[ntotal, float(nstart)/ntotal, float(nstop)/ntotal, float(nstart_nstop)/ntotal])) + "\n")
    outf.close()
示例#46
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: split_fasta.py 1714 2007-12-11 16:51:12Z andreas $")

    parser.add_option("-f", "--file", dest="input_filename", type="string",
                      help="input filename. If not given, stdin is used.",
                      metavar="FILE")

    parser.add_option("-i", "--input-pattern", dest="input_pattern", type="string",
                      help="input pattern. Parses description line in order to extract id.")

    parser.add_option("-o", "--output-pattern", dest="output_pattern", type="string",
                      help="output pattern. Gives filename for a given sequence.")

    parser.add_option("-n", "--num-sequences", dest="num_sequences", type="int",
                      help="split by number of sequences (not implemented yet).")

    parser.add_option("-m", "--map", dest="map_filename", type="string",
                      help="map filename. Map identifiers to filenames",
                      metavar="FILE")

    parser.add_option("-s", "--skip-identifiers", dest="skip_identifiers", action="store_true",
                      help="do not write identifiers.",
                      metavar="FILE")

    parser.add_option("--min-size", dest="min_size", type="int",
                      help="minimum cluster size.")

    parser.set_defaults(
        input_filename=None,
        map_filename=None,
        skip_identifiers=False,
        input_pattern="^(\S+)",
        min_size=0,
        num_sequences=None,
        output_pattern="%s")

    (options, args) = E.Start(parser)

    if options.input_filename:
        infile = IOTools.openFile(options.input_filename, "r")
    else:
        infile = sys.stdin

    if options.map_filename:
        map_id2filename = IOTools.ReadMap(open(options.map_filename, "r"))
    else:
        map_id2filename = {}

    if options.num_sequences:
        files = FilesChunks(chunk_size=options.num_sequences,
                            output_pattern=options.output_pattern,
                            skip_identifiers=options.skip_identifiers)

    else:
        files = Files(output_pattern=options.output_pattern,
                      skip_identifiers=options.skip_identifiers)

    if options.input_pattern:
        rx = re.compile(options.input_pattern)
    else:
        rx = None

    ninput = 0
    noutput = 0
    identifier = None
    chunk = 0

    for seq in FastaIterator.iterate(infile):

        ninput += 1

        if rx:
            try:
                identifier = rx.search(seq.title).groups()[0]
            except AttributeError:
                print "# parsing error in description line %s" % (seq.title)
        else:
            identifier = seq.title

        if map_id2filename:
            if identifier in map_id2filename:
                identifier = map_id2filename[identifier]
            else:
                continue

        files.Write(identifier, seq)
        noutput += 1

    if options.input_filename:
        infile.close()

    # delete all clusters below a minimum size
    # Note: this has to be done at the end, because
    # clusters sizes are only available once both the fasta
    # file and the map has been parsed.
    if options.min_size:
        ndeleted = files.DeleteFiles(min_size=options.min_size)
    else:
        ndeleted = 0

    if options.loglevel >= 1:
        print "# input=%i, output=%i, ndeleted=%i" % (ninput, noutput, ndeleted)

    E.Stop()
示例#47
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-s", "--correct-gap-shift", dest="correct_shift",
        action="store_true",
        help="correct gap length shifts in alignments. "
        "Requires alignlib_lite.py [%default]")

    parser.add_option(
        "-1", "--pattern1", dest="pattern1", type="string",
        help="pattern to extract identifier from in identifiers1. "
        "[%default]")

    parser.add_option(
        "-2", "--pattern2", dest="pattern2", type="string",
        help="pattern to extract identifier from in identifiers2. "
        "[%default]")

    parser.add_option(
        "-o", "--output-section", dest="output", type="choice",
        action="append",
        choices=("diff", "missed", "seqdiff"),
        help="what to output [%default]")

    parser.set_defaults(correct_shift=False,
                        pattern1="(\S+)",
                        pattern2="(\S+)",
                        output=[])

    (options, args) = E.Start(parser)

    if len(args) != 2:
        raise ValueError("two files needed to compare.")

    if options.correct_shift:
        try:
            import alignlib_lite
        except ImportError:
            raise ImportError(
                "option --correct-shift requires alignlib_lite.py_ "
                "but alignlib not found")

    seqs1 = dict([
        (x.title, x.sequence) for x in FastaIterator.iterate(
            IOTools.openFile(args[0], "r"))])
    seqs2 = dict([
        (x.title, x.sequence) for x in FastaIterator.iterate(
            IOTools.openFile(args[1], "r"))])

    if not seqs1:
        raise ValueError("first file %s is empty." % (args[0]))
    if not seqs2:
        raise ValueError("second file %s is empty." % (args[1]))

    MapIdentifiers(seqs1, options.pattern1)
    MapIdentifiers(seqs2, options.pattern2)

    nsame = 0
    nmissed1 = 0
    nmissed2 = 0
    ndiff = 0
    ndiff_first = 0
    ndiff_last = 0
    ndiff_prefix = 0
    ndiff_selenocysteine = 0
    ndiff_masked = 0
    nfixed = 0
    found2 = {}

    write_missed1 = "missed" in options.output
    write_missed2 = "missed" in options.output
    write_seqdiff = "seqdiff" in options.output
    write_diff = "diff" in options.output or write_seqdiff

    for k in sorted(seqs1):
        if k not in seqs2:
            nmissed1 += 1
            if write_missed1:
                options.stdout.write("---- %s ---- %s\n" % (k, "missed1"))
            continue

        found2[k] = 1

        s1 = seqs1[k].upper()
        s2 = seqs2[k].upper()
        m = min(len(s1), len(s2))

        if s1 == s2:
            nsame += 1
        else:
            status = "other"

            ndiff += 1

            if s1[1:] == s2[1:]:
                ndiff_first += 1
                status = "first"
            elif s1[:m] == s2[:m]:
                ndiff_prefix += 1
                status = "prefix"
            elif s1[:-1] == s2[:-1]:
                ndiff_last += 1
                status = "last"
            else:
                if len(s1) == len(s2):
                    # get all differences: the first and last residues
                    # can be different for peptide sequences when
                    # comparing my translations with ensembl peptides.
                    differences = []
                    for x in range(1, len(s1) - 1):
                        if s1[x] != s2[x]:
                            differences.append((s1[x], s2[x]))

                    l = len(differences)
                    # check for Selenocysteins
                    if len([x for x in differences if x[0] == "U" or x[1] == "U"]) == l:
                        ndiff_selenocysteine += 1
                        status = "selenocysteine"

                    # check for masked residues
                    elif len([x for x in differences if x[0] in "NX" or x[1] in "NX"]) == l:
                        ndiff_masked += 1
                        status = "masked"

            # correct for different gap lengths
            if options.correct_shift:

                map_a2b = alignlib_lite.py_makeAlignmentVector()

                a, b = 0, 0
                keep = False

                x = 0
                while x < m and not (a == len(s1) and b == len(s2)):
                    try:
                        if s1[a] != s2[b]:
                            while s1[a] == "N" and s2[b] != "N":
                                a += 1
                            while s1[a] != "N" and s2[b] == "N":
                                b += 1

                            if s1[a] != s2[b]:
                                break
                    except IndexError:
                        print("# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i" % (k, x, a, b, len(s1), len(s2)))
                        break

                    a += 1
                    b += 1
                    map_a2b.addPairExplicit(a, b, 0.0)
                    # check if we have reached the end:
                else:
                    keep = True
                    nfixed += 1
                    f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b)
                    print("fix\t%s\t%s" % (k, str(f)))

                if not keep:
                    print("# warning: not fixable: %s" % k)

            if write_diff:
                options.stdout.write("---- %s ---- %s\n" % (k, status))

            if write_seqdiff:
                options.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k]))

    for k in sorted(list(seqs2.keys())):
        if k not in found2:
            nmissed2 += 1
            if write_missed2:
                options.stdout.write("---- %s ---- %s\n" % (k, "missed2"))

    options.stdlog.write("""# Legend:
""")

    E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" %
           (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2))

    E.info(
        "ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i" %
        (ndiff, ndiff_first, ndiff_last, ndiff_prefix,
         ndiff_selenocysteine, ndiff_masked, nfixed,
         ndiff - ndiff_first - ndiff_last - ndiff_prefix -
         ndiff_selenocysteine - ndiff_masked - nfixed))

    E.Stop()
def main( argv = None ):

    parser = E.OptionParser( version = "%prog version: $Id: analyze_codonbias_shannon.py 2864 2010-03-03 10:18:16Z andreas $",
                                    usage = globals()["__doc__"] )

    parser.add_option( "-c", "--is-cds", dest="is_cds", action="store_true",
                       help = "input are cds (nucleotide) sequences [%default]" )
    
    parser.set_defaults(
        is_cds = False,
        )
    
    (options, args) = E.Start( parser, argv = argv )

    options.stdout.write( "snpid\tidentifier\tpos\treference\tvariant\tcounts\tweight\n" )

    alphabet = "ACDEFGHIKLMNPQRSTVWY"
    
    snpid = 0

    for entry in FastaIterator.iterate( options.stdin ):
        identifier = entry.title

        if options.is_cds:
            cds_sequence = entry.sequence.upper()
            assert len(cds_sequence) % 3 == 0, \
                "length of sequence '%s' is not a multiple of 3" % entry.title

            sequence = Genomics.translate( cds_sequence )
            weights = []
            for pos, cds_pos in enumerate(range( 0, len(cds_sequence), 3)):
                codon = cds_sequence[cds_pos:cds_pos+3]
                counts = collections.defaultdict(int)
                for x in range(0,3):
                    rna = codon[x]
                    for na in "ACGT":
                        if na == rna: continue
                        taa = Genomics.translate(codon[:x] + na + codon[x+1:])
                        counts[taa] += 1
                weights.append( counts )

        else:
            sequence = entry.sequence.upper()
            counts = {}
            for x in alphabet: counts[x] = 1
            weights = [counts] * len(sequence)

        for pos, ref in enumerate( sequence ):

            if ref not in alphabet: continue
            w = weights[pos]
            t = float(sum(w.values()))
            for variant in alphabet:
                if variant == ref: continue
                snpid +=1
                options.stdout.write( 
                    "%s\n" % "\t".join(
                        ( "%010i" % snpid,
                          identifier,
                          str(pos+1),
                          ref, 
                          variant,
                          "%i" % w[variant],
                          "%6.4f" % (w[variant] / t),
                          )))
    
    E.Stop()
示例#49
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"])

    parser.add_option("-k", "--kmer-size", dest="kmer", type="int", help="supply kmer length")

    parser.add_option(
        "-p",
        "--output-proportion",
        dest="proportion",
        action="store_true",
        help="output proportions - overides the default output",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # do not allow greater than octonucleotide
    assert options.kmer <= 8, "cannot handle kmer of length %i" % options.kmer

    # how we deal with the nucleotides depends on the kmer length
    nucleotides = []
    for nucleotide in ["A", "C", "T", "G"]:
        nucleotides = nucleotides + [x for x in itertools.repeat(nucleotide, options.kmer)]

    E.info("retrieving %imer sequences" % options.kmer)
    # get all kmer sequences to query
    kmers = set()
    for kmer in itertools.permutations(nucleotides, options.kmer):
        kmers.add(kmer)

    E.info("matching %imers in file" % options.kmer)
    # count the number of kmers in each sequence

    result = {}

    # NB assume that non fasta files are caught by FastaIterator
    total_entries = 0
    for fasta in FastaIterator.iterate(options.stdin):
        total_entries += 1
        result[fasta.title] = {}
        for kmer in kmers:
            counts = [m.start() for m in re.finditer("".join(kmer), fasta.sequence)]
            result[fasta.title][kmer] = len(counts)

    E.info("writing results")
    # write out the results
    headers = result.keys()
    rows = set()
    for kmer_counts in result.values():
        for kmer, count in kmer_counts.iteritems():
            rows.add("".join(kmer))

    # write header row
    options.stdout.write("kmer\t" + "\t".join(headers) + "\n")

    # output proportions if required - normalises by
    # sequence length
    E.info("computing total counts")
    totals = {}
    for header in headers:
        totals[header] = sum([result[header][tuple(row)] for row in rows])

    for row in rows:
        if options.proportion:
            options.stdout.write(
                "\t".join([row] + [str(float(result[header][tuple(row)]) / totals[header]) for header in headers])
                + "\n"
            )
        else:
            options.stdout.write("\t".join([row] + [str(result[header][tuple(row)]) for header in headers]) + "\n")

    E.info("written kmer counts for %i contigs" % total_entries)
    # write footer and output benchmark information.
    E.Stop()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(version="%prog version: $Id: contigs2random_sample.py 2871 2010-03-03 10:20:44Z nicki $",
                                   usage=globals()["__doc__"])

    parser.add_option("-m", "--species-map", dest="species_map", type="string",
                      help="text file specifying the mapping between contig and genome")

    parser.add_option("-g", "--genome-dir", dest="genome_dir", type="string",
                      help="specify directory where genome / genomes are stored")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # read in contig lengths into dictionary
    E.info("reading contigs file")
    c_contigs = 0
    contigs_lengths = {}
    for fasta in FastaIterator.iterate(options.stdin):
        c_contigs += 1

        # titles of fasta records must be single strings with no special
        # characters
        contigs_lengths[fasta.title.split(" ")[0]] = len(fasta.sequence)

    E.info("read %i contigs" % c_contigs)

    # read in mapping between spcies and contigs
    species_map = {}
    for line in open(options.species_map).readlines():
        data = line[:-1].split("\t")
        contig, species = data[0], data[1]
        species_map[contig] = species

    # read genomes into memory
    # NB this may need optimisin if using large
    # genomes or many genomes
    E.info("reading genomes from %s" % options.genome_dir)

    # The directory must ONLY contain genome files!!
    genomes_sequences = {}
    c_genomes = 0
    for genome_file in glob.glob(os.path.join(options.genome_dir, "*")):
        c_genomes += 1
        for fasta in FastaIterator.iterate(IOTools.openFile(genome_file)):
            genomes_sequences[fasta.title] = fasta.sequence
    E.info("read %i genomes from %s" % (c_genomes, options.genome_dir))

    # iterate over the contigs and sample from the respective genome
    E.info("iterating over contigs")
    c_contigs_output = 0
    for contig, length in contigs_lengths.iteritems():
        if contig not in species_map:
            E.warn("contig %s not in species map file" % contig)
        else:
            c_contigs_output += 1
            genome = species_map[contig]
            genome_length = len(genomes_sequences[genome])

            # get the start position from which to sample
            start = random.randint(1, genome_length)
            try:
                end = start + length - 1
            except ValueError:
                print "end of sampled contig extends beyond length of genome"

            sampled_seq = genomes_sequences[genome][start:end]
            options.stdout.write(
                ">%s_random\n%s\n" % (contig + "_%s" % species_map[contig], sampled_seq))

    E.info("written %i contigs" % c_contigs_output)
    # write footer and output benchmark information.
    E.Stop()
示例#51
0
        
    else:
        files = Files( output_pattern = options.output_pattern,
                       skip_identifiers = options.skip_identifiers )

    if options.input_pattern:
        rx = re.compile( options.input_pattern )
    else:
        rx = None

    ninput = 0
    noutput = 0
    identifier = None
    chunk = 0
    
    for seq in FastaIterator.iterate( infile ):
        
        ninput += 1

        if rx:
            try:
                identifier = rx.search(seq.title).groups()[0]
            except AttributeError:
                print "# parsing error in description line %s" % (seq.title)
        else:
            identifier = seq.title

        if map_id2filename:
            if identifier in map_id2filename:
                identifier = map_id2filename[identifier]
            else:
def findTATABox(infiles, outfile):
    '''find TATA box in promotors. There are several matrices to choose from:

    M00216 V$TATA_C Retroviral TATA box
    M00252 V$TATA_01 cellular and viral TATA box elements
    M00311 V$ATATA_B Avian C-type TATA box
    M00320 V$MTATA_B Muscle TATA box
    '''

    # 1. create fasta file - look for TATA box
    #
    bedfile, genomefile = infiles

    statement = '''
    slopBed -i %(bedfile)s
            -l %(tata_search_upstream)i
            -r %(tata_search_downstream)i
            -s
            -g %(genomefile)s
    | cgat bed2fasta 
       --use-strand
       --genome=%(genome_dir)s/%(genome)s
       --log=%(outfile)s.log
    > %(outfile)s.fasta
    '''

    P.run()

    match_executable = '/ifs/data/biobase/transfac/match/bin/match_linux64'
    match_matrix = '/ifs/data/biobase/transfac/dat/matrix.dat'
    match_profile = 'minFP_good.prf'
    match_profile = outfile + ".prf"

    prf = '''tata.prf
prf to minimize sum of both errors - derived from minSUM.prf
 MIN_LENGTH 300
0.0
 1.000 0.716 0.780 M00216 V$TATA_C
 1.000 0.738 0.856 M00252 V$TATA_01
 1.000 0.717 0.934 M00311 V$ATATA_B
 1.000 0.711 0.784 M00320 V$MTATA_B
//
'''

    with IOTools.openFile(match_profile, "w") as outf:
        outf.write(prf)

    # -u : uniq - only one best match per sequence
    statement = '''
         %(match_executable)s
         %(match_matrix)s
         %(outfile)s.fasta
         %(outfile)s.match
         %(match_profile)s
         -u
    >> %(outfile)s.log
    '''
    P.run()

    transcript2pos = {}
    for entry in FastaIterator.iterate(IOTools.openFile(outfile + ".fasta")):
        transcript_id, contig, start, end, strand = re.match(
            "(\S+)\s+(\S+):(\d+)..(\d+)\s+\((\S)\)", entry.title).groups()
        transcript2pos[transcript_id] = (contig, int(start), int(end), strand)

    MATCH = collections.namedtuple(
        "MATCH", "pid transfac_id pos strand core_similarity matrix_similarity sequence")

    def _grouper(infile):
        r = []
        keep = False
        for line in infile:
            if line.startswith("Inspecting sequence ID"):
                keep = True
                if r:
                    yield pid, r
                r = []
                pid = re.match(
                    "Inspecting sequence ID\s+(\S+)", line).groups()[0]
                continue
            elif line.startswith(" Total"):
                break

            if not keep:
                continue
            if line[:-1].strip() == "":
                continue
            transfac_id, v, core_similarity, matrix_similarity, sequence = [
                x.strip() for x in line[:-1].split("|")]
            pos, strand = re.match("(\d+) \((\S)\)", v).groups()
            r.append(MATCH._make((pid, transfac_id, int(pos), strand,
                                  float(core_similarity), float(matrix_similarity), sequence)))

        yield pid, r

    offset = PARAMS["tata_search_upstream"]

    outf = IOTools.openFile(outfile + ".table.gz", "w")
    outf.write("\t".join(("transcript_id", "strand",
                          "start", "end",
                          "relative_start", "relative_end",
                          "transfac_id",
                          "core_similarity",
                          "matrix_similarity",
                          "sequence")) + "\n")

    bedf = IOTools.openFile(outfile, "w")

    c = E.Counter()
    found = set()
    for transcript_id, matches in _grouper(IOTools.openFile(outfile + ".match")):
        contig, seq_start, seq_end, strand = transcript2pos[transcript_id]
        c.promotor_with_matches += 1
        nmatches = 0
        found.add(transcript_id)
        for match in matches:

            c.matches_total += 1
            lmatch = len(match.sequence)
            if match.strand == "-":
                c.matches_wrong_strand += 1
                continue

            # get genomic location of match
            if strand == "+":
                genome_start = seq_start + match.pos
            else:
                genome_start = seq_end - match.pos - lmatch

            genome_end = genome_start + lmatch

            # get relative location of match
            if strand == "+":
                tss_start = seq_start + offset
                relative_start = genome_start - tss_start
            else:
                tss_start = seq_end - offset
                relative_start = tss_start - genome_end

            relative_end = relative_start + lmatch

            outf.write("\t".join(map(str, (
                transcript_id, strand,
                genome_start, genome_end,
                relative_start, relative_end,
                match.transfac_id,
                match.core_similarity,
                match.matrix_similarity,
                match.sequence))) + "\n")
            c.matches_output += 1
            nmatches += 1

            bedf.write("\t".join(map(
                str,
                (contig, genome_start, genome_end, transcript_id, strand,
                 match.matrix_similarity))) + "\n")

        if nmatches == 0:
            c.promotor_filtered += 1
        else:
            c.promotor_output += 1

    c.promotor_total = len(transcript2pos)
    c.promotor_without_matches = len(
        set(transcript2pos.keys()).difference(found))

    outf.close()
    bedf.close()

    with IOTools.openFile(outfile + ".summary", "w") as outf:
        outf.write("category\tcounts\n")
        outf.write(c.asTable() + "\n")

    E.info(c)
示例#53
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--input-fasta", dest="fasta", type="str",
                      help="name of fasta infile")

    parser.add_option("-k", "--kmer-size", dest="kmer", type="int",
                      help="supply kmer length")

    parser.add_option("--subset", dest="subset", type="int",
                      help="only analyse the first x entries")

    parser.set_defaults(
        fasta=None,
        kmer=10,
        subset=None)

    (options, args) = E.Start(parser)

    E.info("%s\n" % using("start"))

    assert options.fasta, "must provide a fasta filename (--input-fasta=)"

    k = KmerCounter()

    Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta))

    # total entries also acts as the index for the entry_id
    total_entries = 0

    options.stdout.write("%s\n" % "\t".join((
        "id", "unique_kmers", "non_unique_kmers", "fraction_unique")))

    # iterate transcripts, shred and identify unique kmers
    E.info("shredding fasta to identify unique kmers")
    for entry in Iterator:
        if total_entries % 1000 == 0:
            E.info("1st shred complete for %i entries" % total_entries)

        if options.subset and total_entries >= options.subset:
            break

        k.shred(entry.sequence.upper(), options.kmer)
        total_entries += 1

    total_entries = 0
    Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta))

    # iterate transcripts, shread and count unique kmers
    E.info("re-shredding fasta to count unique kmers")

    for entry in Iterator:
        if total_entries % 1000 == 0:
            E.info("2nd shred complete for %i entries" % total_entries)

        transcript_id = entry.title.split()[0]

        if options.subset and total_entries >= options.subset:
            break

        total_entries += 1

        unique, non_unique = k.countUniqueKmers(
            entry.sequence.upper(), options.kmer)

        fraction = np.divide(float(unique), (unique + non_unique))

        options.stdout.write("%s\n" % "\t".join(
            map(str, (transcript_id, unique, non_unique, fraction))))

    E.info("found %i kmers" % len(k.kmer2entry))
    E.info("written kmer counts for %i contigs" % total_entries)
    # write footer and output benchmark information.
    E.info("%s\n" % using("end"))

    E.Stop()
示例#54
0
    parser.set_defaults(
        input_format="fasta",
        output_format="fasta",
        method = None,
        parameters = "",
        gop = -10.0,
        gep = -1.0,
        alignment_method = "sw",
        )

    (options, args) = E.Start( parser )

    options.parameters = options.parameters.split(",")    

    iterator = FastaIterator.iterate( sys.stdin )

    if options.method == "add":
        
        mali = Mali.Mali()

        mali.readFromFile( open(options.parameters[0], "r"), format = options.input_format )
        del options.parameters[0]

        old_length = mali.getLength()
        
        new_mali = convertMali2Mali( mali )

        if options.alignment_method == "sw":
            alignator = alignlib.makeAlignatorFullDP( options.gop, options.gep )
        else:
示例#55
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version",
                            usage=globals()["__doc__"])

    parser.add_option(
        "--output-quality-format", dest="q_format", type="int",
        help="sequence quality format, e.g 33 = +33/Sanger"
        "[default=%default].")

    parser.add_option(
        "--output-paired-end", dest="paired", action="store_true",
        help="generate paired end reads [default = %default].")

    parser.add_option(
        "--insert-length-mean", dest="insert_mean", type="float",
        help="mean insert length [default = %default].")

    parser.add_option(
        "--insert-length-sd", dest="insert_sd", type="float",
        help="insert length standard deviation [default = %default].")

    parser.add_option(
        "--counts-method", dest="counts_method", type="choice",
        choices=("reads", "copies"),
        help="simulate a ground truth number of reads per entry or"
        "copies per entry [default = %default].")

    parser.add_option(
        "--counts-min", dest="counts_min", type="float",
        help="minimum number of reads/read pairs per fasta entry"
        "or copies per entry [default = %default].")

    parser.add_option(
        "--counts-max", dest="counts_max", type="float",
        help="maximum number of reads/read pairs per fasta entry "
        "or copies per entry [default = %default].")

    parser.add_option(
        "--output-read-length", dest="read_length", type="int",
        help="read length [default = %default].")

    parser.add_option(
        "--sequence-error-phred", dest="phred", type="int",
        help="phred quality score [default = %default].")

    parser.add_option(
        "--output-counts", dest="output_counts", type="string",
        help="name for counts outfile [default=%default].")

    parser.add_option(
        "--output-fastq2", dest="fastq2_out", type="string",
        help="filename for second fastq outfile [default=%default].")

    parser.add_option(
        "--premrna-fraction", dest="premrna_fraction", type="float",
        help="the fraction of reads to simulate from pre-mRNA"
        "[default= % default].")

    parser.add_option(
        "--infile-premrna-fasta", dest="premrna_fasta", type="string",
        help="filename for pre-mRNA fasta[default=%default].")

    parser.set_defaults(
        q_format=33,
        paired=False,
        insert_mean=0,
        insert_sd=1,
        counts_method="reads",
        counts_min=1,
        counts_max=1,
        read_length=50,
        fastq2_out=None,
        output_counts=None,
        phred=30,
        premrna_fraction=0,
        premrna_fasta=None
    )

    (options, args) = E.Start(parser)

    if options.paired:
        assert options.fastq2_out, ("must specify a second fastq outfile for "
                                    "paired end (--output-fastq2)")
        outf2 = IOTools.openFile(options.fastq2_out, "w")

    if options.premrna_fraction:
        assert options.premrna_fasta, ("must specfify the location of the"
                                       "fasta file for the pre-mRNA")

    # the sequence quality string will always be the same so define here
    sequence_quality = chr(options.q_format + options.phred)
    qual = "".join([sequence_quality] * options.read_length)

    if options.premrna_fraction:
        iterator = FastaIterator.iterate_together(
            options.stdin, IOTools.openFile(options.premrna_fasta))
    else:
        iterator = FastaIterator.FastaIterator(options.stdin)

    # set a cut off of twice the read/pair length for short entries
    if options.paired:
        minimum_entry_length = (
            2 * ((options.read_length * 2) + options.insert_mean))
    else:
        minimum_entry_length = 2 * options.read_length

    c = collections.Counter()
    counts = collections.Counter()
    copies = collections.Counter()

    for f_entry in iterator:

        if options.premrna_fraction:

            assert getTitle(f_entry[0]) == getTitle(f_entry[1]), (
                "entry ids do not match: %s != %s" % (
                    f_entry[0].title, f_entry[1].title))
            entry = f_entry[0]
            pre_entry = f_entry[1]

        else:
            entry = f_entry

        # reject short fasta entries
        if len(entry.sequence) < minimum_entry_length:
            E.info("skipping short transcript: %s length=%i"
                   % (entry.title, len(entry.sequence)))
            c['skipped'] += 1
            continue

        else:
            c['not_skipped'] += 1

        if options.paired:
            fragment_length = (
                (2 * options.read_length) + options.insert_mean)
        else:
            fragment_length = options.read_length

        reads_per_entry = float(len(entry.sequence)) / fragment_length

        if options.counts_method == "reads":
            n_reads = random.randint(options.counts_min,
                                     options.counts_max + 1)

            n_copies = float(n_reads) / reads_per_entry

            if options.premrna_fraction:
                n_reads_pre = int(round(n_reads * options.premrna_fraction))

        elif options.counts_method == "copies":

            # random float [0-1]
            rand = np.random.random_sample()
            n_copies = (options.counts_min +
                        (rand * (options.counts_max - options.counts_min)))

            n_reads = int(round(n_copies * reads_per_entry, 0))

            # as n_reads must be rounded to int, need to redefine n_copies
            n_copies = float(n_reads) / reads_per_entry

            if options.premrna_fraction:
                reads_per_pre_entry = (float(len(pre_entry.sequence)) /
                                       fragment_length)
                n_copies_pre = n_copies * options.premrna_fraction
                n_reads_pre = int(round(n_copies_pre * reads_per_pre_entry, 0))
                # as n_reads_pre must be rounded to int, need to
                # redefine n_copies_pre
                n_copies_pre = float(n_reads_pre) / reads_per_pre_entry

        entry_id = getTitle(entry)

        counts[entry_id] = n_reads
        copies[entry_id] = n_copies

        if "N" in entry.sequence.upper():
            E.warn("fasta entry %s contains unknown bases ('N')" % entry_id)

        for i in range(0, n_reads):

            read = generateRead(entry=entry.sequence.upper(),
                                read_length=options.read_length,
                                error_rate=options.phred,
                                paired=options.paired,
                                insert_mean=options.insert_mean,
                                insert_sd=options.insert_sd)

            if options.paired:
                r1, r2 = read
                h1 = "@%s_%i/1" % (entry_id, i)
                h2 = "@%s_%i/2" % (entry_id, i)

                options.stdout.write("\n".join((h1, r1, "+", qual)) + "\n")
                outf2.write("\n".join((h2, r2, "+", qual)) + "\n")

            else:
                h = "@%s_%i/1" % (entry_id, i)

                options.stdout.write("\n".join((h, read, "+", qual)) + "\n")

        if options.premrna_fraction:
            c['pre_counts'] += n_reads_pre
            c['pre_copies'] += n_copies_pre

            for i in range(0, n_reads_pre):

                read = generateRead(entry=pre_entry.sequence.upper(),
                                    read_length=options.read_length,
                                    error_rate=options.phred,
                                    paired=options.paired,
                                    insert_mean=options.insert_mean,
                                    insert_sd=options.insert_sd)

                if options.paired:
                    r1, r2 = read
                    h1 = "@%s_pre-mRNA_%i/1" % (entry_id, i)
                    h2 = "@%s_pre-mRNA_%i/2" % (entry_id, i)

                    options.stdout.write("\n".join((h1, r1, "+", qual)) + "\n")
                    outf2.write("\n".join((h2, r2, "+", qual)) + "\n")

                else:
                    h = "@%s_pre-mRNA_%i/1" % (entry_id, i)

                    options.stdout.write("\n".join((h, read, "+", qual)) + "\n")

    if options.paired:
        outf2.close()

    with IOTools.openFile(options.output_counts, "w") as counts_out:

        counts_out.write("%s\n" % "\t".join(("id", "read_count", "tpm")))

        sum_copies = sum(copies.values())
        sum_counts = sum(counts.values())

        for entry_id, count in counts.items():
            tpm = 1000000 * (float(copies[entry_id]) / sum_copies)
            counts_out.write(
                "%s\n" % "\t".join(map(str, (entry_id, count, tpm))))

    E.info("Reads simulated for %i fasta entries, %i entries skipped"
           % (c['not_skipped'], c['skipped']))

    E.info("Simulated: %i reads (%i mRNA, %i pre-mRNA), "
           "%f transcripts (%f mRNA, %f pre-mRNA)" % (
               sum_counts + c['pre_counts'], sum_counts, c['pre_counts'],
               sum_copies + c['pre_copies'], sum_copies, c['pre_copies']))

    E.Stop()
示例#56
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id: run_nubiscan.py 2861 2010-02-23 17:36:32Z andreas $", usage = globals()["__doc__"] )

    parser.add_option("-i", "--iterations", dest="iterations", type="int",
                      help="number of iterations for sampling [default=%default]."  )

    parser.add_option("-q", "--qvalue", dest="qvalue_threshold", type="float",
                      help="qvalue threshold [default=%default]."  )

    parser.add_option("--without-combine", dest="combine", action = "store_false",
                      help="combine overlapping motifs [default=%default]."  )

    parser.add_option("-f", "--fdr-control", dest="fdr_control", type="choice",
                      choices = ("per-sequence", "all", "xall"),
                      help="qvalue threshold [default=%default]."  )

    parser.add_option("-m", "--motif", dest="motif", type="choice",
                      choices=("rxrvdr", "rxrvdr1", "rxrvdr2", "nr"),
                      help="qvalue threshold [default=%default]."  )

    parser.add_option("-a", "--arrangements", dest="arrangements", type="string",
                      help ="',' separated list of repeat arrangements [default=%default]")

    parser.add_option("-x", "--mask", dest="mask", type="choice",
                      choices=("dust","repeatmasker"),
                      help ="mask sequences before scanning [default=%default]")

    parser.add_option("--output-stats", dest="output_stats", action = "store_true",
                      help="output stats [default=%default]."  )

    parser.add_option("--add-sequence", dest="add_sequence", action = "store_true",
                      help="add sequence information [default=%default]."  )

    parser.set_defaults(
        iterations = 100,
        qvalue_threshold = 0.05,
        motif = "rxrvdr",
        fdr_control = "all",
        combine = True,
        arrangements = None,
        mask = None,
        output_stats = False,
        add_sequence = False,
        )

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv, add_output_options = True )

    ## do sth
    ninput, nskipped, noutput = 0, 0, 0

    if options.arrangements == None:
        options.arrangements = [ "DR%s" % x for x in range(0,15) ] + [ "ER%s" % x for x in range(0,15) ]
    else:
        options.arrangements = options.arrangements.split(",")
        
    options.stdout.write( "%s" % "\t".join(Nubiscan.NubiscanMatch._fields) )
    if options.add_sequence: options.stdout.write( "\tsequence" )
    options.stdout.write("\n")

    if options.motif == 'nr': sense_matrix = NR
    elif options.motif == "rxrvdr": sense_matrix = RXRVDR
    elif options.motif == "rxrvdr1": sense_matrix = RXRVDR1
    elif options.motif == "rxrvdr2": sense_matrix = RXRVDR2
    else:
        raise ValueError("unknown matrix %s" % options.motif)

    if options.fdr_control == "all":

        seqs = list(FastaIterator.iterate(options.stdin))

        if options.mask:
            masked_seqs = maskSequences( [x.sequence for x in seqs], options.mask )
        else:
            masked_seqs = [x.sequence for x in seqs]
            
        ninput = len(seqs)
        map_id2title = dict( enumerate( [re.sub("\s.*", "", x.title) for x in seqs] ) )
        matcher = Nubiscan.MatcherRandomisationSequences( sense_matrix,
                                                          samples = options.iterations )
        
        results = matcher.run( masked_seqs,
                               options.arrangements,
                               qvalue_threshold = options.qvalue_threshold )

        if options.combine:
            results =  Nubiscan.combineMotifs( results )
        
        for r in results:

            if r.alternatives:
                alternatives = ",".join( [x.arrangement for x in r.alternatives ] )
            else:
                alternatives = ""

            options.stdout.write( "\t".join( ( 
                map_id2title[r.id],
                "%i" % r.start,
                "%i" % r.end,
                r.strand,
                r.arrangement,
                "%6.4f" % r.score,
                "%6.4f" % r.zscore,
                "%6.4e" % r.pvalue,
                "%6.4e" % r.qvalue,
                alternatives) ) )

            if options.add_sequence:
                s = masked_seqs[int(r.id)][r.start:r.end]
                if r.strand == "-": s = Genomics.complement( s )
                s = s[:6].upper() + s[6:-6].lower() + s[-6:].upper()
                options.stdout.write( "\t%s" % s )
                
            options.stdout.write("\n")
            noutput += 1

        # output stats
        if options.output_stats:
            outfile = E.openOutputFile( "fdr" )
            outfile.write("bin\thist\tnobserved\n" )
            for bin, hist, nobs in zip(matcher.bin_edges, matcher.hist, matcher.nobservations):
                outfile.write( "%f\t%f\t%f\n" % (bin, hist, nobs))
            outfile.close()


    elif options.fdr_control == "xall":

        matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix,
                                                         samples = options.iterations )
    

        # collect all results
        matches = []
        for seq in FastaIterator.iterate(options.stdin):
            ninput += 1
            mm = matcher.run( seq.sequence,
                              options.arrangements,
                              qvalue_threshold = None )
            for m in mm:
                matches.append( m._replace( sequence = seq.title ) )

        # estimate qvalues for all matches across all sequences
        pvalues = [ x.pvalue for x in matches ]
        fdr = Stats.doFDR( pvalues )
        qvalues = fdr.mQValues
        results = []
        for m, qvalue in zip(matches, qvalues):
            if qvalue > options.qvalue_threshold: continue
            results.append( m._replace( qvalue = qvalue ) )

        if options.combine:            
            results =  Nubiscan.combineMotifs( results )

        # output
        for r in results:
            options.stdout.write( "\t".join( ( 
                r.id,
                "%i" % r.start,
                "%i" % r.end,
                r.strand,
                r.arrangement,
                "%6.4f" % r.score,
                "%6.4f" % r.zscore,
                "%6.4e" % r.pvalue,
                "%6.4e" % r.qvalue ) ) + "\n" )

            noutput += 1

    elif options.fdr_control == "per-sequence":
        matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix,
                                                         samples = options.iterations )
    

        for seq in FastaIterator.iterate(options.stdin):
            ninput += 1
            result = matcher.run( seq.sequence,
                                  options.arrangements,
                                  qvalue_threshold = options.qvalue_threshold )
            
            if options.combine:
                result =  Nubiscan.combineMotifs( result )

            t = re.sub(" .*","",  seq.title)
            for r in result:
                options.stdout.write( "\t".join( ( 
                    t,
                    "%i" % r.start,
                    "%i" % r.end,
                    r.strand,
                    r.arrangement,
                    "%6.4f" % r.score,
                    "%6.4f" % r.zscore,
                    "%f" % r.pvalue,
                    "%f" % r.qvalue ) ) + "\n" )

            noutput += 1
    
    E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput,nskipped) )

    ## write footer and output benchmark information.
    E.Stop()
示例#57
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"])

    parser.add_option("--input-fasta", dest="fasta", type="str", help="name of fasta infile")

    parser.add_option(
        "--method",
        dest="method",
        type="choice",
        choices=("transcript", "gene"),
        help="count unique kmers per transcript or gene",
    )

    parser.add_option("--genemap", dest="genemap", type="str", help="file mapping transcripts to genes")

    parser.add_option("-k", "--kmer-size", dest="kmer", type="int", help="supply kmer length")

    parser.add_option("--subset", dest="subset", type="int", help="only analyse the first x entries")

    parser.set_defaults(fasta=None, method="transcript", genemap=None, kmer=10, subset=None)

    (options, args) = E.Start(parser)

    E.info("%s\n" % using("start"))

    assert options.fasta, "must provide a fasta filename (--input-fasta=)"

    k = KmerCounter()

    Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta))

    # total entries also acts as the index for the entry_id
    total_entries = 0

    options.stdout.write("%s\n" % "\t".join(("id", "unique_kmers", "non_unique_kmers", "fraction_unique")))

    # iterate fasta entries, shred and identify kmers

    if options.method == "gene":
        E.info("shredding genes to identify unique kmers")

        assert options.genemap, (
            "to perform a gene-level unique kmer count, " "you must supply a transcript2gene map (--genemap)"
        )
        t2g = {}
        with IOTools.openFile(options.genemap, "r") as inf:
            for line in inf:
                transcript, gene = line.strip().split("\t")
                t2g[transcript] = gene

        genes = set()
        current_gene = None
        sequences = []

        for entry in Iterator:

            if options.subset and total_entries >= options.subset:
                break

            transcript_id = entry.title.split()[0]
            gene_id = t2g[transcript_id]

            if gene_id != current_gene:
                if not current_gene:
                    current_gene = gene_id
                    continue

                # check this is the first time we've dealt with this gene?
                assert current_gene not in genes, (
                    "the fasta does not appear to be sorted in gene order, the"
                    " same gene is observed in non-consecutive positions!"
                )

                genes.add(current_gene)

                k.shred(sequences, options.kmer)

                if total_entries % 1000 == 0:
                    E.info("1st shred complete for %i genes" % total_entries)

                total_entries += 1

                sequences = [entry.sequence.upper()]
                current_gene = gene_id

            else:
                sequences.append(entry.sequence.upper())

        # catch last gene
        if not options.subset or options.subset and total_entries < options.subset:
            k.shred(sequences, options.kmer)

        E.info("1st shred complete for %i genes" % total_entries)

    elif options.method == "transcript":
        E.info("shredding transcripts to identify unique kmers")
        for entry in Iterator:
            if total_entries % 1000 == 0:
                E.info("1st shred complete for %i transcripts" % total_entries)

            if options.subset and total_entries >= options.subset:
                break

            k.shred([entry.sequence.upper()], options.kmer)
            total_entries += 1
        E.info("1st shred complete for %i transcripts" % total_entries)

    total_entries = 0
    Iterator = FastaIterator.iterate(IOTools.openFile(options.fasta))

    # iterate fasta entries, shread and count unique kmers
    if options.method == "gene":
        E.info("re-shredding fasta to count gene unique kmers")

        genes = set()
        current_gene = None
        sequences = []

        for entry in Iterator:

            if options.subset and total_entries >= options.subset:
                break

            transcript_id = entry.title.split()[0]
            gene_id = t2g[transcript_id]

            if gene_id != current_gene:
                if not current_gene:
                    current_gene = gene_id
                    continue

                # check this is the first time we've dealt with this gene?
                assert current_gene not in genes, (
                    "the fasta does not appear to be sorted in gene order, the"
                    " same gene is observed in non-consecutive positions!"
                )
                genes.add(current_gene)

                unique, non_unique = k.countUniqueKmers(sequences, options.kmer)

                fraction = np.divide(float(unique), (unique + non_unique))

                options.stdout.write("%s\n" % "\t".join(map(str, (current_gene, unique, non_unique, fraction))))

                if total_entries % 1000 == 0:
                    E.info("2nd shred complete for %i genes" % total_entries)

                total_entries += 1

                sequences = [entry.sequence.upper()]
                current_gene = gene_id
                total_entries += 1

            else:
                sequences.append(entry.sequence.upper())

        # catch last gene
        if not options.subset or options.subset and total_entries < options.subset:
            unique, non_unique = k.countUniqueKmers(sequences, options.kmer)

            fraction = np.divide(float(unique), (unique + non_unique))

            options.stdout.write("%s\n" % "\t".join(map(str, (gene_id, unique, non_unique, fraction))))

    if options.method == "transcript":
        E.info("re-shredding fasta to count transcript unique kmers")
        for entry in Iterator:

            if total_entries % 1000 == 0:
                E.info("2nd shred complete for %i transcripts" % total_entries)

            if options.subset and total_entries >= options.subset:
                break

            transcript_id = entry.title.split()[0]

            total_entries += 1

            unique, non_unique = k.countUniqueKmers([entry.sequence.upper()], options.kmer)

            fraction = np.divide(float(unique), (unique + non_unique))

            options.stdout.write("%s\n" % "\t".join(map(str, (transcript_id, unique, non_unique, fraction))))

    E.info("found %i kmers" % len(k.kmer2entry))
    E.info("written kmer counts for %i contigs" % total_entries)
    # write footer and output benchmark information.
    E.info("%s\n" % using("end"))

    E.Stop()
示例#58
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: sequences2mali.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-i", "--input-format", dest="input_format", type="choice",
                      choices=(
                          "plain", "fasta", "clustal", "stockholm", "phylip"),
                      help="input format of multiple alignment")

    parser.add_option("-o", "--output-format", dest="output_format", type="choice",
                      choices=("plain", "fasta", "stockholm", "phylip"),
                      help="output format of multiple alignment")

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("add",),
                      help="""method to use to build multiple alignment.""")

    parser.add_option("-p", "--parameters", dest="parameters", type="string",
                      help="parameter stack for methods that require one.")

    parser.add_option("-a", "--alignment-method", dest="alignment_method", type="choice",
                      choices=("sw", "nw"),
                      help="alignment_method [%default].")

    parser.set_defaults(
        input_format="fasta",
        output_format="fasta",
        method=None,
        parameters="",
        gop=-10.0,
        gep=-1.0,
        alignment_method="sw",
    )

    (options, args) = E.Start(parser)

    options.parameters = options.parameters.split(",")

    iterator = FastaIterator.iterate(sys.stdin)

    if options.method == "add":

        mali = Mali.Mali()

        mali.readFromFile(
            open(options.parameters[0], "r"), format=options.input_format)
        del options.parameters[0]

        old_length = mali.getLength()

        new_mali = convertMali2Mali(mali)

        if options.alignment_method == "sw":
            alignator = alignlib_lite.py_makeAlignatorFullDP(
                options.gop, options.gep)
        else:
            alignator = alignlib_lite.py_makeAlignatorFullDPGlobal(
                options.gop, options.gep)

        while 1:
            cur_record = iterator.next()
            if cur_record is None:
                break

            map_mali2seq = alignlib_lite.py_makeAlignataVector()

            sequence = alignlib_lite.py_makeSequence(cur_record.sequence)
            profile = alignlib_lite.py_makeProfileFromMali(new_mali)

            if options.loglevel >= 4:
                options.stdlog.write(profile.Write())

            alignator.Align(profile, sequence, map_mali2seq)

            if options.loglevel >= 3:
                options.stdlog.write(map_mali2seq.Write())

            # add sequence to mali
            a = alignlib_lite.py_makeAlignatumFromString(cur_record.sequence)
            a.thisown = 0

            new_mali.addAlignatum(a, map_mali2seq, 1, 1, 1, 1, 1)

            id = cur_record.title
            mali.mIdentifiers.append(id)
            mali.mMali[id] = Mali.AlignedString(id, 0, len(
                cur_record.sequence), new_mali.getRow(new_mali.getWidth() - 1).getString())

        # substitute
        for x in range(old_length):
            mali.mMali[mali.mIdentifiers[x]].mString = new_mali.getRow(
                x).getString()

        mali.writeToFile(sys.stdout, format=options.output_format)

    E.Stop()