def main():
    args = get_args()
    conn = sqlite3.connect(args.db)
    c = conn.cursor()
    config = ConfigParser.RawConfigParser(allow_no_value=True)
    config.read(args.config)
    organisms = get_names_from_config(config, args.group)
    excludes = get_names_from_config(config, 'Excludes')
    if excludes:
        organisms = [org for org in organisms if org not in excludes]
    args.output.write("org\tcontigs\tavg len\n")
    for org in organisms:
        # skip extended data, which are typically from genome-enabled orgs,
        # not capture data
        if not org.endswith('*'):
            # get the uce-matching node names from the db
            matching_nodes = get_matching_node_names(c, org)
            # parse the contig file for the organism, and return contig
            # lengths
            f = os.path.join(args.fasta, "{0}.{1}".format(org.replace('_','-'),'contigs.fasta'))
            records = fasta.FastaReader(f)
            contig_lens = [len(seq) for seq in records 
                if '_'.join(seq.identifier.strip('>').split('_')[0:2]) in matching_nodes]
            # write the average contig length of contigs matching UCEs
            args.output.write("{0}\t{1}\t{2}\n".format(org, len(contig_lens), float(sum(contig_lens))/len(contig_lens)))
def main():
    args = get_args()
    config = ConfigParser.RawConfigParser(allow_no_value=True)
    config.read(args.config)
    conn = sqlite3.connect(args.db)
    c = conn.cursor()
    if args.extend_db:
        query = "ATTACH DATABASE '{0}' AS extended".format(args.extend_db)
        c.execute(query)
    organisms = get_names_from_config(config, "Organisms")
    uces = get_names_from_config(config, "Loci")
    # pdb.set_trace()
    uce_fasta_out = fasta.FastaWriter(args.output)
    regex = re.compile("[N,n]{1,21}")
    for organism in organisms:
        print "Getting {0} reads...".format(organism)
        written = []
        # going to need to do something more generic w/ suffixes
        # pdb.set_trace()
        name = organism.replace("_", "-")
        if args.notstrict:
            if not organism.endswith("*"):
                reads = find_file(args.contigs, name)
                node_dict, missing = get_nodes_for_uces(c, organism, uces, extend=False, notstrict=True)
            elif args.extend_dir:
                # remove the asterisk
                name = name.rstrip("*")
                reads = find_file(args.extend_dir, name)
                node_dict, missing = get_nodes_for_uces(c, organism.rstrip("*"), uces, extend=True, notstrict=True)
        else:
            if not name.endswith("*"):
                reads = find_file(args.contigs, name)
                node_dict, missing = get_nodes_for_uces(c, organism, uces)
            elif name.endswith("*") and args.extend_dir:
                # remove the asterisk
                name = name.rstrip("*")
                reads = find_file(args.extend_dir, name)
                node_dict, missing = get_nodes_for_uces(c, organism.rstrip("*"), uces, extend=True)
        for read in fasta.FastaReader(reads):
            name = get_name(read.identifier).lower()
            coverage = get_coverage(read.identifier)
            if name in node_dict.keys():
                uce_seq = fasta.FastaSequence()
                uce_seq.identifier = ">{0}_{1} |{0}|{2}".format(node_dict[name][0], organism.rstrip("*"), coverage)
                # deal with strandedness because aligners dont, which
                # is annoying
                if node_dict[name][1] == "-":
                    uce_seq.sequence = transform.DNA_reverse_complement(read.sequence)
                else:
                    uce_seq.sequence = read.sequence
                # replace any occurrences of <21 Ns in a given sequence with
                # blanks.  These should gap out during alignment.
                if regex.search(uce_seq.sequence):
                    uce_seq.sequence = re.sub(regex, "", uce_seq.sequence)
                    print "\tReplaced < 20 ambiguous bases in {0}".format(uce_seq.identifier.split(" ")[0])
                # Replace and leading/trailing lowercase bases from velvet
                # assemblies. Lowercase bases indicate low coverage, and these
                # have been problematic in downstream alignments).
                uce_seq.sequence = re.sub("^[acgtn]+", "", uce_seq.sequence)
                uce_seq.sequence = re.sub("[acgtn]+$", "", uce_seq.sequence)
                uce_fasta_out.write(uce_seq)
                written.append(str(node_dict[name][0]))
            else:
                pass
        # pdb.set_trace()
        if args.notstrict and missing:
            args.notstrict.write("[{0}]\n".format(organism))
            for name in missing:
                args.notstrict.write("{0}\n".format(name))
                written.append(name)
        assert set(written) == set(uces), "UCE names do not match"
        # assert set(written) == set(uces), pdb.set_trace()
    uce_fasta_out.close()
def main():
    args = get_args()
    config = ConfigParser.RawConfigParser(allow_no_value=True)
    config.read(args.config)
    conn = sqlite3.connect(args.db)
    c = conn.cursor()
    if args.extend_db:
        query = "ATTACH DATABASE '{0}' AS extended".format(args.extend_db)
        c.execute(query)
    organisms = get_names_from_config(config, 'Organisms')
    uces = get_names_from_config(config, 'Loci')
    #pdb.set_trace()
    uce_fasta_out = fasta.FastaWriter(args.output)
    regex = re.compile("[N,n]{1,21}")
    for organism in organisms:
        print "Getting {0} reads...".format(organism)
        written = []
        # going to need to do something more generic w/ suffixes
        #pdb.set_trace()
        name = organism.replace('_', '-')
        if args.notstrict:
            if not organism.endswith('*'):
                reads = find_file(args.contigs, name)
                node_dict, missing = get_nodes_for_uces(c, organism, uces, extend=False, notstrict=True)
            elif args.extend_dir:
                # remove the asterisk
                name = name.rstrip('*')
                reads = find_file(args.extend_dir, name)
                node_dict, missing = get_nodes_for_uces(c, organism.rstrip('*'), uces, extend=True, notstrict=True)
        else:
            if not name.endswith('*'):
                reads = find_file(args.contigs, name)
                node_dict, missing = get_nodes_for_uces(c, organism, uces)
            elif name.endswith('*') and args.extend_dir:
                # remove the asterisk
                name = name.rstrip('*')
                reads = find_file(args.extend_dir, name)
                node_dict, missing = get_nodes_for_uces(c, organism.rstrip('*'), uces, extend=True)
        for read in fasta.FastaReader(reads):
            name = get_name(read.identifier).lower()
            coverage = get_coverage(read.identifier)
            if name in node_dict.keys():
                uce_seq = fasta.FastaSequence()
                uce_seq.identifier = ">{0}_{1} |{0}|{2}".format(node_dict[name][0], organism, coverage)
                # deal with strandedness because aligners dont, which
                # is annoying
                if node_dict[name][1] == '-':
                    uce_seq.sequence = transform.DNA_reverse_complement(read.sequence)
                else:
                    uce_seq.sequence = read.sequence
                # replace any occurrences of <21 Ns
                if regex.search(uce_seq.sequence):
                    uce_seq.sequence = re.sub(regex, "", uce_seq.sequence)
                    print "\tReplaced < 20 ambiguous bases in {0}".format(uce_seq.identifier.split(' ')[0])
                uce_fasta_out.write(uce_seq)
                written.append(str(node_dict[name][0]))
            else:
                pass
        #pdb.set_trace()
        if args.notstrict and missing:
            args.notstrict.write("[{0}]\n".format(organism))
            for name in missing:
                args.notstrict.write("{0}\n".format(name))
                written.append(name)
        assert set(written) == set(uces), "UCE names do not match"
        #assert set(written) == set(uces), pdb.set_trace()
    uce_fasta_out.close()
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # parse the config file - allowing no values (e.g. no ":" in config file)
    config = ConfigParser.RawConfigParser(allow_no_value=True)
    config.optionxform = str
    config.read(args.config)
    # connect to the database
    conn = sqlite3.connect(args.locus_db)
    c = conn.cursor()
    # attach to external database, if passed as option
    organisms = get_names_from_config(config, "Organisms")
    log.info(
        "There are {} taxa in the match-count-config file named {}".format(
            len(organisms), os.path.basename(args.config)
        )
    )
    exons = get_names_from_config(config, "Loci")
    log.info("There are {} exon loci in the matrix".format(len(exons)))
    regex = re.compile("[N,n]{1,21}")
    out_dir = "/".join(args.output.split("/")[:-1])
    temp_conf = os.path.join(out_dir, "config_extended")
    incomplete_outf = open(temp_conf, "w")
    with open(args.output, "w") as exon_fasta_out:
        for organism in organisms:
            text = "Getting exon loci for {0}".format(organism)
            log.info(text.center(65, "-"))
            written = []
            # going to need to do something more generic w/ suffixes
            name = organism.replace("_", "-")
            if not organism.endswith("*"):
                reads = find_file(args.contigs, name)
                node_dict, missing = get_nodes_for_exons(c, organism, exons, extend=False, notstrict=True)
            count = 0
            log.info("There are {} exon loci for {}".format(len(node_dict), organism))
            log.info("Parsing and renaming contigs for {}".format(organism))
            for seq in SeqIO.parse(open(reads, "rU"), "fasta"):
                name = get_contig_name(seq.id).lower()
                # print "name:", name
                # print node_dict.keys()

                if name in node_dict.keys():
                    seq.id = "{0}_{1} |{0}".format(node_dict[name][0], organism.rstrip("*"))
                    seq.name = ""
                    seq.description = ""
                    # deal with strandedness because aligners sometimes dont, which
                    # is annoying
                    if node_dict[name][1] == "-":
                        seq.seq = seq.seq.reverse_complement()
                    # Replace any occurrences of <21 Ns in a given sequence with
                    # blanks.  These should gap out during alignment. Also, replace
                    # leading/trailing lowercase bases from velvet assemblies.
                    # Lowercase bases indicate low coverage, and these
                    # have been problematic in downstream alignments).
                    seq, count = replace_and_remove_bases(regex, seq, count)
                    exon_fasta_out.write(seq.format("fasta"))
                    # print "node_dict:", node_dict[name][0]
                    written.append(str(node_dict[name][0]))
                else:
                    pass
            if count > 0:
                log.info("Replaced <20 ambiguous bases (N) in {} contigs for {}".format(count, organism))
            if missing:
                log.info("Writing missing locus information to {}".format(temp_conf))
                incomplete_outf.write("[{0}]\n".format(organism))
                for name in missing:
                    incomplete_outf.write("{0}\n".format(name))
                    written.append(name)
            # print written
            # print exons
            assert set(written) == set(exons), "exon names do not match"
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))
示例#5
0
def main(args):
    #args = get_args()
    # setup logging
    #log, my_name = setup_logging(args)
    # parse the config file - allowing no values (e.g. no ":" in config file)
    config = ConfigParser.RawConfigParser(allow_no_value=True)
    config.optionxform = str
    config.read(args.config)
    # connect to the database
    conn = sqlite3.connect(args.locus_db)
    c = conn.cursor()
    # attach to external database, if passed as option
    organisms = get_names_from_config(config, 'Organisms')
    log.info("There are {} taxa in the match-count-config file named {}".format(
        len(organisms),
        os.path.basename(args.config)
    ))
    exons = get_names_from_config(config, 'Loci')

    dupefile = None
    if args.include_duplicates is not None:
    	dupefile = args.include_duplicates

    dupe_config = ConfigParser.RawConfigParser(allow_no_value=True)
    dupe_config.optionxform = str
    if args.include_duplicates is not None:
        dupe_config.read(dupefile)

    log.info("There are {} exon loci in the matrix".format(len(exons)))
    regex = re.compile("[N,n]{1,21}")
    out_dir = '/'.join(args.output.split('/')[:-1])
    temp_conf = os.path.join(out_dir, 'config_extended')
    incomplete_outf = open(temp_conf, 'w')
    with open(args.output, 'w') as exon_fasta_out:
        for organism in organisms:
            organism_dupe_dict = {}
            if args.include_duplicates is not None:
                dupes = dupe_config.items('%s - contigs hitting multiple probes' %organism)
                for element in dupes:
                    organism_dupe_dict.setdefault(element[0],element[1])
            organism_orientation_dict = {}
            if args.include_duplicates is not None:
                locus_orientation = dupe_config.items('%s - contig orientation' %organism)
                for element in locus_orientation:
                    organism_orientation_dict.setdefault(element[0],element[1])            
            #print (organism_dupe_dict)
            text = "Getting exon loci for {0}".format(organism)
            log.info(text.center(65, "-"))
            written = []
            # going to need to do something more generic w/ suffixes
            name = organism.replace('_', '-')
            if not organism.endswith('*'):
                reads = find_file(args.contigs, name)
                node_dict, missing = get_nodes_for_exons(c, organism, exons, args, organism_dupe_dict, organism_orientation_dict, extend=False, notstrict=True)
            count = 0
            log.info("There are {} exon loci for {}".format(len(node_dict), organism))
            log.info("Parsing and renaming contigs for {}".format(organism))
            for seq in SeqIO.parse(open(reads, 'rU'), 'fasta'):
                name = get_contig_name(seq.id,args).lower()
                #print "name:", name
                #print node_dict.keys()
                if name in node_dict.keys():
                    seq.id = "{0}_{1} |{0}".format(node_dict[name][0], organism.rstrip('*'))
                    seq.name = ''
                    seq.description = ''
                    # deal with strandedness because aligners sometimes dont, which
                    # is annoying
                    if node_dict[name][1] == '-':
                        seq.seq = seq.seq.reverse_complement()
                    # Replace any occurrences of <21 Ns in a given sequence with
                    # blanks.  These should gap out during alignment. Also, replace
                    # leading/trailing lowercase bases from velvet assemblies.
                    # Lowercase bases indicate low coverage, and these
                    # have been problematic in downstream alignments).
                    seq, count = replace_and_remove_bases(regex, seq, count)
                    exon_fasta_out.write(seq.format('fasta'))
                    #print "node_dict:", node_dict[name][0]
                    written.append(str(node_dict[name][0]))
                else:
                    pass
            if count > 0:
                log.info("Replaced <20 ambiguous bases (N) in {} contigs for {}".format(count, organism))
            if missing:
                log.info("Writing missing locus information to {}".format(temp_conf))
                incomplete_outf.write("[{0}]\n".format(organism))
                for name in missing:
                    incomplete_outf.write("{0}\n".format(name))
                    written.append(name)
            #print written
            #print exons
            # This test will result in an error if duplicates are included
            #assert set(written) == set(exons), "exon names do not match"
    text = " Completed! "
    log.info(text.center(65, "="))
def main():
    args = get_args()
    # setup logging
    log, my_name = setup_logging(args)
    # parse the config file - allowing no values (e.g. no ":" in config file)
    config = ConfigParser.RawConfigParser(allow_no_value=True)
    config.optionxform = str
    config.read(args.match_count_output)
    # connect to the database
    conn = sqlite3.connect(args.locus_db)
    c = conn.cursor()
    # attach to external database, if passed as option
    if args.extend_locus_db:
        log.info("Attaching extended database {}".format(os.path.basename(args.extend_locus_db)))
        query = "ATTACH DATABASE '{0}' AS extended".format(args.extend_locus_db)
        c.execute(query)
    organisms = get_names_from_config(config, 'Organisms')
    log.info("There are {} taxa in the match-count-config file named {}".format(
        len(organisms),
        os.path.basename(args.match_count_output)
    ))
    uces = get_names_from_config(config, 'Loci')
    if not args.incomplete_matrix:
        log.info("There are {} shared UCE loci in a COMPLETE matrix".format(len(uces)))
    else:
        log.info("There are {} UCE loci in an INCOMPLETE matrix".format(len(uces)))
    regex = re.compile("[N,n]{1,21}")
    if args.incomplete_matrix:
        incomplete_outf = open(args.incomplete_matrix, 'w')
    with open(args.output, 'w') as uce_fasta_out:
        for organism in organisms:
            text = "Getting UCE loci for {0}".format(organism)
            log.info(text.center(65, "-"))
            written = []
            # going to need to do something more generic w/ suffixes
            name = organism.replace('_', '-')
            if args.incomplete_matrix:
                if not organism.endswith('*'):
                    reads = find_file(args.contigs, name)
                    node_dict, missing = get_nodes_for_uces(c, organism, uces, extend=False, notstrict=True)
                elif args.extend_locus_contigs:
                    # remove the asterisk
                    name = name.rstrip('*')
                    reads = find_file(args.extend_locus_contigs, name)
                    node_dict, missing = get_nodes_for_uces(c, organism.rstrip('*'), uces, extend=True, notstrict=True)
            else:
                if not name.endswith('*'):
                    reads = find_file(args.contigs, name)
                    node_dict, missing = get_nodes_for_uces(c, organism, uces)
                elif name.endswith('*') and args.extend_locus_contigs:
                    # remove the asterisk
                    name = name.rstrip('*')
                    reads = find_file(args.extend_locus_contigs, name)
                    node_dict, missing = get_nodes_for_uces(c, organism.rstrip('*'), uces, extend=True)
            count = 0
            log.info("There are {} UCE loci for {}".format(len(node_dict), organism))
            log.info("Parsing and renaming contigs for {}".format(organism))
            for seq in SeqIO.parse(open(reads, 'rU'), 'fasta'):
                name = get_contig_name(seq.id).lower()
                if name in node_dict.keys():
                    seq.id = "{0}_{1} |{0}".format(node_dict[name][0], organism.rstrip('*'))
                    seq.name = ''
                    seq.description = ''
                    # deal with strandedness because aligners sometimes dont, which
                    # is annoying
                    if node_dict[name][1] == '-':
                        seq.seq = seq.seq.reverse_complement()
                    # Replace any occurrences of <21 Ns in a given sequence with
                    # blanks.  These should gap out during alignment. Also, replace
                    # leading/trailing lowercase bases from velvet assemblies.
                    # Lowercase bases indicate low coverage, and these
                    # have been problematic in downstream alignments).
                    seq, count = replace_and_remove_bases(regex, seq, count)
                    uce_fasta_out.write(seq.format('fasta'))
                    written.append(str(node_dict[name][0]))
                else:
                    pass
            if count > 0:
                log.info("Replaced <20 ambiguous bases (N) in {} contigs for {}".format(count, organism))
            if args.incomplete_matrix and missing:
                log.info("Writing missing locus information to {}".format(args.incomplete_matrix))
                incomplete_outf.write("[{0}]\n".format(organism))
                for name in missing:
                    incomplete_outf.write("{0}\n".format(name))
                    written.append(name)
            assert set(written) == set(uces), "UCE names do not match"
    text = " Completed {} ".format(my_name)
    log.info(text.center(65, "="))