예제 #1
0
    def test_ids_from_fasta_lines(self):
        """ ids_from_fasta_lines should return ids"""

        fasta_lines = \
            [">hsa:8355  HIST1H3G; histone cluster 1, H3g ; K11253 histone H3",
             "atggcccgcaccaagcagactgcacgcaagtccaccggtggcaaagcgccgcgcaagcagctgg",
             "ccactaaggcggctcggaaaagcgcgccggccaccggcggcgtgaagaaacctcatcgctaccg",
             "tcccggcaccgtggctctgcgcgagattcgccgctatcagaagtcgactgagctgctgatccgc",
             "aagttgcctttccaacgcctggtgcgagaaatcgctcaggacttcaagacagatctgcgctttc",
             "agagttccgcggtgatggccctgcaggaggcctgcgaggcctacttggtggggctctttgagga",
             "taccaacctgtgtgccatccatgctaagcgagtgactatcatgcccaaggacattcagctcgct",
             "cgccgcattcgtggggagagagcgtag",
             ">hsa:9081  PRY; PTPN13-like, Y-linked",
             "atgggagccactgggcttggctttctactttcctggagacaagacaatttgaatggcact"]
        exp_ids = ["hsa:8355", "hsa:9081"]
        obs_ids = ids_from_fasta_lines(fasta_lines)

        self.assertEqual(obs_ids, exp_ids)
    def test_ids_from_fasta_lines(self):
        """ ids_from_fasta_lines should return ids"""

        fasta_lines = \
            [">hsa:8355  HIST1H3G; histone cluster 1, H3g ; K11253 histone H3",
             "atggcccgcaccaagcagactgcacgcaagtccaccggtggcaaagcgccgcgcaagcagctgg",
             "ccactaaggcggctcggaaaagcgcgccggccaccggcggcgtgaagaaacctcatcgctaccg",
             "tcccggcaccgtggctctgcgcgagattcgccgctatcagaagtcgactgagctgctgatccgc",
             "aagttgcctttccaacgcctggtgcgagaaatcgctcaggacttcaagacagatctgcgctttc",
             "agagttccgcggtgatggccctgcaggaggcctgcgaggcctacttggtggggctctttgagga",
             "taccaacctgtgtgccatccatgctaagcgagtgactatcatgcccaaggacattcagctcgct",
             "cgccgcattcgtggggagagagcgtag",
             ">hsa:9081  PRY; PTPN13-like, Y-linked",
             "atgggagccactgggcttggctttctactttcctggagacaagacaatttgaatggcact"]
        exp_ids = ["hsa:8355", "hsa:9081"]
        obs_ids = ids_from_fasta_lines(fasta_lines)

        self.assertEqual(obs_ids, exp_ids)
예제 #3
0
def main():
    option_parser, options, args = parse_command_line_parameters(**script_info)
    DEBUG = options.verbose
    check_options(option_parser, options)
    start_time = time()
    option_lines = format_options_as_lines(options)
    if DEBUG:
        print FORMAT_BAR
        print "Running with options:"
        for line in sorted(option_lines):
            print line
        print FORMAT_BAR

    # because the blast app controller uses absolute paths, make sure subject
    # db path is fully specified

    subject_db = options.subjectdb
    if not subject_db.startswith('/'):
        subject_db = join(getcwd(), subject_db)
    if not options.no_format_db:

        # initialize object
        inpath = FilePath(abspath(options.subjectdb))
        subject_dir, subj_file = split(inpath)

        fdb = FormatDb(WorkingDir=subject_dir)
        # Currently we do not support protein blasts, but
        # this would be easy to add in the future...
        fdb.Parameters['-p'].on('F')

        # Create indices for record lookup
        fdb.Parameters['-o'].on('T')

        # Set input database
        fdb.Parameters['-i'].on(subject_db)

        formatdb_cmd = fdb.BaseCommand

        if DEBUG:
            print "Formatting db with command: %s" % formatdb_cmd

        app_result = fdb(subject_db)
        formatdb_filepaths = []
        for v in app_result.values():
            try:
                formatdb_filepaths.append(v.name)
            except AttributeError:
                # not a file object, so no path to return
                pass

        db_format_time = time() - start_time

        if DEBUG:
            print "Formatting subject db took: %2.f seconds" % db_format_time
            print "formatdb log file written to: %s" % app_result['log']
            print FORMAT_BAR
    else:
        db_format_time = time() - start_time
        formatdb_cmd = "None (formatdb not called)"
        # Check that User-Supplied subjectdb is valid
        db_ext = [".nhr", ".nin", ".nsd", ".nsi", ".nsq"]
        formatdb_filepaths = [subject_db + ext for ext in db_ext]

        if DEBUG:
            print "Checking that pre-existing formatdb files exist and can be read."
            print "Files to be checked:"
            for fp in formatdb_filepaths:
                print fp
            print FORMAT_BAR

        try:
            formatdb_files = [open(db_f, "U") for db_f in formatdb_filepaths]
            [f.close() for f in formatdb_files]
        except IOError:
            if DEBUG:
                print "Cannot open user-supplied database file:", db_f
            option_parser.error(
                """Problem with -d and --no_format_db option combination: Cannot open the following user-supplied database file: %s. Consider running without --no_format_db to let formatdb generate these required files""" %
                db_f)

        if DEBUG:
            print "OK: BLAST Database files exist and can be read."
            print FORMAT_BAR

    # Perform BLAST search
    blast_results, hit_ids, removed_hit_ids = find_homologs(options.querydb,
                                                            subject_db, options.e_value, options.max_hits,
                                                            options.working_dir, options.blastmatroot, options.wordsize,
                                                            options.percent_aligned, DEBUG=DEBUG)

    blast_time = (time() - start_time) - db_format_time

    if DEBUG:
        print "BLAST search took: %2.f minute(s)" % (blast_time / 60.0)
        print FORMAT_BAR

    # Create output folder
    outputdir = options.outputdir
    try:
        makedirs(outputdir)
    except OSError:
        pass

    # Record raw blast results
    raw_blast_results_path = join(outputdir, "raw_blast_results.txt")
    f = open(raw_blast_results_path, 'w')
    f.writelines(blast_results)
    f.close()

    # Record excluded seqs
    excluded_seqs_path = join(outputdir, "matching.fna")
    ids_to_seq_file(hit_ids, options.querydb, excluded_seqs_path, "")

    # Record included (screened) seqs
    included_seqs_path = join(outputdir, "non-matching.fna")
    all_ids = ids_from_fasta_lines(open(options.querydb))
    included_ids = set(all_ids) - hit_ids
    ids_to_seq_file(included_ids, options.querydb, included_seqs_path, "")

    log_lines = compose_logfile_lines(start_time, db_format_time, blast_time,
                                      option_lines, formatdb_cmd,
                                      blast_results, options, all_ids,
                                      hit_ids, removed_hit_ids,
                                      included_ids, DEBUG)

    log_path = join(outputdir, "sequence_exclusion.log")
    if DEBUG:
        print "Writing summary to: %s" % log_path

    f = open(log_path, 'w')
    f.writelines(log_lines)
    f.close()

    if not options.no_clean:
        if DEBUG:

            print FORMAT_BAR
            print "|                           Cleanup                        |"
            print FORMAT_BAR

        if not options.no_format_db:
            if options.verbose:
                print "Cleaning up formatdb files:", formatdb_filepaths
            remove_files(formatdb_filepaths)
        else:
            if options.verbose:
                print "Formatdb not run...nothing to clean"
예제 #4
0
def main():
    option_parser, options, args = parse_command_line_parameters(**script_info)
    DEBUG = options.verbose
    check_options(option_parser, options)
    start_time = time()
    option_lines = format_options_as_lines(options)
    if DEBUG:
        print FORMAT_BAR
        print "Running with options:"
        for line in sorted(option_lines):
            print line
        print FORMAT_BAR

    #because the blast app controller uses absolute paths, make sure subject
    #db path is fully specified

    subject_db = options.subjectdb
    if not subject_db.startswith('/'):
        subject_db = join(getcwd(), subject_db)
    if not options.no_format_db:

        #initialize object
        inpath = FilePath(abspath(options.subjectdb))
        subject_dir, subj_file = split(inpath)

        fdb = FormatDb(WorkingDir=subject_dir)
        # Currently we do not support protein blasts, but
        # this would be easy to add in the future...
        fdb.Parameters['-p'].on('F')

        # Create indices for record lookup
        fdb.Parameters['-o'].on('T')

        # Set input database
        fdb.Parameters['-i'].on(subject_db)

        formatdb_cmd = fdb.BaseCommand

        if DEBUG:
            print "Formatting db with command: %s" % formatdb_cmd

        app_result = fdb(subject_db)
        formatdb_filepaths = []
        for v in app_result.values():
            try:
                formatdb_filepaths.append(v.name)
            except AttributeError:
                # not a file object, so no path to return
                pass

        db_format_time = time() - start_time

        if DEBUG:
            print "Formatting subject db took: %2.f seconds" % db_format_time
            print "formatdb log file written to: %s" % app_result['log']
            print FORMAT_BAR
    else:
        db_format_time = time() - start_time
        formatdb_cmd = "None (formatdb not called)"
        # Check that User-Supplied subjectdb is valid
        db_ext = [".nhr", ".nin", ".nsd", ".nsi", ".nsq"]
        formatdb_filepaths = [subject_db + ext for ext in db_ext]

        if DEBUG:
            print "Checking that pre-existing formatdb files exist and can be read."
            print "Files to be checked:"
            for fp in formatdb_filepaths:
                print fp
            print FORMAT_BAR

        try:
            formatdb_files = [open(db_f, "U") for db_f in formatdb_filepaths]
            [f.close() for f in formatdb_files]
        except IOError:
            if DEBUG:
                print "Cannot open user-supplied database file:", db_f
            option_parser.error(
                """Problem with -d and --no_format_db option combination: Cannot open the following user-supplied database file: %s. Consider running without --no_format_db to let formatdb generate these required files"""
                % db_f)

        if DEBUG:
            print "OK: BLAST Database files exist and can be read."
            print FORMAT_BAR

    # Perform BLAST search
    blast_results,hit_ids, removed_hit_ids = find_homologs(options.querydb,\
        subject_db, options.e_value,options.max_hits,\
        options.working_dir,options.blastmatroot, options.wordsize,\
                            options.percent_aligned, DEBUG=DEBUG)

    blast_time = (time() - start_time) - db_format_time

    if DEBUG:
        print "BLAST search took: %2.f minute(s)" % (blast_time / 60.0)
        print FORMAT_BAR

    #Create output folder
    outputdir = options.outputdir
    try:
        makedirs(outputdir)
    except OSError:
        pass

    #Record raw blast results
    raw_blast_results_path = join(outputdir, "raw_blast_results.txt")
    f = open(raw_blast_results_path, 'w')
    f.writelines(blast_results)
    f.close()

    #Record excluded seqs
    excluded_seqs_path = join(outputdir, "matching.fna")
    ids_to_seq_file(hit_ids, options.querydb, excluded_seqs_path, "")

    #Record included (screened) seqs
    included_seqs_path = join(outputdir, "non-matching.fna")
    all_ids = ids_from_fasta_lines(open(options.querydb))
    included_ids = set(all_ids) - hit_ids
    ids_to_seq_file(included_ids, options.querydb, included_seqs_path, "")

    log_lines = compose_logfile_lines(start_time, db_format_time, blast_time,\
                                                   option_lines,formatdb_cmd,\
                                               blast_results,options,all_ids,\
                                                     hit_ids,removed_hit_ids,\
                                                          included_ids,DEBUG)

    log_path = join(outputdir, "sequence_exclusion.log")
    if DEBUG:
        print "Writing summary to: %s" % log_path

    f = open(log_path, 'w')
    f.writelines(log_lines)
    f.close()

    if not options.no_clean:
        if DEBUG:

            print FORMAT_BAR
            print "|                           Cleanup                        |"
            print FORMAT_BAR

        if not options.no_format_db:
            if options.verbose:
                print "Cleaning up formatdb files:", formatdb_filepaths
            remove_files(formatdb_filepaths)
        else:
            if options.verbose:
                print "Formatdb not run...nothing to clean"