Пример #1
0
    def setUp(self):
        """Creates a new ScrollPy Object"""
        self.tmpdir = os.path.join(data_dir, 'ss-tmp')
        try:
            os.makedirs(self.tmpdir)
        except FileExistsError:
            pass
        # Populate ARGS values of config file
        load_config_file()
        try:
            config.add_section('ARGS')
        except DuplicateSectionError:
            pass
        # Now provide sufficient arg defaults
        config['ARGS']['filter'] = 'False'
        config['ARGS']['filter_method'] = 'zscore'
        config['ARGS']['dist_matrix'] = 'LG'
        config['ARGS']['no_clobber'] = 'True'

        # CHANGE ME TO CHANGE TEST
        #######################################
        self.infile = 'Hsap_AP1G_FourSeqs.fa' #
        #######################################

        self.infile_base = self.infile.split('.')[0]
        self.inpath = os.path.join(data_dir, self.infile)

        self.sp = ScrollPy(
                self.tmpdir, # target_dir
                'Mafft', # align_method
                'RAxML', # dist_method
                (self.inpath,),
                )
Пример #2
0
 def test_filter_less(self):
     """Tests whether _filter returns a smaller list"""
     # Mock user input
     try:
         config.add_section('ARGS')
     except DuplicateSectionError:
         pass
     config.set('ARGS', 'number', '2')  # i.e. config['ARGS']['number']
     # Test list
     new_list = self.writer._filter()
     self.assertEqual(len(new_list[0][1]), 2)  # nested -> [(x,[])]
Пример #3
0
 def test_filter_more(self):
     """Tests whether _filter handles N larger than actual number"""
     # Mock user input
     try:
         config.add_section('ARGS')
     except DuplicateSectionError:
         pass
     config.set('ARGS', 'number', '6')  # i.e. config['ARGS']['number']
     # Test list
     new_list = self.writer._filter()
     self.assertEqual(len(new_list[0][1]), 4)  # nested -> [(x,[])]
Пример #4
0
 def test_get_filepath(self):
     """Tests returned filepath"""
     # Mock user input
     try:
         config.add_section('ARGS')
     except DuplicateSectionError:
         pass
     config.set('ARGS', 'no-clobber', 'False')
     config.set('ARGS', 'filesep', '_')
     config.set('ARGS', 'suffix', 'awesome')
     config.set('ARGS', 'seqfmt', 'fasta')
     # Call and test
     outpath = self.writer._get_filepath("group")
     self.assertEqual(
         outpath, os.path.join(self.tmpdir, 'group_sequences_awesome.fa'))
Пример #5
0
 def setUp(self):
     """Create necessary objects"""
     # Make dir
     self.tmpdir = os.path.join(data_dir, 'out-seq')
     try:
         os.makedirs(self.tmpdir)
     except FileExistsError:
         print("Failed to make target directory")
         pass
     # Populate ARGS values of config file
     load_config_file()
     try:
         config.add_section('ARGS')
     except DuplicateSectionError:
         pass
     # Now provide sufficient arg defaults
     config['ARGS']['filter'] = 'False'
     config['ARGS']['filter_method'] = 'zscore'
     config['ARGS']['dist_matrix'] = 'LG'
     config['ARGS']['no_clobber'] = 'True'
     # Make ScrollPy object
     # CHANGE ME TO CHANGE TEST
     #######################################
     self.infile = 'Hsap_AP1G_FourSeqs.fa'  #
     #######################################
     self.infile_base = self.infile.split('.')[0]
     self.inpath = os.path.join(data_dir, self.infile)
     self.sp = ScrollPy(
         self.tmpdir,  #target dir
         'Mafft',  # align_method
         'RAxML',  # dist_method
         (self.inpath, ),
     )
     self.sp()  # Run internal methods
     # Make SeqWriter object
     self.writer = output.SeqWriter(
         self.sp,  # object
         self.tmpdir,  # file_path
     )
Пример #6
0
    def setUp(self):
        """Creates a new ScrollCollection Object"""
        # Populate ARGS values of config file
        load_config_file()
        try:
            config.add_section('ARGS')
        except DuplicateSectionError:
            pass
        # Now provide sufficient arg defaults
        config['ARGS']['filter'] = 'False'
        config['ARGS']['filter_method'] = 'zscore'
        config['ARGS']['dist_matrix'] = 'LG'
        config['ARGS']['no_clobber'] = 'True'

        ids = (1, 2, 3, 4)
        infile = os.path.join(data_dir, 'Hsap_AP1G_FourSeqs.fa')
        records = sf._get_sequences(infile)
        self.seq_list = []
        for id_num, seq_record in zip(ids, records):
            self.seq_list.append(
                ScrollSeq(
                    id_num,  # ID
                    infile,  # infile
                    id_num,  # Group; not important here
                    SeqRecord=seq_record))
        self.tmpdir = os.path.join(data_dir, 'tmp')
        try:
            os.makedirs(self.tmpdir)
        except FileExistsError:
            pass  # tmpdir still present
        self.collection = ScrollCollection(
            self.tmpdir,  # outdir
            self.seq_list,  # sequence list
            'one',  # group
            'Mafft',  # align_method
            'RAxML',  # dist_method
        )
Пример #7
0
def main():
    ##############################################################################
    # BEGIN TIMING
    ##############################################################################

    main_start = datetime.datetime.now()

    ##############################################################################
    # COMMAND LINE ARGUMENT
    ##############################################################################

    parser = argparse.ArgumentParser(
        description=_formatted_desc,
        formatter_class=argparse.HelpFormatter,  # TO-DO
        #add_help = False, # is this what we want?
    )
    # Options for files
    file_options = parser.add_argument_group("File Options")
    file_options.add_argument(
        "-i",
        "--infiles",
        nargs='*',
        metavar="Infile Paths",
        help=("Path(s) to one or more input files. Can be blank. "
              "If using the '--tree-file' option, the headers in"
              "each file will be concatenated and assumed to represent"
              "all of the branch tips in the tree provided."))
    file_options.add_argument(
        "--infmt",
        nargs='?',
        metavar="Infile Type",
        default="fasta",
        choices=["fasta", "fasta-2line", "genbank"],  # Work on this
        help=("Format of ALL specified infiles (all must be the same), "
              "default to 'fasta'."))
    file_options.add_argument("-q",
                              "--seqtype",
                              nargs='?',
                              metavar="Sequence Type",
                              default="prot",
                              choices=["nucl", "prot"],
                              help=("Alphabet of ALL specified infiles."))
    file_options.add_argument(
        "-t",
        "--treefile",
        nargs='?',
        metavar="Treefile Path",
        help=("Path to a file containing a tree object (optional). "
              "If provided, analysis defaults to reading branch lengths"
              "to calculate patristic distances."))
    file_options.add_argument(
        "--treefmt",
        nargs='?',
        metavar="Tree Format",
        default="newick",
        choices=["nexus", "newick", "phylip", "nexml"],
        help=("Format of tree file, if supplied. Defaults to 'newick'."))
    file_options.add_argument(
        "-o",
        "--out",
        nargs='?',
        metavar="Target Output Directory",
        default=None,
        help=("Target directory for output files. If the target directory "
              "does not exist, it is created unless the '--no-create' flag "
              "is set. If target directory is not specified, or directory "
              "creation fails, defaults to the current directory."))
    file_options.add_argument(
        "--tmpout",
        nargs='?',
        metavar="Keep Temporary Output",
        default=None,
        help=(
            "Target directory for intermediate run files. If specified "
            "and does not exist, it is created unless the '--no-create' "
            "flag is set. If creation fails, tries to create /tmp/ in the "
            "current directory instead. If not specified, or if all creation "
            "attempts fail, a temporary directory is used and removed "
            "following execution."))
    file_options.add_argument(
        "--suffix",
        nargs='?',
        metavar="Output File Suffix",
        default='',
        help=("Optional argument to specify a common suffix to be added to "
              "all output files. This will be added to the default names "
              "in the form <name><sep><suffix><extension>, and the <sep> "
              "argument specified by '--filesep'."))
    file_options.add_argument(
        "--seqout",
        action="store_true",
        help=("Write output files containing optimal sequences. One file "
              "is written for each group, with up to '--numseqs' sequences "
              "in each group."))
    file_options.add_argument(
        "--seqfmt",
        nargs='?',
        metavar="Output Sequence Format",
        choices=["fasta", "fasta-2line", "genbank"],  # Work on this
        default="fasta",
        help=("Format to write sequences to. If not specified, or if an "
              "error is encountered during writing, defaults to 'fasta'."))
    file_options.add_argument(
        "--tblfmt",
        nargs='?',
        metavar="Output Table Format",
        choices=["csv", "space-delim", "tab-delim", "sep"],
        default="csv",
        help=("Format to write summary information to. If 'sep' is specfified "
              "writes using character '--tblsep' as delimiter. If no sep is "
              "specified, or this option is not set, defaults to csv (',')."))
    file_options.add_argument(
        "-m",
        "--mapping",
        nargs='?',
        metavar="Optional Group Mapping",
        help=(
            "Path to a file containing a mapping of sequence headers/ tip "
            "labels to a pre-specified group. Format <id><tab><group>. If "
            "not all headers/labels are present in the file, the remaining "
            "items are gathered into their own group. Note: header/labels "
            "present must match those of the tree/sequence file(s) EXACTLY."))
    file_options.add_argument(
        "--filesep",
        nargs='?',
        metavar="Filepath Separator",
        default='_',
        help=("Allows to set the delimiter in output filepaths."))
    file_options.add_argument(
        "--tblsep",
        nargs='?',
        metavar="Table Column Separator",
        default=',',
        help=("Allows to set the delimiter for output table columns."))
    file_options.add_argument(
        "-x",
        "--no-create",
        action="store_true",
        help=("Prevents creation of output directory(ies). If an output "
              "directory is specified that does not exist and this option "
              "is set, ScrollPy will quit with an error message."))
    file_options.add_argument(
        "-y",
        "--no-clobber",
        action="store_true",
        help=("Prevents overwrite of target files with the same path. If "
              "this flag is set and a log file is encountered, the log "
              "file will instead write to a new file with an additional "
              "suffix. If any other kind of file is encountered, ScrollPy "
              "assumes it is correct and tries to use it as normal."))
    # Options for Running
    run_options = parser.add_argument_group("Run Options")
    run_options.add_argument(
        "-a",
        "--align",
        nargs='?',
        choices=["Muscle", "Mafft", "Generic"],  # TO-DO
        default="Mafft",
        metavar="Alignment Method",
        help=("Method to use for aligning sequences. Any other option than "
              "'Generic' requires that the corresponding option in the config "
              "file is set properly."))
    run_options.add_argument(
        "--align-matrix",
        nargs='?',
        choices=["Blosum62", "Pam250"],  # TO-DO
        metavar="Alignment Matrix",
        help=("Specify a matrix to use for alignment, e.g. 'Blosum62'. If "
              "the called program does not accept the matrix, the default "
              "will be used instead. Default is to use the default specified "
              "by each program, or 'Blosum62' when using 'Generic'."))
    run_options.add_argument(
        "-d",
        "--distance",
        nargs='?',
        choices=["PhyML", "RAxML", "Generic"],  # TO-DO
        default="RAxML",
        metavar="Distance Method",
        help=(
            "Method to use for calculating distances. Any other options than "
            "'Generic' requires that the corresponding option in the config "
            "file is set properly."))
    run_options.add_argument(
        "--dist-matrix",
        nargs='?',
        choices=["WAG", "LG", "JC"],  # TO-DO
        default="LG",
        metavar="Distance Matrix",
        help=("Specify a substitution matrix for distance calculations. If "
              "the called program does not accept the matrix, the default "
              "will be used instead. Default is to use the default specified "
              "by each program, or 'LG'/'GTR' when using 'Generic'."))
    run_options.add_argument(
        "-f",
        "--filter",
        action="store_true",
        help=("Specify pre-filtering of sequences prior to running ScrollPy. "
              "Filtering is performed according to '--filter-method' and "
              "filtered sequences can be output into a separate file for "
              "user inspection using the '--filter-out' option. If both the "
              "'--treefile' and '--infiles' options are set AND all labels "
              "present in '--treefile' are present in '--infiles', ScrollPy "
              "will also perform filtering of tree tip labels."))
    run_options.add_argument(
        "--filter-method",  # TO-DO
        nargs='?',
        choices=["zscore", "mad"],
        default="zscore",
        metavar="Filtering Method",
        help=("HELP TEXT FOR FILTERING METHOD"))
    run_options.add_argument(
        "--filter-out",
        action="store_true",
        help=("Outputs filtered sequences in the same format as specified for "
              "'--seqout'. Filtered sequences will be organized by group."))
    run_options.add_argument(
        "-s",
        "--split-seqs",
        action="store_true",
        help=("If a single file is used as input, and '--mapping' is not "
              "specified, attempt to split input sequence file into an "
              "optimal number of sub-groups based on pariwise similarity "
              "and given that each group must have at least two sequences."))
    run_options.add_argument(
        "--split-method",  # TO-DO
        nargs='?',
        choices=["one", "two"],
        default="one",
        metavar="Splitting Method",
        help=("HELP TEXT FOR SPLITTING METHOD"))
    run_options.add_argument(
        "-n",
        "--number",
        nargs='?',
        type=int,
        default=3,
        metavar="Number of Sequences",
        help=("If '--seqout' is specified, at most '--number' of sequences "
              "from each group will be written to outfiles. If the value "
              "specified is less than or equal to zero, the default is used "
              "instead (Default is 3)."))
    run_options.add_argument(
        "--use-config",
        action="store_true",
        help=("This option allows the user finer control over external "
              "program runs by specifying the values of additional parameters "
              "to use in the config file. Values must be specified as they "
              "would appear on the command line under their own header "
              "within the config file. See docs for more information."))
    # Options for Logging
    log_options = parser.add_argument_group("Log Options")
    log_options.add_argument(
        "-l",
        "--logfile",
        nargs='?',
        metavar="Logfile Name",
        help=("This option allows the user to specify a name for the logged "
              "output. If a full path is specified, ScrollPy attempts to "
              "place the file as specified; if ScrollPy cannot, or if only "
              "a name is specified, '--out' is used instead."))
    log_options.add_argument("--no-log",
                             action="store_true",
                             help=("This option turns off logging."))
    log_options.add_argument(
        "--log-level",
        nargs='?',
        type=int,
        choices=[1, 2, 3],
        default=3,
        help=(
            "This option contols how much detail is contained in the logfile. "
            "Setting this option to '1' logs only the bare minimum, '2' logs "
            "more information, whereas '3' also logs output from external "
            "program calls (default)."))
    log_options.add_argument(
        "--no-summ",
        action="store_true",
        help=("This option turns off the automatic summary file generated on "
              "each run. Not recommended."))
    # Options for displaying information
    info_options = parser.add_argument_group("User Information Options")
    info_options.add_argument(
        "-v",
        "--verbosity",
        nargs='?',
        type=int,
        choices=[1, 2, 3],
        default=2,
        help=("This option controls how much detail is output to the terminal "
              "during output run. '1' results in a very quiet run, while '2' "
              "(default) displays more information about the run and '3' "
              "displays output from external program calls."))
    info_options.add_argument("--version",
                              action="store_true",
                              help="Display version information and quit.")
    info_options.add_argument(
        "--usage",
        action="store_true",
        help="Display some common usage examples and quit.")
    info_options.add_argument("--citation",
                              action="store_true",
                              help="Display citation and quit.")
    # Parse all arguments
    args = parser.parse_args()

    #############################################################################
    # CONFIGURE LOGGING
    #############################################################################

    # Add logging preferences to global config
    config.add_section("ARGS")
    config.set("ARGS", 'log_level', str(args.log_level))
    config.set("ARGS", 'verbosity', str(args.verbosity))

    # Set up loggers
    #name = __name__
    name = 'scrollpy'  # can't use __name__ since it becomes __main__
    #    print("Name of main module is: {}".format(name))
    out = args.out if args.out else current_dir
    logfile_path = scroll_log.get_logfile(
        #args.no_log,      # Whether to log to file
        True,  # JUST FOR TESTING
        args.logfile,  # Logfile name/path
        out,  # Output directory
        args.no_create,  # Directory creation
        args.no_clobber,  # Replace existing file
        args.filesep,  # Separator for files
    )
    # Get loggers and configure each; default level is 'INFO'
    # Configure console handler
    console_handler = logging.StreamHandler(stream=sys.stderr)
    console_handler.setFormatter(scroll_log.raw_format)
    console_handler.addFilter(scroll_log.ConsoleFilter(args.verbosity))
    # Create console logger and add handler to it
    console_logger = scroll_log.get_console_logger(name)
    console_logger.addHandler(console_handler)

    # Configure file handler
    file_handler = logging.FileHandler(filename=logfile_path)  # mode='a'
    file_handler.setFormatter(scroll_log.rich_format)
    file_handler.addFilter(
        scroll_log.FileFilter(
            args.log_level,
            args.no_log,  # If set, no output will be logged
        ))
    # Create file logger and add handler to it
    file_logger = scroll_log.get_file_logger(name)
    file_logger.addHandler(file_handler)

    # Configure output handler
    output_handler = logging.FileHandler(filename=logfile_path)  # Same as file
    output_handler.setFormatter(scroll_log.raw_format)
    output_handler.addFilter(
        scroll_log.OutputFilter(
            args.log_level,
            args.no_log,  # If set, no output will be logged
        ))
    # Create output logger and add handler to it
    output_logger = scroll_log.get_output_logger(name)
    output_logger.addHandler(output_handler)

    # Simple message regarding starting time to the user
    #    scroll_log.log_message(
    #            scroll_log.BraceMessage("Initialized at {} \n", main_start),  # msg
    #            2,  # verbosity level of message
    #            'INFO',  # level
    #            console_logger, file_logger  # loggers
    #            )
    #

    #############################################################################
    # SIMPLE USE CASES
    #############################################################################

    # Check to see if any of 'citation'/'usage'/'version' present
    if args.version:
        print(_version)  # Move to logging!
        sys.exit(0)
    if args.citation:
        print(_citation)  # Move to logging!
        sys.exit(0)
    if args.usage:
        print(_usage)
        sys.exit(0)

    ##############################################################################
    # PARAMETER VALIDATION
    ##############################################################################

    # Check the filepaths for appropriateness
    all_paths = []
    if args.infiles:  # Nonetype if not called at all
        if len(args.infiles) > 0:  # list; zero-length if none specified
            for path in args.infiles:
                # os.path ensures correct full path
                real_path = os.path.realpath(os.path.join(current_dir, path))
                all_paths.append(real_path)
    if args.treefile:  # Nonetype if not called at all
        real_path = os.path.realpath(os.path.join(current_dir, path))
        all_paths.append(real_path)  # Only one file
    # Quit if no paths specified
    if len(all_paths) == 0:  # No input files!
        print("No input files detected; please try again"
              )  # Print something useful!
        sys.exit(0)
    # Check for duplicates and quit if any exist
    duplicates = util.check_duplicate_paths(*all_paths)
    if len(duplicates) > 0:
        for path in duplicates:
            print(
                "Duplicate path {} detected in input".format(path))  # Logging!
        sys.exit(0)
    # Check to make sure all paths are good!
    non_existent = util.check_input_paths(*all_paths)
    if len(non_existent) > 0:
        for path in non_existent:
            print("Apparent non-existent file {}".format(path))  # Logging!
        sys.exit(0)

    # Check whether the output directory exists; if not, try to make it
    if not args.out:
        args.out = current_dir
    else:
        if not args.no_create:
            try:
                util.ensure_dir_exists(args.out)
            except OSError:
                # Make a note in the log file(s)!
                args.out = current_dir
        else:
            args.out = current_dir

    # Check whether temporary output is specified
    if args.tmpout:  # None otherwise
        if not args.no_create:
            try:
                util.ensure_dir_exists(args.tmpout)
            except OSError:
                # Logging!
                new_tmp = os.path.join(current_dir, '/tmp/')
                try:
                    util.ensure_dir_exists(new_tmp)
                except OSError:
                    args.tmpout = None  # Fall back to tmp dir

    # Need to check all other parameters here...

    ##############################################################################
    # POPULATE GLOBAL CONFIG
    ##############################################################################

    # ADD PARAMS TO CONFIGS IF NECESSARY!!!
    #config.add_section("ARGS")
    vargs = vars(args)  # make dict-like for iter
    for arg, val in vargs.items():
        if arg not in ('infiles', 'treefile'):
            sarg = str(arg)  # ConfigParser demands strings
            sval = str(val)  # ConfigParser demands strings
            config.set("ARGS", sarg, sval)  # Assign to config dictionary!

    # Load from config file
    # Call this later so that we can configure logging first!
    load_config_file()
    #    print("Config Arguments")
    #    for key in config["ARGS"]:
    #        print("{} : {}".format(key, config["ARGS"][key]))
    #    print("Config Alignment")
    #    for key in config["ALIGNMENT"]:
    #        print("{} : {}".format(key, config["ALIGNMENT"][key]))
    #    print("Config Distance")
    #    for key in config["DISTANCE"]:
    #        print("{} : {}".format(key, config["DISTANCE"][key]))

    ##############################################################################
    # ACTUAL PROGRAM EXECUTION
    ##############################################################################

    # Actual program execution
    if not args.treefile:  # Sequence-based analysis
        RunObj = ScrollPy(
            args.tmpout,  # Actual program run uses tmp dir!
            args.align,
            args.distance,
            args.infiles)
        # Run Scrollsaw itself
        RunObj()
        # Write to outfile(s); config handles gritty details
        # Write table file no matter what
        Writer = TableWriter(
            RunObj,  # object to use
            args.out,  # specified output location
        )
        #try:
        Writer.write()
        #except:  # Dangerous; Change!!!
        #    print("Unexpected error when writing table file")
        # Write sequences, if requested
        if args.seqout or args.filter_out:  # User requested sequences
            Writer = SeqWriter(
                RunObj,  # object to use
                args.out,  # specified output location
                args.seqout,  # Output sequences
                args.filter_out,  # Removed/filtered sequences
            )
            #try:
            Writer.write()