Пример #1
0
def main(argv):
    global parser
    (opts, args) = parser.parse_args()
    if valid_arguments(opts, args):
       print usage
       sys.exit(0)

    signal.signal(signal.SIGINT, sigint_handler)
    signal.signal(signal.SIGTERM, sigint_handler)

    eprintf("COMMAND : %s\n", sys.argv[0] + ' ' +  ' '.join(argv))
    # initialize the input directory or file
    input_fp = opts.input_fp 
    output_dir = path.abspath(opts.output_dir)
    verbose = opts.verbose
    print_only = opts.print_only
    sample_subset= opts.sample_subset

    run_type = opts.run_type.strip()


    '''no need to remove the whole directory'''
#    if run_type == 'overwrite':
#       force_remove_dir=True
#    else:
#       force_remove_dir=False

    if opts.config_file:
       config_file= opts.config_file
    else:
       config_file = cmd_folder + PATHDELIM + metapaths_config
    
    if opts.ncbi_header and opts.ncbi_sbt:
       if not path.exists(opts.ncbi_header):
          print "Could not open or missing NCBI header file " + opts.ncbi_header
          print "Either disable option to CREATE_SEQUIN_FILE or provide a valid header file"
          sys.exit(0)

       if  not path.exists(opts.ncbi_sbt):
          print """You must must have a sbt file obtained from the NCBI \"Create Submission Template\" form \n 
                 http://www.ncbi.nlm.nih.gov/WebSub/template.cgi """ + opts.ncbi_sbt
          sys.exit(0)

       ncbi_sequin_params = path.abspath(opts.ncbi_header)
       ncbi_sequin_sbt = path.abspath(opts.ncbi_sbt)
    else:
       ncbi_sequin_params = None
       ncbi_sequin_sbt = None

    # try to load the parameter file    
    try:
        parameter_f = opts.parameter_fp
    except IOError:
        raise IOError,\
         "Can't open parameters file (%s). Does it exist? Do you have read access?"\
         % opts.parameter_fp

    
    try:
       if run_type in ['overlay', 'safe'] and not path.exists(output_dir):
             makedirs(output_dir)
    except OSError:
        print ""
        print "ERROR: Cannot create output directory \"" + output_dir + "\"\n"+\
              "       Perhaps directory \"" + output_dir  + "\" already exists.\n" +\
              "       Please choose a different directory, or \n" +\
              "       run with the option \"-r  overwrite\" to force overwrite it."
        sys.exit(1)

        
    if verbose:
        status_update_callback = print_to_stdout
    else:
        status_update_callback = no_status_updates
    
    command_line_params={}
    command_line_params['verbose']= opts.verbose

    params=parse_metapaths_parameters(parameter_f)
    format = params['INPUT']['format']

    """ load the sample inputs  it expects either a fasta 
        file or  a directory containing fasta and yaml file pairs
    """

    globalerrorlogger = WorkflowLogger(generate_log_fp(output_dir, basefile_name= 'global_errors_warnings'), open_mode='w') 
    
    input_output_list = {}
    # TODO: Check for illumina paired data... this complicates things a little. 
    if path.isfile(input_fp):   
       """ check if it is a file """
       # TODO: Check for illumina pattern, if so check for pairs
       input_output_list = create_an_input_output_pair(input_fp, output_dir, format, globalerrorlogger = globalerrorlogger)
    else:
       if path.exists(input_fp):   
          """ check if dir exists """
          input_output_list = create_input_output_pairs(input_fp, output_dir, format, globalerrorlogger=globalerrorlogger)
       else:   
          """ must be an error """
          eprintf("ERROR\tNo valid input sample file or directory containing samples exists .!")
          eprintf("ERROR\tAs provided as arguments in the -in option.!\n")
          exit_process("ERROR\tAs provided as arguments in the -in option.!\n")
   
    """ these are the subset of sample to process if specified
        in case of an empty subset process all the sample """
    if sample_subset:
       remove_unspecified_samples(input_output_list, sample_subset, format, globalerrorlogger = globalerrorlogger)


    # add check the config parameters 
    sorted_input_output_list = sorted(input_output_list.keys())

    config_settings = read_pipeline_configuration(config_file, globalerrorlogger)

    parameter =  Parameters()
    if not staticDiagnose(config_settings, params, logger = globalerrorlogger):
        eprintf("ERROR\tFailed to pass the test for required scripts and inputs before run\n")
        globalerrorlogger.printf("ERROR\tFailed to pass the test for required scripts and inputs before run\n")
        exit_process("ERROR\tFailed to pass the test for required scripts and inputs before run\n")

    
    
    samplesData = {}
    # PART1 before the blast

    block_mode = opts.block_mode
    runid = opts.runid

    try:
         # load the sample information 
         if len(input_output_list): 
              for input_file in sorted_input_output_list:
                sample_output_dir = input_output_list[input_file]
                algorithm = get_parameter(params, 'annotation', 'algorithm', default='LAST').upper()
   
                s = SampleData() 
                s.setInputOutput(inputFile = input_file, sample_output_dir = sample_output_dir)
                s.setParameter('algorithm', algorithm)
                s.setParameter('ncbi_params_file', ncbi_sequin_params)
                s.setParameter('ncbi_sequin_sbt', ncbi_sequin_sbt)
                s.clearJobs()
   
                if run_type=='overwrite' and  path.exists(sample_output_dir):
                   shutil.rmtree(sample_output_dir)
                   makedirs(sample_output_dir)
                if not  path.exists(sample_output_dir):
                   makedirs(sample_output_dir)
   
                s.prepareToRun()
                samplesData[input_file] = s
   
              # load the sample information 
              run_metapathways(
                   samplesData,
                   sample_output_dir,
                   output_dir,
                   globallogger = globalerrorlogger,
                   command_line_params=command_line_params,
                   params=params,
                   metapaths_config=metapaths_config,
                   status_update_callback=status_update_callback,
                   config_file=config_file,
                   run_type = run_type, 
                   config_settings = config_settings,
                   block_mode = block_mode,
                   runid = runid
              )
         else: 
              eprintf("ERROR\tNo input files in the specified folder %s to process!\n",sQuote(input_fp) )
              globalerrorlogger.printf("ERROR\tNo input files in the specified folder %s to process!\n",sQuote(input_fp) )
   
        
         # blast the files
     
         blasting_system =    get_parameter(params,  'metapaths_steps', 'BLAST_REFDB', default='yes')
         if blasting_system =='grid':
            #  blasting the files files on the grids
             input_files = sorted_input_output_list
             blast_in_grid(
                   sampleData[input_file],
                   input_files, 
                   path.abspath(opts.output_dir),   #important to use opts.
                   params=params,
                   metapaths_config=metapaths_config,
                   config_file=config_file,
                   run_type = run_type,
                   runid = runid
                )
     
    except:
       globalerrorlogger.write( "ERROR\t" + str(traceback.format_exc(10)))
       exit_process("ERROR:" + str(traceback.format_exc(10)))


    
    eprintf("            ***********                \n")
    eprintf("INFO : FINISHED PROCESSING THE SAMPLES \n")
    eprintf("             THE END                   \n")
    eprintf("            ***********                \n")
    halt_process(4)
Пример #2
0
def main(argv):
    global parser
    (opts, args) = parser.parse_args()
    if valid_arguments(opts, args):
        print usage
        sys.exit(0)

    signal.signal(signal.SIGINT, sigint_handler)
    signal.signal(signal.SIGTERM, sigint_handler)

    eprintf("COMMAND : %s\n", sys.argv[0] + ' ' + ' '.join(argv))
    # initialize the input directory or file
    input_fp = opts.input_fp
    output_dir = path.abspath(opts.output_dir)
    verbose = opts.verbose
    print_only = opts.print_only

    sample_subset = removeSuffix(opts.sample_subset)

    run_type = opts.run_type.strip()
    '''no need to remove the whole directory'''
    #    if run_type == 'overwrite':
    #       force_remove_dir=True
    #    else:
    #       force_remove_dir=False

    if opts.config_file:
        config_file = opts.config_file
    else:
        config_file = cmd_folder + PATHDELIM + metapaths_config

    if opts.ncbi_header and opts.ncbi_sbt:
        if not path.exists(opts.ncbi_header):
            print "Could not open or missing NCBI header file " + opts.ncbi_header
            print "Either disable option to CREATE_SEQUIN_FILE or provide a valid header file"
            sys.exit(0)

        if not path.exists(opts.ncbi_sbt):
            print """You must must have a sbt file obtained from the NCBI \"Create Submission Template\" form \n 
                 http://www.ncbi.nlm.nih.gov/WebSub/template.cgi """ + opts.ncbi_sbt
            sys.exit(0)

        ncbi_sequin_params = path.abspath(opts.ncbi_header)
        ncbi_sequin_sbt = path.abspath(opts.ncbi_sbt)
    else:
        ncbi_sequin_params = None
        ncbi_sequin_sbt = None

    # try to load the parameter file
    try:
        if opts.parameter_fp:
            parameter_fp = opts.parameter_fp
        else:
            parameter_fp = cmd_folder + PATHDELIM + metapaths_param
    except IOError:
        raise IOError, (
            "Can't open parameters file (%s). Does it exist? Do you have read access?"
            % opts.parameter_fp)

    try:
        if run_type in ['overlay', 'safe'] and not path.exists(output_dir):
            makedirs(output_dir)
    except OSError:
        print ""
        print "ERROR: Cannot create output directory \"" + output_dir + "\"\n"+\
              "       Perhaps directory \"" + output_dir  + "\" already exists.\n" +\
              "       Please choose a different directory, or \n" +\
              "       run with the option \"-r  overwrite\" to force overwrite it."
        sys.exit(1)

    if verbose:
        status_update_callback = print_to_stdout
    else:
        status_update_callback = no_status_updates

    command_line_params = {}
    command_line_params['verbose'] = opts.verbose

    params = parse_metapaths_parameters(parameter_fp)
    """ load the sample inputs  it expects either a fasta 
        file or  a directory containing fasta and yaml file pairs
    """

    globalerrorlogger = WorkflowLogger(generate_log_fp(
        output_dir, basefile_name='global_errors_warnings'),
                                       open_mode='w')

    input_output_list = {}
    if path.isfile(input_fp):
        """ check if it is a file """
        input_output_list = create_an_input_output_pair(
            input_fp, output_dir, globalerrorlogger=globalerrorlogger)
    else:
        if path.exists(input_fp):
            """ check if dir exists """
            input_output_list = create_input_output_pairs(
                input_fp, output_dir, globalerrorlogger=globalerrorlogger)
        else:
            """ must be an error """
            eprintf(
                "ERROR\tNo valid input sample file or directory containing samples exists .!"
            )
            eprintf("ERROR\tAs provided as arguments in the -in option.!\n")
            exit_process(
                "ERROR\tAs provided as arguments in the -in option.!\n")
    """ these are the subset of sample to process if specified
        in case of an empty subset process all the sample """

    # remove all samples that are not specifed unless sample_subset is empty
    remove_unspecified_samples(input_output_list,
                               sample_subset,
                               globalerrorlogger=globalerrorlogger)

    # add check the config parameters
    sorted_input_output_list = sorted(input_output_list.keys())

    filetypes = check_file_types(sorted_input_output_list)

    #stop on in valid samples
    if not halt_on_invalid_input(input_output_list, filetypes, sample_subset):
        globalerrorlogger.printf(
            "ERROR\tInvalid inputs found. Check for file with bad format or characters!\n"
        )
        halt_process(opts.delay)

    # make sure the sample files are found
    report_missing_filenames(input_output_list,
                             sample_subset,
                             logger=globalerrorlogger)

    #check the pipeline configuration
    config_settings = read_pipeline_configuration(config_file,
                                                  globalerrorlogger)

    parameter = Parameters()
    if not staticDiagnose(config_settings, params, logger=globalerrorlogger):
        eprintf(
            "ERROR\tFailed to pass the test for required scripts and inputs before run\n"
        )
        globalerrorlogger.printf(
            "ERROR\tFailed to pass the test for required scripts and inputs before run\n"
        )
        halt_process(opts.delay)

    samplesData = {}
    # PART1 before the blast

    block_mode = opts.block_mode
    runid = opts.runid

    try:
        # load the sample information
        print "RUNNING MetaPathways version 2.5.2"
        if len(input_output_list):
            for input_file in sorted_input_output_list:
                sample_output_dir = input_output_list[input_file]
                algorithm = get_parameter(params,
                                          'annotation',
                                          'algorithm',
                                          default='LAST').upper()
                s = SampleData()
                s.setInputOutput(inputFile=input_file,
                                 sample_output_dir=sample_output_dir)
                s.setParameter('algorithm', algorithm)
                s.setParameter('ncbi_params_file', ncbi_sequin_params)
                s.setParameter('ncbi_sequin_sbt', ncbi_sequin_sbt)
                s.setParameter('FILE_TYPE', filetypes[input_file][0])
                if params["INPUT"]['format'] in [
                        "gbk-annotated", "gff-annotated"
                ]:
                    s.setParameter('ANNOTATED', True)
                else:
                    s.setParameter('ANNOTATED', False)
                s.setParameter('SEQ_TYPE', filetypes[input_file][1])
                s.clearJobs()

                if run_type == 'overwrite' and path.exists(sample_output_dir):
                    shutil.rmtree(sample_output_dir)
                    makedirs(sample_output_dir)
                if not path.exists(sample_output_dir):
                    makedirs(sample_output_dir)

                s.prepareToRun()
                samplesData[input_file] = s

            # load the sample information
            run_metapathways(samplesData,
                             sample_output_dir,
                             output_dir,
                             globallogger=globalerrorlogger,
                             command_line_params=command_line_params,
                             params=params,
                             metapaths_config=metapaths_config,
                             status_update_callback=status_update_callback,
                             config_file=config_file,
                             run_type=run_type,
                             config_settings=config_settings,
                             block_mode=block_mode,
                             runid=runid)
        else:
            eprintf(
                "ERROR\tNo valid input files/Or no files specified  to process in folder %s!\n",
                sQuote(input_fp))
            globalerrorlogger.printf(
                "ERROR\tNo valid input files to process in folder %s!\n",
                sQuote(input_fp))

        # blast the files

        blasting_system = get_parameter(params,
                                        'metapaths_steps',
                                        'BLAST_REFDB',
                                        default='yes')
        if blasting_system == 'grid':
            #  blasting the files files on the grids
            input_files = sorted_input_output_list
            blast_in_grid(
                sampleData[input_file],
                input_files,
                path.abspath(opts.output_dir),  #important to use opts.
                params=params,
                metapaths_config=metapaths_config,
                config_file=config_file,
                run_type=run_type,
                runid=runid)

    except:
        exit_process(str(traceback.format_exc(10)), logger=globalerrorlogger)

    eprintf("            ***********                \n")
    eprintf("INFO : FINISHED PROCESSING THE SAMPLES \n")
    eprintf("             THE END                   \n")
    eprintf("            ***********                \n")
    halt_process(opts.delay)
Пример #3
0
def blast_in_grid(input_files, output_dir, config_params, metapaths_config, config_file, run_type):

    algorithm = get_parameter(config_params, 'annotation', 'algorithm', default='BLAST').upper()
    messagelogger = WorkflowLogger(generate_log_fp(output_dir, basefile_name='metapathways_messages', suffix='txt'),\
                    open_mode='w')

    command_Status=  get_parameter(config_params,'metapaths_steps','BLAST_REFDB')

    config_settings = read_pipeline_configuration( config_file )

#   preprocessed_dir = output_dir + PATHDELIM + "preprocessed" + PATHDELIM
    orf_prediction_dir =   "orf_prediction"  
#   genbank_dir =  output_dir + PATHDELIM + "genbank"  + PATHDELIM
    output_run_statistics_dir = output_dir + PATHDELIM + "run_statistics"  +PATHDELIM
    blast_results_dir =  output_dir +  PATHDELIM + "blast_results"  + PATHDELIM
    output_results = output_dir + PATHDELIM + "results" + PATHDELIM 
    #---

    # create the sample and input pairs 
    samples_and_input = {}
    for input_file in input_files:
       sample_name = re.sub(r'[.][a-zA-Z]*$','',input_file)
       sample_name = path.basename(sample_name)
       sample_name = re.sub('[.]','_',sample_name)
       samples_and_input[sample_name] =  output_dir + PATHDELIM + sample_name + PATHDELIM + orf_prediction_dir + PATHDELIM +  sample_name + ".qced.faa"   
    
    

    # BLAST THE ORFs AGAINST THE REFERENCE DATABASES  FOR FUNCTIONAL ANNOTATION
    dbstring = get_parameter(config_params, 'annotation', 'dbs', default=None)
    dbs= dbstring.split(",")

    #parse the grid settings from the param file
    gridEnginePATTERN = re.compile(r'(grid_engine\d+)')
    trueOrYesPATTERN = re.compile(r'^[yYTt]')

    gridSettings = []
    for key in config_params:
       match = gridEnginePATTERN.match(key)
       if match ==None:
           continue
       if 'active' in config_params[key]:
           trueOrYes =  trueOrYesPATTERN.match(config_params[key]['active'])
           if trueOrYes:  # this grid is inactive
               # proceed with adding the grid
               match = gridEnginePATTERN.match(key)
               if match:
                  gridSettings.append(config_params[key])

    
    if not isValidInput(output_dir, samples_and_input, dbs, gridSettings, config_settings = config_settings,\
         messagelogger = messagelogger): 
       sys.exit(0)
       
    blastbroker = BlastBroker(messagelogger) # setup the broker with a message logger
    blastbroker.setBaseOutputFolder(output_dir)  #set up the output folder 
    blastbroker.addSamples(samples_and_input)   # add the samples and the input files
    
    # add databases against the samples
    for sample in samples_and_input:
       for db in dbs:
          blastbroker.addDatabase(sample, db)
       blastbroker.addAlgorithm(sample, algorithm)   # add the algorithms
       
    # setup services and add them to the Broker 
    for gridsetting in gridSettings:
        gridsetting['messagelogger']=messagelogger
        gridsetting['MetaPathwaysDir']=config_settings['METAPATHWAYS_PATH']
        gridsetting['base_output_folder']=blastbroker.base_output_folder
        gridsetting['blast_db_folder']=config_settings['REFDBS'] + PATHDELIM + 'functional'

        try:
          blastservice = BlastService(gridsetting)
        except:
          print traceback.format_exc(10)

        blastbroker.addService(blastservice)

    # create the work space folders
    if  blastbroker.are_working_folders_available():
       messagelogger.write("STATUS: Local working folders for Grid found!\n")
    elif blastbroker.create_working_folders():
       messagelogger.write("OK: Successfully created the grid related local working folders!\n")
    else:
       messagelogger.write("ERROR: Cannot create the grid working folders!\n")
       messagelogger.write("ERROR: Exiting blast in grid mode!\n")
       return

    
    # check if the input files are already split
    messagelogger.write("STATUS: Checking if input files are already split!\n")
#    for s in blastbroker.getSamples():
#       if not blastbroker.doesValidSplitExist(s):
#          messagelogger.write("STATUS: Did not find any previously split files for sample \"%s\"!\n" %(s))
#          if not blastbroker.splitInput(s): #if not then split
#             messagelogger.write("ERROR: Cannot split the files for some or all of the samples!\n")
#             sys.exit(0)
#          else:
#             messagelogger.write("SUCCESS: Successfully split the files for some or all of the samples!\n")
#       else:
#          messagelogger.write("OK: Found previously split files for sample \"%s\"!\n" %(s))
#           
    messagelogger.write("STATUS: Competed checks for file splits!\n")

    batch_size = int(get_parameter(config_params, 'grid_submission', 'batch_size', default=1000))
    blastbroker.setBatchSize(batch_size)
    
    
    # check if the input files are already split
    for s in blastbroker.getSamples():
       if not blastbroker.doesValidSplitExist(s):
          messagelogger.write("STATUS: Did not find any previously split files for sample \"%s\"!\n" %(s))
          if not blastbroker.splitInput(s): #if not then split
             print ("ERROR: Cannot split the files for some or all of the samples!\n")
             messagelogger.write("ERROR: Cannot split the files for some or all of the samples!\n")
             sys.exit(0)
          else:
             messagelogger.write("SUCCESS: Successfully split the files for some or all of the samples!\n")
       else:
          messagelogger.write("OK: Found previously split files for sample \"%s\"!\n" %(s))
           
    # load the list of splits
    blastbroker.load_list_splits()
    messagelogger.write("SUCCESS: Successfully loaded the list of file splits!\n")
    
    # create the databse and split combinations as jobs for each sample
    blastbroker.createJobs(redo=False)
    messagelogger.write("SUCCESS: Successfully created the (split, database) pairs!\n")
    
    # make sure you loaded the latest job lists on file
    blastbroker.load_job_lists()
    messagelogger.write("SUCCESS: Successfully recovered the old/existing job list!\n")

    # for each sample load the submitted and completed lists
    # and compute the loadper Server
    blastbroker.load_job_status_lists()
    messagelogger.write("SUCCESS: Successfully loaded the status of the jobs!\n")

    blastbroker.compute_performance()

    try:
       blastbroker.compute_server_loads()
    except:
       print traceback.format_exc(10)

    #print blastbroker.list_jobs_submitted
    #print blastbroker.list_jobs_completed
    #blastbroker.launch_AWS_grid()
    blastbroker.setupStatsVariables()

    messagelogger.write("STATUS: Getting ready to submit jobs to the servers!\n")
    blastbroker.Do_Work()
    #blastbroker.stop_AWS_grid()

    blastbroker.Delete_Remote_Directories()
    

    #print output_dir    
    #print samples_and_input
    #print dbs
    #print gridSettings
    
    message = "\n6. Blasting using Grid ORFs against reference database - "
def main(argv):
    global parser

    (opts, args) = parser.parse_args()
    if valid_arguments(opts, args):
       print usage
       sys.exit(0)

    signal.signal(signal.SIGINT, sigint_handler)
    signal.signal(signal.SIGTERM, sigint_handler)

    eprintf("%-10s:%s\n" %('COMMAND', sys.argv[0] + ' ' +  ' '.join(argv)) )
    # initialize the input directory or file
    input_fp = opts.input_fp 
    output_dir = path.abspath(opts.output_dir)
    verbose = opts.verbose
    print_only = opts.print_only

    sample_subset = removeSuffix(opts.sample_subset)

    run_type = opts.run_type.strip()


    '''no need to remove the whole directory'''
#    if run_type == 'overwrite':
#       force_remove_dir=True
#    else:
#       force_remove_dir=False

    if opts.config_file:
       config_file= opts.config_file
    else:
       config_file = cmd_folder + PATHDELIM + metapaths_config
    

    # try to load the parameter file    
    try:
       if opts.parameter_fp:
          parameter_fp= opts.parameter_fp
       else:
          parameter_fp = cmd_folder + PATHDELIM + metapaths_param
    except IOError:
        raise IOError, ( "Can't open parameters file (%s). Does it exist? Do you have read access?" % opts.parameter_fp )

    
    try:
       if run_type in ['overlay', 'safe'] and not path.exists(output_dir):
             makedirs(output_dir)
    except OSError:
        print ""
        print "ERROR: Cannot create output directory \"" + output_dir + "\"\n"+\
              "       Perhaps directory \"" + output_dir  + "\" already exists.\n" +\
              "       Please choose a different directory, or \n" +\
              "       run with the option \"-r  overwrite\" to force overwrite it."
        sys.exit(2)

        
    if verbose:
        status_update_callback = print_to_stdout
    else:
        status_update_callback = no_status_updates
    
    command_line_params={}
    command_line_params['verbose']= opts.verbose

    if not path.exists(parameter_fp):
        eprintf("%-10s: No parameters file %s found!\n" %('WARNING', parameter_fp))
        eprintf("%-10s: Creating a parameters file %s found!\n" %('INFO', parameter_fp))
        create_metapaths_parameters(parameter_fp, cmd_folder)

    params=parse_metapaths_parameters(parameter_fp)

    """ load the sample inputs  it expects either a fasta 
        file or  a directory containing fasta and yaml file pairs
    """

    globalerrorlogger = WorkflowLogger(generate_log_fp(output_dir, basefile_name= 'global_errors_warnings'), open_mode='w') 

    input_output_list = {}
    if path.isfile(input_fp):   
       """ check if it is a file """
       input_output_list = create_an_input_output_pair(input_fp, output_dir,  globalerrorlogger=globalerrorlogger)
    else:
       if path.exists(input_fp):   
          """ check if dir exists """
          input_output_list = create_input_output_pairs(input_fp, output_dir, globalerrorlogger=globalerrorlogger)
       else:   
          """ must be an error """
          eprintf("ERROR\tNo valid input sample file or directory containing samples exists .!")
          eprintf("ERROR\tAs provided as arguments in the -in option.!\n")
          exit_process("ERROR\tAs provided as arguments in the -in option.!\n")
   
    """ these are the subset of sample to process if specified
        in case of an empty subset process all the sample """

    # remove all samples that are not specifed unless sample_subset is empty
    remove_unspecified_samples(input_output_list, sample_subset, globalerrorlogger = globalerrorlogger)

    # add check the config parameters 
    sorted_input_output_list = sorted(input_output_list.keys())

    filetypes = check_file_types(sorted_input_output_list) 

    #stop on in valid samples
    if not halt_on_invalid_input(input_output_list, filetypes, sample_subset):
       globalerrorlogger.printf("ERROR\tInvalid inputs found. Check for file with bad format or characters!\n")
       halt_process(opts.delay)

    # make sure the sample files are found
    report_missing_filenames(input_output_list, sample_subset, logger=globalerrorlogger)


    #check the pipeline configuration

    print 'config'
    if not path.exists(config_file):
        eprintf("%-10s: No config file %s found!\n" %('WARNING', config_file))
        eprintf("%-10s: Creating a config file %s!\n" %('INFO', config_file))
        if not environment_variables_defined():
           sys.exit(0)
        create_metapaths_configuration(config_file, cmd_folder)

    config_settings = read_pipeline_configuration(config_file, globalerrorlogger)


    parameter =  Parameters()
    if not staticDiagnose(config_settings, params, logger = globalerrorlogger):
        eprintf("ERROR\tFailed to pass the test for required scripts and inputs before run\n")
        globalerrorlogger.printf("ERROR\tFailed to pass the test for required scripts and inputs before run\n")
        return 
    
    samplesData = {}
    # PART1 before the blast

    block_mode = opts.block_mode
    runid = opts.runid

    try:
         # load the sample information 
         print "RUNNING MetaPathways version FogDog 3.0"
         if len(input_output_list): 
              for input_file in sorted_input_output_list:
                sample_output_dir = input_output_list[input_file]
                algorithm = get_parameter(params, 'annotation', 'algorithm', default='LAST').upper()
   
                s = SampleData() 
                s.setInputOutput(inputFile = input_file, sample_output_dir = sample_output_dir)
                s.setParameter('algorithm', algorithm)
                s.setParameter('FILE_TYPE', filetypes[input_file][0])
                s.setParameter('SEQ_TYPE', filetypes[input_file][1])
                s.clearJobs()
   
                if run_type=='overwrite' and  path.exists(sample_output_dir):
                   shutil.rmtree(sample_output_dir)
                   makedirs(sample_output_dir)
                if not  path.exists(sample_output_dir):
                   makedirs(sample_output_dir)
   
                s.prepareToRun()
                samplesData[input_file] = s
   
              # load the sample information 
              run_metapathways(
                   samplesData,
                   sample_output_dir,
                   output_dir,
                   globallogger = globalerrorlogger,
                   command_line_params=command_line_params,
                   params=params,
                   metapaths_config=metapaths_config,
                   status_update_callback=status_update_callback,
                   config_file=config_file,
                   run_type = run_type, 
                   config_settings = config_settings,
                   block_mode = block_mode,
                   runid = runid
              )
         else: 
              eprintf("ERROR\tNo valid input files/Or no files specified  to process in folder %s!\n",sQuote(input_fp) )
              globalerrorlogger.printf("ERROR\tNo valid input files to process in folder %s!\n",sQuote(input_fp) )
   
    except:
       exit_process(str(traceback.format_exc(10)), logger= globalerrorlogger )


    
    eprintf("            ***********                \n")
    eprintf("INFO : FINISHED PROCESSING THE SAMPLES \n")
    eprintf("             THE END                   \n")
    eprintf("            ***********                \n")
def main(argv):
    global parser

    (opts, args) = parser.parse_args()
    if valid_arguments(opts, args):
       print usage
       sys.exit(0)

    signal.signal(signal.SIGINT, sigint_handler)
    signal.signal(signal.SIGTERM, sigint_handler)

    eprintf("%-10s:%s\n" %('COMMAND', sys.argv[0] + ' ' +  ' '.join(argv)) )
    # initialize the input directory or file
    input_fp = opts.input_fp 
    output_dir = path.abspath(opts.output_dir)
    verbose = opts.verbose
    print_only = opts.print_only

    sample_subset = removeSuffix(opts.sample_subset)

    run_type = opts.run_type.strip()


    '''no need to remove the whole directory'''
#    if run_type == 'overwrite':
#       force_remove_dir=True
#    else:
#       force_remove_dir=False

    if opts.config_file:
       config_file= opts.config_file
    else:
       config_file = cmd_folder + PATHDELIM + metapaths_config
    

    # try to load the parameter file    
    try:
       if opts.parameter_fp:
          parameter_fp= opts.parameter_fp
       else:
          parameter_fp = cmd_folder + PATHDELIM + metapaths_param
    except IOError:
        raise IOError, ( "Can't open parameters file (%s). Does it exist? Do you have read access?" % opts.parameter_fp )

    
    try:
       if run_type in ['overlay', 'safe'] and not path.exists(output_dir):
             makedirs(output_dir)
    except OSError:
        print ""
        print "ERROR: Cannot create output directory \"" + output_dir + "\"\n"+\
              "       Perhaps directory \"" + output_dir  + "\" already exists.\n" +\
              "       Please choose a different directory, or \n" +\
              "       run with the option \"-r  overwrite\" to force overwrite it."
        sys.exit(2)

        
    if verbose:
        status_update_callback = print_to_stdout
    else:
        status_update_callback = no_status_updates
    
    command_line_params={}
    command_line_params['verbose']= opts.verbose

    if not path.exists(parameter_fp):
        eprintf("%-10s: No parameters file %s found!\n" %('WARNING', parameter_fp))
        eprintf("%-10s: Creating a parameters file %s found!\n" %('INFO', parameter_fp))
        create_metapaths_parameters(parameter_fp, cmd_folder)

    params=parse_metapaths_parameters(parameter_fp)

    """ load the sample inputs  it expects either a fasta 
        file or  a directory containing fasta and yaml file pairs
    """

    globalerrorlogger = WorkflowLogger(generate_log_fp(output_dir, basefile_name= 'global_errors_warnings'), open_mode='w') 

    input_output_list = {}
    if path.isfile(input_fp):   
       """ check if it is a file """
       input_output_list = create_an_input_output_pair(input_fp, output_dir,  globalerrorlogger=globalerrorlogger)
    else:
       if path.exists(input_fp):   
          """ check if dir exists """
          input_output_list = create_input_output_pairs(input_fp, output_dir, globalerrorlogger=globalerrorlogger)
       else:   
          """ must be an error """
          eprintf("ERROR\tNo valid input sample file or directory containing samples exists .!")
          eprintf("ERROR\tAs provided as arguments in the -in option.!\n")
          exit_process("ERROR\tAs provided as arguments in the -in option.!\n")
   
    """ these are the subset of sample to process if specified
        in case of an empty subset process all the sample """

    # remove all samples that are not specifed unless sample_subset is empty
    remove_unspecified_samples(input_output_list, sample_subset, globalerrorlogger = globalerrorlogger)

    # add check the config parameters 
    sorted_input_output_list = sorted(input_output_list.keys())

    filetypes = check_file_types(sorted_input_output_list) 

    #stop on in valid samples
    if not halt_on_invalid_input(input_output_list, filetypes, sample_subset):
       globalerrorlogger.printf("ERROR\tInvalid inputs found. Check for file with bad format or characters!\n")
       halt_process(opts.delay)

    # make sure the sample files are found
    report_missing_filenames(input_output_list, sample_subset, logger=globalerrorlogger)


    #check the pipeline configuration

    print 'config'
    if not path.exists(config_file):
        eprintf("%-10s: No config file %s found!\n" %('WARNING', config_file))
        eprintf("%-10s: Creating a config file %s!\n" %('INFO', config_file))
        if not environment_variables_defined():
           sys.exit(0)
        create_metapaths_configuration(config_file, cmd_folder)

    config_settings = read_pipeline_configuration(config_file, globalerrorlogger)


    parameter =  Parameters()
    if not staticDiagnose(config_settings, params, logger = globalerrorlogger):
        eprintf("ERROR\tFailed to pass the test for required scripts and inputs before run\n")
        globalerrorlogger.printf("ERROR\tFailed to pass the test for required scripts and inputs before run\n")
        return 
    
    samplesData = {}
    # PART1 before the blast

    block_mode = opts.block_mode
    runid = opts.runid

    try:
         # load the sample information 
         print "RUNNING MetaPathways version FogDog 3.0"
         if len(input_output_list): 
              for input_file in sorted_input_output_list:
                sample_output_dir = input_output_list[input_file]
                algorithm = get_parameter(params, 'annotation', 'algorithm', default='LAST').upper()
   
                s = SampleData() 
                s.setInputOutput(inputFile = input_file, sample_output_dir = sample_output_dir)
                s.setParameter('algorithm', algorithm)
                s.setParameter('FILE_TYPE', filetypes[input_file][0])
                s.setParameter('SEQ_TYPE', filetypes[input_file][1])
                s.clearJobs()
   
                if run_type=='overwrite' and  path.exists(sample_output_dir):
                   shutil.rmtree(sample_output_dir)
                   makedirs(sample_output_dir)
                if not  path.exists(sample_output_dir):
                   makedirs(sample_output_dir)
   
                s.prepareToRun()
                samplesData[input_file] = s
   
              # load the sample information 
              run_metapathways(
                   samplesData,
                   sample_output_dir,
                   output_dir,
                   globallogger = globalerrorlogger,
                   command_line_params=command_line_params,
                   params=params,
                   metapaths_config=metapaths_config,
                   status_update_callback=status_update_callback,
                   config_file=config_file,
                   run_type = run_type, 
                   config_settings = config_settings,
                   block_mode = block_mode,
                   runid = runid
              )
         else: 
              eprintf("ERROR\tNo valid input files/Or no files specified  to process in folder %s!\n",sQuote(input_fp) )
              globalerrorlogger.printf("ERROR\tNo valid input files to process in folder %s!\n",sQuote(input_fp) )
   
    except:
       exit_process(str(traceback.format_exc(10)), logger= globalerrorlogger )


    
    eprintf("            ***********                \n")
    eprintf("INFO : FINISHED PROCESSING THE SAMPLES \n")
    eprintf("             THE END                   \n")
    eprintf("            ***********                \n")