def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) unittest_glob = opts.unittest_glob temp_filepath = opts.temp_filepath script_usage_tests = opts.script_usage_tests suppress_unit_tests = opts.suppress_unit_tests suppress_script_usage_tests = opts.suppress_script_usage_tests suppress_javascript_unit_tests = opts.suppress_javascript_unit_tests # since the test data is in the tests folder just add scripts_test_data emperor_test_data_dir = join(abspath(dirname(__file__)), 'scripts_test_data/') # offer the option for the user to pass the scripts dir from the command # line since there is no other way to get the scripts dir. If not provided # the base structure of the repository will be assumed. Note that for both # cases we are using absolute paths, to avoid unwanted failures. if opts.emperor_scripts_dir is None: emperor_scripts_dir = abspath(join(get_emperor_project_dir(), 'scripts/')) # let's try to guess cases for qiime-deploy type of installs if get_emperor_project_dir().endswith('/lib'): emperor_scripts_dir = abspath(join(get_emperor_project_dir()[:-3], 'scripts/')) else: emperor_scripts_dir = abspath(opts.emperor_scripts_dir) # make a sanity check if (suppress_unit_tests and suppress_script_usage_tests and suppress_javascript_unit_tests): option_parser.error("All tests have been suppresed. Nothing to run.") test_dir = abspath(dirname(__file__)) unittest_good_pattern = re.compile('OK\s*$') application_not_found_pattern = re.compile('ApplicationNotFoundError') python_name = 'python' bad_tests = [] missing_application_tests = [] # Run through all of Emperor's unit tests, and keep track of any files which # fail unit tests, note that these are the unit tests only if not suppress_unit_tests: unittest_names = [] if not unittest_glob: for root, dirs, files in walk(test_dir): for name in files: if name.startswith('test_') and name.endswith('.py'): unittest_names.append(join(root,name)) else: for fp in glob(unittest_glob): fn = split(fp)[1] if fn.startswith('test_') and fn.endswith('.py'): unittest_names.append(abspath(fp)) unittest_names.sort() for unittest_name in unittest_names: print "Testing %s:\n" % unittest_name command = '%s %s -v' % (python_name, unittest_name) stdout, stderr, return_value = qcli_system_call(command) print stderr if not unittest_good_pattern.search(stderr): if application_not_found_pattern.search(stderr): missing_application_tests.append(unittest_name) else: bad_tests.append(unittest_name) script_usage_failures = 0 # choose to run some of the script usage tests or all the available ones if not suppress_script_usage_tests and exists(emperor_test_data_dir) and\ exists(emperor_scripts_dir): if script_usage_tests != None: script_tests = script_usage_tests.split(',') else: script_tests = None initial_working_directory = getcwd() # Run the script usage testing functionality; note that depending on the # module where this was imported, the name of the arguments will change # that's the reason why I added the name of the arguments in here script_usage_result_summary, script_usage_failures = \ run_script_usage_tests( emperor_test_data_dir, # test_data_dir emperor_scripts_dir, # scripts_dir temp_filepath, # working_dir True, # verbose script_tests, # tests None, # failure_log_fp False) # force_overwrite # running script usage tests breaks the current working directory chdir(initial_working_directory) if not suppress_javascript_unit_tests: runner = join(test_dir, 'javascript_tests', 'runner.js') index = join(test_dir, 'javascript_tests', 'index.html') o, e, r = qcli_system_call('phantomjs %s %s' % (runner, index)) if o: print o if e: print e # if all the tests passed javascript_tests_passed = True if r == 0 else False else: javascript_tests_passed = True print "==============\nResult summary\n==============" if not suppress_unit_tests: print "\nUnit test result summary\n------------------------\n" if bad_tests: print "\nFailed the following unit tests.\n%s" %'\n'.join(bad_tests) if missing_application_tests: print "\nFailed the following unit tests, in part or whole due "+\ "to missing external applications.\nDepending on the Emperor "+\ "features you plan to use, this may not be critical.\n%s"\ % '\n'.join(missing_application_tests) if not(missing_application_tests or bad_tests): print "\nAll unit tests passed.\n" if not suppress_script_usage_tests: if exists(emperor_test_data_dir) and exists(emperor_scripts_dir): print "\nScript usage test result summary"+\ "\n--------------------------------\n" print script_usage_result_summary else: print ("\nCould not run script usage tests.\nThe Emperor scripts " "directory could not be automatically located, try supplying " " it manually using the --emperor_scripts_dir option.") if not suppress_javascript_unit_tests: print ('\nJavaScript unit tests result summary\n' '------------------------------------\n') if javascript_tests_passed: print 'All JavaScript unit tests passed.\n' else: print 'JavaScript unit tests failed, check the summary above.' # In case there were no failures of any type, exit with a return code of 0 return_code = 1 if (len(bad_tests) == 0 and len(missing_application_tests) == 0 and script_usage_failures == 0 and javascript_tests_passed): return_code = 0 return return_code
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_coords = opts.input_coords map_fp = opts.map_fp output_dir = opts.output_dir color_by_column_names = opts.color_by add_unique_columns = opts.add_unique_columns custom_axes = opts.custom_axes ignore_missing_samples = opts.ignore_missing_samples missing_custom_axes_values = opts.missing_custom_axes_values jackknifing_method = opts.ellipsoid_method master_pcoa = opts.master_pcoa taxa_fp = opts.taxa_fp n_taxa_to_keep = opts.n_taxa_to_keep biplot_fp = opts.biplot_fp add_vectors = opts.add_vectors verbose_output = opts.verbose number_of_axes = opts.number_of_axes compare_plots = opts.compare_plots number_of_segments = opts.number_of_segments pct_variation_below_one = opts.pct_variation_below_one # add some metadata to the output emperor_autograph = format_emperor_autograph(map_fp, input_coords, 'HTML') # verifying that the number of axes requested is greater than 3 if number_of_axes<3: option_parser.error(('You need to plot at least 3 axes.')) # verifying that the number of segments is between the desired range if number_of_segments<4 or number_of_segments>14: option_parser.error(('number_of_segments should be between 4 and 14.')) # append headernames that the script didn't find in the mapping file # according to different criteria to the following variables offending_fields = [] non_numeric_categories = [] serial_comparison = True # can't do averaged pcoa plots _and_ custom axes in the same plot if custom_axes!=None and len(custom_axes.split(','))>1 and\ isdir(input_coords): option_parser.error(('Jackknifed plots are limited to one custom axis, ' 'currently trying to use: %s. Make sure you use only one.' % custom_axes)) # make sure the flag is not misunderstood from the command line interface if isdir(input_coords) == False and compare_plots: option_parser.error('Cannot use the \'--compare_plots\' flag unless the' ' input path is a directory.') # before creating any output, check correct parsing of the main input files try: mapping_data, header, comments = parse_mapping_file(open(map_fp,'U')) # use this set variable to make presence/absensce checks faster lookup_header = set(header) except: option_parser.error(('The metadata mapping file \'%s\' does not seem ' 'to be formatted correctly, verify the formatting is QIIME ' 'compliant by using check_id_map.py') % map_fp) # dir means jackknifing or coordinate comparison type of processing if isdir(input_coords): offending_coords_fp = [] coords_headers, coords_data, coords_eigenvalues, coords_pct=[],[],[],[] # iterate only over the non-hidden files and not folders and if anything # ignore the procrustes results file that is generated by # transform_coordinate_matrices.py suffixed in procrustes_results.txt coord_fps = [join(input_coords, f) for f in listdir(input_coords) if not f.startswith('.') and not isdir(join(abspath(input_coords),f)) and not f.endswith('procrustes_results.txt')] # this could happen and we rather avoid this problem if len(coord_fps) == 0: option_parser.error('Could not use any of the files in the input ' 'directory.') # the master pcoa must be the first in the list of coordinates; however # if the visualization is not a jackknifed plot this gets ignored if master_pcoa and compare_plots == False: if master_pcoa in coord_fps: # remove it if duplicated coord_fps.remove(master_pcoa) coord_fps = [master_pcoa] + coord_fps # prepend it to the list # passing a master file means that the comparison is not serial elif master_pcoa and compare_plots: serial_comparison = False # guarantee that the master is the first and is not repeated if master_pcoa in coord_fps: coord_fps.remove(master_pcoa) coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps) # QIIME generates folders of transformed coordinates for the specific # purpose of connecting all coordinates to a set of origin coordinates. # The name of this file is suffixed as _transformed_reference.txt elif master_pcoa == None and len([f for f in coord_fps if f.endswith( '_transformed_reference.txt')]): master_pcoa = [f for f in coord_fps if f.endswith( '_transformed_reference.txt')][0] serial_comparison = False # Note: the following steps are to guarantee consistency. # remove the master from the list and re-add it as a first element # the rest of the files must be sorted alphabetically so the result # will be: ['unifrac_transformed_reference.txt', # 'unifrac_transformed_q1.txt', 'unifrac_transformed_q2.txt'] etc coord_fps.remove(master_pcoa) coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps) for fp in coord_fps: try: _coords_headers, _coords_data, _coords_eigenvalues,_coords_pct=\ parse_coords(open(fp,'U')) except (ValueError, QiimeParseError): offending_coords_fp.append(fp) # do not add any of the data and move along continue # pack all the data correspondingly only if it was correctly parsed coords_headers.append(_coords_headers) coords_data.append(_coords_data) coords_eigenvalues.append(_coords_eigenvalues) coords_pct.append(_coords_pct) # in case there were files that couldn't be parsed if offending_coords_fp: option_parser.error(('The following file(s): \'%s\' could not be ' 'parsed properly. Make sure the input folder only contains ' 'coordinates files.') % ', '.join(offending_coords_fp)) # check all files contain the same sample identifiers by flattening the # list of available sample ids and returning the sample ids that are # in one of the sets of sample ids but not in the globablly shared ids non_shared_ids = set(sum([list(set(sum(coords_headers, []))^set(e)) for e in coords_headers],[])) if non_shared_ids and len(coords_headers) > 1: option_parser.error(('The following sample identifier(s): \'%s\'' 'are not shared between all the files. The files used to ' 'make a jackknifed PCoA plot or coordinate comparison plot (' 'procustes plot) must share all the same sample identifiers' 'between each other.')%', '.join(list(non_shared_ids))) # flatten the list of lists into a 1-d list _coords_headers = list(set(sum(coords_headers, []))) # number of samples ids that are shared between coords and mapping files sids_intersection=list(set(zip(*mapping_data)[0])&set(_coords_headers)) # sample ids that are not mapped but are in the coords sids_difference=list(set(_coords_headers)-set(zip(*mapping_data)[0])) # used to perform different validations in the script, very similar for # the case where the input is not a directory number_intersected_sids = len(sids_intersection) required_number_of_sids = len(coords_headers[0]) else: try: coords_headers, coords_data, coords_eigenvalues, coords_pct =\ parse_coords(open(input_coords,'U')) # this exception was noticed when there were letters in the coords file # other exeptions should be catched here; code will be updated then except (ValueError, QiimeParseError): option_parser.error(('The PCoA file \'%s\' does not seem to be a ' 'coordinates formatted file, verify by manually inspecting ' 'the contents.') % input_coords) # number of samples ids that are shared between coords and mapping files sids_intersection = list(set(zip(*mapping_data)[0])&set(coords_headers)) # sample ids that are not mapped but are in the coords sids_difference = list(set(coords_headers)-set(zip(*mapping_data)[0])) number_intersected_sids = len(sids_intersection) required_number_of_sids = len(coords_headers) if taxa_fp: try: # for summarized tables the "otu_ids" are really the "lineages" otu_sample_ids, lineages, otu_table, _ = parse_otu_table(open( taxa_fp, 'U'), count_map_f=float, remove_empty_rows=True) except ValueError, e: option_parser.error('There was a problem parsing the --taxa_fp: %s'% e.message) # make sure there are matching sample ids with the otu table if not len(list(set(sids_intersection)&set(otu_sample_ids))): option_parser.error('The sample identifiers in the OTU table must ' 'have at least one match with the data in the mapping file and ' 'with the coordinates file. Verify you are using input files ' 'that belong to the same dataset.') if len(lineages) <= 1: option_parser.error('Contingency tables with one or fewer rows are ' 'not supported, please try passing a contingency table with ' 'more than one row.')
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) unittest_glob = opts.unittest_glob temp_filepath = opts.temp_filepath script_usage_tests = opts.script_usage_tests suppress_unit_tests = opts.suppress_unit_tests suppress_script_usage_tests = opts.suppress_script_usage_tests # since the test data is in the tests folder just add scripts_test_data emperor_test_data_dir = join(abspath(dirname(__file__)), 'scripts_test_data/') # offer the option for the user to pass the scripts dir from the command # line since there is no other way to get the scripts dir. If not provided # the base structure of the repository will be assumed. Note that for both # cases we are using absolute paths, to avoid unwanted failures. if opts.emperor_scripts_dir is None: emperor_scripts_dir = abspath( join(get_emperor_project_dir(), 'scripts/')) # let's try to guess cases for qiime-deploy type of installs if get_emperor_project_dir().endswith('/lib'): emperor_scripts_dir = abspath( join(get_emperor_project_dir()[:-3], 'scripts/')) else: emperor_scripts_dir = abspath(opts.emperor_scripts_dir) # make a sanity check if (suppress_unit_tests and suppress_script_usage_tests): option_parser.error("All tests have been suppresed. Nothing to run.") test_dir = abspath(dirname(__file__)) unittest_good_pattern = re.compile('OK\s*$') application_not_found_pattern = re.compile('ApplicationNotFoundError') python_name = 'python' bad_tests = [] missing_application_tests = [] # Run through all of Emperor's unit tests, and keep track of any files which # fail unit tests, note that these are the unit tests only if not suppress_unit_tests: unittest_names = [] if not unittest_glob: for root, dirs, files in walk(test_dir): for name in files: if name.startswith('test_') and name.endswith('.py'): unittest_names.append(join(root, name)) else: for fp in glob(unittest_glob): fn = split(fp)[1] if fn.startswith('test_') and fn.endswith('.py'): unittest_names.append(abspath(fp)) unittest_names.sort() for unittest_name in unittest_names: print "Testing %s:\n" % unittest_name command = '%s %s -v' % (python_name, unittest_name) stdout, stderr, return_value = qcli_system_call(command) print stderr if not unittest_good_pattern.search(stderr): if application_not_found_pattern.search(stderr): missing_application_tests.append(unittest_name) else: bad_tests.append(unittest_name) script_usage_failures = 0 # choose to run some of the script usage tests or all the available ones if not suppress_script_usage_tests and exists(emperor_test_data_dir) and\ exists(emperor_scripts_dir): if script_usage_tests != None: script_tests = script_usage_tests.split(',') else: script_tests = None # Run the script usage testing functionality; note that depending on the # module where this was imported, the name of the arguments will change # that's the reason why I added the name of the arguments in here script_usage_result_summary, script_usage_failures = \ run_script_usage_tests( emperor_test_data_dir, # test_data_dir emperor_scripts_dir, # scripts_dir temp_filepath, # working_dir True, # verbose script_tests, # tests None, # failure_log_fp False) # force_overwrite print "==============\nResult summary\n==============" if not suppress_unit_tests: print "\nUnit test result summary\n------------------------\n" if bad_tests: print "\nFailed the following unit tests.\n%s" % '\n'.join( bad_tests) if missing_application_tests: print "\nFailed the following unit tests, in part or whole due "+\ "to missing external applications.\nDepending on the Emperor "+\ "features you plan to use, this may not be critical.\n%s"\ % '\n'.join(missing_application_tests) if not (missing_application_tests or bad_tests): print "\nAll unit tests passed.\n\n" if not suppress_script_usage_tests: if exists(emperor_test_data_dir) and exists(emperor_scripts_dir): print "\nScript usage test result summary"+\ "\n------------------------------------\n" print script_usage_result_summary else: print( "\nCould not run script usage tests.\nThe Emperor scripts " "directory could not be automatically located, try supplying " " it manually using the --emperor_scripts_dir option.") # In case there were no failures of any type, exit with a return code of 0 return_code = 1 if (len(bad_tests) == 0 and len(missing_application_tests) == 0 and script_usage_failures == 0): return_code = 0 return return_code
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) unittest_glob = opts.unittest_glob temp_filepath = opts.temp_filepath script_usage_tests = opts.script_usage_tests suppress_unit_tests = opts.suppress_unit_tests suppress_script_usage_tests = opts.suppress_script_usage_tests suppress_javascript_unit_tests = opts.suppress_javascript_unit_tests # since the test data is in the tests folder just add scripts_test_data ili_test_data_dir = join(abspath(dirname(__file__)), 'scripts_test_data/') # offer the option for the user to pass the scripts dir from the command # line since there is no other way to get the scripts dir. If not provided # the base structure of the repository will be assumed. Note that for both # cases we are using absolute paths, to avoid unwanted failures. if opts.ili_scripts_dir is None: ili_scripts_dir = abspath(join(get_ili_project_dir(), 'scripts/')) else: ili_scripts_dir = abspath(opts.ili_scripts_dir) # make a sanity check if (suppress_unit_tests and suppress_script_usage_tests and suppress_javascript_unit_tests): option_parser.error("All tests have been suppresed. Nothing to run.") test_dir = abspath(dirname(__file__)) unittest_good_pattern = re.compile('OK\s*$') application_not_found_pattern = re.compile('ApplicationNotFoundError') python_name = 'python' bad_tests = [] missing_application_tests = [] # Run through all of ili's unit tests, and keep track of any files # which fail unit tests, note that these are the unit tests only if not suppress_unit_tests: unittest_names = [] if not unittest_glob: for root, dirs, files in walk(test_dir): for name in files: if name.startswith('test_') and name.endswith('.py'): unittest_names.append(join(root, name)) else: for fp in glob(unittest_glob): fn = split(fp)[1] if fn.startswith('test_') and fn.endswith('.py'): unittest_names.append(abspath(fp)) unittest_names.sort() for unittest_name in unittest_names: print "Testing %s:\n" % unittest_name command = '%s %s -v' % (python_name, unittest_name) stdout, stderr, return_value = qcli_system_call(command) print stderr if not unittest_good_pattern.search(stderr): if application_not_found_pattern.search(stderr): missing_application_tests.append(unittest_name) else: bad_tests.append(unittest_name) script_usage_failures = 0 # choose to run some of the script usage tests or all the available ones if (not suppress_script_usage_tests and exists(ili_test_data_dir) and exists(ili_scripts_dir)): if script_usage_tests is not None: script_tests = script_usage_tests.split(',') else: script_tests = None initial_working_directory = getcwd() # Run the script usage testing functionality; note that depending on # the module where this was imported, the name of the arguments will # change that's the reason why I added the name of the arguments in # here script_usage_result_summary, script_usage_failures = \ run_script_usage_tests(ili_test_data_dir, # test_data_dir ili_scripts_dir, # scripts_dir temp_filepath, # working_dir True, # verbose script_tests, # tests None, # failure_log_fp False) # force_overwrite # running script usage tests breaks the current working directory chdir(initial_working_directory) if not suppress_javascript_unit_tests: runner = join(test_dir, 'javascript_tests', 'runner.js') index = join(test_dir, 'javascript_tests', 'index.html') o, e, r = qcli_system_call('phantomjs %s %s' % (runner, index)) if o: print o if e: print e # if all the tests passed javascript_tests_passed = True if r == 0 else False else: javascript_tests_passed = True print "==============\nResult summary\n==============" if not suppress_unit_tests: print "\nUnit test result summary\n------------------------\n" if bad_tests: print("\nFailed the following unit tests.\n%s" % '\n'.join(bad_tests)) if missing_application_tests: print( "\nFailed the following unit tests, in part or whole due " "to missing external applications.\nDepending on the " "ili features you plan to use, this may not be " "critical.\n%s" % '\n'.join(missing_application_tests)) if not (missing_application_tests or bad_tests): print "\nAll unit tests passed.\n" if not suppress_script_usage_tests: if exists(ili_test_data_dir) and exists(ili_scripts_dir): print( "\nScript usage test result summary" "\n--------------------------------\n") print script_usage_result_summary else: print( "\nCould not run script usage tests.\nThe ili scripts " "directory could not be automatically located, try " "supplying it manually using the --ili_scripts_dir " "option.") if not suppress_javascript_unit_tests: print( '\nJavaScript unit tests result summary\n' '------------------------------------\n') if javascript_tests_passed: print 'All JavaScript unit tests passed.\n' else: print 'JavaScript unit tests failed, check the summary above.' # In case there were no failures of any type, exit with a return code of 0 return_code = 1 if (len(bad_tests) == 0 and len(missing_application_tests) == 0 and script_usage_failures == 0 and javascript_tests_passed): return_code = 0 return return_code
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_coords = opts.input_coords map_fp = opts.map_fp output_dir = opts.output_dir color_by_column_names = opts.color_by add_unique_columns = opts.add_unique_columns custom_axes = opts.custom_axes ignore_missing_samples = opts.ignore_missing_samples missing_custom_axes_values = opts.missing_custom_axes_values jackknifing_method = opts.ellipsoid_method master_pcoa = opts.master_pcoa taxa_fp = opts.taxa_fp n_taxa_to_keep = opts.n_taxa_to_keep biplot_fp = opts.biplot_fp add_vectors = opts.add_vectors verbose_output = opts.verbose number_of_axes = opts.number_of_axes compare_plots = opts.compare_plots number_of_segments = opts.number_of_segments pct_variation_below_one = opts.pct_variation_below_one # add some metadata to the output emperor_autograph = format_emperor_autograph(map_fp, input_coords, 'HTML') # verifying that the number of axes requested is greater than 3 if number_of_axes < 3: option_parser.error(('You need to plot at least 3 axes.')) # verifying that the number of segments is between the desired range if not (4 <= number_of_segments <= 14): option_parser.error(('number_of_segments should be between 4 and 14.')) # append headernames that the script didn't find in the mapping file # according to different criteria to the following variables offending_fields = [] non_numeric_categories = [] serial_comparison = True # can't do averaged pcoa plots _and_ custom axes in the same plot if custom_axes is not None and isdir(input_coords): if custom_axes.count(',') > 0: option_parser.error(('Jackknifed plots are limited to one custom ' 'axis, currently trying to use: %s. Make ' 'sure you use only one.' % custom_axes)) # make sure the flag is not misunderstood from the command line interface if not isdir(input_coords) and compare_plots: option_parser.error("Cannot use the '--compare_plots' flag unless the " "input path is a directory.") # before creating any output, check correct parsing of the main input files try: mapping_data, header, comments = parse_mapping_file(open(map_fp, 'U')) except: option_parser.error(("The metadata mapping file '%s' does not seem " "to be formatted correctly, verify the " "formatting is QIIME compliant by using " "validate_mapping_file.py") % map_fp) else: # use this set variable to make presence/absensce checks faster lookup_header = set(header) mapping_ids = {row[0] for row in mapping_data} # dir means jackknifing or coordinate comparison type of processing if isdir(input_coords): offending_coords_fp = [] coords_headers = [] coords_data = [] coords_eigenvalues = [] coords_pct = [] coord_fps = guess_coordinates_files(input_coords) # QIIME generates folders of transformed coordinates for the specific # purpose of connecting all coordinates to a set of origin coordinates. # The name of this file is suffixed as _transformed_reference.txt trans_suf = '_transformed_reference.txt' transformed = [f for f in coord_fps if f.endswith(trans_suf)] # this could happen and we rather avoid this problem if len(coord_fps) == 0: option_parser.error('Could not use any of the files in the input ' 'directory.') # the master pcoa must be the first in the list of coordinates; however # if the visualization is not a jackknifed plot this gets ignored if master_pcoa and not compare_plots: if master_pcoa in coord_fps: # remove it if duplicated coord_fps.remove(master_pcoa) coord_fps = [master_pcoa] + coord_fps # prepend it to the list # passing a master file means that the comparison is not serial elif master_pcoa and compare_plots: serial_comparison = False # guarantee that the master is the first and is not repeated if master_pcoa in coord_fps: coord_fps.remove(master_pcoa) sorted_filenames = sort_comparison_filenames(coord_fps) coord_fps = [master_pcoa] + sorted_filenames elif master_pcoa is None and len(transformed): master_pcoa = transformed[0] serial_comparison = False # Note: the following steps are to guarantee consistency. # remove the master from the list and re-add it as a first element # the rest of the files must be sorted alphabetically so the result # will be: ['unifrac_transformed_reference.txt', # 'unifrac_transformed_q1.txt', 'unifrac_transformed_q2.txt'] etc coord_fps.remove(master_pcoa) coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps) for fp in coord_fps: try: parsed = parse_coords(open(fp, 'U')) except (ValueError, QiimeParseError): offending_coords_fp.append(fp) # do not add any of the data and move along continue else: # pack all the data correspondingly only if it was correctly # parsed coords_headers.append(parsed[0]) coords_data.append(parsed[1]) coords_eigenvalues.append(parsed[2]) coords_pct.append(parsed[3]) # in case there were files that couldn't be parsed if offending_coords_fp: errout = ', '.join(offending_coords_fp) option_parser.error(("The following file(s): '%s' could not be " "parsed properly. Make sure the input folder " "only contains coordinates files.") % errout) # check all files contain the same sample identifiers by flattening the # list of available sample ids and returning the sample ids that are # in one of the sets of sample ids but not in the globablly shared ids _coords_headers = set(flatten(coords_headers)) _per_file_missing = [_coords_headers - set(e) for e in coords_headers] non_shared_ids = set(flatten(_per_file_missing)) if non_shared_ids: errout = ', '.join(non_shared_ids) option_parser.error(("The following sample identifier(s): '%s' " "are not shared between all the files. The " "files used to make a jackknifed PCoA plot " "or coordinate comparison plot (procustes " "plot) must share all the same sample " "identifiers between each other.") % errout) # number of samples ids that are shared between coords and mapping # files sids_intersection = mapping_ids.intersection(_coords_headers) # sample ids that are not mapped but are in the coords sids_difference = _coords_headers.difference(mapping_ids) # used to perform different validations in the script, very similar for # the case where the input is not a directory number_intersected_sids = len(sids_intersection) required_number_of_sids = len(coords_headers[0]) else: try: parsed = parse_coords(open(input_coords, 'U')) # this exception was noticed when there were letters in the coords file # other exeptions should be catched here; code will be updated then except (ValueError, QiimeParseError): option_parser.error( ("The PCoA file '%s' does not seem to be a " "coordinates formatted file, verify by " "manually inspecting the contents.") % input_coords) else: coords_headers = parsed[0] coords_data = parsed[1] coords_eigenvalues = parsed[2] coords_pct = parsed[3] # number of samples ids that are shared between coords and mapping # files sids_intersection = mapping_ids.intersection(coords_headers) # sample ids that are not mapped but are in the coords sids_difference = set(coords_headers).difference(mapping_ids) number_intersected_sids = len(sids_intersection) required_number_of_sids = len(coords_headers) if taxa_fp: try: # This should really use BIOM's Table.from_tsv # for summarized tables the "otu_ids" are really the "lineages" parsed = parse_otu_table(open(taxa_fp, 'U'), count_map_f=float, remove_empty_rows=True) except ValueError, e: option_parser.error(("There was a problem parsing the --taxa_fp: " "%s" % e.message)) else: otu_sample_ids = parsed[0] lineages = parsed[1] otu_table = parsed[2] # make sure there are matching sample ids with the otu table if not sids_intersection.issuperset(otu_sample_ids): option_parser.error("The sample identifiers in the OTU table must " "have at least one match with the data in the " "mapping file and with the coordinates file. " "Verify you are using input files that belong " "to the same dataset.") if len(lineages) <= 1: option_parser.error("Contingency tables with one or fewer rows " "are not supported, please try passing a " "contingency table with more than one row.")
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_coords = opts.input_coords map_fp = opts.map_fp output_dir = opts.output_dir color_by_column_names = opts.color_by add_unique_columns = opts.add_unique_columns custom_axes = opts.custom_axes ignore_missing_samples = opts.ignore_missing_samples missing_custom_axes_values = opts.missing_custom_axes_values jackknifing_method = opts.ellipsoid_method master_pcoa = opts.master_pcoa taxa_fp = opts.taxa_fp n_taxa_to_keep = opts.n_taxa_to_keep biplot_fp = opts.biplot_fp add_vectors = opts.add_vectors verbose_output = opts.verbose number_of_axes = opts.number_of_axes compare_plots = opts.compare_plots number_of_segments = opts.number_of_segments pct_variation_below_one = opts.pct_variation_below_one # add some metadata to the output emperor_autograph = format_emperor_autograph(map_fp, input_coords, 'HTML') # verifying that the number of axes requested is greater than 3 if number_of_axes < 3: option_parser.error(('You need to plot at least 3 axes.')) # verifying that the number of segments is between the desired range if not (4 <= number_of_segments <= 14): option_parser.error(('number_of_segments should be between 4 and 14.')) # append headernames that the script didn't find in the mapping file # according to different criteria to the following variables offending_fields = [] non_numeric_categories = [] serial_comparison = True # can't do averaged pcoa plots _and_ custom axes in the same plot if custom_axes is not None and isdir(input_coords): if custom_axes.count(',') > 0: option_parser.error(('Jackknifed plots are limited to one custom ' 'axis, currently trying to use: %s. Make ' 'sure you use only one.' % custom_axes)) # make sure the flag is not misunderstood from the command line interface if not isdir(input_coords) and compare_plots: option_parser.error("Cannot use the '--compare_plots' flag unless the " "input path is a directory.") # before creating any output, check correct parsing of the main input files try: mapping_data, header, comments = parse_mapping_file(open(map_fp, 'U')) except: option_parser.error(("The metadata mapping file '%s' does not seem " "to be formatted correctly, verify the " "formatting is QIIME compliant by using " "validate_mapping_file.py") % map_fp) else: # use this set variable to make presence/absensce checks faster lookup_header = set(header) mapping_ids = {row[0] for row in mapping_data} # dir means jackknifing or coordinate comparison type of processing if isdir(input_coords): offending_coords_fp = [] coords_headers = [] coords_data = [] coords_eigenvalues = [] coords_pct = [] coord_fps = guess_coordinates_files(input_coords) # QIIME generates folders of transformed coordinates for the specific # purpose of connecting all coordinates to a set of origin coordinates. # The name of this file is suffixed as _transformed_reference.txt trans_suf = '_transformed_reference.txt' transformed = [f for f in coord_fps if f.endswith(trans_suf)] # this could happen and we rather avoid this problem if len(coord_fps) == 0: option_parser.error('Could not use any of the files in the input ' 'directory.') # the master pcoa must be the first in the list of coordinates; however # if the visualization is not a jackknifed plot this gets ignored if master_pcoa and not compare_plots: if master_pcoa in coord_fps: # remove it if duplicated coord_fps.remove(master_pcoa) coord_fps = [master_pcoa] + coord_fps # prepend it to the list # passing a master file means that the comparison is not serial elif master_pcoa and compare_plots: serial_comparison = False # guarantee that the master is the first and is not repeated if master_pcoa in coord_fps: coord_fps.remove(master_pcoa) sorted_filenames = sort_comparison_filenames(coord_fps) coord_fps = [master_pcoa] + sorted_filenames elif master_pcoa is None and len(transformed): master_pcoa = transformed[0] serial_comparison = False # Note: the following steps are to guarantee consistency. # remove the master from the list and re-add it as a first element # the rest of the files must be sorted alphabetically so the result # will be: ['unifrac_transformed_reference.txt', # 'unifrac_transformed_q1.txt', 'unifrac_transformed_q2.txt'] etc coord_fps.remove(master_pcoa) coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps) for fp in coord_fps: try: parsed = parse_coords(open(fp, 'U')) except (ValueError, QiimeParseError): offending_coords_fp.append(fp) # do not add any of the data and move along continue else: # pack all the data correspondingly only if it was correctly # parsed coords_headers.append(parsed[0]) coords_data.append(parsed[1]) coords_eigenvalues.append(parsed[2]) coords_pct.append(parsed[3]) # in case there were files that couldn't be parsed if offending_coords_fp: errout = ', '.join(offending_coords_fp) option_parser.error(("The following file(s): '%s' could not be " "parsed properly. Make sure the input folder " "only contains coordinates files.") % errout) # check all files contain the same sample identifiers by flattening the # list of available sample ids and returning the sample ids that are # in one of the sets of sample ids but not in the globablly shared ids _coords_headers = set(flatten(coords_headers)) _per_file_missing = [_coords_headers - set(e) for e in coords_headers] non_shared_ids = set(flatten(_per_file_missing)) if non_shared_ids: errout = ', '.join(non_shared_ids) option_parser.error(("The following sample identifier(s): '%s' " "are not shared between all the files. The " "files used to make a jackknifed PCoA plot " "or coordinate comparison plot (procustes " "plot) must share all the same sample " "identifiers between each other.") % errout) # number of samples ids that are shared between coords and mapping # files sids_intersection = mapping_ids.intersection(_coords_headers) # sample ids that are not mapped but are in the coords sids_difference = _coords_headers.difference(mapping_ids) # used to perform different validations in the script, very similar for # the case where the input is not a directory number_intersected_sids = len(sids_intersection) required_number_of_sids = len(coords_headers[0]) else: try: parsed = parse_coords(open(input_coords, 'U')) # this exception was noticed when there were letters in the coords file # other exeptions should be catched here; code will be updated then except (ValueError, QiimeParseError): option_parser.error(("The PCoA file '%s' does not seem to be a " "coordinates formatted file, verify by " "manually inspecting the contents.") % input_coords) else: coords_headers = parsed[0] coords_data = parsed[1] coords_eigenvalues = parsed[2] coords_pct = parsed[3] # number of samples ids that are shared between coords and mapping # files sids_intersection = mapping_ids.intersection(coords_headers) # sample ids that are not mapped but are in the coords sids_difference = set(coords_headers).difference(mapping_ids) number_intersected_sids = len(sids_intersection) required_number_of_sids = len(coords_headers) if taxa_fp: try: # This should really use BIOM's Table.from_tsv # for summarized tables the "otu_ids" are really the "lineages" parsed = parse_otu_table(open(taxa_fp, 'U'), count_map_f=float, remove_empty_rows=True) except ValueError, e: option_parser.error(("There was a problem parsing the --taxa_fp: " "%s" % e.message)) else: otu_sample_ids = parsed[0] lineages = parsed[1] otu_table = parsed[2] # make sure there are matching sample ids with the otu table if not sids_intersection.issuperset(otu_sample_ids): option_parser.error("The sample identifiers in the OTU table must " "have at least one match with the data in the " "mapping file and with the coordinates file. " "Verify you are using input files that belong " "to the same dataset.") if len(lineages) <= 1: option_parser.error("Contingency tables with one or fewer rows " "are not supported, please try passing a " "contingency table with more than one row.")
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_coords = opts.input_coords map_fp = opts.map_fp output_dir = opts.output_dir color_by_column_names = opts.color_by add_unique_columns = opts.add_unique_columns custom_axes = opts.custom_axes ignore_missing_samples = opts.ignore_missing_samples missing_custom_axes_values = opts.missing_custom_axes_values jackknifing_method = opts.ellipsoid_method master_pcoa = opts.master_pcoa taxa_fp = opts.taxa_fp n_taxa_to_keep = opts.n_taxa_to_keep biplot_fp = opts.biplot_fp add_vectors = opts.add_vectors verbose_output = opts.verbose number_of_axes = opts.number_of_axes compare_plots = opts.compare_plots number_of_segments = opts.number_of_segments # add some metadata to the output emperor_autograph = format_emperor_autograph(map_fp, input_coords, 'HTML') # verifying that the number of axes requested is greater than 3 if number_of_axes < 3: option_parser.error(('You need to plot at least 3 axes.')) # verifying that the number of segments is between the desired range if number_of_segments < 4 or number_of_segments > 14: option_parser.error(('number_of_segments should be between 4 and 14.')) # append headernames that the script didn't find in the mapping file # according to different criteria to the following variables offending_fields = [] non_numeric_categories = [] serial_comparison = True # can't do averaged pcoa plots _and_ custom axes in the same plot if custom_axes!=None and len(custom_axes.split(','))>1 and\ isdir(input_coords): option_parser.error( ('Jackknifed plots are limited to one custom axis, ' 'currently trying to use: %s. Make sure you use only one.' % custom_axes)) # make sure the flag is not misunderstood from the command line interface if isdir(input_coords) == False and compare_plots: option_parser.error( 'Cannot use the \'--compare_plots\' flag unless the' ' input path is a directory.') # before creating any output, check correct parsing of the main input files try: mapping_data, header, comments = parse_mapping_file(open(map_fp, 'U')) # use this set variable to make presence/absensce checks faster lookup_header = set(header) except: option_parser.error( ('The metadata mapping file \'%s\' does not seem ' 'to be formatted correctly, verify the formatting is QIIME ' 'compliant by using check_id_map.py') % map_fp) # dir means jackknifing or coordinate comparison type of processing if isdir(input_coords): offending_coords_fp = [] coords_headers, coords_data, coords_eigenvalues, coords_pct=[],[],[],[] # iterate only over the non-hidden files and not folders and if anything # ignore the procrustes results file that is generated by # transform_coordinate_matrices.py suffixed in procrustes_results.txt coord_fps = [ join(input_coords, f) for f in listdir(input_coords) if not f.startswith('.') and not isdir(join(abspath(input_coords), f)) and not f.endswith('procrustes_results.txt') ] # this could happen and we rather avoid this problem if len(coord_fps) == 0: option_parser.error('Could not use any of the files in the input ' 'directory.') # the master pcoa must be the first in the list of coordinates; however # if the visualization is not a jackknifed plot this gets ignored if master_pcoa and compare_plots == False: if master_pcoa in coord_fps: # remove it if duplicated coord_fps.remove(master_pcoa) coord_fps = [master_pcoa] + coord_fps # prepend it to the list # passing a master file means that the comparison is not serial elif master_pcoa and compare_plots: serial_comparison = False # guarantee that the master is the first and is not repeated if master_pcoa in coord_fps: coord_fps.remove(master_pcoa) coord_fps = [master_pcoa ] + sort_comparison_filenames(coord_fps) # QIIME generates folders of transformed coordinates for the specific # purpose of connecting all coordinates to a set of origin coordinates. # The name of this file is suffixed as _transformed_reference.txt elif master_pcoa == None and len( [f for f in coord_fps if f.endswith('_transformed_reference.txt')]): master_pcoa = [ f for f in coord_fps if f.endswith('_transformed_reference.txt') ][0] serial_comparison = False # Note: the following steps are to guarantee consistency. # remove the master from the list and re-add it as a first element # the rest of the files must be sorted alphabetically so the result # will be: ['unifrac_transformed_reference.txt', # 'unifrac_transformed_q1.txt', 'unifrac_transformed_q2.txt'] etc coord_fps.remove(master_pcoa) coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps) for fp in coord_fps: try: _coords_headers, _coords_data, _coords_eigenvalues,_coords_pct=\ parse_coords(open(fp,'U')) except (ValueError, QiimeParseError): offending_coords_fp.append(fp) # do not add any of the data and move along continue # pack all the data correspondingly only if it was correctly parsed coords_headers.append(_coords_headers) coords_data.append(_coords_data) coords_eigenvalues.append(_coords_eigenvalues) coords_pct.append(_coords_pct) # in case there were files that couldn't be parsed if offending_coords_fp: option_parser.error( ('The following file(s): \'%s\' could not be ' 'parsed properly. Make sure the input folder only contains ' 'coordinates files.') % ', '.join(offending_coords_fp)) # check all files contain the same sample identifiers by flattening the # list of available sample ids and returning the sample ids that are # in one of the sets of sample ids but not in the globablly shared ids non_shared_ids = set( sum([ list(set(sum(coords_headers, [])) ^ set(e)) for e in coords_headers ], [])) if non_shared_ids and len(coords_headers) > 1: option_parser.error( ('The following sample identifier(s): \'%s\'' 'are not shared between all the files. The files used to ' 'make a jackknifed PCoA plot or coordinate comparison plot (' 'procustes plot) must share all the same sample identifiers' 'between each other.') % ', '.join(list(non_shared_ids))) # flatten the list of lists into a 1-d list _coords_headers = list(set(sum(coords_headers, []))) # number of samples ids that are shared between coords and mapping files sids_intersection = list( set(zip(*mapping_data)[0]) & set(_coords_headers)) # sample ids that are not mapped but are in the coords sids_difference = list( set(_coords_headers) - set(zip(*mapping_data)[0])) # used to perform different validations in the script, very similar for # the case where the input is not a directory number_intersected_sids = len(sids_intersection) required_number_of_sids = len(coords_headers[0]) else: try: coords_headers, coords_data, coords_eigenvalues, coords_pct =\ parse_coords(open(input_coords,'U')) # this exception was noticed when there were letters in the coords file # other exeptions should be catched here; code will be updated then except (ValueError, QiimeParseError): option_parser.error( ('The PCoA file \'%s\' does not seem to be a ' 'coordinates formatted file, verify by manually inspecting ' 'the contents.') % input_coords) # number of samples ids that are shared between coords and mapping files sids_intersection = list( set(zip(*mapping_data)[0]) & set(coords_headers)) # sample ids that are not mapped but are in the coords sids_difference = list( set(coords_headers) - set(zip(*mapping_data)[0])) number_intersected_sids = len(sids_intersection) required_number_of_sids = len(coords_headers) if taxa_fp: try: # for summarized tables the "otu_ids" are really the "lineages" otu_sample_ids, lineages, otu_table, _ = parse_otu_table( open(taxa_fp, 'U'), count_map_f=float, remove_empty_rows=True) except ValueError, e: option_parser.error( 'There was a problem parsing the --taxa_fp: %s' % e.message) # make sure there are matching sample ids with the otu table if not len(list(set(sids_intersection) & set(otu_sample_ids))): option_parser.error( 'The sample identifiers in the OTU table must ' 'have at least one match with the data in the mapping file and ' 'with the coordinates file. Verify you are using input files ' 'that belong to the same dataset.') if len(lineages) <= 1: option_parser.error( 'Contingency tables with one or fewer rows are ' 'not supported, please try passing a contingency table with ' 'more than one row.')