def test_preprocess_otu_table_exceptions(self): """Check the exceptions are raised appropriately""" # should raise an exception because the inputs contain a single row with self.assertRaises(EmperorUnsupportedComputation): o_otu_coords, o_otu_table, o_otu_lineages, o_prevalence, lines = preprocess_otu_table( self.otu_sample_ids, self.otu_table_broken, self.lineages_broken, self.coords, self.coords_header, 4 ) # some inputs are completely wrong but should still fail because the # contingency table has one row only, hence scores cannot be computed with self.assertRaises(EmperorUnsupportedComputation): o_otu_coords, o_otu_table, o_otu_lineages, o_prevalence, lines = preprocess_otu_table( self.otu_sample_ids, self.otu_table_broken, [[]], self.coords, self.coords_header, 4 ) with self.assertRaises(EmperorUnsupportedComputation): o_otu_coords, o_otu_table, o_otu_lineages, o_prevalence, lines = preprocess_otu_table( self.otu_sample_ids, array([]), self.lineages_broken, self.coords, self.coords_header, 4 )
def test_preprocess_otu_table_exceptions(self): """Check the exceptions are raised appropriately""" # should raise an exception because the inputs contain a single row with self.assertRaises(EmperorUnsupportedComputation): o_otu_coords, o_otu_table, o_otu_lineages, o_prevalence, lines =\ preprocess_otu_table(self.otu_sample_ids, self.otu_table_broken, self.lineages_broken, self.coords, self.coords_header, 4) # some inputs are completely wrong but should still fail because the # contingency table has one row only, hence scores cannot be computed with self.assertRaises(EmperorUnsupportedComputation): o_otu_coords, o_otu_table, o_otu_lineages, o_prevalence, lines =\ preprocess_otu_table(self.otu_sample_ids, self.otu_table_broken, [[]], self.coords, self.coords_header, 4) with self.assertRaises(EmperorUnsupportedComputation): o_otu_coords, o_otu_table, o_otu_lineages, o_prevalence, lines =\ preprocess_otu_table(self.otu_sample_ids, array([]), self.lineages_broken, self.coords, self.coords_header, 4)
def test_preprocess_otu_table(self): """Check the coords and otu table are processed correctly""" # processing only the four most prevalent taxa o_otu_coords, o_otu_table, o_otu_lineages, o_prevalence, lines =\ preprocess_otu_table(self.otu_sample_ids, self.otu_table, self.lineages, self.coords, self.coords_header, 4) assert_almost_equal(o_otu_coords, array([[ -6.71083200e-02, 1.05892642e-02, -5.26801821e-03, -1.31730322e-02, 3.15036935e-02, -1.99712144e-02, 8.14445313e-03, -1.76632227e-02, -4.84141987e-09], [ 9.65846961e-02, -1.64070839e-02, 1.68610695e-02, 1.03495979e-02, -2.26223522e-02, 2.83763737e-02, 1.76116225e-05, 2.89253284e-02, -4.84141987e-09], [-5.61881305e-04, 1.16341355e-02, -4.97196330e-02, 4.51141625e-02, -1.29353935e-01, -2.14114921e-02, -6.92988035e-02, -6.27730937e-02, -4.84141987e-09], [-5.70985165e-02, -1.09278921e-02, 8.49830390e-04, -1.91550282e-02, -4.22122952e-02, 7.75750297e-04, -1.18543093e-02, 3.31082777e-02, -4.84141987e-09]])) assert_almost_equal(o_otu_table, array([[ 0.78767123, 0.45637584, 0.22, 0.39597315, 0.41610738, 0.20945946, 0.70068027, 0.89932886, 0.77333333], [0.14383562, 0.27516779, 0.65333333, 0.52348993, 0.38926174, 0.69594595, 0.28571429, 0.0738255, 0.19333333], [ 0.03424658, 0.16107383, 0.02666667, 0.00671141, 0.14765101, 0.01351351, 0., 0.01342282, 0.00666667], [0.02739726, 0.04697987, 0.02, 0.04697987, 0.01, 0.02027027, 0.01360544, 0.01342282, 0.02666667]])) self.assertEquals(o_otu_lineages, ['Root;k__Bacteria;p__Firmicutes', 'Root;k__Bacteria;p__Bacteroidetes', 'Root;k__Bacteria;p__Tenericutes', 'Root;k__Bacteria;Other']) assert_almost_equal(o_prevalence, array([ 1., 0.66471926, 0.08193196, 0.04374296])) self.assertEquals(lines, LINES) # tests for correct outputs of empty inputs o_otu_coords, o_otu_table, o_otu_lineages, o_prevalence, lines =\ preprocess_otu_table([],[], [], self.coords, self.coords_header, 4) self.assertEquals(o_otu_coords, []) self.assertEquals(o_otu_table, []) self.assertEquals(o_otu_lineages, []) self.assertEquals(o_prevalence, []) self.assertEquals(lines, '')
'that are not numeric, hence not suitable for \'--custom_axes\' nor' ' for \'--add_vectors\'. Try the \'--missing_custom_axes_values\' ' 'option to fix these values.' % ', '.join(non_numeric_categories))) # process the coordinates file first, preventing the case where the custom # axes is not in the coloring categories i. e. in the --colory_by categories coords_headers, coords_data, coords_eigenvalues, coords_pct, coords_low,\ coords_high, clones = preprocess_coords_file(coords_headers,coords_data, coords_eigenvalues, coords_pct, header, mapping_data, custom_axes, jackknifing_method=jackknifing_method, is_comparison=compare_plots, pct_variation_below_one=pct_variation_below_one) # process the otu table after processing the coordinates to get custom axes # (when available) or any other change that occurred to the coordinates otu_coords, otu_table, otu_lineages, otu_prevalence, lines =\ preprocess_otu_table(otu_sample_ids, otu_table, lineages, coords_data, coords_headers, n_taxa_to_keep) # remove the columns in the mapping file that are not informative taking # into account the header names that were already authorized to be used # and take care of concatenating the fields for the && merged columns mapping_data, header = preprocess_mapping_file(mapping_data, header, color_by_column_names, unique=not add_unique_columns, clones=clones) # create the output directory before creating any other output if not isdir(opts.output_dir): makedirs(opts.output_dir) fp_out = open(join(output_dir, 'index.html'),'w') fp_out.write(emperor_autograph+'\n') fp_out.write(EMPEROR_HEADER_HTML_STRING)
option_parser.error(('The following field(s): \'%s\' contain values ' 'that are not numeric, hence not suitable for \'--custom_axes\' nor' ' for \'--add_vectors\'. Try the \'--missing_custom_axes_values\' ' 'option to fix these values.' % ', '.join(non_numeric_categories))) # process the coordinates file first, preventing the case where the custom # axes is not in the coloring categories i. e. in the --colory_by categories coords_headers, coords_data, coords_eigenvalues, coords_pct, coords_low,\ coords_high, clones = preprocess_coords_file(coords_headers,coords_data, coords_eigenvalues, coords_pct, header, mapping_data, custom_axes, jackknifing_method=jackknifing_method, is_comparison=compare_plots) # process the otu table after processing the coordinates to get custom axes # (when available) or any other change that occurred to the coordinates otu_coords, otu_table, otu_lineages, otu_prevalence, lines =\ preprocess_otu_table(otu_sample_ids, otu_table, lineages, coords_data, coords_headers, n_taxa_to_keep) # remove the columns in the mapping file that are not informative taking # into account the header names that were already authorized to be used # and take care of concatenating the fields for the && merged columns mapping_data, header = preprocess_mapping_file(mapping_data, header, color_by_column_names, unique=not add_unique_columns, clones=clones) # create the output directory before creating any other output create_dir(opts.output_dir,False) fp_out = open(join(output_dir, 'index.html'),'w') fp_out.write(emperor_autograph+'\n') fp_out.write(EMPEROR_HEADER_HTML_STRING) # write the html file
def test_preprocess_otu_table(self): """Check the coords and otu table are processed correctly""" # processing only the four most prevalent taxa o_otu_coords, o_otu_table, o_otu_lineages, o_prevalence, lines =\ preprocess_otu_table(self.otu_sample_ids, self.otu_table, self.lineages, self.coords, self.coords_header, 4) assert_almost_equal( o_otu_coords, array([[ -6.71083200e-02, 1.05892642e-02, -5.26801821e-03, -1.31730322e-02, 3.15036935e-02, -1.99712144e-02, 8.14445313e-03, -1.76632227e-02, -4.84141987e-09 ], [ 9.65846961e-02, -1.64070839e-02, 1.68610695e-02, 1.03495979e-02, -2.26223522e-02, 2.83763737e-02, 1.76116225e-05, 2.89253284e-02, -4.84141987e-09 ], [ -5.61881305e-04, 1.16341355e-02, -4.97196330e-02, 4.51141625e-02, -1.29353935e-01, -2.14114921e-02, -6.92988035e-02, -6.27730937e-02, -4.84141987e-09 ], [ -5.70985165e-02, -1.09278921e-02, 8.49830390e-04, -1.91550282e-02, -4.22122952e-02, 7.75750297e-04, -1.18543093e-02, 3.31082777e-02, -4.84141987e-09 ]])) assert_almost_equal( o_otu_table, array([[ 0.78767123, 0.45637584, 0.22, 0.39597315, 0.41610738, 0.20945946, 0.70068027, 0.89932886, 0.77333333 ], [ 0.14383562, 0.27516779, 0.65333333, 0.52348993, 0.38926174, 0.69594595, 0.28571429, 0.0738255, 0.19333333 ], [ 0.03424658, 0.16107383, 0.02666667, 0.00671141, 0.14765101, 0.01351351, 0., 0.01342282, 0.00666667 ], [ 0.02739726, 0.04697987, 0.02, 0.04697987, 0.01, 0.02027027, 0.01360544, 0.01342282, 0.02666667 ]])) self.assertEquals(o_otu_lineages, [ 'Root;k__Bacteria;p__Firmicutes', 'Root;k__Bacteria;p__Bacteroidetes', 'Root;k__Bacteria;p__Tenericutes', 'Root;k__Bacteria;Other' ]) assert_almost_equal(o_prevalence, array([1., 0.66471926, 0.08193196, 0.04374296])) self.assertEquals(lines, LINES) # tests for correct outputs of empty inputs o_otu_coords, o_otu_table, o_otu_lineages, o_prevalence, lines =\ preprocess_otu_table([],[], [], self.coords, self.coords_header, 4) self.assertEquals(o_otu_coords, []) self.assertEquals(o_otu_table, []) self.assertEquals(o_otu_lineages, []) self.assertEquals(o_prevalence, []) self.assertEquals(lines, '')
def test_preprocess_otu_table(self): """Check the coords and otu table are processed correctly""" # processing only the four most prevalent taxa o_otu_coords, o_otu_table, o_otu_lineages, o_prevalence, lines =\ preprocess_otu_table(self.otu_sample_ids, self.otu_table, self.lineages, self.coords, self.coords_header, 4) assert_almost_equal(o_otu_coords, array([[-6.71083200e-02, 1.05892642e-02, -5.26801821e-03, -1.31730322e-02, 3.15036935e-02, -1.99712144e-02, 8.14445313e-03, -1.76632227e-02, -4.84141987e-09], [9.65846961e-02, -1.64070839e-02, 1.68610695e-02, 1.03495979e-02, -2.26223522e-02, 2.83763737e-02, 1.76116225e-05, 2.89253284e-02, -4.84141987e-09], [-5.61881305e-04, 1.16341355e-02, -4.97196330e-02, 4.51141625e-02, -1.29353935e-01, -2.14114921e-02, -6.92988035e-02, -6.27730937e-02, -4.84141987e-09], [-5.70985165e-02, -1.09278921e-02, 8.49830390e-04, -1.91550282e-02, -4.22122952e-02, 7.75750297e-04, -1.18543093e-02, 3.31082777e-02, -4.84141987e-09]])) assert_almost_equal(o_otu_table, array([[0.78767123, 0.45637584, 0.22, 0.39597315, 0.41610738, 0.20945946, 0.70068027, 0.89932886, 0.77333333], [0.14383562, 0.27516779, 0.65333333, 0.52348993, 0.38926174, 0.69594595, 0.28571429, 0.0738255, 0.19333333], [0.03424658, 0.16107383, 0.02666667, 0.00671141, 0.14765101, 0.01351351, 0., 0.01342282, 0.00666667], [0.02739726, 0.04697987, 0.02, 0.04697987, 0.01, 0.02027027, 0.01360544, 0.01342282, 0.02666667]])) self.assertEquals(o_otu_lineages, ['Root;k__Bacteria;p__Firmicutes', 'Root;k__Bacteria;p__Bacteroidetes', 'Root;k__Bacteria;p__Tenericutes', 'Root;k__Bacteria;Other']) assert_almost_equal(o_prevalence, array([1., 0.66471926, 0.08193196, 0.04374296])) # Avoid string comparisons as reconciling floating point errors there # would be hard. The function producing this output is already tested # in tests/test_qiime_backports/test_biplots.py self.assertEqual(len(lines.split('\n')), 5) self.assertTrue(lines.startswith('#Taxon\tpc1\tpc2\tpc3\tpc4\tpc5\tpc6' '\tpc7\tpc8\tpc9\n')) for lineage in o_otu_lineages: self.assertTrue(lineage in lines) # tests for correct outputs of empty inputs o_otu_coords, o_otu_table, o_otu_lineages, o_prevalence, lines =\ preprocess_otu_table([], [], [], self.coords, self.coords_header, 4) self.assertEquals(o_otu_coords, []) self.assertEquals(o_otu_table, []) self.assertEquals(o_otu_lineages, []) self.assertEquals(o_prevalence, []) self.assertEquals(lines, '')
def main(): #option_parser, opts, args = parse_command_line_parameters(**script_info) input_coords = args.input_coords map_fp = args.map_fp output_dir = args.output_dir color_by_column_names = None #opts.color_by add_unique_columns = False #opts.add_unique_columns custom_axes = None #opts.custom_axes ignore_missing_samples = False #opts.ignore_missing_samples missing_custom_axes_values = None #opts.missing_custom_axes_values jackknifing_method = 'IQR' #opts.ellipsoid_method master_pcoa = None #opts.master_pcoa taxa_fp = None #opts.taxa_fp n_taxa_to_keep = False #opts.n_taxa_to_keep biplot_fp = None #opts.biplot_fp add_vectors = [None, None] #opts.add_vectors verbose_output = False #opts.verbose number_of_axes = 10 #opts.number_of_axes compare_plots = False #opts.compare_plots number_of_segments = 8 #opts.number_of_segments pct_variation_below_one = True #opts.pct_variation_below_one # add some metadata to the output emperor_autograph = format_emperor_autograph(map_fp, input_coords, 'HTML') # verifying that the number of axes requested is greater than 3 if number_of_axes < 3: print(('You need to plot at least 3 axes.')) # verifying that the number of segments is between the desired range if not (4 <= number_of_segments <= 14): print(('number_of_segments should be between 4 and 14.')) # append headernames that the script didn't find in the mapping file # according to different criteria to the following variables offending_fields = [] non_numeric_categories = [] serial_comparison = True # can't do averaged pcoa plots _and_ custom axes in the same plot if custom_axes is not None and isdir(input_coords): if custom_axes.count(',') > 0: print(('Jackknifed plots are limited to one custom ' 'axis, currently trying to use: %s. Make ' 'sure you use only one.' % custom_axes)) # make sure the flag is not misunderstood from the command line interface if not isdir(input_coords) and compare_plots: print("Cannot use the '--compare_plots' flag unless the " "input path is a directory.") # before creating any output, check correct parsing of the main input files #try: mapping_data, header, comments = parse_mapping_file(open(map_fp, 'U')) try: pass except: sys.exit(("The metadata mapping file '%s' does not seem " "to be formatted correctly, verify the " "formatting is QIIME compliant by using " "validate_mapping_file.py") % map_fp) else: # use this set variable to make presence/absensce checks faster lookup_header = set(header) mapping_ids = {row[0] for row in mapping_data} # dir means jackknifing or coordinate comparison type of processing if isdir(input_coords): offending_coords_fp = [] coords_headers = [] coords_data = [] coords_eigenvalues = [] coords_pct = [] coord_fps = guess_coordinates_files(input_coords) # QIIME generates folders of transformed coordinates for the specific # purpose of connecting all coordinates to a set of origin coordinates. # The name of this file is suffixed as _transformed_reference.txt trans_suf = '_transformed_reference.txt' transformed = [f for f in coord_fps if f.endswith(trans_suf)] # this could happen and we rather avoid this problem if len(coord_fps) == 0: print('Could not use any of the files in the input ' 'directory.') # the master pcoa must be the first in the list of coordinates; however # if the visualization is not a jackknifed plot this gets ignored if master_pcoa and not compare_plots: if master_pcoa in coord_fps: # remove it if duplicated coord_fps.remove(master_pcoa) coord_fps = [master_pcoa] + coord_fps # prepend it to the list # passing a master file means that the comparison is not serial elif master_pcoa and compare_plots: serial_comparison = False # guarantee that the master is the first and is not repeated if master_pcoa in coord_fps: coord_fps.remove(master_pcoa) sorted_filenames = sort_comparison_filenames(coord_fps) coord_fps = [master_pcoa] + sorted_filenames elif master_pcoa is None and len(transformed): master_pcoa = transformed[0] serial_comparison = False # Note: the following steps are to guarantee consistency. # remove the master from the list and re-add it as a first element # the rest of the files must be sorted alphabetically so the result # will be: ['unifrac_transformed_reference.txt', # 'unifrac_transformed_q1.txt', 'unifrac_transformed_q2.txt'] etc coord_fps.remove(master_pcoa) coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps) for fp in coord_fps: try: parsed = parse_coords(open(fp, 'U')) except (ValueError, QiimeParseError): offending_coords_fp.append(fp) # do not add any of the data and move along continue else: # pack all the data correspondingly only if it was correctly # parsed coords_headers.append(parsed[0]) coords_data.append(parsed[1]) coords_eigenvalues.append(parsed[2]) coords_pct.append(parsed[3]) # in case there were files that couldn't be parsed if offending_coords_fp: errout = ', '.join(offending_coords_fp) sys.exit(("The following file(s): '%s' could not be " "parsed properly. Make sure the input folder " "only contains coordinates files.") % errout) # check all files contain the same sample identifiers by flattening the # list of available sample ids and returning the sample ids that are # in one of the sets of sample ids but not in the globablly shared ids _coords_headers = set(flatten(coords_headers)) _per_file_missing = [_coords_headers - set(e) for e in coords_headers] non_shared_ids = set(flatten(_per_file_missing)) if non_shared_ids: errout = ', '.join(non_shared_ids) sys.exit(("The following sample identifier(s): '%s' " "are not shared between all the files. The " "files used to make a jackknifed PCoA plot " "or coordinate comparison plot (procustes " "plot) must share all the same sample " "identifiers between each other.") % errout) # number of samples ids that are shared between coords and mapping # files sids_intersection = mapping_ids.intersection(_coords_headers) # sample ids that are not mapped but are in the coords sids_difference = _coords_headers.difference(mapping_ids) # used to perform different validations in the script, very similar for # the case where the input is not a directory number_intersected_sids = len(sids_intersection) required_number_of_sids = len(coords_headers[0]) else: try: parsed = parse_coords(open(input_coords, 'U')) # this exception was noticed when there were letters in the coords file # other exeptions should be catched here; code will be updated then except (ValueError, QiimeParseError): sys.exit(("The PCoA file '%s' does not seem to be a " "coordinates formatted file, verify by " "manually inspecting the contents.") % input_coords) else: coords_headers = parsed[0] coords_data = parsed[1] coords_eigenvalues = parsed[2] coords_pct = parsed[3] # number of samples ids that are shared between coords and mapping # files sids_intersection = mapping_ids.intersection(coords_headers) # sample ids that are not mapped but are in the coords sids_difference = set(coords_headers).difference(mapping_ids) number_intersected_sids = len(sids_intersection) required_number_of_sids = len(coords_headers) if taxa_fp: try: # This should really use BIOM's Table.from_tsv # for summarized tables the "otu_ids" are really the "lineages" parsed = parse_otu_table(open(taxa_fp, 'U'), count_map_f=float, remove_empty_rows=True) except ValueError(e): sys.exit(("There was a problem parsing the --taxa_fp: " "%s" % e.message)) else: otu_sample_ids = parsed[0] lineages = parsed[1] otu_table = parsed[2] # make sure there are matching sample ids with the otu table if not sids_intersection.issuperset(otu_sample_ids): sys.exit("The sample identifiers in the OTU table must " "have at least one match with the data in the " "mapping file and with the coordinates file. " "Verify you are using input files that belong " "to the same dataset.") if len(lineages) <= 1: sys.exit("Contingency tables with one or fewer rows " "are not supported, please try passing a " "contingency table with more than one row.") else: # empty lists indicate that there was no taxa file passed in otu_sample_ids, lineages, otu_table = [], [], [] # sample ids must be shared between files if number_intersected_sids <= 0: sys.exit('None of your sample identifiers match between the' ' mapping file and the coordinates file. Verify ' 'you are using a coordinates file and a mapping ' 'file that belong to the same dataset.') # the intersection of the sample ids in the coords and the sample ids in # the mapping file must at the very least include all ids in the coords # file Otherwise it isn't valid; unless --ignore_missing_samples is set # True if number_intersected_sids != required_number_of_sids: if ignore_missing_samples: # keep only the samples that are mapped in the mapping file coords_headers, coords_data = keep_samples_from_pcoa_data( coords_headers, coords_data, sids_intersection) else: message = ("The metadata mapping file has fewer sample " "identifiers than the coordinates file. Verify you are " "using a mapping file that contains at least all the " "samples contained in the coordinates file(s). You can " "force the script to ignore these samples by passing " "the '--ignore_missing_samples' flag.") if verbose_output: missing_ids = ', '.join(sids_difference) message += ' Offending sample identifier(s): %s.' % missing_ids sys.exit(message) # ignore samples that exist in the coords but not in the mapping file, # note: we're using sids_intersection so if --ignore_missing_samples is # enabled we account for unmapped coords, else the program will exit before # this point header, mapping_data = filter_mapping_file(mapping_data, header, sids_intersection, include_repeat_cols=True) # catch the errors that could occur when filling the mapping file values if missing_custom_axes_values: # the fact that this uses parse_metadata_state_descriptions makes the # following option '-x Category:7;PH:12' to work as well as the # script-interface-documented '-x Category:7 -x PH:12' option for val in missing_custom_axes_values: if ':' not in val: sys.exit("Not valid missing value for custom " "axes: %s" % val) _mcav = ';'.join(missing_custom_axes_values) try: mapping_data = fill_mapping_field_from_mapping_file(mapping_data, header, _mcav) except AssertionError(e): print(e.message) except EmperorInputFilesError(e): print(e.message) # check that all the required columns exist in the metadata mapping file if color_by_column_names: color_by_column_names = color_by_column_names.split(',') # check for all the mapping fields for col in color_by_column_names: # for concatenated columns check each individual field parts = col.split('&&') offending_fields.extend(p for p in parts if p not in lookup_header) else: # if the user didn't specify the header names display everything color_by_column_names = [None] # extract a list of the custom axes provided and each element is numeric if custom_axes: custom_axes = custom_axes.strip().strip("'").strip('"').split(',') # the MetadataMap object makes some checks easier map_object = MetadataMap(mapping_file_to_dict(mapping_data, header), []) for axis in custom_axes: # append the field to the error queue that it belongs to if axis not in lookup_header: offending_fields.append(axis) break # make sure this value is in the mapping file elif axis not in color_by_column_names: color_by_column_names.append(axis) # perform only if the for loop does not call break else: # make sure all these axes are numeric for axis in custom_axes: if not map_object.isNumericCategory(axis): non_numeric_categories.append(axis) # make multiple checks for the add_vectors option if add_vectors != [None, None]: add_vectors = add_vectors.split(',') # check there are at the most two categories specified for this option if len(add_vectors) > 2: print("The '--add_vectors' option can accept up to " "two different fields from the mapping file; " "currently trying to use %d (%s)." % (len(add_vectors), ', '.join(add_vectors))) # make sure the field(s) exist for col in add_vectors: # concatenated fields are allowed now so check for each field if '&&' in col: for _col in col.split('&&'): if _col not in lookup_header: offending_fields.append(col) break # only execute this block of code if all checked fields exist else: # make sure that if it's going to be used for vector # creation it gets used for coloring and map postprocessing if col not in color_by_column_names: color_by_column_names.append(col) # if it's a column without concatenations elif col not in lookup_header: offending_fields.append(col) break else: # check this vector value is in the color by category if col not in color_by_column_names: color_by_column_names.append(col) # perform only if the for loop does not call break else: # check that the second category is all with numeric values if len(add_vectors) == 2: map_object = MetadataMap(mapping_file_to_dict(mapping_data, header), []) # if it has non-numeric values add it to the list of offenders if not map_object.isNumericCategory(add_vectors[1]): msg = add_vectors[1] + '(used in --add_vectors)' non_numeric_categories.append(msg) else: add_vectors.append(None) # terminate the program for the cases where a mapping field was not found # or when a mapping field didn't meet the criteria of being numeric if offending_fields: sys.exit("Invalid field(s) '%s'; the valid field(s) are: " "'%s'" % (', '.join(offending_fields), ', '.join(header))) if non_numeric_categories: sys.exit(("The following field(s): '%s' contain values " "that are not numeric, hence not suitable for " "'--custom_axes' nor for '--add_vectors'. Try " "the '--missing_custom_axes_values' option to " "fix these values." % ', '.join(non_numeric_categories))) # process the coordinates file first, preventing the case where the custom # axes is not in the coloring categories i. e. in the --colory_by # categories preprocessed_coords = preprocess_coords_file(coords_headers, coords_data, coords_eigenvalues, coords_pct, header, mapping_data, custom_axes, jackknifing_method, compare_plots, pct_variation_below_one) coords_headers = preprocessed_coords[0] coords_data = preprocessed_coords[1] coords_eigenvalues = preprocessed_coords[2] coords_pct = preprocessed_coords[3] coords_low = preprocessed_coords[4] coords_high = preprocessed_coords[5] clones = preprocessed_coords[6] # process the otu table after processing the coordinates to get custom axes # (when available) or any other change that occurred to the coordinates preprocessed_otu_table = preprocess_otu_table(otu_sample_ids, otu_table, lineages, coords_data, coords_headers, n_taxa_to_keep) otu_coords = preprocessed_otu_table[0] otu_table = preprocessed_otu_table[1] otu_lineages = preprocessed_otu_table[2] otu_prevalence = preprocessed_otu_table[3] lines = preprocessed_otu_table[4] # remove the columns in the mapping file that are not informative taking # into account the header names that were already authorized to be used # and take care of concatenating the fields for the && merged columns mapping_data, header = preprocess_mapping_file(mapping_data, header, color_by_column_names, not add_unique_columns, clones=clones) # create the output directory before creating any other output if not isdir(opts.output_dir): makedirs(opts.output_dir) fp_out = open(join(output_dir, 'index.html'), 'w') fp_out.write(emperor_autograph+'\n') fp_out.write(EMPEROR_HEADER_HTML_STRING) # write the html file fp_out.write(format_mapping_file_to_js(mapping_data, header, header)) # certain percents being explained cannot be displayed in the GUI try: fp_out.write(format_pcoa_to_js(coords_headers, coords_data, coords_eigenvalues, coords_pct, custom_axes, coords_low, coords_high, number_of_axes=number_of_axes, number_of_segments=number_of_segments)) except EmperorLogicError(e): sys.exit(e.message) fp_out.write(format_taxa_to_js(otu_coords, otu_lineages, otu_prevalence)) fp_out.write(format_vectors_to_js(mapping_data, header, coords_data, coords_headers, add_vectors[0], add_vectors[1])) fp_out.write(format_comparison_bars_to_js(coords_data, coords_headers, clones, serial_comparison)) has_taxa = taxa_fp is not None has_input_coords = isdir(input_coords) and not compare_plots has_add_vectors = add_vectors != [None, None] has_clones = clones > 0 fp_out.write(format_emperor_html_footer_string(has_taxa, has_input_coords, has_add_vectors, has_clones)) fp_out.close() copy_support_files(output_dir) # write the biplot coords in the output file if a path is passed if biplot_fp and taxa_fp: if biplot_fp.endswith('/') or isdir(biplot_fp): print("Do not specify a path to a new (path ending " "in a slash) or existing directory for " "biplot_fp. The output file will be a " "tab-delimited text file.") # make sure this file can be created try: fd = open(biplot_fp, 'w') except IOError: sys.exit("There was a problem creating the file with " "the coordinates for the biplots (%s)." % biplot_fp) else: fd.writelines(lines) fd.close()
def test_preprocess_otu_table(self): """Check the coords and otu table are processed correctly""" # processing only the four most prevalent taxa o_otu_coords, o_otu_table, o_otu_lineages, o_prevalence, lines =\ preprocess_otu_table(self.otu_sample_ids, self.otu_table, self.lineages, self.coords, self.coords_header, 4) assert_almost_equal( o_otu_coords, array([[ -6.71083200e-02, 1.05892642e-02, -5.26801821e-03, -1.31730322e-02, 3.15036935e-02, -1.99712144e-02, 8.14445313e-03, -1.76632227e-02, -4.84141987e-09 ], [ 9.65846961e-02, -1.64070839e-02, 1.68610695e-02, 1.03495979e-02, -2.26223522e-02, 2.83763737e-02, 1.76116225e-05, 2.89253284e-02, -4.84141987e-09 ], [ -5.61881305e-04, 1.16341355e-02, -4.97196330e-02, 4.51141625e-02, -1.29353935e-01, -2.14114921e-02, -6.92988035e-02, -6.27730937e-02, -4.84141987e-09 ], [ -5.70985165e-02, -1.09278921e-02, 8.49830390e-04, -1.91550282e-02, -4.22122952e-02, 7.75750297e-04, -1.18543093e-02, 3.31082777e-02, -4.84141987e-09 ]])) assert_almost_equal( o_otu_table, array([[ 0.78767123, 0.45637584, 0.22, 0.39597315, 0.41610738, 0.20945946, 0.70068027, 0.89932886, 0.77333333 ], [ 0.14383562, 0.27516779, 0.65333333, 0.52348993, 0.38926174, 0.69594595, 0.28571429, 0.0738255, 0.19333333 ], [ 0.03424658, 0.16107383, 0.02666667, 0.00671141, 0.14765101, 0.01351351, 0., 0.01342282, 0.00666667 ], [ 0.02739726, 0.04697987, 0.02, 0.04697987, 0.01, 0.02027027, 0.01360544, 0.01342282, 0.02666667 ]])) self.assertEquals(o_otu_lineages, [ 'Root;k__Bacteria;p__Firmicutes', 'Root;k__Bacteria;p__Bacteroidetes', 'Root;k__Bacteria;p__Tenericutes', 'Root;k__Bacteria;Other' ]) assert_almost_equal(o_prevalence, array([1., 0.66471926, 0.08193196, 0.04374296])) # Avoid string comparisons as reconciling floating point errors there # would be hard. The function producing this output is already tested # in tests/test_qiime_backports/test_biplots.py self.assertEqual(len(lines.split('\n')), 5) self.assertTrue( lines.startswith('#Taxon\tpc1\tpc2\tpc3\tpc4\tpc5\tpc6' '\tpc7\tpc8\tpc9\n')) for lineage in o_otu_lineages: self.assertTrue(lineage in lines) # tests for correct outputs of empty inputs o_otu_coords, o_otu_table, o_otu_lineages, o_prevalence, lines =\ preprocess_otu_table([], [], [], self.coords, self.coords_header, 4) self.assertEquals(o_otu_coords, []) self.assertEquals(o_otu_table, []) self.assertEquals(o_otu_lineages, []) self.assertEquals(o_prevalence, []) self.assertEquals(lines, '')