示例#1
0
    def test_parse_mapping_file(self):
        """parse_mapping_file functions as expected"""
        s1 = ['#sample\ta\tb', '#comment line to skip',\
              'x \t y \t z ', ' ', '#more skip', 'i\tj\tk']
        exp = ([['x','y','z'],['i','j','k']],\
               ['sample','a','b'],\
               ['comment line to skip','more skip'])
        obs = parse_mapping_file(s1)
        self.assertEqual(obs, exp)

        # We don't currently support this, but we should soon...
        # # check that first non-comment, non-blank line is used as
        # # header
        # s1 = ['sample\ta\tb', '#comment line to skip',\
        #       'x \t y \t z ', ' ', '#more skip', 'i\tj\tk']
        # exp = ([['x','y','z'],['i','j','k']],\
        #        ['sample','a','b'],\
        #        ['comment line to skip','more skip'])
        # obs = parse_mapping_file(s1)
        # self.assertEqual(obs, exp)

        #check that we strip double quotes by default
        s2 = ['#sample\ta\tb', '#comment line to skip',\
              '"x "\t" y "\t z ', ' ', '"#more skip"', 'i\t"j"\tk']
        obs = parse_mapping_file(s2)
        self.assertEqual(obs, exp)
def load_mf(fn):
    from skbio.io.util import open_file
    from emperor.qiime_backports.parse import parse_mapping_file
    with open_file(fn) as f:
        mapping_data, header, _ = parse_mapping_file(f)
        _mapping_file = pd.DataFrame(mapping_data, columns=header)
        _mapping_file.set_index('SampleID', inplace=True)
    return _mapping_file
 def test_mapping_file_to_dict(self):
     """parse_mapping_file functions as expected"""
     s1 = ["#sample\ta\tb", "#comment line to skip", "x \t y \t z ", " ", "#more skip", "i\tj\tk"]
     exp = ([["x", "y", "z"], ["i", "j", "k"]], ["sample", "a", "b"], ["comment line to skip", "more skip"])
     mapres = parse_mapping_file(s1)  # map_data, header, comments
     mapdict = mapping_file_to_dict(*mapres[:2])
     expdict = {"x": {"a": "y", "b": "z"}, "i": {"a": "j", "b": "k"}}
     self.assertEqual(mapdict, expdict)
def load_mf(fn):
    from skbio.io.util import open_file
    from emperor.qiime_backports.parse import parse_mapping_file
    with open_file(fn) as f:
        mapping_data, header, _ = parse_mapping_file(f)
        _mapping_file = pd.DataFrame(mapping_data, columns=header)
        _mapping_file.set_index('SampleID', inplace=True)
    return _mapping_file
示例#5
0
 def test_mapping_file_to_dict(self):
     """parse_mapping_file functions as expected"""
     s1 = ['#sample\ta\tb', '#comment line to skip',\
           'x \t y \t z ', ' ', '#more skip', 'i\tj\tk']
     exp = ([['x','y','z'],['i','j','k']],\
            ['sample','a','b'],\
            ['comment line to skip','more skip'])
     mapres = parse_mapping_file(s1)  # map_data, header, comments
     mapdict = mapping_file_to_dict(*mapres[:2])
     expdict = {'x': {'a': 'y', 'b': 'z'}, 'i': {'a': 'j', 'b': 'k'}}
     self.assertEqual(mapdict, expdict)
def sample_ids_from_metadata_description(mapping_f,valid_states_str):
    """ Given a description of metadata, return the corresponding sample ids
    """
    map_data, map_header, map_comments = parse_mapping_file(mapping_f)
    valid_states = parse_metadata_state_descriptions(valid_states_str)
    sample_ids = get_sample_ids(map_data, map_header, valid_states)

    if len(sample_ids)<1:
        raise ValueError,"All samples have been filtered out for the criteria"+\
            " described in the valid states"

    return sample_ids
示例#7
0
def sample_ids_from_metadata_description(mapping_f, valid_states_str):
    """ Given a description of metadata, return the corresponding sample ids
    """
    map_data, map_header, map_comments = parse_mapping_file(mapping_f)
    valid_states = parse_metadata_state_descriptions(valid_states_str)
    sample_ids = get_sample_ids(map_data, map_header, valid_states)

    if len(sample_ids) < 1:
        raise ValueError,"All samples have been filtered out for the criteria"+\
            " described in the valid states"

    return sample_ids
    def setUp(self):
        self.map_str1 = map_str1
        self.map_str2 = map_str2.split('\n')
        self.map_data, self.map_headers, self.map_comments =\
         parse_mapping_file(StringIO(self.map_str1))
        self.tutorial_mapping_f = StringIO(tutorial_mapping_f)

        # For sample_ids_from_category_state_coverage() tests.
        self.exp_empty = (set([]), 0, set([]))
        self.exp_all = (set(['PC.354', 'PC.355', 'PC.356', 'PC.481', 'PC.593',
                             'PC.607', 'PC.634', 'PC.635', 'PC.636']), 6,
                        set(['Control', 'Fast']))
    def test_parse_mapping_file(self):
        """parse_mapping_file functions as expected"""
        s1 = ["#sample\ta\tb", "#comment line to skip", "x \t y \t z ", " ", "#more skip", "i\tj\tk"]
        exp = ([["x", "y", "z"], ["i", "j", "k"]], ["sample", "a", "b"], ["comment line to skip", "more skip"])
        obs = parse_mapping_file(s1)
        self.assertEqual(obs, exp)

        # We don't currently support this, but we should soon...
        # # check that first non-comment, non-blank line is used as
        # # header
        # s1 = ['sample\ta\tb', '#comment line to skip',\
        #       'x \t y \t z ', ' ', '#more skip', 'i\tj\tk']
        # exp = ([['x','y','z'],['i','j','k']],\
        #        ['sample','a','b'],\
        #        ['comment line to skip','more skip'])
        # obs = parse_mapping_file(s1)
        # self.assertEqual(obs, exp)

        # check that we strip double quotes by default
        s2 = ["#sample\ta\tb", "#comment line to skip", '"x "\t" y "\t z ', " ", '"#more skip"', 'i\t"j"\tk']
        obs = parse_mapping_file(s2)
        self.assertEqual(obs, exp)
示例#10
0
def filter_mapping_file_from_mapping_f(mapping_f,sample_ids_to_keep,negate=False):
    """ Filter rows from a metadata mapping file """
    mapping_data, header, comments = parse_mapping_file(mapping_f)
    filtered_mapping_data = []
    sample_ids_to_keep = {}.fromkeys(sample_ids_to_keep)
    
    for mapping_datum in mapping_data:
        hit = mapping_datum[0] in sample_ids_to_keep
        if hit and not negate:
            filtered_mapping_data.append(mapping_datum)
        elif not hit and negate:
            filtered_mapping_data.append(mapping_datum)
        else:
            pass
    return format_mapping_file(header,filtered_mapping_data)
示例#11
0
def filter_mapping_file_from_mapping_f(mapping_f,
                                       sample_ids_to_keep,
                                       negate=False):
    """ Filter rows from a metadata mapping file """
    mapping_data, header, comments = parse_mapping_file(mapping_f)
    filtered_mapping_data = []
    sample_ids_to_keep = {}.fromkeys(sample_ids_to_keep)

    for mapping_datum in mapping_data:
        hit = mapping_datum[0] in sample_ids_to_keep
        if hit and not negate:
            filtered_mapping_data.append(mapping_datum)
        elif not hit and negate:
            filtered_mapping_data.append(mapping_datum)
        else:
            pass
    return format_mapping_file(header, filtered_mapping_data)
示例#12
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    input_coords = opts.input_coords
    map_fp = opts.map_fp
    output_dir = opts.output_dir
    color_by_column_names = opts.color_by
    add_unique_columns = opts.add_unique_columns
    custom_axes = opts.custom_axes
    ignore_missing_samples = opts.ignore_missing_samples
    missing_custom_axes_values = opts.missing_custom_axes_values
    jackknifing_method = opts.ellipsoid_method
    master_pcoa = opts.master_pcoa
    taxa_fp = opts.taxa_fp
    n_taxa_to_keep = opts.n_taxa_to_keep
    biplot_fp = opts.biplot_fp
    add_vectors = opts.add_vectors
    verbose_output = opts.verbose
    number_of_axes = opts.number_of_axes
    compare_plots = opts.compare_plots
    number_of_segments = opts.number_of_segments
    pct_variation_below_one = opts.pct_variation_below_one

    # add some metadata to the output
    emperor_autograph = format_emperor_autograph(map_fp, input_coords, 'HTML')

    # verifying that the number of axes requested is greater than 3
    if number_of_axes<3:
        option_parser.error(('You need to plot at least 3 axes.'))

    # verifying that the number of segments is between the desired range
    if number_of_segments<4 or number_of_segments>14:
        option_parser.error(('number_of_segments should be between 4 and 14.'))

    # append headernames that the script didn't find in the mapping file
    # according to different criteria to the following variables
    offending_fields = []
    non_numeric_categories = []

    serial_comparison = True

    # can't do averaged pcoa plots _and_ custom axes in the same plot
    if custom_axes!=None and len(custom_axes.split(','))>1 and\
        isdir(input_coords):
        option_parser.error(('Jackknifed plots are limited to one custom axis, '
            'currently trying to use: %s. Make sure you use only one.' %
            custom_axes))

    # make sure the flag is not misunderstood from the command line interface
    if isdir(input_coords) == False and compare_plots:
        option_parser.error('Cannot use the \'--compare_plots\' flag unless the'
            ' input path is a directory.')

    # before creating any output, check correct parsing of the main input files
    try:
        mapping_data, header, comments = parse_mapping_file(open(map_fp,'U'))

        # use this set variable to make presence/absensce checks faster
        lookup_header = set(header)
    except:
        option_parser.error(('The metadata mapping file \'%s\' does not seem '
            'to be formatted correctly, verify the formatting is QIIME '
            'compliant by using check_id_map.py') % map_fp)

    # dir means jackknifing or coordinate comparison type of processing
    if isdir(input_coords):
        offending_coords_fp = []
        coords_headers, coords_data, coords_eigenvalues, coords_pct=[],[],[],[]

        # iterate only over the non-hidden files and not folders and if anything
        # ignore the procrustes results file that is generated by
        # transform_coordinate_matrices.py suffixed in procrustes_results.txt
        coord_fps = [join(input_coords, f) for f in listdir(input_coords) if
            not f.startswith('.') and not isdir(join(abspath(input_coords),f))
            and not f.endswith('procrustes_results.txt')]

        # this could happen and we rather avoid this problem
        if len(coord_fps) == 0:
            option_parser.error('Could not use any of the files in the input '
                'directory.')

        # the master pcoa must be the first in the list of coordinates; however
        # if the visualization is not a jackknifed plot this gets ignored
        if master_pcoa and compare_plots == False:
            if master_pcoa in coord_fps: # remove it if duplicated
                coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + coord_fps # prepend it to the list
        # passing a master file means that the comparison is not serial
        elif master_pcoa and compare_plots:
            serial_comparison = False

            # guarantee that the master is the first and is not repeated
            if master_pcoa in  coord_fps:
                coord_fps.remove(master_pcoa)
                coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps)

        # QIIME generates folders of transformed coordinates for the specific
        # purpose of connecting all coordinates to a set of origin coordinates.
        # The name of this file is suffixed as _transformed_reference.txt
        elif master_pcoa == None and len([f for f in coord_fps if f.endswith(
            '_transformed_reference.txt')]):
            master_pcoa = [f for f in coord_fps if f.endswith(
                '_transformed_reference.txt')][0]
            serial_comparison = False

            # Note: the following steps are to guarantee consistency.
            # remove the master from the list and re-add it as a first element
            # the rest of the files must be sorted alphabetically so the result
            # will be: ['unifrac_transformed_reference.txt',
            # 'unifrac_transformed_q1.txt', 'unifrac_transformed_q2.txt'] etc
            coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps)

        for fp in coord_fps:
            try:
                _coords_headers, _coords_data, _coords_eigenvalues,_coords_pct=\
                    parse_coords(open(fp,'U'))
            except (ValueError, QiimeParseError):
                offending_coords_fp.append(fp)

                # do not add any of the data and move along
                continue

            # pack all the data correspondingly only if it was correctly parsed
            coords_headers.append(_coords_headers)
            coords_data.append(_coords_data)
            coords_eigenvalues.append(_coords_eigenvalues)
            coords_pct.append(_coords_pct)

        # in case there were files that couldn't be parsed
        if offending_coords_fp:
            option_parser.error(('The following file(s): \'%s\' could not be '
                'parsed properly. Make sure the input folder only contains '
                'coordinates files.') % ', '.join(offending_coords_fp))

        # check all files contain the same sample identifiers by flattening the
        # list of available sample ids and returning the sample ids that are
        # in one of the sets of sample ids but not in the globablly shared ids
        non_shared_ids = set(sum([list(set(sum(coords_headers, []))^set(e))
            for e in coords_headers],[]))
        if non_shared_ids and len(coords_headers) > 1:
            option_parser.error(('The following sample identifier(s): \'%s\''
                'are not shared between all the files. The files used to '
                'make a jackknifed PCoA plot or coordinate comparison plot ('
                'procustes plot) must share all the same sample identifiers'
                'between each other.')%', '.join(list(non_shared_ids)))

        # flatten the list of lists into a 1-d list
        _coords_headers = list(set(sum(coords_headers, [])))

        # number of samples ids that are shared between coords and mapping files
        sids_intersection=list(set(zip(*mapping_data)[0])&set(_coords_headers))

        # sample ids that are not mapped but are in the coords
        sids_difference=list(set(_coords_headers)-set(zip(*mapping_data)[0]))

        # used to perform different validations in the script, very similar for
        # the case where the input is not a directory
        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers[0])

    else:
        try:
            coords_headers, coords_data, coords_eigenvalues, coords_pct =\
                parse_coords(open(input_coords,'U'))
        # this exception was noticed when there were letters in the coords file
        # other exeptions should be catched here; code will be updated then
        except (ValueError, QiimeParseError):
            option_parser.error(('The PCoA file \'%s\' does not seem to be a '
                'coordinates formatted file, verify by manually inspecting '
                'the contents.') % input_coords)

        # number of samples ids that are shared between coords and mapping files
        sids_intersection = list(set(zip(*mapping_data)[0])&set(coords_headers))
        # sample ids that are not mapped but are in the coords
        sids_difference = list(set(coords_headers)-set(zip(*mapping_data)[0]))
        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers)

    if taxa_fp:
        try:
            # for summarized tables the "otu_ids" are really the "lineages"
            otu_sample_ids, lineages, otu_table, _ = parse_otu_table(open(
                taxa_fp, 'U'), count_map_f=float, remove_empty_rows=True)
        except ValueError, e:
            option_parser.error('There was a problem parsing the --taxa_fp: %s'%
                e.message)

        # make sure there are matching sample ids with the otu table
        if not len(list(set(sids_intersection)&set(otu_sample_ids))):
            option_parser.error('The sample identifiers in the OTU table must '
                'have at least one match with the data in the mapping file and '
                'with the coordinates file. Verify you are using input files '
                'that belong to the same dataset.')
        if len(lineages) <= 1:
            option_parser.error('Contingency tables with one or fewer rows are '
                'not supported, please try passing a contingency table with '
                'more than one row.')
示例#13
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    input_coords = opts.input_coords
    map_fp = opts.map_fp
    output_dir = opts.output_dir
    color_by_column_names = opts.color_by
    add_unique_columns = opts.add_unique_columns
    custom_axes = opts.custom_axes
    ignore_missing_samples = opts.ignore_missing_samples
    missing_custom_axes_values = opts.missing_custom_axes_values
    jackknifing_method = opts.ellipsoid_method
    master_pcoa = opts.master_pcoa
    taxa_fp = opts.taxa_fp
    n_taxa_to_keep = opts.n_taxa_to_keep
    biplot_fp = opts.biplot_fp
    add_vectors = opts.add_vectors
    verbose_output = opts.verbose
    number_of_axes = opts.number_of_axes
    compare_plots = opts.compare_plots
    number_of_segments = opts.number_of_segments
    pct_variation_below_one = opts.pct_variation_below_one

    # add some metadata to the output
    emperor_autograph = format_emperor_autograph(map_fp, input_coords, 'HTML')

    # verifying that the number of axes requested is greater than 3
    if number_of_axes < 3:
        option_parser.error(('You need to plot at least 3 axes.'))

    # verifying that the number of segments is between the desired range
    if not (4 <= number_of_segments <= 14):
        option_parser.error(('number_of_segments should be between 4 and 14.'))

    # append headernames that the script didn't find in the mapping file
    # according to different criteria to the following variables
    offending_fields = []
    non_numeric_categories = []

    serial_comparison = True

    # can't do averaged pcoa plots _and_ custom axes in the same plot
    if custom_axes is not None and isdir(input_coords):
        if custom_axes.count(',') > 0:
            option_parser.error(('Jackknifed plots are limited to one custom '
                                 'axis, currently trying to use: %s. Make '
                                 'sure you use only one.' % custom_axes))

    # make sure the flag is not misunderstood from the command line interface
    if not isdir(input_coords) and compare_plots:
        option_parser.error("Cannot use the '--compare_plots' flag unless the "
                            "input path is a directory.")

    # before creating any output, check correct parsing of the main input files
    try:
        mapping_data, header, comments = parse_mapping_file(open(map_fp, 'U'))

    except:
        option_parser.error(("The metadata mapping file '%s' does not seem "
                             "to be formatted correctly, verify the "
                             "formatting is QIIME compliant by using "
                             "validate_mapping_file.py") % map_fp)
    else:
        # use this set variable to make presence/absensce checks faster
        lookup_header = set(header)
        mapping_ids = {row[0] for row in mapping_data}

    # dir means jackknifing or coordinate comparison type of processing
    if isdir(input_coords):
        offending_coords_fp = []
        coords_headers = []
        coords_data = []
        coords_eigenvalues = []
        coords_pct = []

        coord_fps = guess_coordinates_files(input_coords)

        # QIIME generates folders of transformed coordinates for the specific
        # purpose of connecting all coordinates to a set of origin coordinates.
        # The name of this file is suffixed as _transformed_reference.txt
        trans_suf = '_transformed_reference.txt'
        transformed = [f for f in coord_fps if f.endswith(trans_suf)]

        # this could happen and we rather avoid this problem
        if len(coord_fps) == 0:

            option_parser.error('Could not use any of the files in the input '
                                'directory.')

        # the master pcoa must be the first in the list of coordinates; however
        # if the visualization is not a jackknifed plot this gets ignored
        if master_pcoa and not compare_plots:
            if master_pcoa in coord_fps:  # remove it if duplicated
                coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + coord_fps  # prepend it to the list
        # passing a master file means that the comparison is not serial
        elif master_pcoa and compare_plots:
            serial_comparison = False

            # guarantee that the master is the first and is not repeated
            if master_pcoa in coord_fps:
                coord_fps.remove(master_pcoa)
                sorted_filenames = sort_comparison_filenames(coord_fps)
                coord_fps = [master_pcoa] + sorted_filenames

        elif master_pcoa is None and len(transformed):
            master_pcoa = transformed[0]
            serial_comparison = False

            # Note: the following steps are to guarantee consistency.
            # remove the master from the list and re-add it as a first element
            # the rest of the files must be sorted alphabetically so the result
            # will be: ['unifrac_transformed_reference.txt',
            # 'unifrac_transformed_q1.txt', 'unifrac_transformed_q2.txt'] etc
            coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps)

        for fp in coord_fps:
            try:
                parsed = parse_coords(open(fp, 'U'))
            except (ValueError, QiimeParseError):
                offending_coords_fp.append(fp)

                # do not add any of the data and move along
                continue
            else:
                # pack all the data correspondingly only if it was correctly
                # parsed
                coords_headers.append(parsed[0])
                coords_data.append(parsed[1])
                coords_eigenvalues.append(parsed[2])
                coords_pct.append(parsed[3])

        # in case there were files that couldn't be parsed
        if offending_coords_fp:
            errout = ', '.join(offending_coords_fp)
            option_parser.error(("The following file(s): '%s' could not be "
                                 "parsed properly. Make sure the input folder "
                                 "only contains coordinates files.") % errout)

        # check all files contain the same sample identifiers by flattening the
        # list of available sample ids and returning the sample ids that are
        # in one of the sets of sample ids but not in the globablly shared ids
        _coords_headers = set(flatten(coords_headers))
        _per_file_missing = [_coords_headers - set(e) for e in coords_headers]
        non_shared_ids = set(flatten(_per_file_missing))
        if non_shared_ids:
            errout = ', '.join(non_shared_ids)
            option_parser.error(("The following sample identifier(s): '%s' "
                                 "are not shared between all the files. The "
                                 "files used to make a jackknifed PCoA plot "
                                 "or coordinate comparison plot (procustes "
                                 "plot) must share all the same sample "
                                 "identifiers between each other.") % errout)

        # number of samples ids that are shared between coords and mapping
        # files
        sids_intersection = mapping_ids.intersection(_coords_headers)

        # sample ids that are not mapped but are in the coords
        sids_difference = _coords_headers.difference(mapping_ids)

        # used to perform different validations in the script, very similar for
        # the case where the input is not a directory
        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers[0])

    else:
        try:
            parsed = parse_coords(open(input_coords, 'U'))
        # this exception was noticed when there were letters in the coords file
        # other exeptions should be catched here; code will be updated then
        except (ValueError, QiimeParseError):
            option_parser.error(
                ("The PCoA file '%s' does not seem to be a "
                 "coordinates formatted file, verify by "
                 "manually inspecting the contents.") % input_coords)
        else:
            coords_headers = parsed[0]
            coords_data = parsed[1]
            coords_eigenvalues = parsed[2]
            coords_pct = parsed[3]

        # number of samples ids that are shared between coords and mapping
        # files
        sids_intersection = mapping_ids.intersection(coords_headers)

        # sample ids that are not mapped but are in the coords
        sids_difference = set(coords_headers).difference(mapping_ids)

        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers)

    if taxa_fp:
        try:
            # This should really use BIOM's Table.from_tsv
            # for summarized tables the "otu_ids" are really the "lineages"
            parsed = parse_otu_table(open(taxa_fp, 'U'),
                                     count_map_f=float,
                                     remove_empty_rows=True)
        except ValueError, e:
            option_parser.error(("There was a problem parsing the --taxa_fp: "
                                 "%s" % e.message))
        else:
            otu_sample_ids = parsed[0]
            lineages = parsed[1]
            otu_table = parsed[2]

        # make sure there are matching sample ids with the otu table
        if not sids_intersection.issuperset(otu_sample_ids):
            option_parser.error("The sample identifiers in the OTU table must "
                                "have at least one match with the data in the "
                                "mapping file and with the coordinates file. "
                                "Verify you are using input files that belong "
                                "to the same dataset.")
        if len(lineages) <= 1:
            option_parser.error("Contingency tables with one or fewer rows "
                                "are not supported, please try passing a "
                                "contingency table with more than one row.")
示例#14
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    input_coords = opts.input_coords
    map_fp = opts.map_fp
    output_dir = opts.output_dir
    color_by_column_names = opts.color_by
    add_unique_columns = opts.add_unique_columns
    custom_axes = opts.custom_axes
    ignore_missing_samples = opts.ignore_missing_samples
    missing_custom_axes_values = opts.missing_custom_axes_values
    jackknifing_method = opts.ellipsoid_method
    master_pcoa = opts.master_pcoa
    taxa_fp = opts.taxa_fp
    n_taxa_to_keep = opts.n_taxa_to_keep
    biplot_fp = opts.biplot_fp
    add_vectors = opts.add_vectors
    verbose_output = opts.verbose
    number_of_axes = opts.number_of_axes
    compare_plots = opts.compare_plots
    number_of_segments = opts.number_of_segments
    pct_variation_below_one = opts.pct_variation_below_one

    # add some metadata to the output
    emperor_autograph = format_emperor_autograph(map_fp, input_coords, 'HTML')

    # verifying that the number of axes requested is greater than 3
    if number_of_axes < 3:
        option_parser.error(('You need to plot at least 3 axes.'))

    # verifying that the number of segments is between the desired range
    if not (4 <= number_of_segments <= 14):
        option_parser.error(('number_of_segments should be between 4 and 14.'))

    # append headernames that the script didn't find in the mapping file
    # according to different criteria to the following variables
    offending_fields = []
    non_numeric_categories = []

    serial_comparison = True

    # can't do averaged pcoa plots _and_ custom axes in the same plot
    if custom_axes is not None and isdir(input_coords):
        if custom_axes.count(',') > 0:
            option_parser.error(('Jackknifed plots are limited to one custom '
                                 'axis, currently trying to use: %s. Make '
                                 'sure you use only one.' % custom_axes))

    # make sure the flag is not misunderstood from the command line interface
    if not isdir(input_coords) and compare_plots:
        option_parser.error("Cannot use the '--compare_plots' flag unless the "
                            "input path is a directory.")

    # before creating any output, check correct parsing of the main input files
    try:
        mapping_data, header, comments = parse_mapping_file(open(map_fp, 'U'))

    except:
        option_parser.error(("The metadata mapping file '%s' does not seem "
                             "to be formatted correctly, verify the "
                             "formatting is QIIME compliant by using "
                             "validate_mapping_file.py") % map_fp)
    else:
        # use this set variable to make presence/absensce checks faster
        lookup_header = set(header)
        mapping_ids = {row[0] for row in mapping_data}

    # dir means jackknifing or coordinate comparison type of processing
    if isdir(input_coords):
        offending_coords_fp = []
        coords_headers = []
        coords_data = []
        coords_eigenvalues = []
        coords_pct = []

        coord_fps = guess_coordinates_files(input_coords)

        # QIIME generates folders of transformed coordinates for the specific
        # purpose of connecting all coordinates to a set of origin coordinates.
        # The name of this file is suffixed as _transformed_reference.txt
        trans_suf = '_transformed_reference.txt'
        transformed = [f for f in coord_fps if f.endswith(trans_suf)]

        # this could happen and we rather avoid this problem
        if len(coord_fps) == 0:

            option_parser.error('Could not use any of the files in the input '
                                'directory.')

        # the master pcoa must be the first in the list of coordinates; however
        # if the visualization is not a jackknifed plot this gets ignored
        if master_pcoa and not compare_plots:
            if master_pcoa in coord_fps:  # remove it if duplicated
                coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + coord_fps  # prepend it to the list
        # passing a master file means that the comparison is not serial
        elif master_pcoa and compare_plots:
            serial_comparison = False

            # guarantee that the master is the first and is not repeated
            if master_pcoa in coord_fps:
                coord_fps.remove(master_pcoa)
                sorted_filenames = sort_comparison_filenames(coord_fps)
                coord_fps = [master_pcoa] + sorted_filenames

        elif master_pcoa is None and len(transformed):
            master_pcoa = transformed[0]
            serial_comparison = False

            # Note: the following steps are to guarantee consistency.
            # remove the master from the list and re-add it as a first element
            # the rest of the files must be sorted alphabetically so the result
            # will be: ['unifrac_transformed_reference.txt',
            # 'unifrac_transformed_q1.txt', 'unifrac_transformed_q2.txt'] etc
            coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps)

        for fp in coord_fps:
            try:
                parsed = parse_coords(open(fp, 'U'))
            except (ValueError, QiimeParseError):
                offending_coords_fp.append(fp)

                # do not add any of the data and move along
                continue
            else:
                # pack all the data correspondingly only if it was correctly
                # parsed
                coords_headers.append(parsed[0])
                coords_data.append(parsed[1])
                coords_eigenvalues.append(parsed[2])
                coords_pct.append(parsed[3])

        # in case there were files that couldn't be parsed
        if offending_coords_fp:
            errout = ', '.join(offending_coords_fp)
            option_parser.error(("The following file(s): '%s' could not be "
                                 "parsed properly. Make sure the input folder "
                                 "only contains coordinates files.") % errout)

        # check all files contain the same sample identifiers by flattening the
        # list of available sample ids and returning the sample ids that are
        # in one of the sets of sample ids but not in the globablly shared ids
        _coords_headers = set(flatten(coords_headers))
        _per_file_missing = [_coords_headers - set(e) for e in coords_headers]
        non_shared_ids = set(flatten(_per_file_missing))
        if non_shared_ids:
            errout = ', '.join(non_shared_ids)
            option_parser.error(("The following sample identifier(s): '%s' "
                                 "are not shared between all the files. The "
                                 "files used to make a jackknifed PCoA plot "
                                 "or coordinate comparison plot (procustes "
                                 "plot) must share all the same sample "
                                 "identifiers between each other.") % errout)

        # number of samples ids that are shared between coords and mapping
        # files
        sids_intersection = mapping_ids.intersection(_coords_headers)

        # sample ids that are not mapped but are in the coords
        sids_difference = _coords_headers.difference(mapping_ids)

        # used to perform different validations in the script, very similar for
        # the case where the input is not a directory
        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers[0])

    else:
        try:
            parsed = parse_coords(open(input_coords, 'U'))
        # this exception was noticed when there were letters in the coords file
        # other exeptions should be catched here; code will be updated then
        except (ValueError, QiimeParseError):
            option_parser.error(("The PCoA file '%s' does not seem to be a "
                                 "coordinates formatted file, verify by "
                                 "manually inspecting the contents.") %
                                input_coords)
        else:
            coords_headers = parsed[0]
            coords_data = parsed[1]
            coords_eigenvalues = parsed[2]
            coords_pct = parsed[3]

        # number of samples ids that are shared between coords and mapping
        # files
        sids_intersection = mapping_ids.intersection(coords_headers)

        # sample ids that are not mapped but are in the coords
        sids_difference = set(coords_headers).difference(mapping_ids)

        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers)

    if taxa_fp:
        try:
            # This should really use BIOM's Table.from_tsv
            # for summarized tables the "otu_ids" are really the "lineages"
            parsed = parse_otu_table(open(taxa_fp, 'U'), count_map_f=float,
                                     remove_empty_rows=True)
        except ValueError, e:
            option_parser.error(("There was a problem parsing the --taxa_fp: "
                                 "%s" % e.message))
        else:
            otu_sample_ids = parsed[0]
            lineages = parsed[1]
            otu_table = parsed[2]

        # make sure there are matching sample ids with the otu table
        if not sids_intersection.issuperset(otu_sample_ids):
            option_parser.error("The sample identifiers in the OTU table must "
                                "have at least one match with the data in the "
                                "mapping file and with the coordinates file. "
                                "Verify you are using input files that belong "
                                "to the same dataset.")
        if len(lineages) <= 1:
            option_parser.error("Contingency tables with one or fewer rows "
                                "are not supported, please try passing a "
                                "contingency table with more than one row.")
示例#15
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    input_coords = opts.input_coords
    map_fp = opts.map_fp
    output_dir = opts.output_dir
    color_by_column_names = opts.color_by
    add_unique_columns = opts.add_unique_columns
    custom_axes = opts.custom_axes
    ignore_missing_samples = opts.ignore_missing_samples
    missing_custom_axes_values = opts.missing_custom_axes_values
    jackknifing_method = opts.ellipsoid_method
    master_pcoa = opts.master_pcoa
    taxa_fp = opts.taxa_fp
    n_taxa_to_keep = opts.n_taxa_to_keep
    biplot_fp = opts.biplot_fp
    add_vectors = opts.add_vectors
    verbose_output = opts.verbose
    number_of_axes = opts.number_of_axes
    compare_plots = opts.compare_plots
    number_of_segments = opts.number_of_segments

    # add some metadata to the output
    emperor_autograph = format_emperor_autograph(map_fp, input_coords, 'HTML')

    # verifying that the number of axes requested is greater than 3
    if number_of_axes < 3:
        option_parser.error(('You need to plot at least 3 axes.'))

    # verifying that the number of segments is between the desired range
    if number_of_segments < 4 or number_of_segments > 14:
        option_parser.error(('number_of_segments should be between 4 and 14.'))

    # append headernames that the script didn't find in the mapping file
    # according to different criteria to the following variables
    offending_fields = []
    non_numeric_categories = []

    serial_comparison = True

    # can't do averaged pcoa plots _and_ custom axes in the same plot
    if custom_axes!=None and len(custom_axes.split(','))>1 and\
        isdir(input_coords):
        option_parser.error(
            ('Jackknifed plots are limited to one custom axis, '
             'currently trying to use: %s. Make sure you use only one.' %
             custom_axes))

    # make sure the flag is not misunderstood from the command line interface
    if isdir(input_coords) == False and compare_plots:
        option_parser.error(
            'Cannot use the \'--compare_plots\' flag unless the'
            ' input path is a directory.')

    # before creating any output, check correct parsing of the main input files
    try:
        mapping_data, header, comments = parse_mapping_file(open(map_fp, 'U'))

        # use this set variable to make presence/absensce checks faster
        lookup_header = set(header)
    except:
        option_parser.error(
            ('The metadata mapping file \'%s\' does not seem '
             'to be formatted correctly, verify the formatting is QIIME '
             'compliant by using check_id_map.py') % map_fp)

    # dir means jackknifing or coordinate comparison type of processing
    if isdir(input_coords):
        offending_coords_fp = []
        coords_headers, coords_data, coords_eigenvalues, coords_pct=[],[],[],[]

        # iterate only over the non-hidden files and not folders and if anything
        # ignore the procrustes results file that is generated by
        # transform_coordinate_matrices.py suffixed in procrustes_results.txt
        coord_fps = [
            join(input_coords, f) for f in listdir(input_coords) if
            not f.startswith('.') and not isdir(join(abspath(input_coords), f))
            and not f.endswith('procrustes_results.txt')
        ]

        # this could happen and we rather avoid this problem
        if len(coord_fps) == 0:
            option_parser.error('Could not use any of the files in the input '
                                'directory.')

        # the master pcoa must be the first in the list of coordinates; however
        # if the visualization is not a jackknifed plot this gets ignored
        if master_pcoa and compare_plots == False:
            if master_pcoa in coord_fps:  # remove it if duplicated
                coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + coord_fps  # prepend it to the list
        # passing a master file means that the comparison is not serial
        elif master_pcoa and compare_plots:
            serial_comparison = False

            # guarantee that the master is the first and is not repeated
            if master_pcoa in coord_fps:
                coord_fps.remove(master_pcoa)
                coord_fps = [master_pcoa
                             ] + sort_comparison_filenames(coord_fps)

        # QIIME generates folders of transformed coordinates for the specific
        # purpose of connecting all coordinates to a set of origin coordinates.
        # The name of this file is suffixed as _transformed_reference.txt
        elif master_pcoa == None and len(
            [f
             for f in coord_fps if f.endswith('_transformed_reference.txt')]):
            master_pcoa = [
                f for f in coord_fps
                if f.endswith('_transformed_reference.txt')
            ][0]
            serial_comparison = False

            # Note: the following steps are to guarantee consistency.
            # remove the master from the list and re-add it as a first element
            # the rest of the files must be sorted alphabetically so the result
            # will be: ['unifrac_transformed_reference.txt',
            # 'unifrac_transformed_q1.txt', 'unifrac_transformed_q2.txt'] etc
            coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps)

        for fp in coord_fps:
            try:
                _coords_headers, _coords_data, _coords_eigenvalues,_coords_pct=\
                    parse_coords(open(fp,'U'))
            except (ValueError, QiimeParseError):
                offending_coords_fp.append(fp)

                # do not add any of the data and move along
                continue

            # pack all the data correspondingly only if it was correctly parsed
            coords_headers.append(_coords_headers)
            coords_data.append(_coords_data)
            coords_eigenvalues.append(_coords_eigenvalues)
            coords_pct.append(_coords_pct)

        # in case there were files that couldn't be parsed
        if offending_coords_fp:
            option_parser.error(
                ('The following file(s): \'%s\' could not be '
                 'parsed properly. Make sure the input folder only contains '
                 'coordinates files.') % ', '.join(offending_coords_fp))

        # check all files contain the same sample identifiers by flattening the
        # list of available sample ids and returning the sample ids that are
        # in one of the sets of sample ids but not in the globablly shared ids
        non_shared_ids = set(
            sum([
                list(set(sum(coords_headers, [])) ^ set(e))
                for e in coords_headers
            ], []))
        if non_shared_ids and len(coords_headers) > 1:
            option_parser.error(
                ('The following sample identifier(s): \'%s\''
                 'are not shared between all the files. The files used to '
                 'make a jackknifed PCoA plot or coordinate comparison plot ('
                 'procustes plot) must share all the same sample identifiers'
                 'between each other.') % ', '.join(list(non_shared_ids)))

        # flatten the list of lists into a 1-d list
        _coords_headers = list(set(sum(coords_headers, [])))

        # number of samples ids that are shared between coords and mapping files
        sids_intersection = list(
            set(zip(*mapping_data)[0]) & set(_coords_headers))

        # sample ids that are not mapped but are in the coords
        sids_difference = list(
            set(_coords_headers) - set(zip(*mapping_data)[0]))

        # used to perform different validations in the script, very similar for
        # the case where the input is not a directory
        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers[0])

    else:
        try:
            coords_headers, coords_data, coords_eigenvalues, coords_pct =\
                parse_coords(open(input_coords,'U'))
        # this exception was noticed when there were letters in the coords file
        # other exeptions should be catched here; code will be updated then
        except (ValueError, QiimeParseError):
            option_parser.error(
                ('The PCoA file \'%s\' does not seem to be a '
                 'coordinates formatted file, verify by manually inspecting '
                 'the contents.') % input_coords)

        # number of samples ids that are shared between coords and mapping files
        sids_intersection = list(
            set(zip(*mapping_data)[0]) & set(coords_headers))
        # sample ids that are not mapped but are in the coords
        sids_difference = list(
            set(coords_headers) - set(zip(*mapping_data)[0]))
        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers)

    if taxa_fp:
        try:
            # for summarized tables the "otu_ids" are really the "lineages"
            otu_sample_ids, lineages, otu_table, _ = parse_otu_table(
                open(taxa_fp, 'U'), count_map_f=float, remove_empty_rows=True)
        except ValueError, e:
            option_parser.error(
                'There was a problem parsing the --taxa_fp: %s' % e.message)

        # make sure there are matching sample ids with the otu table
        if not len(list(set(sids_intersection) & set(otu_sample_ids))):
            option_parser.error(
                'The sample identifiers in the OTU table must '
                'have at least one match with the data in the mapping file and '
                'with the coordinates file. Verify you are using input files '
                'that belong to the same dataset.')
        if len(lineages) <= 1:
            option_parser.error(
                'Contingency tables with one or fewer rows are '
                'not supported, please try passing a contingency table with '
                'more than one row.')
示例#16
0
def main():
    #option_parser, opts, args = parse_command_line_parameters(**script_info)
    input_coords = args.input_coords
    map_fp = args.map_fp
    output_dir = args.output_dir
    
    color_by_column_names = None  #opts.color_by
    add_unique_columns = False      #opts.add_unique_columns
    custom_axes = None  #opts.custom_axes
    ignore_missing_samples = False      #opts.ignore_missing_samples
    missing_custom_axes_values = None   #opts.missing_custom_axes_values
    jackknifing_method = 'IQR'      #opts.ellipsoid_method
    master_pcoa = None      #opts.master_pcoa
    taxa_fp = None      #opts.taxa_fp
    n_taxa_to_keep = False      #opts.n_taxa_to_keep
    biplot_fp = None        #opts.biplot_fp
    add_vectors = [None, None]      #opts.add_vectors
    verbose_output = False      #opts.verbose
    number_of_axes = 10     #opts.number_of_axes
    compare_plots = False       #opts.compare_plots
    number_of_segments = 8      #opts.number_of_segments
    pct_variation_below_one = True  #opts.pct_variation_below_one

    # add some metadata to the output
    emperor_autograph = format_emperor_autograph(map_fp, input_coords, 'HTML')

    # verifying that the number of axes requested is greater than 3
    if number_of_axes < 3:
        print(('You need to plot at least 3 axes.'))

    # verifying that the number of segments is between the desired range
    if not (4 <= number_of_segments <= 14):
        print(('number_of_segments should be between 4 and 14.'))

    # append headernames that the script didn't find in the mapping file
    # according to different criteria to the following variables
    offending_fields = []
    non_numeric_categories = []

    serial_comparison = True

    # can't do averaged pcoa plots _and_ custom axes in the same plot
    if custom_axes is not None and isdir(input_coords):
        if custom_axes.count(',') > 0:
            print(('Jackknifed plots are limited to one custom '
                                 'axis, currently trying to use: %s. Make '
                                 'sure you use only one.' % custom_axes))

    # make sure the flag is not misunderstood from the command line interface
    if not isdir(input_coords) and compare_plots:
        print("Cannot use the '--compare_plots' flag unless the "
                            "input path is a directory.")

    # before creating any output, check correct parsing of the main input files
    #try:
    mapping_data, header, comments = parse_mapping_file(open(map_fp, 'U'))
    try:
        pass
    except:
        sys.exit(("The metadata mapping file '%s' does not seem "
                             "to be formatted correctly, verify the "
                             "formatting is QIIME compliant by using "
                             "validate_mapping_file.py") % map_fp)
            
    else:
        # use this set variable to make presence/absensce checks faster
        lookup_header = set(header)
        mapping_ids = {row[0] for row in mapping_data}

    # dir means jackknifing or coordinate comparison type of processing
    if isdir(input_coords):
        offending_coords_fp = []
        coords_headers = []
        coords_data = []
        coords_eigenvalues = []
        coords_pct = []

        coord_fps = guess_coordinates_files(input_coords)

        # QIIME generates folders of transformed coordinates for the specific
        # purpose of connecting all coordinates to a set of origin coordinates.
        # The name of this file is suffixed as _transformed_reference.txt
        trans_suf = '_transformed_reference.txt'
        transformed = [f for f in coord_fps if f.endswith(trans_suf)]

        # this could happen and we rather avoid this problem
        if len(coord_fps) == 0:

            print('Could not use any of the files in the input '
                                'directory.')

        # the master pcoa must be the first in the list of coordinates; however
        # if the visualization is not a jackknifed plot this gets ignored
        if master_pcoa and not compare_plots:
            if master_pcoa in coord_fps:  # remove it if duplicated
                coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + coord_fps  # prepend it to the list
        # passing a master file means that the comparison is not serial
        elif master_pcoa and compare_plots:
            serial_comparison = False

            # guarantee that the master is the first and is not repeated
            if master_pcoa in coord_fps:
                coord_fps.remove(master_pcoa)
                sorted_filenames = sort_comparison_filenames(coord_fps)
                coord_fps = [master_pcoa] + sorted_filenames

        elif master_pcoa is None and len(transformed):
            master_pcoa = transformed[0]
            serial_comparison = False

            # Note: the following steps are to guarantee consistency.
            # remove the master from the list and re-add it as a first element
            # the rest of the files must be sorted alphabetically so the result
            # will be: ['unifrac_transformed_reference.txt',
            # 'unifrac_transformed_q1.txt', 'unifrac_transformed_q2.txt'] etc
            coord_fps.remove(master_pcoa)
            coord_fps = [master_pcoa] + sort_comparison_filenames(coord_fps)

        for fp in coord_fps:
            try:
                parsed = parse_coords(open(fp, 'U'))
            except (ValueError, QiimeParseError):
                offending_coords_fp.append(fp)

                # do not add any of the data and move along
                continue
            else:
                # pack all the data correspondingly only if it was correctly
                # parsed
                coords_headers.append(parsed[0])
                coords_data.append(parsed[1])
                coords_eigenvalues.append(parsed[2])
                coords_pct.append(parsed[3])

        # in case there were files that couldn't be parsed
        if offending_coords_fp:
            errout = ', '.join(offending_coords_fp)
            sys.exit(("The following file(s): '%s' could not be "
                                 "parsed properly. Make sure the input folder "
                                 "only contains coordinates files.") % errout)

        # check all files contain the same sample identifiers by flattening the
        # list of available sample ids and returning the sample ids that are
        # in one of the sets of sample ids but not in the globablly shared ids
        _coords_headers = set(flatten(coords_headers))
        _per_file_missing = [_coords_headers - set(e) for e in coords_headers]
        non_shared_ids = set(flatten(_per_file_missing))
        if non_shared_ids:
            errout = ', '.join(non_shared_ids)
            sys.exit(("The following sample identifier(s): '%s' "
                                 "are not shared between all the files. The "
                                 "files used to make a jackknifed PCoA plot "
                                 "or coordinate comparison plot (procustes "
                                 "plot) must share all the same sample "
                                 "identifiers between each other.") % errout)

        # number of samples ids that are shared between coords and mapping
        # files
        sids_intersection = mapping_ids.intersection(_coords_headers)

        # sample ids that are not mapped but are in the coords
        sids_difference = _coords_headers.difference(mapping_ids)

        # used to perform different validations in the script, very similar for
        # the case where the input is not a directory
        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers[0])

    else:
        try:
            parsed = parse_coords(open(input_coords, 'U'))
        # this exception was noticed when there were letters in the coords file
        # other exeptions should be catched here; code will be updated then
        except (ValueError, QiimeParseError):
            sys.exit(("The PCoA file '%s' does not seem to be a "
                                 "coordinates formatted file, verify by "
                                 "manually inspecting the contents.") %
                                input_coords)
        else:
            coords_headers = parsed[0]
            coords_data = parsed[1]
            coords_eigenvalues = parsed[2]
            coords_pct = parsed[3]

        # number of samples ids that are shared between coords and mapping
        # files
        sids_intersection = mapping_ids.intersection(coords_headers)

        # sample ids that are not mapped but are in the coords
        sids_difference = set(coords_headers).difference(mapping_ids)

        number_intersected_sids = len(sids_intersection)
        required_number_of_sids = len(coords_headers)

    if taxa_fp:
        try:
            # This should really use BIOM's Table.from_tsv
            # for summarized tables the "otu_ids" are really the "lineages"
            parsed = parse_otu_table(open(taxa_fp, 'U'), count_map_f=float,
                                     remove_empty_rows=True)
        except ValueError(e):
            sys.exit(("There was a problem parsing the --taxa_fp: "
                                 "%s" % e.message))
        else:
            otu_sample_ids = parsed[0]
            lineages = parsed[1]
            otu_table = parsed[2]

        # make sure there are matching sample ids with the otu table
        if not sids_intersection.issuperset(otu_sample_ids):
            sys.exit("The sample identifiers in the OTU table must "
                                "have at least one match with the data in the "
                                "mapping file and with the coordinates file. "
                                "Verify you are using input files that belong "
                                "to the same dataset.")
        if len(lineages) <= 1:
            sys.exit("Contingency tables with one or fewer rows "
                                "are not supported, please try passing a "
                                "contingency table with more than one row.")
    else:
        # empty lists indicate that there was no taxa file passed in
        otu_sample_ids, lineages, otu_table = [], [], []

    # sample ids must be shared between files
    if number_intersected_sids <= 0:
        sys.exit('None of your sample identifiers match between the'
                            ' mapping file and the coordinates file. Verify '
                            'you are using a coordinates file and a mapping '
                            'file that belong to the same dataset.')

    # the intersection of the sample ids in the coords and the sample ids in
    # the mapping file must at the very least include all ids in the coords
    # file Otherwise it isn't valid; unless --ignore_missing_samples is set
    # True
    if number_intersected_sids != required_number_of_sids:
        if ignore_missing_samples:
            # keep only the samples that are mapped in the mapping file
            coords_headers, coords_data = keep_samples_from_pcoa_data(
                coords_headers, coords_data, sids_intersection)
        else:
            message = ("The metadata mapping file has fewer sample "
                       "identifiers than the coordinates file. Verify you are "
                       "using a mapping file that contains at least all the "
                       "samples contained in the coordinates file(s). You can "
                       "force the script to ignore these samples by passing "
                       "the '--ignore_missing_samples' flag.")

            if verbose_output:
                missing_ids = ', '.join(sids_difference)
                message += ' Offending sample identifier(s): %s.' % missing_ids

            sys.exit(message)

    # ignore samples that exist in the coords but not in the mapping file,
    # note: we're using sids_intersection so if --ignore_missing_samples is
    # enabled we account for unmapped coords, else the program will exit before
    # this point
    header, mapping_data = filter_mapping_file(mapping_data, header,
                                               sids_intersection,
                                               include_repeat_cols=True)

    # catch the errors that could occur when filling the mapping file values
    if missing_custom_axes_values:
        # the fact that this uses parse_metadata_state_descriptions makes the
        # following option '-x Category:7;PH:12' to work as well as the
        # script-interface-documented '-x Category:7 -x PH:12' option
        for val in missing_custom_axes_values:
            if ':' not in val:
                sys.exit("Not valid missing value for custom "
                                    "axes: %s" % val)
        _mcav = ';'.join(missing_custom_axes_values)
        try:
            mapping_data = fill_mapping_field_from_mapping_file(mapping_data,
                                                                header, _mcav)
        except AssertionError(e):
            print(e.message)
        except EmperorInputFilesError(e):
            print(e.message)

    # check that all the required columns exist in the metadata mapping file
    if color_by_column_names:
        color_by_column_names = color_by_column_names.split(',')

        # check for all the mapping fields
        for col in color_by_column_names:
            # for concatenated columns check each individual field
            parts = col.split('&&')
            offending_fields.extend(p for p in parts if p not in lookup_header)
    else:
        # if the user didn't specify the header names display everything
        color_by_column_names = [None]

    # extract a list of the custom axes provided and each element is numeric
    if custom_axes:
        custom_axes = custom_axes.strip().strip("'").strip('"').split(',')

        # the MetadataMap object makes some checks easier
        map_object = MetadataMap(mapping_file_to_dict(mapping_data, header),
                                 [])
        for axis in custom_axes:
            # append the field to the error queue that it belongs to
            if axis not in lookup_header:
                offending_fields.append(axis)
                break
            # make sure this value is in the mapping file
            elif axis not in color_by_column_names:
                color_by_column_names.append(axis)
        # perform only if the for loop does not call break
        else:
            # make sure all these axes are numeric
            for axis in custom_axes:
                if not map_object.isNumericCategory(axis):
                    non_numeric_categories.append(axis)

    # make multiple checks for the add_vectors option
    if add_vectors != [None, None]:
        add_vectors = add_vectors.split(',')
        # check there are at the most two categories specified for this option
        if len(add_vectors) > 2:
            print("The '--add_vectors' option can accept up to "
                                "two different fields from the mapping file; "
                                "currently trying to use %d (%s)." %
                                (len(add_vectors), ', '.join(add_vectors)))
        # make sure the field(s) exist
        for col in add_vectors:
            # concatenated fields are allowed now so check for each field
            if '&&' in col:
                for _col in col.split('&&'):
                    if _col not in lookup_header:
                        offending_fields.append(col)
                        break
                # only execute this block of code if all checked fields exist
                else:
                    # make sure that if it's going to be used for vector
                    # creation it gets used for coloring and map postprocessing
                    if col not in color_by_column_names:
                        color_by_column_names.append(col)
            # if it's a column without concatenations
            elif col not in lookup_header:
                offending_fields.append(col)
                break
            else:
                # check this vector value is in the color by category
                if col not in color_by_column_names:
                    color_by_column_names.append(col)
        # perform only if the for loop does not call break
        else:
            # check that the second category is all with numeric values
            if len(add_vectors) == 2:
                map_object = MetadataMap(mapping_file_to_dict(mapping_data,
                                                              header),
                                         [])
                # if it has non-numeric values add it to the list of offenders
                if not map_object.isNumericCategory(add_vectors[1]):
                    msg = add_vectors[1] + '(used in --add_vectors)'
                    non_numeric_categories.append(msg)
            else:
                add_vectors.append(None)

    # terminate the program for the cases where a mapping field was not found
    # or when a mapping field didn't meet the criteria of being numeric
    if offending_fields:
        sys.exit("Invalid field(s) '%s'; the valid field(s) are: "
                            "'%s'" % (', '.join(offending_fields),
                                      ', '.join(header)))
    if non_numeric_categories:
        sys.exit(("The following field(s): '%s' contain values "
                             "that are not numeric, hence not suitable for "
                             "'--custom_axes' nor for '--add_vectors'. Try "
                             "the '--missing_custom_axes_values' option to "
                             "fix these values." %
                             ', '.join(non_numeric_categories)))

    # process the coordinates file first, preventing the case where the custom
    # axes is not in the coloring categories i. e. in the --colory_by
    # categories
    preprocessed_coords = preprocess_coords_file(coords_headers, coords_data,
                                                 coords_eigenvalues,
                                                 coords_pct, header,
                                                 mapping_data, custom_axes,
                                                 jackknifing_method,
                                                 compare_plots,
                                                 pct_variation_below_one)
    coords_headers = preprocessed_coords[0]
    coords_data = preprocessed_coords[1]
    coords_eigenvalues = preprocessed_coords[2]
    coords_pct = preprocessed_coords[3]
    coords_low = preprocessed_coords[4]
    coords_high = preprocessed_coords[5]
    clones = preprocessed_coords[6]

    # process the otu table after processing the coordinates to get custom axes
    # (when available) or any other change that occurred to the coordinates
    preprocessed_otu_table = preprocess_otu_table(otu_sample_ids, otu_table,
                                                  lineages, coords_data,
                                                  coords_headers,
                                                  n_taxa_to_keep)
    otu_coords = preprocessed_otu_table[0]
    otu_table = preprocessed_otu_table[1]
    otu_lineages = preprocessed_otu_table[2]
    otu_prevalence = preprocessed_otu_table[3]
    lines = preprocessed_otu_table[4]

    # remove the columns in the mapping file that are not informative taking
    # into account the header names that were already authorized to be used
    # and take care of concatenating the fields for the && merged columns
    mapping_data, header = preprocess_mapping_file(mapping_data, header,
                                                   color_by_column_names,
                                                   not add_unique_columns,
                                                   clones=clones)

    # create the output directory before creating any other output
    if not isdir(opts.output_dir):
        makedirs(opts.output_dir)

    fp_out = open(join(output_dir, 'index.html'), 'w')
    fp_out.write(emperor_autograph+'\n')
    fp_out.write(EMPEROR_HEADER_HTML_STRING)

    # write the html file
    fp_out.write(format_mapping_file_to_js(mapping_data, header, header))

    # certain percents being explained cannot be displayed in the GUI
    try:
        fp_out.write(format_pcoa_to_js(coords_headers, coords_data,
                                       coords_eigenvalues, coords_pct,
                                       custom_axes, coords_low,
                                       coords_high,
                                       number_of_axes=number_of_axes,
                                       number_of_segments=number_of_segments))
    except EmperorLogicError(e):
        sys.exit(e.message)

    fp_out.write(format_taxa_to_js(otu_coords, otu_lineages, otu_prevalence))
    fp_out.write(format_vectors_to_js(mapping_data, header, coords_data,
                                      coords_headers, add_vectors[0],
                                      add_vectors[1]))
    fp_out.write(format_comparison_bars_to_js(coords_data, coords_headers,
                                              clones, serial_comparison))
    has_taxa = taxa_fp is not None
    has_input_coords = isdir(input_coords) and not compare_plots
    has_add_vectors = add_vectors != [None, None]
    has_clones = clones > 0
    fp_out.write(format_emperor_html_footer_string(has_taxa, has_input_coords,
                                                   has_add_vectors,
                                                   has_clones))
    fp_out.close()
    copy_support_files(output_dir)

    # write the biplot coords in the output file if a path is passed
    if biplot_fp and taxa_fp:
        if biplot_fp.endswith('/') or isdir(biplot_fp):
            print("Do not specify a path to a new (path ending "
                                "in a slash) or existing directory for "
                                "biplot_fp. The output file will be a "
                                "tab-delimited text file.")

        # make sure this file can be created
        try:
            fd = open(biplot_fp, 'w')
        except IOError:
            sys.exit("There was a problem creating the file with "
                                "the coordinates for the biplots (%s)."
                                % biplot_fp)
        else:
            fd.writelines(lines)
            fd.close()