def test_correlations_selection(self):

        user_request = AnalysisTestUtils.create_default_user_request()
        user_request.set_custom_attr("expvar", "MetadataSignificant")

        otu_table = AnalysisTestUtils.get_test_input_as_table(
            AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT)
        headers, sample_labels = AnalysisTestUtils.get_test_input_as_metadata(
            AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT)
        metadata_table = AnalysisTestUtils.get_test_input_as_table(
            AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT, SAMPLE_METADATA_FILENAME)
        taxonomic_map = AnalysisTestUtils.get_test_taxonomy(
            AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT)

        metadata = Metadata("test", "test", False)
        metadata.set_table(metadata_table)

        plugin = CorrelationsSelection()
        actual_output = plugin.analyse(user_request, otu_table, headers,
                                       sample_labels, metadata, taxonomic_map)
        print(json.dumps(actual_output))
        expected_output = AnalysisTestUtils.get_expected_output(
            AnalysisTestUtils.SIMPLE_TEST_CASE_OUTPUT_ROOT,
            "correlations_selection.json")
        comparison_output = AnalysisTestUtils.compare_two_objects(
            expected_output, actual_output)
        if not comparison_output:
            print("Expected: ")
            print(expected_output)
            print("Actual: ")
            print(actual_output)
        self.assertTrue(comparison_output)
Пример #2
0
 def get_metadata_obj(test_dir,
                      csv_name=SAMPLE_METADATA_FILENAME,
                      sep="\t"):
     output = []
     csv_name = os.path.join(test_dir, csv_name)
     print("Opening file with name " + csv_name)
     with open(csv_name, 'r') as csvfile:
         base_csv = csv.reader(csvfile, delimiter=sep, quotechar='|')
         i = 0
         for o in base_csv:
             output.append(o)
             i += 1
     metadata = Metadata("", "", load_samples=False)
     metadata.set_table(output)
     return metadata
Пример #3
0
    def test_simple_differential_selection_with_ancom(self):

        user_request = AnalysisTestUtils.create_default_user_request()
        user_request.catvar = "Category"
        user_request.set_custom_attr("pvalthreshold", "0.01")
        user_request.set_custom_attr("pwVar1", "Control")
        user_request.set_custom_attr("pwVar2", "Disease")

        otu_table = AnalysisTestUtils.get_test_input_as_table(
            AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT)
        headers, sample_labels = AnalysisTestUtils.get_test_input_as_metadata(
            AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT)
        metadata_table = AnalysisTestUtils.get_test_input_as_table(
            AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT, SAMPLE_METADATA_FILENAME)
        metadata_col = AnalysisTestUtils.get_disease_metadata_values(
            AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT)
        taxonomic_map = AnalysisTestUtils.get_test_taxonomy(
            AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT)
        sample_ids_from_metadata = AnalysisTestUtils.get_sample_ids_from_metadata(
            AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT)
        sample_id_to_metadata = {}
        i = 0
        while i < len(sample_ids_from_metadata):
            sample_id_to_metadata[
                sample_ids_from_metadata[i]] = metadata_col[i]
            i += 1

        metadata = Metadata("test", "test", False)
        metadata.set_table(metadata_table)

        plugin = DifferentialSelection()
        abundances = plugin.analyse_with_ancom(user_request, otu_table,
                                               headers, sample_labels,
                                               sample_id_to_metadata,
                                               taxonomic_map)
        print(json.dumps(abundances))
        expected_output = AnalysisTestUtils.get_expected_output(
            AnalysisTestUtils.SIMPLE_TEST_CASE_OUTPUT_ROOT,
            "differential_selection_with_ancom_control_disease.json")
        comparison_output = AnalysisTestUtils.compare_two_objects(
            expected_output, abundances)
        if not comparison_output:
            print("Expected: ")
            print(expected_output)
            print("Actual: ")
            print(abundances)
        self.assertTrue(comparison_output)
Пример #4
0
    def load_otu_table(self, user_id, pid, use_raw, use_np):
        self.user_id = user_id
        self.pid = pid
        logger.info("Before load")
        self.sample_metadata = Metadata(user_id, pid)
        logger.info("Finished metadata loading")
        self.otu_metadata = Taxonomy(user_id, pid)
        logger.info("Finished taxonomy loading")

        logger.info("Using raw data")
        if use_np:
            self.table = DataIO.tsv_to_np_table(self.user_id, self.pid, RAW_GENE_TABLE_FILENAME)
        else:
            self.table = DataIO.tsv_to_table(self.user_id, self.pid, RAW_GENE_TABLE_FILENAME)
        labels = DataIO.tsv_to_table(self.user_id, self.pid, RAW_GENE_TABLE_LABELS_FILENAME)
        self.headers = labels[0]
        self.sample_labels = labels[1]
    def test_simple_correlations(self):

        user_request = AnalysisTestUtils.create_default_user_request()
        user_request.set_custom_attr("corrvar1", "MetadataSignificant")
        user_request.set_custom_attr("corrvar2", "MetadataNonSignificant")
        user_request.set_custom_attr("corrvar1SpecificTaxonomies", "[]")
        user_request.set_custom_attr("corrvar2SpecificTaxonomies", "[]")
        user_request.set_custom_attr("colorvar", "MetadataSignificant")
        user_request.set_custom_attr("sizevar", "MetadataNonSignificant")
        user_request.set_custom_attr("samplestoshow", "both")

        otu_table = AnalysisTestUtils.get_test_input_as_table(
            AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT)
        headers, sample_labels = AnalysisTestUtils.get_test_input_as_metadata(
            AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT)
        metadata_table = AnalysisTestUtils.get_test_input_as_table(
            AnalysisTestUtils.SIMPLE_TEST_CASE_ROOT, SAMPLE_METADATA_FILENAME)

        metadata = Metadata("test", "test", False)
        metadata.set_table(metadata_table)

        plugin = Correlations()
        actual_output = plugin.analyse(user_request, otu_table, headers,
                                       sample_labels, metadata)
        expected_output = AnalysisTestUtils.get_expected_output(
            AnalysisTestUtils.SIMPLE_TEST_CASE_OUTPUT_ROOT,
            "correlation_sign_nonsign_sign_nonsign.json")
        comparison_output = AnalysisTestUtils.compare_two_objects(
            expected_output, actual_output)
        print(json.dumps(actual_output))
        if not comparison_output:
            print("Expected: ")
            print(expected_output)
            print("Actual: ")
            print(actual_output)
        self.assertTrue(comparison_output)
Пример #6
0
class OTUTable(object):

    def __init__(self, user_id, pid, use_raw=False, use_np=True):
        self.user_id = ""
        self.pid = ""
        self.sample_metadata = ""
        self.otu_metadata = ""
        self.phylogenetic_tree = ""
        self.table = []
        self.headers = []
        self.sample_labels = []
        self.load_otu_table(user_id, pid, use_raw, use_np)
        logger.info(DataIO.tsv_to_table.cache_info())

    def load_otu_table(self, user_id, pid, use_raw, use_np):
        self.user_id = user_id
        self.pid = pid
        logger.info("Before load")
        self.sample_metadata = Metadata(user_id, pid)
        logger.info("Finished metadata loading")
        self.otu_metadata = Taxonomy(user_id, pid)
        logger.info("Finished taxonomy loading")

        logger.info("Using raw data")
        if use_np:
            self.table = DataIO.tsv_to_np_table(self.user_id, self.pid, RAW_GENE_TABLE_FILENAME)
        else:
            self.table = DataIO.tsv_to_table(self.user_id, self.pid, RAW_GENE_TABLE_FILENAME)
        labels = DataIO.tsv_to_table(self.user_id, self.pid, RAW_GENE_TABLE_LABELS_FILENAME)
        self.headers = labels[0]
        self.sample_labels = labels[1]

    def load_phylogenetic_tree_if_exists(self):
        self.phylogenetic_tree = ""

    def get_table(self):
        return self.table

    def get_headers(self):
        return self.headers

    def get_sample_labels(self):
        return self.sample_labels

    def get_table_after_filtering(self, user_request):
        t, h, s = self.filter_otu_table_by_metadata(self.table, self.headers, self.sample_labels, user_request)
        t, h, s = self.filter_otu_table_by_taxonomic_items(t, h, s, self.otu_metadata.get_taxonomy_map(), user_request)
        return t, h, s

    def get_table_after_filtering_and_aggregation(self, user_request):
        logger.info("Starting filtering and aggregation")
        t, h, s = self.filter_otu_table_by_metadata(self.table, self.headers, self.sample_labels, user_request)
        logger.info("Finished filtering by metadata")
        t, h, s = self.filter_otu_table_by_taxonomic_items(t, h, s, self.otu_metadata.get_taxonomy_map(), user_request)
        logger.info("Finished filtering by taxonomic items")
        # TODO: No aggregation needed in the gene world
        # t, h, s = self.aggregate_otu_table_at_taxonomic_level(t, h, s, user_request)
        logger.info("Finished aggregation")
        return t, h, s

    def get_table_after_filtering_and_aggregation_and_low_count_aggregation(self, user_request):
        logger.info("Starting filtering and aggregation")
        t, h, s = self.filter_otu_table_by_metadata(self.table, self.headers, self.sample_labels, user_request)
        logger.info("Finished filtering by metadata")
        t, h, s = self.filter_otu_table_by_taxonomic_items(t, h, s, self.otu_metadata.get_taxonomy_map(), user_request)
        logger.info("Finished filtering by taxonomic items")
        # TODO: No aggregation needed in the gene world
        # t, h, s = self.aggregate_otu_table_at_taxonomic_level(t, h, s, user_request)
        logger.info("Finished filtering by low counts")
        # t, h, s = self.aggregate_low_count_np(t, h, s, user_request)
        logger.info("Finished aggregation")
        return t, h, s

    def get_otu_metadata(self):
        return self.otu_metadata

    def get_sample_metadata(self):
        return self.sample_metadata

    def get_phylogenetic_tree(self):
        return self.phylogenetic_tree

    def filter_otu_table_by_metadata(self, base, headers, sample_labels, user_request):
        """
        Filters an OTU table by a particular metadata category by identifying the samples that fall under the
        metadata category
        :param base:
        :param metadata:
        :param catvar:
        :param values:
        :return:
        """
        catvar = user_request.sample_filter
        role = user_request.sample_filter_role
        values = user_request.sample_filter_vals
        if catvar == "none" or catvar == "" or (len(values) == 1 and values[0] == "mian-select-all"):
            # Filtering is not enabled or everything is selected
            logger.info("Sample filtering not enabled or all samples are selected")
            return base, headers, sample_labels

        metadata_map = self.sample_metadata.get_sample_id_to_metadata_map(catvar)

        samples = {}

        row = 0
        while row < len(base):
            sample_id = sample_labels[row]
            if sample_id in metadata_map:
                if role == "Include":
                    if metadata_map[sample_id] in values:
                        samples[sample_id] = 1
                else:
                    if metadata_map[sample_id] not in values:
                        samples[sample_id] = 1

            row += 1

        if samples is None or samples == "":
            samples = []

        new_otu_table = []
        new_sample_labels = []

        num_filtered_samples = 0
        i = 0
        while i < len(base):
            sample_id = sample_labels[i]
            if sample_id in samples:
                new_otu_table.append(base[i])
                new_sample_labels.append(sample_id)
            else:
                num_filtered_samples += 1
            i += 1

        logger.info("Filtered out " + str(num_filtered_samples) + "/" + str(len(base)) + " samples")
        return new_otu_table, headers, new_sample_labels

    def filter_otu_table_by_taxonomic_items(self, base, headers, sample_labels, taxonomic_map, user_request):
        """
        Returns an OTU table that has been filtered by specific taxonomic items of interest
        (eg. if the user selected that they only wanted to see Staphylococcus genus, an OTU table
        will be returned that only contains Staphylococcus OTUs)
        :param base:
        :param taxonomic_map:
        :param items_of_interest:
        :param level:
        :return:
        """
        level = user_request.taxonomy_filter
        role = user_request.taxonomy_filter_role
        items_of_interest = user_request.taxonomy_filter_vals

        if int(level) == -2 or (len(items_of_interest) == 1 and items_of_interest[0] == "mian-select-all"):
            # -2 indicates that we should not filter by taxonomic items or everything is selected
            logger.info("OTU filtering not enabled or all OTUs are selected")
            return base, headers, sample_labels

        otus = {}
        for otu, classification in taxonomic_map.items():
            if 0 <= int(level) < len(classification):
                if role == "Include":
                    if classification[int(level)] in items_of_interest:
                        otus[otu] = 1
                else:
                    if classification[int(level)] not in items_of_interest:
                        otus[otu] = 1
            elif int(level) == -1:
                if role == "Include":
                    if otu in items_of_interest:
                        otus[otu] = 1
                else:
                    if otu not in items_of_interest:
                        otus[otu] = 1
            else:
                otus[otu] = 1

        new_otu_table = []
        new_headers = []

        num_filtered_otus = 0
        i = 0
        while i < len(base):
            new_row = []
            j = 0
            while j < len(base[i]):
                if headers[j] in otus:
                    new_row.append(base[i][j])
                    if i == 0:
                        new_headers.append(headers[j])
                else:
                    if i == 0:
                        num_filtered_otus += 1
                j += 1
            new_otu_table.append(new_row)
            i += 1
        logger.info("Table cols = " + str(len(new_otu_table[0])) + " header cols = " + str(len(new_headers)))
        logger.info("Filtered out " + str(num_filtered_otus) + "/" + str(len(base[0])) + " OTUs/taxas")
        return new_otu_table, new_headers, sample_labels

    def aggregate_otu_table_at_taxonomic_level(self, base, headers, sample_labels, user_request):
        """
        Returns an OTU table that has been transformed according to the functional annotations
        :param base:
        :param level:
        :return:
        """
        level = user_request.level
        if int(level) < 0:
            # We want to aggregate at the OTU level, which is essentially not aggregating at all
            return base, headers, sample_labels

        taxonomy_map = self.otu_metadata.get_taxonomy_map()
        taxonomies = []
        taxonomy_to_cols = {}
        j = 0
        while j < len(headers):
            otu = headers[j]

            if otu not in taxonomy_map:
                # TODO: This actually indicates bad input data
                j += 1
                continue

            taxonomy = "; ".join(taxonomy_map[otu][:int(level) + 1])
            if taxonomy in taxonomy_to_cols:
                taxonomy_to_cols[taxonomy].append(j)
            else:
                taxonomy_to_cols[taxonomy] = [j]
                taxonomies.append(taxonomy)
            j += 1

        with Pool() as pool:
            func = partial(process_row_aggregate_otu_table_at_taxonomic_level, taxonomies, taxonomy_to_cols)
            aggregated_base = pool.map(func, base)
            aggregated_headers = taxonomies
            logger.info("Agg Table cols = " + str(len(aggregated_base[0])) + " header cols = " + str(len(aggregated_headers)))
            return aggregated_base, aggregated_headers, sample_labels

    def aggregate_otu_table_at_taxonomic_level_np(self, base, headers, sample_labels, user_request):
        """
        Returns an OTU table that has been aggregated at a specific taxonomic level (eg. this could return a
        table that is grouped at the Family taxonomic level). Approx 5x slower than non-np version
        :param base:
        :param level:
        :return:
        """
        level = user_request.level
        if int(level) < 0:
            # We want to aggregate at the OTU level, which is essentially not aggregating at all
            return base, headers, sample_labels

        taxonomy_map = self.otu_metadata.get_taxonomy_map()
        taxonomies = []
        taxonomy_to_cols = {}
        i = 0
        while i < len(headers):
            otu = headers[i]
            if otu not in taxonomy_map:
                # TODO: This actually indicates bad input data
                i += 1
                continue

            taxonomy = "; ".join(taxonomy_map[otu][:int(level) + 1])
            if taxonomy in taxonomy_to_cols:
                taxonomy_to_cols[taxonomy].append(i)
            else:
                taxonomy_to_cols[taxonomy] = [i]
                taxonomies.append(taxonomy)
            i += 1

        rows = len(base)
        cols = len(taxonomies)

        aggregated_base = np.zeros((rows, cols))
        i = 0
        for taxonomy in taxonomies:
            cols_to_aggregate = taxonomy_to_cols[taxonomy]
            aggregated_base[:, i] += np.sum(base[:, cols_to_aggregate], axis=1)
            i += 1

        aggregated_headers = taxonomies

        return aggregated_base, aggregated_headers, sample_labels

    def filter_out_low_count_np(self, base, headers, sample_labels, user_request):

        logger.info("Starting filtering by low count")
        count_threshold = user_request.taxonomy_filter_count
        min_prevalence = user_request.taxonomy_filter_prevalence

        base = np.array(base)
        headers = np.array(headers)
        num_samples = base.shape[0]
        min_prevalence_percentage = min_prevalence / float(100)
        otus_over_threshold = (base > count_threshold).astype(int)
        otus_to_keep = np.divide(np.sum(otus_over_threshold, axis=0), num_samples) >= min_prevalence_percentage

        logger.info(
            "Done filtering by low count. Kept " + str(sum(otus_to_keep)) + " cols out of " + str(len(headers)))
        return base[:, otus_to_keep], headers[otus_to_keep], sample_labels

    def aggregate_low_count_np(self, base, headers, sample_labels, user_request):
        count_threshold = user_request.taxonomy_filter_count
        min_prevalence = user_request.taxonomy_filter_prevalence

        base = np.array(base)
        headers = np.array(headers)
        num_samples = base.shape[0]
        min_prevalence_percentage = min_prevalence / float(100)
        otus_over_threshold = (base > count_threshold).astype(int)
        otus_to_keep = np.divide(np.sum(otus_over_threshold, axis=0), num_samples) >= min_prevalence_percentage
        otus_to_aggregate = np.divide(np.sum(otus_over_threshold, axis=0), num_samples) < min_prevalence_percentage
        aggregated_col = np.sum(base[:, otus_to_aggregate], axis=1)
        aggregated_base = np.c_[base[:, otus_to_keep], aggregated_col]
        aggregated_headers = headers[otus_to_keep].append("Other")

        logger.info(
            "Aggregate low count cols = " + str(len(otus_to_aggregate)) + " header cols = " + str(len(aggregated_headers)))
        return aggregated_base, aggregated_headers, sample_labels


    @staticmethod
    def get_otu_table_headers_at_taxonomic_level(user_id, pid, level, use_raw=False):
        logger.info("Using raw data")
        labels = DataIO.tsv_to_table(user_id, pid, RAW_GENE_TABLE_LABELS_FILENAME)
        headers = labels[0]

        if int(level) == -1:
            # OTUs requested
            return headers

        taxonomy = Taxonomy(user_id, pid)
        taxonomy_map = taxonomy.get_taxonomy_map()
        taxonomies = []
        taxonomy_to_cols = {}
        j = 0
        while j < len(headers):
            otu = headers[j]
            if otu in taxonomy_map:
                # Uncomment below if we want to use the fully quantified taxonomy string
                # taxonomy = "; ".join(taxonomy_map[otu][:int(level) + 1])
                taxonomy = taxonomy_map[otu][int(level)]
                if taxonomy != "":
                    if taxonomy in taxonomy_to_cols:
                        taxonomy_to_cols[taxonomy].append(j)
                    else:
                        taxonomy_to_cols[taxonomy] = [j]
                        taxonomies.append(taxonomy)
            j += 1
        return taxonomies