Пример #1
0
    def _read_file(self, TITLE, PRINT, REG, WEIGHTS, ANNA_PREPROCESS):
        bactria_as_feature_file = 'feature-table_Allergy_cleaned_taxa_290119_updated_in_140219.csv'

        features = pd.read_csv(bactria_as_feature_file, header=1)
        cols = list(features.columns)
        # remove non-numeric values
        cols.remove('Feature ID')
        cols.remove('Taxonomy')

        if REG:
            self.reg(features, cols)
        # get single\multiple information
        multiple_samples_info_path = 'mf_merge_ok84_ok93_ok66_69_TreeNuts_controls_271118_040219 post     MG17 07.05.19.csv'
        multiple_samples_info_df = pd.read_csv(multiple_samples_info_path)
        single_or_multiple_list = multiple_samples_info_df[
            'Michael_4_Single_Multiple']
        single_or_multiple_id_list = multiple_samples_info_df['SampleCode']
        single_or_multiple_map = {}
        for id, s_or_m in zip(single_or_multiple_id_list,
                              single_or_multiple_list):
            single_or_multiple_map[id] = s_or_m
        ids_list_wo_multiple = [
            key for key, val in single_or_multiple_map.items()
            if val == 'Single'
        ]
        # mf_merge_ok84_ok93_ok66_69_TreeNuts_controls_271118_040219 post     MG17 07.05.19.xlsx
        samples_data_file = 'mf_merge_ok84_ok93_ok66_69_merged_by_RestoredSampleCode_as_ID_290119.csv'

        OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, bactria_as_feature_file),
                             os.path.join(SCRIPT_DIR, samples_data_file),
                             from_QIIME=True,
                             id_col='Feature ID',
                             taxonomy_col='Taxonomy')

        if ANNA_PREPROCESS:
            preproccessed_data = preprocess_data(OtuMf.otu_file,
                                                 visualize_data=False,
                                                 taxonomy_col='Taxonomy',
                                                 taxnomy_level=6)
            # if we want to remove certain type of data according to the features
            # preproccessed_data = preproccessed_data.join(OtuMf.mapping_file[['AllergyType', 'SuccessDescription']], how='inner')
            # preproccessed_data = preproccessed_data.loc[
            #    (preproccessed_data['AllergyType'] == 'Milk') | ((preproccessed_data['AllergyType'] == 'Peanut'))]
            # preproccessed_data = preproccessed_data.drop(['AllergyType', 'SuccessDescription'], axis=1)
            # mapping_file = OtuMf.mapping_file.loc[(OtuMf.mapping_file['AllergyType'] == 'Milk') | (OtuMf.mapping_file['AllergyType'] == 'Peanut')]

            mapping_file = OtuMf.mapping_file['AllergyType']
            mapping_disease = {
                'Milk': 0,
                'Tree_nut': 1,  # 'Cashew' + 'Hazelnut' + 'Walnut'
                'Peanut': 2,
                'Sesame': 3
            }
            mapping_file = mapping_file.map(mapping_disease)
            preproccessed_data, mapping_file = distance_learning(
                perform_distance=True,
                level=3,
                preproccessed_data=preproccessed_data,
                mapping_file=mapping_file)
            self._preproccessed_data = preproccessed_data

        else:
            preproccessed_data = preprocess_data(OtuMf.otu_file,
                                                 visualize_data=False,
                                                 taxnomy_level=6,
                                                 taxonomy_col='Taxonomy',
                                                 preform_taxnomy_group=True)

            self._preproccessed_data = preproccessed_data
            # drow_data(preproccessed_data)
            # otu_after_pca_wo_taxonomy, _, _ = apply_pca(data_after_log_zcore, n_components=40, visualize=False)

        otu_after_pca_wo_taxonomy, pca_obj, _ = apply_pca(
            preproccessed_data, n_components=n_components, visualize=False)
        control = otu_after_pca_wo_taxonomy.index[0:62]  # 'Con'
        self._pca_obj = pca_obj
        # if we want to remove the healthy samples that are used for control
        # otu_after_pca_wo_taxonomy = otu_after_pca_wo_taxonomy.drop(preproccessed_data.index[0:62])

        index_to_id_map = {}
        id_to_features_map = {}
        for i, row in enumerate(otu_after_pca_wo_taxonomy.values):
            id_to_features_map[otu_after_pca_wo_taxonomy.index[i]] = row
            index_to_id_map[i] = otu_after_pca_wo_taxonomy.index[i]

        self._index_to_id_map = index_to_id_map
        self._id_to_features_map = id_to_features_map

        success_tag_column = 'SuccessDescription'
        stages_column = 'TreatmentTimePoint'
        allergan_column = 'AllergyType'
        code_column = 'ParticipentCode'
        ids_list_w_con = otu_after_pca_wo_taxonomy.index.tolist()
        ids_list_wo_con = otu_after_pca_wo_taxonomy.index.drop(
            otu_after_pca_wo_taxonomy.index[0:62])

        self._ids_list_w_con = ids_list_w_con
        self._ids_list_wo_con = ids_list_wo_con
        self._ids_list_wo_multiple = [
            id for id in ids_list_wo_multiple if id in ids_list_w_con
        ]

        stages = []

        # ##### separate samples by allergic and healthy==>'Con'
        id_to_binary_health_tag_map = {}
        for sample in ids_list_w_con:
            if sample.startswith('Con'):
                id_to_binary_health_tag_map[sample] = 1
            else:
                id_to_binary_health_tag_map[sample] = 0

        self._id_to_binary_health_tag_map = id_to_binary_health_tag_map

        # ##### separate samples by stage, success of treatment and allergen type
        id_to_success_tag_map = {}
        id_to_stage_map = {}
        id_to_binary_success_tag_map = {}
        id_to_allergy_type_tag_map = {}
        id_to_allergy_number_type_tag_map = {}
        id_to_milk_allergy_tag_map = {}
        allergan_types = set()

        tag_to_allergy_type_map = {
            0: 'Milk',
            1: 'Tree_nut',  # 'Cashew' + 'Hazelnut' + 'Walnut'
            2: 'Peanut',
            3: 'Sesame'
        }  # removed 'Egg' samples

        allergy_type_to_instances_map = {
            'Milk': 0,
            'Tree_nut': 0,
            'Peanut': 0,
            'Sesame': 0
        }  # 'Non': 9 samples, 'Egg': 35 samples
        """
        nuts_samples_list = []
        for sample in ids_list_wo_con:
             a = OtuMf.mapping_file.loc[sample, allergan_column]
             if a == 'Nuts':
                nuts_samples_list.append(sample)
        with open("nuts_samples.txt", "w") as file:
            for l in nuts_samples_list:
                 file.write(l + "\n")
    """
        non_allergy_type_ids = []
        egg_allergy_type_ids = []
        for sample in ids_list_wo_con:
            s = OtuMf.mapping_file.loc[sample, stages_column]
            # stages
            stages.append(s)
            id_to_stage_map[sample] = s
            stage_0_ids = [
                key for key in id_to_stage_map
                if id_to_stage_map[key] == '0_before'
            ]
            self._stage_0_ids = stage_0_ids

            # success
            t = OtuMf.mapping_file.loc[sample, success_tag_column]
            id_to_success_tag_map[sample] = t
            # save tags from k-classes as success(A1)->1 and failure(the rest)->0
            if t == 'A1':
                id_to_binary_success_tag_map[sample] = 1
            else:
                id_to_binary_success_tag_map[sample] = 0

            # allergy type
            a = OtuMf.mapping_file.loc[sample, allergan_column]
            allergan_types.add(a)
            id_to_allergy_type_tag_map[sample] = a

            if a == 'Milk' or a == 'Milk_suspected' or a == 'milk':
                id_to_allergy_number_type_tag_map[sample] = 0
                id_to_milk_allergy_tag_map[sample] = 1
                allergy_type_to_instances_map[
                    'Milk'] = allergy_type_to_instances_map.get('Milk') + 1
            elif a == 'Cashew' or a == 'Cashew ' or a == 'Hazelnut' or a == 'Walnut' or a == 'Nuts':
                id_to_allergy_number_type_tag_map[sample] = 1
                id_to_milk_allergy_tag_map[sample] = 0
                allergy_type_to_instances_map[
                    'Tree_nut'] = allergy_type_to_instances_map.get(
                        'Tree_nut') + 1
            elif a == 'Peanut':
                id_to_allergy_number_type_tag_map[sample] = 2
                id_to_milk_allergy_tag_map[sample] = 0
                allergy_type_to_instances_map[
                    'Peanut'] = allergy_type_to_instances_map.get('Peanut') + 1
            elif a == 'Sesame':
                id_to_allergy_number_type_tag_map[sample] = 3
                id_to_milk_allergy_tag_map[sample] = 0
                allergy_type_to_instances_map[
                    'Sesame'] = allergy_type_to_instances_map.get('Sesame') + 1
            elif a == 'Egg':
                egg_allergy_type_ids.append(sample)
                # id_to_allergy_number_type_tag_map[sample] = 1
                # id_to_milk_allergy_tag_map[sample] = 0
                # allergy_type_to_instances_map['Egg'] = allergy_type_to_instances_map.get('Egg') + 1
            elif a == 'Non':
                non_allergy_type_ids.append(sample)
                # id_to_allergy_number_type_tag_map[sample] = None
                # id_to_milk_allergy_tag_map[sample] = None
                # allergy_type_to_instances_map['Non'] = allergy_type_to_instances_map.get('Non') + 1
            else:
                print("error in allergy type " + str(sample))

        self._id_wo_non_and_egg_allergy_type_list = [
            x for x in self._ids_list_wo_con
            if x not in non_allergy_type_ids + egg_allergy_type_ids
        ]
        self._tag_to_allergy_type_map = tag_to_allergy_type_map
        self._allergy_type_to_instances_map = allergy_type_to_instances_map
        self._id_to_success_tag_map = id_to_success_tag_map
        self._id_to_stage_map = id_to_stage_map
        self._id_to_binary_success_tag_map = id_to_binary_success_tag_map
        self._id_to_allergy_type_tag_map = id_to_allergy_type_tag_map
        self._id_to_allergy_number_type_tag_map = id_to_allergy_number_type_tag_map
        self._id_to_milk_allergy_tag_map = id_to_milk_allergy_tag_map

        self._ids_list_wo_multiple = [
            id for id in ids_list_wo_multiple
            if id in id_to_allergy_number_type_tag_map.keys()
        ]

        # -------------------------------------------- weights !--------------------------------------------
        # calculate weights for types of allergy
        if WEIGHTS:
            total_sum = sum(list(allergy_type_to_instances_map.values()))
            types = list(allergy_type_to_instances_map.keys())
            allergy_type_to_weight_map = {}
            for t in types:
                allergy_type_to_weight_map[
                    t] = total_sum / allergy_type_to_instances_map[t]

            # normalize
            max_weight = max(list(allergy_type_to_weight_map.values()))
            for t in types:
                allergy_type_to_weight_map[t] = allergy_type_to_weight_map.get(
                    t) / max_weight

            # calculate weights for milk vs. other types of allergy
            milk_vs_other_allergy_weight_map = {
                'Other':
                total_sum /
                (total_sum - allergy_type_to_instances_map.get("Milk")),
                'Milk':
                total_sum / allergy_type_to_instances_map.get("Milk")
            }
            # normalize
            max_weight = max(list(milk_vs_other_allergy_weight_map.values()))
            for t in ['Other', 'Milk']:
                milk_vs_other_allergy_weight_map[
                    t] = milk_vs_other_allergy_weight_map.get(t) / max_weight

            # calculate weights for healthy and allergic
            healthy_vs_allergic_weight_map = {
                'Allergic': (len(ids_list_w_con)) / len(ids_list_wo_con),
                'Healthy': (len(ids_list_w_con)) /
                (len(ids_list_w_con) - len(ids_list_wo_con))
            }

            # normalize
            max_weight = max(list(healthy_vs_allergic_weight_map.values()))
            for t in ['Allergic', 'Healthy']:
                healthy_vs_allergic_weight_map[
                    t] = healthy_vs_allergic_weight_map.get(t) / max_weight

            # calculate weights for responding and not (success)
            no_response = list(id_to_binary_success_tag_map.values()).count(0)
            yes_response = list(id_to_binary_success_tag_map.values()).count(1)

            responding_vs_not_weight_map = {
                'No': (len(ids_list_wo_con)) / no_response,
                'Yes': (len(ids_list_wo_con) / yes_response)
            }

            # normalize
            max_weight = max(list(responding_vs_not_weight_map.values()))
            for t in ['No', 'Yes']:
                responding_vs_not_weight_map[
                    t] = responding_vs_not_weight_map.get(t) / max_weight

            # calculate weights for responding and not (prognostic)
            tags = []
            for i in stage_0_ids:
                tags.append(id_to_binary_success_tag_map.get(i))

            no_response = tags.count(0)
            yes_response = tags.count(1)

            prognostic_responding_vs_not_weight_map = {
                'No': (len(stage_0_ids)) / no_response,
                'Yes': (len(stage_0_ids) / yes_response)
            }

            # normalize
            max_weight = max(
                list(prognostic_responding_vs_not_weight_map.values()))
            for t in ['No', 'Yes']:
                prognostic_responding_vs_not_weight_map[
                    t] = prognostic_responding_vs_not_weight_map.get(
                        t) / max_weight

            self._allergy_type_to_weight_map = allergy_type_to_weight_map
            self._milk_vs_other_allergy_weight_map = milk_vs_other_allergy_weight_map
            self._healthy_vs_allergic_weight_map = healthy_vs_allergic_weight_map
            self._responding_vs_not_weight_map = responding_vs_not_weight_map
            self._prognostic_responding_vs_not_weight_map = prognostic_responding_vs_not_weight_map
        """    # count tags in all vs. stage_0
        all_tags = list(id_to_binary_success_tag_map.values())
        print("tags total len: " + str(len(all_tags)) + " pos tags: " + str(all_tags.count(1))
              + " percent: " + str(all_tags.count(1)/len(all_tags)))
        stage_0_tags = [id_to_binary_success_tag_map[id] for id in stage_0_ids if id in id_to_binary_success_tag_map.keys()]
        print("stage 0 tags total len: " + str(len(stage_0_tags)) + " pos tags: " + str(stage_0_tags.count(1))
              + " percent: " + str(stage_0_tags.count(1)/len(stage_0_tags)))
        """

        # return the list of features and the list of ids in the same order
        feature_list = [id_to_features_map[id] for id in ids_list_w_con]
        return ids_list_w_con, ids_list_wo_con, feature_list
    def _read_file(self, title, bactria_as_feature_file, samples_data_file,
                   allow_printing, perform_anna_preprocess):
        features = pd.read_csv(bactria_as_feature_file, header=1)
        cols = list(features.columns)
        # remove non-numeric values
        cols.remove('Feature ID')
        cols.remove('Taxonomy')

        OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, bactria_as_feature_file),
                             os.path.join(SCRIPT_DIR, samples_data_file),
                             from_QIIME=True,
                             id_col='Feature ID',
                             taxonomy_col='Taxonomy')

        if perform_anna_preprocess:
            preproccessed_data = preprocess_data(OtuMf.otu_file,
                                                 visualize_data=False,
                                                 taxonomy_col='Taxonomy',
                                                 taxnomy_level=6)
            mapping_file = OtuMf.mapping_file['XXXXX']
            mapping_disease = {
                'a': 0,
                'b': 1,  # 'Cashew' + 'Hazelnut' + 'Walnut'
                'c': 2,
                'd': 3
            }
            mapping_file = mapping_file.map(mapping_disease)
            preproccessed_data, mapping_file = distance_learning(
                perform_distance=True,
                level=self._taxnomy_level,
                preproccessed_data=preproccessed_data,
                mapping_file=mapping_file)
            self._preproccessed_data = preproccessed_data
        else:
            preproccessed_data = preprocess_data(
                OtuMf.otu_file,
                visualize_data=False,
                taxnomy_level=self._taxnomy_level,
                taxonomy_col='Taxonomy',
                preform_taxnomy_group=True)

            self._preproccessed_data = preproccessed_data
            # drow_data(preproccessed_data)
            # otu_after_pca_wo_taxonomy, _, _ = apply_pca(data_after_log_zcore, n_components=40, visualize=False)

        otu_after_pca_wo_taxonomy, pca_obj, _ = apply_pca(
            preproccessed_data, n_components=n_components, visualize=False)
        self._pca_obj = pca_obj

        index_to_id_map = {}
        id_to_features_map = {}
        for i, row in enumerate(otu_after_pca_wo_taxonomy.values):
            id_to_features_map[otu_after_pca_wo_taxonomy.index[i]] = row
            index_to_id_map[i] = otu_after_pca_wo_taxonomy.index[i]
        ids_who_has_features = list(id_to_features_map.keys())

        self._index_to_id_map = index_to_id_map
        self._id_to_features_map = id_to_features_map
        ids_list = otu_after_pca_wo_taxonomy.index.tolist()

        sample_id_to_sample_code_map = {}
        sample_ids = [s for s in OtuMf.mapping_file.index
                      ]  #  if not s.startswith("Con")]
        sample_code = [s for s in OtuMf.mapping_file["SampleCode"]
                       ]  #  if s != "Control"]
        for id, code in zip(sample_ids, sample_code):
            if not id.startswith("Con"):
                sample_id_to_sample_code_map[id] = code

        # ------------------------------------ each TASK creates different tag map --------------------------------
        before_ids = []
        id_to_tag_map = {}

        for id in OtuMf.mapping_file.index:
            before = OtuMf.mapping_file.loc[id, "TreatmentPoint"]
            if before == "before":
                if sample_id_to_sample_code_map[id] in id_to_features_map.keys(
                ):
                    before_ids.append(id)
                else:
                    print(code + " not in id_to_features_map")
            elif before == "Control":
                before_ids.append(id)

        code_list = []
        for id in before_ids:
            if id in sample_id_to_sample_code_map.keys():
                code_list.append(sample_id_to_sample_code_map[id])
            else:
                code_list.append(id)

        # HEALTH_BEFORE_TREATMENT_TASK
        if self._task == "health_before_treatment_task":
            for id, code in zip(before_ids, code_list):
                before = OtuMf.mapping_file.loc[id, "TreatmentPoint"]
                if before == "before":
                    if code in id_to_features_map.keys():
                        id_to_tag_map[code] = 1
                elif before == "Control":
                    id_to_tag_map[id] = 0

            self._ids_list = list(id_to_tag_map.keys())
            """
            # before_ids.remove("382954")
            # before_ids.remove("386137")
            # before_ids.remove("386100")
                    if self._task == "health_before_treatment_task":
            for id in OtuMf.mapping_file.index:
                before = OtuMf.mapping_file.loc[id, "TreatmentPoint"]
                if before == "before":
                    code = sample_id_to_sample_code_map[id]
                    if code in id_to_features_map.keys():
                        before_ids.append(code)
                        id_to_tag_map[code] = 1
                    else:
                        print(code + " not in id_to_features_map")

                elif before == "Control":
                        before_ids.append(id)
                        id_to_tag_map[id] = 0
                else:
                    print(before + " error")
            """

        # ALLERGY_TYPE_BEFORE_TREATMENT_TASK
        elif self._task == "allergy_type_before_treatment_task":
            tag_to_allergy_type_map = {
                0: 'Milk',
                1: 'Tree_nut',  # 'Cashew' + 'Hazelnut' + 'Walnut'
                2: 'Peanut',
                3: 'Sesame'
            }  # removed 'Egg' samples
            for sample, code in zip(before_ids, code_list):
                a = OtuMf.mapping_file.loc[sample, 'AllergyType']
                if a == 'Milk' or a == 'Milk_suspected' or a == 'milk':
                    id_to_tag_map[code] = 0
                elif a == 'Cashew' or a == 'Cashew ' or a == 'Hazelnut' or a == 'Walnut' or a == 'Nuts':
                    id_to_tag_map[code] = 1
                elif a == 'Peanut':
                    id_to_tag_map[code] = 2
                elif a == 'Sesame':
                    id_to_tag_map[code] = 3
            self._ids_list = [
                id for id in code_list if not id.startswith("Con")
            ]

        self._id_to_tag_map = id_to_tag_map

        # -------------------------------------------- weights !--------------------------------------------
        # calculate weights
        y = list(id_to_tag_map.values())
        classes_sum = [
            np.sum(np.array(y) == unique_class)
            for unique_class in np.unique(np.array(y))
        ]
        classes_ratio = [1 - (a / sum(classes_sum)) for a in classes_sum]
        weights = [classes_ratio[a] for a in np.array(y)]
        self._weight_map = {
            i: classes_ratio[i]
            for i in range(len(classes_ratio))
        }

        # return the list of features and the list of ids in the same order
        feature_list = []
        for id in self._ids_list:
            if id in sample_id_to_sample_code_map.keys():
                feature_list.append(
                    id_to_features_map[sample_id_to_sample_code_map[id]])
            else:
                id_to_features_map[id]
        self._feature_list = feature_list
Пример #3
0
    def _read_file(self, title, bactria_as_feature_file, samples_data_file, allow_printing, perform_anna_preprocess, visualize_pre, re_arange):
        
        sample = "SALIVA"


        OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, bactria_as_feature_file),
                             os.path.join(SCRIPT_DIR, samples_data_file),
                             from_QIIME=False, id_col='ID',
                             taxonomy_col='taxonomy')
        #rare_bacteria = self.find_rare_bacteria(OtuMf)
        #OtuMf = self.drop_rare_bacteria(rare_bacteria, OtuMf)
        OtuMf = self.remove_duplicate(OtuMf)
        OtuMf = self.rearange_data(OtuMf, re_arange)
        #OtuMf.otu_file.T.to_csv("GDM_OTU_rmv_dup_arrange.csv")
        OtuMf.mapping_file.to_csv("GDM_tag_rmv_dup.csv")
        #returnmapping_file
        return
        
        if perform_anna_preprocess:
            preproccessed_data = preprocess_data(OtuMf.otu_file.T, visualize_data=False, taxonomy_col='taxonomy',
                                                 taxnomy_level=8)
            mapping_file = OtuMf.mapping_file['Control_GDM']
            mapping_disease = {'Control': 0,
                               'GDM': 1}
            mapping_file = mapping_file.map(mapping_disease)
            preproccessed_data, mapping_file = distance_learning(perform_distance=True, level=4,
                                                                 preproccessed_data=preproccessed_data,
                                                                 mapping_file=mapping_file)
            self._preproccessed_data = preproccessed_data
            self._preproccessed_data.to_csv('anna_pca_old_loader.csv')
        else:
            if re_arange != 0:
                OtuMf = self.rearange_data(OtuMf, re_arange)
            preproccessed_data = preprocess_data(OtuMf.otu_file.T, visualize_data=visualize_pre, taxnomy_level=self._taxnomy_level,
                                                 taxonomy_col='taxonomy', preform_taxnomy_group=True, std_to_delete = 0.25)
        self.OtuMf = OtuMf
        self._preproccessed_data = preproccessed_data
        
        otu_after_pca_wo_taxonomy, pca_obj, _ = apply_pca(preproccessed_data, n_components=n_components,
                                                          visualize=False)
        self._pca_obj = pca_obj
        
        #This line ignore the PCA made above disable the line if PCA is needed 
        otu_after_pca_wo_taxonomy = self._preproccessed_data
        
        
        
        index_to_id_map = {}
        id_to_features_map = {}
        for i, row in enumerate(otu_after_pca_wo_taxonomy.values):
            id_to_features_map[otu_after_pca_wo_taxonomy.index[i]] = row
            index_to_id_map[i] = otu_after_pca_wo_taxonomy.index[i]
        self._index_to_id_map = index_to_id_map
        self._id_to_features_map = id_to_features_map

        ids_whole_list = otu_after_pca_wo_taxonomy.index.tolist()

        # ------------------------------------ each TASK creates different tag map --------------------------------
        id_to_tag_map = {}

        tag_map = {'Control': 0, 'GDM': 1}
        if sample == "both":
            T1_ids = [id for id in ids_whole_list if OtuMf.mapping_file["trimester"][id] == '1']
        else:
            T1_ids = [id for id in ids_whole_list if int(OtuMf.mapping_file["trimester"][id]) == 1 and OtuMf.mapping_file["body_site"][id] == sample]
        counter_GDM = 0
        counter_Control = 0
        for id in T1_ids:
            id_to_tag_map[id] = tag_map[OtuMf.mapping_file["Control_GDM"][id]]
        self._ids_list = T1_ids
        self._id_to_tag_map = id_to_tag_map

        # -------------------------------------------- weights !--------------------------------------------
        # calculate weights
        y = list(self._id_to_tag_map.values())
        classes_sum = [np.sum(np.array(y) == unique_class) for unique_class in
                       np.unique(np.array(y))]
        classes_ratio = [1 - (a / sum(classes_sum)) for a in classes_sum]
        weights = [classes_ratio[a] for a in np.array(y)]
        self._weight_map = {i: classes_ratio[i] for i in range(len(classes_ratio))}

        # return the list of features and the list of ids in the same order
        self._feature_list = [self._id_to_features_map[id] for id in self._ids_list]
Пример #4
0
    def _read_file(self, title, bactria_as_feature_file, samples_data_file,
                   allow_printing, perform_anna_preprocess):
        features = pd.read_csv(bactria_as_feature_file, header=1)
        cols = list(features.columns)
        # remove non-numeric values
        cols.remove('Feature ID')
        cols.remove('Taxonomy')

        OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, bactria_as_feature_file),
                             os.path.join(SCRIPT_DIR, samples_data_file),
                             from_QIIME=True,
                             id_col='Feature ID',
                             taxonomy_col='Taxonomy')

        if perform_anna_preprocess:
            preproccessed_data = preprocess_data(OtuMf.otu_file,
                                                 visualize_data=False,
                                                 taxonomy_col='Taxonomy',
                                                 taxnomy_level=6)
            mapping_file = OtuMf.mapping_file['XXXXX']
            mapping_disease = {
                'a': 0,
                'b': 1,  # 'Cashew' + 'Hazelnut' + 'Walnut'
                'c': 2,
                'd': 3
            }
            mapping_file = mapping_file.map(mapping_disease)
            preproccessed_data, mapping_file = distance_learning(
                perform_distance=True,
                level=self._taxnomy_level,
                preproccessed_data=preproccessed_data,
                mapping_file=mapping_file)
            self._preproccessed_data = preproccessed_data
        else:
            preproccessed_data = preprocess_data(
                OtuMf.otu_file,
                visualize_data=False,
                taxnomy_level=self._taxnomy_level,
                taxonomy_col='Taxonomy',
                preform_taxnomy_group=True)

            self._preproccessed_data = preproccessed_data
            # drow_data(preproccessed_data)
            # otu_after_pca_wo_taxonomy, _, _ = apply_pca(data_after_log_zcore, n_components=40, visualize=False)

        otu_after_pca_wo_taxonomy, pca_obj, _ = apply_pca(
            preproccessed_data, n_components=n_components, visualize=False)
        self._pca_obj = pca_obj

        index_to_id_map = {}
        id_to_features_map = {}
        for i, row in enumerate(otu_after_pca_wo_taxonomy.values):
            id_to_features_map[otu_after_pca_wo_taxonomy.index[i]] = row
            index_to_id_map[i] = otu_after_pca_wo_taxonomy.index[i]

        self._index_to_id_map = index_to_id_map
        self._id_to_features_map = id_to_features_map
        ids_list = otu_after_pca_wo_taxonomy.index.tolist()
        ids_list_wo_con = otu_after_pca_wo_taxonomy.index.drop(
            otu_after_pca_wo_taxonomy.index[0:62])

        if self._task == "health task":
            self._ids_list = ids_list
            id_to_tag_map = {}
            for sample in ids_list:
                if sample.startswith('Con'):
                    id_to_tag_map[sample] = 1
                else:
                    id_to_tag_map[sample] = 0
            self._id_to_tag_map = id_to_tag_map

        if self._task == "prognostic task":
            treatment_point_column = 'TreatmentPoint'
            before_treatment_ids = []
            for sample in ids_list_wo_con:
                s = OtuMf.mapping_file.loc[sample, treatment_point_column]
                if s == "before":
                    before_treatment_ids.append(sample)

            self._ids_list = list(before_treatment_ids)
            success_column = 'SuccessDescription'
            id_to_tag_map = {}
            for sample in before_treatment_ids:
                t = OtuMf.mapping_file.loc[sample, success_column]
                id_to_tag_map[sample] = t
                if t == 'A1':
                    id_to_tag_map[sample] = 1
                else:
                    id_to_tag_map[sample] = 0
            self._id_to_tag_map = id_to_tag_map

        if self._task == "diagnostics task":
            self._ids_list = list(ids_list_wo_con)
            success_column = 'SuccessDescription'
            id_to_tag_map = {}
            for sample in ids_list_wo_con:
                t = OtuMf.mapping_file.loc[sample, success_column]
                id_to_tag_map[sample] = t
                if t == 'A1':
                    id_to_tag_map[sample] = 1
                else:
                    id_to_tag_map[sample] = 0
            self._id_to_tag_map = id_to_tag_map

        # -------------------------------------------- weights !--------------------------------------------
        # calculate weights
        y = list(id_to_tag_map.values())
        classes_sum = [
            np.sum(np.array(y) == unique_class)
            for unique_class in np.unique(np.array(y))
        ]
        classes_ratio = [1 - (a / sum(classes_sum)) for a in classes_sum]
        weights = [classes_ratio[a] for a in np.array(y)]
        self._weight_map = {
            i: classes_ratio[i]
            for i in range(len(classes_ratio))
        }

        # return the list of features and the list of ids in the same order
        feature_list = [id_to_features_map[id] for id in ids_list]
        self._feature_list = feature_list