Пример #1
0
    def generate(self, input=None):
        data = input.data

        pca = PCA(n_components=self.config.get("number_of_components"))
        pca.fit(data.T)  #  Transpose it, as vars need to along the top

        weights = pca.transform(data.T)  # Get weights?

        # Label up the top 50 (the values are retained; just for clarity)
        wmx = np.amax(np.absolute(weights), axis=1)

        dso_z = list(zip(input.scales[1], input.entities[1], input.labels[1]))
        dso_z = sorted(zip(dso_z, wmx), key=lambda x: x[1])[-50:]  # Top 50

        dso_z = [x for x, wmx in dso_z]

        # Build scores into a dso no_of_samples x no_of_principal_components
        scored = DataSet(size=(len(pca.components_[0]), len(pca.components_)))
        scored.labels[0] = input.labels[0]
        scored.classes[0] = input.classes[0]

        for n, s in enumerate(pca.components_):
            scored.data[:, n] = s
            scored.labels[1][n] = "Principal Component %d (%0.2f%%)" % (n + 1, pca.explained_variance_ratio_[0] * 100.0)

        weightsd = DataSet(size=weights.T.shape)
        weightsd.data = weights.T

        weightsd.scales[1] = input.scales[1]

        dso_pc = {}
        for n in range(0, weights.shape[1]):
            pcd = DataSet(size=(1, input.shape[1]))
            pcd.entities[1] = input.entities[1]
            pcd.labels[1] = input.labels[1]
            pcd.scales[1] = input.scales[1]
            pcd.data = weights[:, n : n + 1].T
            dso_pc["pc%s" % (n + 1)] = pcd
            weightsd.labels[0][n] = "PC %s" % (n + 1)
            weightsd.classes[0][n] = "PC %s" % (n + 1)

        return dict(
            list({"dso": input, "pca": pca, "scores": scored, "weights": weightsd, "wmx": wmx, "dso_z": dso_z}.items())
            + list(dso_pc.items())
        )
Пример #2
0
    def generate(self, input=None):

        pathways = [k for k, v in db.dbm.get_pathways()]
        pathway_compounds = dict()

        for k, p in db.dbm.get_pathways():
            pathway_compounds[p.id] = set([m for m in p.compounds])

        data_m, labels_m = self.build_matrix(pathways, pathway_compounds)

        pathway_reactions = dict()

        for k, p in list(db.dbm.pathways.items()):
            pathway_reactions[p.id] = set([m for m in p.reactions])

        data_r, labels_r = self.build_matrix(pathways, pathway_reactions)

        pathway_active_reactions = dict()
        pathway_active_compounds = dict()
        active_pathways = input.entities[1]
        active_pathways_id = []

        for p in active_pathways:
            pathway_active_reactions[p.id] = set([r for r in p.reactions])
            pathway_active_compounds[p.id] = set([r for r in p.compounds])
            active_pathways_id.append(p.id)

        data_ar, labels_ar = self.build_matrix(active_pathways_id,
                                               pathway_active_reactions)
        data_am, labels_am = self.build_matrix(active_pathways_id,
                                               pathway_active_compounds)

        dim = len(data_ar)

        dso_r = DataSet(size=(dim, dim))
        dso_r.data = data_ar
        dso_r.labels[1] = labels_ar

        dso_m = DataSet(size=(dim, dim))
        dso_m.data = data_am
        dso_m.labels[1] = labels_am

        return {'dso_r': dso_r, 'dso_m': dso_m}
Пример #3
0
    def normalise(self, dsi):
        # Generate bin values for range start_scale to end_scale
        # Calculate the number of bins at binsize across range
        dso = DataSet(size=dsi.shape)
        dso.import_data(dsi)

        dso.data = self.algorithms[self.config.get('algorithm')](dso.data)
        # -- optionally use the line widths and take max within each of these for each spectra (peak shiftiness)
        # Filter the original data with those locations and output\

        return dso
Пример #4
0
    def normalise(self, dsi):
        # Generate bin values for range start_scale to end_scale
        # Calculate the number of bins at binsize across range
        dso = DataSet(size=dsi.shape)
        dso.import_data(dsi)

        dso.data = self.algorithms[self.config.get('algorithm')](dso.data)
        # -- optionally use the line widths and take max within each of these for each spectra (peak shiftiness)
        # Filter the original data with those locations and output\

        return dso
Пример #5
0
    def generate(self, input=None):

        pathways = list(self.m.db.pathways.keys())
        pathway_compounds = dict()

        for k, p in list(self.m.db.pathways.items()):
            pathway_compounds[p.id] = set([m for m in p.compounds])

        data_m, labels_m = self.build_matrix(pathways, pathway_compounds)

        pathway_reactions = dict()

        for k, p in list(self.m.db.pathways.items()):
            pathway_reactions[p.id] = set([m for m in p.reactions])

        data_r, labels_r = self.build_matrix(pathways, pathway_reactions)

        pathway_active_reactions = dict()
        pathway_active_compounds = dict()
        active_pathways = input.entities[1]  # [self.parent.db.pathways[p] for p in self.parent.config.value('/Pathways/Show').split(',')]
        active_pathways_id = []

        for p in active_pathways:
            pathway_active_reactions[p.id] = set([r for r in p.reactions])
            pathway_active_compounds[p.id] = set([r for r in p.compounds])
            active_pathways_id.append(p.id)

        data_ar, labels_ar = self.build_matrix(active_pathways_id, pathway_active_reactions)
        data_am, labels_am = self.build_matrix(active_pathways_id, pathway_active_compounds)

        dim = len(data_ar)

        dso_r = DataSet(size=(dim, dim))
        dso_r.data = data_ar
        dso_r.labels[1] = labels_ar

        dso_m = DataSet(size=(dim, dim))
        dso_m.data = data_am
        dso_m.labels[1] = labels_am

        return {'dso_r': dso_r, 'dso_m': dso_m}
Пример #6
0
    def generate(self, input=None):

        pathways = list(self.m.db.pathways.keys())
        pathway_compounds = dict()

        for k, p in list(self.m.db.pathways.items()):
            pathway_compounds[p.id] = set([m for m in p.compounds])

        data_m, labels_m = self.build_matrix(pathways, pathway_compounds)

        pathway_reactions = dict()

        for k, p in list(self.m.db.pathways.items()):
            pathway_reactions[p.id] = set([m for m in p.reactions])

        data_r, labels_r = self.build_matrix(pathways, pathway_reactions)

        pathway_active_reactions = dict()
        pathway_active_compounds = dict()
        active_pathways = input.entities[1]  # [self.parent.db.pathways[p] for p in self.parent.config.value('/Pathways/Show').split(',')]
        active_pathways_id = []

        for p in active_pathways:
            pathway_active_reactions[p.id] = set([r for r in p.reactions])
            pathway_active_compounds[p.id] = set([r for r in p.compounds])
            active_pathways_id.append(p.id)

        data_ar, labels_ar = self.build_matrix(active_pathways_id, pathway_active_reactions)
        data_am, labels_am = self.build_matrix(active_pathways_id, pathway_active_compounds)

        dim = len(data_ar)

        dso_r = DataSet(size=(dim, dim))
        dso_r.data = data_ar
        dso_r.labels[1] = labels_ar

        dso_m = DataSet(size=(dim, dim))
        dso_m.data = data_am
        dso_m.labels[1] = labels_am

        return {'dso_r': dso_r, 'dso_m': dso_m}
Пример #7
0
    def generate(self, input=None):

        pathways = [k for k, v in db.dbm.get_pathways()]
        pathway_compounds = dict()

        for k, p in db.dbm.get_pathways():
            pathway_compounds[p.id] = set([m for m in p.compounds])

        data_m, labels_m = self.build_matrix(pathways, pathway_compounds)

        pathway_reactions = dict()

        for k, p in list(db.dbm.pathways.items()):
            pathway_reactions[p.id] = set([m for m in p.reactions])

        data_r, labels_r = self.build_matrix(pathways, pathway_reactions)

        pathway_active_reactions = dict()
        pathway_active_compounds = dict()
        active_pathways = input.entities[1]
        active_pathways_id = []

        for p in active_pathways:
            pathway_active_reactions[p.id] = set([r for r in p.reactions])
            pathway_active_compounds[p.id] = set([r for r in p.compounds])
            active_pathways_id.append(p.id)

        data_ar, labels_ar = self.build_matrix(active_pathways_id, pathway_active_reactions)
        data_am, labels_am = self.build_matrix(active_pathways_id, pathway_active_compounds)

        dim = len(data_ar)

        dso_r = DataSet(size=(dim, dim))
        dso_r.data = data_ar
        dso_r.labels[1] = labels_ar

        dso_m = DataSet(size=(dim, dim))
        dso_m.data = data_am
        dso_m.labels[1] = labels_am

        return {'dso_r': dso_r, 'dso_m': dso_m}
Пример #8
0
    def generate(self, input=None):
        data = input.data

        pca = PCA(n_components=self.config.get('number_of_components'))
        pca.fit(data)
        scores = pca.transform(data)

        # Build scores into a dso no_of_samples x no_of_principal_components
        scored = DataSet(size=(scores.shape))
        scored.labels[0] = input.labels[0]
        scored.classes[0] = input.classes[0]
        scored.data = scores

        for n in range(0, scored.shape[1]):
            scored.labels[1][n] = 'Principal Component %d (%0.2f%%)' % (n + 1, pca.explained_variance_ratio_[0] * 100.)

        weightsd = DataSet(size=pca.components_.shape)
        weightsd.data = pca.components_

        weightsd.scales[1] = input.scales[1]

        dso_pc = {}
        for n in range(0, pca.components_.shape[0]):
            pcd = DataSet(size=(1, input.shape[1]))
            pcd.entities[1] = input.entities[1]
            pcd.labels[1] = input.labels[1]
            pcd.scales[1] = input.scales[1]
            pcd.data = weightsd.data[n:n + 1, :]
            dso_pc['pc%s' % (n + 1)] = pcd
            weightsd.labels[0][n] = "PC %s" % (n + 1)
            #weightsd.classes[0][n] = "PC %s" % (n+1)

        return dict(list({
            'dso': input,
            'pca': pca,
            'scores': scored,
            'weights': weightsd,
        }.items()) + list(dso_pc.items()))
Пример #9
0
    def generate(self, input=None):
        data = input.data

        pca = PCA(n_components=self.config.get('number_of_components'))
        pca.fit(data.T)  # Transpose it, as vars need to along the top

        weights = pca.transform(data.T)  # Get weights?

        # Label up the top 50 (the values are retained; just for clarity)
        wmx = np.amax(np.absolute(weights), axis=1)

        dso_z = list(zip(input.scales[1], input.entities[1], input.labels[1]))
        dso_z = sorted(zip(dso_z, wmx), key=lambda x: x[1])[-50:]  # Top 50

        dso_z = [x for x, wmx in dso_z]

        # Build scores into a dso no_of_samples x no_of_principal_components
        scored = DataSet(size=(len(pca.components_[0]), len(pca.components_)))
        scored.labels[0] = input.labels[0]
        scored.classes[0] = input.classes[0]

        for n, s in enumerate(pca.components_):
            scored.data[:, n] = s
            scored.labels[1][n] = 'Principal Component %d (%0.2f%%)' % (
                n + 1, pca.explained_variance_ratio_[0] * 100.)

        dso_pc = {}
        for n in range(0, weights.shape[1]):
            pcd = DataSet(size=(1, input.shape[1]))
            pcd.entities[1] = input.entities[1]
            pcd.labels[1] = input.labels[1]
            pcd.scales[1] = input.scales[1]
            pcd.data = weights[:, n:n + 1].T
            dso_pc['pc%s' % (n + 1)] = pcd

        return dict(
            list({
                'dso': input,
                'pca': pca,
                'scores': scored,
                #'weights': weights,
                'wmx': wmx,
                'dso_z': dso_z,
            }.items()) + list(dso_pc.items()))
Пример #10
0
    def load_datafile(self, file):

        reader = csv.reader(open(file, 'rU'), delimiter='\t', dialect='excel')
        hrow = next(reader)  # Get top row

        slabels = []
        data = []

        if hrow[0] == 'Profiled Data Type':  # Is a Chenomx output file; use the other columns to map data scale/etc. once implemented
            next(reader)  # Skip date row
            hrow = next(reader)
            labels = hrow[
                2:]  # We strip off the pH here; might be nice to keep it
            entities = [
                self.m.db.synrev[l] if l in self.m.db.synrev else None
                for l in labels
            ]  # Map to entities if they exist

            next(reader)  # Skip compound ID
            next(reader)  # Skip InChI
            next(reader)  # Skip SMILES

            for hrow in reader:  # Now read the data rows
                slabels.append(hrow[0])
                td = []
                for x in hrow[2:]:
                    try:
                        td.append(float(x))
                    except:
                        td.append(0)
                data.append(td)

        data = np.array(data)
        dso = DataSet(size=data.shape)
        print(data.shape)
        dso.labels[1] = labels
        dso.entities[1] = entities
        dso.labels[0] = slabels
        dso.data = data

        return {'output': dso}
Пример #11
0
    def load_bml_datafile(self, data_path, target, name):

        dso = DataSet()

        # Read in data for the graphing metabolite, with associated value (generate mean)
        reader = csv.reader(utils.nonull(open(data_path, 'rb')),
                            delimiter='\t',
                            dialect='excel')

        for row in reader:
            if row and row[0] == 'metabolite':  # Look for the top row
                break
        else:
            return

        samples = row[1:-2]  # Sample identities
        samples = [sample[8:-1] for sample in samples]

        xdim = 0
        ydim = len(samples)

        raw_data = []
        metabolites = []

        for row in reader:
            xdim += 1
            metabolites.append(row[0])

            raw_data.append([float(i) for i in row[1:-2]])

        dso = DataSet(size=(ydim, xdim))
        dso.labels[1] = metabolites

        dso.data = np.array(raw_data).T

        dso.name = name
        dso.description = 'Imported from FIMA (%s)' % name

        return dso
Пример #12
0
    def load_bml_datafile(self, data_path, target, name):

        dso = DataSet()

        # Read in data for the graphing metabolite, with associated value (generate mean)
        reader = csv.reader(utils.nonull(open(data_path, 'rb')), delimiter='\t', dialect='excel')

        for row in reader:
            if row and row[0] == 'metabolite':  # Look for the top row
                break
        else:
            return

        samples = row[1:-2]  # Sample identities
        samples = [sample[8:-1] for sample in samples]

        xdim = 0
        ydim = len(samples)

        raw_data = []
        metabolites = []

        for row in reader:
            xdim += 1
            metabolites.append(row[0])

            raw_data.append([float(i) for i in row[1:-2]])

        dso = DataSet(size=(ydim, xdim))
        dso.labels[1] = metabolites

        dso.data = np.array(raw_data).T

        dso.name = name
        dso.description = 'Imported from FIMA (%s)' % name

        return dso
Пример #13
0
    def load_datafile(self, file):

        reader = csv.reader(open(file, 'rU'), delimiter='\t', dialect='excel')
        hrow = next(reader)  # Get top row

        slabels = []
        data = []

        if hrow[0] == 'Profiled Data Type':  # Is a Chenomx output file; use the other columns to map data scale/etc. once implemented
            next(reader)  # Skip date row
            hrow = next(reader)
            labels = hrow[2:]  # We strip off the pH here; might be nice to keep it
            entities = [self.m.db.synrev[l] if l in self.m.db.synrev else None for l in labels]  # Map to entities if they exist

            next(reader)  # Skip compound ID
            next(reader)  # Skip InChI
            next(reader)  # Skip SMILES

            for hrow in reader:  # Now read the data rows
                slabels.append(hrow[0])
                td = []
                for x in hrow[2:]:
                    try:
                        td.append(float(x))
                    except:
                        td.append(0)
                data.append(td)

        data = np.array(data)
        dso = DataSet(size=data.shape)
        print(data.shape)
        dso.labels[1] = labels
        dso.entities[1] = entities
        dso.labels[0] = slabels
        dso.data = data

        return {'output': dso}
Пример #14
0
    def generate(self, input_1=None, input_2=None, input_3=None, input_4=None):
        #dsi = input
        # Iterate all the compounds in the current analysis
        # Assign score to each of the compound's pathways
        # Sum up, crop and return a list of pathway_ids to display
        # Pass this in as the list to view
        # + requested pathways, - excluded pathways

        db = self.m.db

        mining_depth = self.config.get('/Data/MiningDepth')
        mining_type = self.config.get('/Data/MiningType')

        pathway_scores = defaultdict(int)

        for dsi in input_1, input_2, input_3, input_4:
            if dsi == None:
                continue

            print("Mining using '%s'" % mining_type)

            for n, entity in enumerate(dsi.entities[1]):
                if entity == None:
                    continue  # Skip

                score = dsi.data[0, n]
                #score = self.analysis[ m_id ]['score']

                # 1' neighbours; 2' neighbours etc. add score
                # Get a list of methods in connected reactions, add their score % to this compound
                # if m_id in db.compounds.keys():
                #    n_compounds = [r.compounds for r in db.compounds[ m_id ].reactions ]
                #     print n_compounds
                #     n_compounds = [m for ml in n_compounds for m in ml if n_m.id in self.analysis and m.id != m_id ]
                #     for n_m in n_compounds:
                #         score += self.analysis[ n_m.id ]['score'] * 0.5

                # Get the entity's pathways
                pathways = entity.pathways
                if pathways == []:
                    continue

                if self.config.get('/Data/MiningShared'):
                    # Share the change score between the associated pathways
                    # this prevents compounds having undue influence
                    score = score / len(pathways)

                for p in pathways:
                    mining_val = {
                        'c': abs(score),
                        'u': max(0, score),
                        'd': abs(min(0, score)),
                        'm': 1.0,
                        't': score,
                        }
                    pathway_scores[p] += mining_val[mining_type]


            # If we're using tendency scaling; abs the scores here
            if mining_type == 't':
                for p, v  in list(pathway_scores.items()):
                    pathway_scores[p] = abs(v)


            # If we're pruning, then remove any pathways not in keep_pathways
            if self.config.get('/Data/MiningRelative'):
                print("Scaling pathway scores to pathway sizes...")
                for p, v in list(pathway_scores.items()):
                    pathway_scores[p] = float(v) / len(p.reactions)

        
        if not pathway_scores:
            # No data
            raise BaseException

        # Now take the accumulated scores; and create the output
        pathway_scorest = list(pathway_scores.items())  # Switch it to a dict so we can sort
        pathway_scorest = [(p, v) for p, v in pathway_scorest if v > 0]  # Remove any scores of 0
        pathway_scorest.sort(key=lambda tup: tup[1], reverse=True)  # Sort by scores (either system)

        # Get top N defined by mining_depth parameter
        keep_pathways = pathway_scorest[0:mining_depth]
        remaining_pathways = pathway_scorest[mining_depth + 1:mining_depth + 100]

        print("Mining recommended %d out of %d" % (len(keep_pathways), len(pathway_scores)))

        for n, p in enumerate(keep_pathways):
            print("- %d. %s [%.2f]" % (n + 1, p[0].name, p[1]))

        #self.analysis['mining_ranked_remaining_pathways'] = []

        #if remaining_pathways:
        #    print "Note: Next pathways by current scoring method are..."
        #    for n2,p in enumerate(remaining_pathways):
        #        print "- %d. %s [%.2f]" % (n+n2+1, db.pathways[ p[0] ].name, p[1])
        #        self.analysis['mining_ranked_remaining_pathways'].append( p[0] )

        #self.analysis_suggested_pathways = [db.pathways[p[0]] for p in pathway_scorest]
        dso = DataSet(size=(1, len(keep_pathways)))
        dso.entities[1] = [k for k, v in keep_pathways]
        dso.labels[1] = [k.name for k, v in keep_pathways]
        dso.data = np.array([v for k, v in keep_pathways], ndmin=2)

        dso.labels[0][0] = "Pathway mining scores"

        return {'output': dso}
Пример #15
0
    def generate(self, input=None):   
        dso = input
        
        _experiment_test = self.config.get('experiment_test')
        _experiment_control = self.config.get('experiment_control')
                
        data = dso.data
        
        plsr = PLSRegression(n_components=self.config.get('number_of_components'), scale=self.config.get('autoscale')) #, algorithm=self.config.get('algorithm'))
        Y = np.array([0 if c == _experiment_control else 1 for c in dso.classes[0] ])

        plsr.fit(data, Y) # Transpose it, as vars need to along the top
        
        # Build scores into a dso no_of_samples x no_of_principal_components
        scored = DataSet(size=(len(plsr.x_scores_),len(plsr.x_scores_[0])))  
        scored.labels[0] = input.labels[0]
        scored.classes[0] = input.classes[0]

        for n,s in enumerate(plsr.x_scores_.T):
            scored.data[:,n] = s
            scored.labels[1][n] = 'Latent Variable %d' % (n+1) #, plsr.y_weights_[0][n])
                
        # PLS-DA regions; mean +- 95% confidence in each axis for each cluster
        cw_x = defaultdict(list)
        cw_y = defaultdict(list)
            
        for c in list(cw_x.keys()):
            # Calculate mean point
            cx = np.mean( cw_x[c] )
            cy = np.mean( cw_y[c] )
            
            # Calculate 95% CI
            rx = np.std( cw_x[c] ) *2 # 2sd = 95% #1.95 * ( / srn) # 1.95 * SEM => 95% confidence
            ry = np.std( cw_y[c] ) *2 #1.95 * ( / srn)

            figure_regions.append( 
                (c, cx, cy, rx, ry)
            )

        
            
        # Label up the top 50 (the values are retained; just for clarity)
        wmx = np.amax( np.absolute( plsr.x_weights_), axis=1 )
        dso_z = list(zip( dso.scales[1], dso.entities[1], dso.labels[1] ))
        dso_z = sorted( zip( dso_z, wmx ), key=lambda x: x[1])[-50:] # Top 50
        dso_z = [x for x, wmx in dso_z ]    

        weightsd = DataSet(size=plsr.x_weights_.T.shape)
        weightsd.data = plsr.x_weights_.T
        weightsd.scales[1] = input.scales[1]

        dso_lv = {}
        for n in range(0, plsr.x_weights_.shape[1] ):
            lvd =  DataSet( size=(1, input.shape[1] ) )
            lvd.entities[1] = input.entities[1]
            lvd.labels[1] = input.labels[1]
            lvd.scales[1] = input.scales[1]
            lvd.data = plsr.x_weights_[:,n:n+1].T
            dso_lv['lv%s' % (n+1)] = lvd
            weightsd.labels[0][n] = "Weights on LV %s" % (n+1)
            weightsd.classes[0][n] = "LV %s" % (n+1)
                    
        return dict(list({
            'dso': dso,
            'scores':scored,
            'weights':weightsd,
            #'figure_data': figure_data,
            #'figure_regions': figure_regions,
            'y_weights': plsr.y_weights_,
            'x_weights': plsr.x_weights_,
        }.items()) + list(dso_lv.items()) )
Пример #16
0
    def load_csv_C(
        self, filename
    ):  # Load from csv with experiments in COLUMNS, metabolites in ROWS
        # Read in data for the graphing metabolite, with associated value (generate mean)
        f = open(filename, 'rU')
        fsize = os.path.getsize(filename)
        reader = csv.reader(f, delimiter=str(','), dialect='excel')

        hrow = next(reader)  # Discard top row (sample no's)
        samples = hrow[1:]

        hrow = next(reader)  # Get 2nd row
        classesa = hrow[1:]
        classes = [c for c in classesa if c != '.']

        metabolites = []

        data = []

        added_rows = 0
        for n, row in enumerate(reader):
            metabolite = row[0]
            metabolites.append(row[0])
            quants = []
            for cn, c in enumerate(row[1:]):
                if classesa[cn] != '.':
                    try:
                        data.append(float(c))
                    except:
                        data.append(0)

            if n % 100 == 0:
                self.progress.emit(float(f.tell()) / fsize)

        data = np.asarray(data)
        data = np.reshape(data, (n + 1, len(classes))).T

        xdim = len(quants)
        ydim = len(classes)

        # Build dataset object
        dso = DataSet(
            size=(xdim, ydim))  # self.add_data('imported_data', DataSetself) )
        dso.empty(size=(ydim, xdim))
        dso.labels[1] = metabolites

        scales = []
        mlabels = []
        for m in metabolites:
            try:
                scales.append(float(m))
                mlabels.append(None)
            except:
                scales.append(None)
                mlabels.append(m)

        dso.scales[0] = [None] * len(samples)
        dso.labels[0] = samples
        dso.classes[0] = classes
        dso.entities[0] = [None] * len(samples)

        dso.scales[1] = scales
        dso.labels[1] = mlabels
        dso.classes[1] = [None] * len(scales)
        dso.entities[1] = [None] * len(scales)

        dso.data = data

        return dso
Пример #17
0
    def load_csv_R(
        self, filename
    ):  # Load from csv with experiments in ROWS, metabolites in COLUMNS
        # Read in data for the graphing metabolite, with associated value (generate mean)
        f = open(filename, 'rU')
        fsize = os.path.getsize(filename)
        reader = csv.reader(f, delimiter=str(','), dialect='excel')
        print('R')
        hrow = next(reader)  # Get top row
        metabolites = hrow[2:]
        ydim = 0
        xdim = len(metabolites)

        samples = []
        classes = []
        raw_data = []

        # Build quants table for metabolite classes
        #for metabolite in self.metabolites:
        #    quantities[ metabolite ] = defaultdict(list)

        for n, row in enumerate(reader):
            ydim += 1
            if row[1] != '.':  # Skip excluded classes # row[1] = Class
                samples.append(row[0])
                classes.append(row[1])
                data_row = []
                for c in row[2:]:  # in self.metabolites:
                    try:
                        c = float(c)
                    except:
                        c = 0
                    data_row.append(c)

                raw_data.append(data_row)
                #metabolite_column = hrow.index( metabolite )
                #if row[ metabolite_column ]:
                #    data_row.append(
                #    quantities[metabolite][ row[1] ].append( float(row[ metabolite_column ]) )
                #self.statistics['ymin'] = min( self.statistics['ymin'], float(row[ metabolite_column ]) )
                #self.statistics['ymax'] = max( self.statistics['ymax'], float(row[ metabolite_column ]) )
                #else:
                #    quantities[metabolite][ row[1] ].append( 0 )
            else:
                pass

            if n % 100 == 0:
                self.progress.emit(float(f.tell()) / fsize)

                #self.statistics['excluded'] += 1

        # Build dataset object
        dso = DataSet(
            size=(xdim, ydim))  # self.add_data('imported_data', DataSetself) )
        dso.empty(size=(ydim, xdim))
        #dso.labels[1] = metabolites

        scales = []
        mlabels = []
        for m in metabolites:
            try:
                scales.append(float(m))
                mlabels.append(None)
            except:
                scales.append(None)
                mlabels.append(m)

        dso.scales[1] = [None] * len(samples)
        dso.labels[0] = samples
        dso.classes[0] = classes
        dso.entities[0] = [None] * len(samples)

        dso.scales[1] = scales
        dso.labels[1] = mlabels
        dso.entities[1] = [None] * len(scales)
        dso.classes[1] = [None] * len(scales)

        dso.data = np.array(raw_data)

        return dso
Пример #18
0
    def generate(self, input=None):
        dso = input

        _experiment_test = self.config.get('experiment_test')
        _experiment_control = self.config.get('experiment_control')

        data = dso.data

        plsr = PLSRegression(
            n_components=self.config.get('number_of_components'),
            scale=self.config.get(
                'autoscale'))  #, algorithm=self.config.get('algorithm'))
        Y = np.array(
            [0 if c == _experiment_control else 1 for c in dso.classes[0]])
        #Y = Y.reshape( (len(dso.classes[0]),1) )

        plsr.fit(data, Y)  # Transpose it, as vars need to along the top

        #figure_data = zip( dso.classes[0], plsr.x_scores_[:,0], plsr.x_scores_[:,1])

        # Build scores into a dso no_of_samples x no_of_principal_components
        scored = DataSet(size=(len(plsr.x_scores_), len(plsr.x_scores_[0])))
        scored.labels[0] = input.labels[0]
        scored.classes[0] = input.classes[0]

        print(plsr.x_scores_.shape)
        print(scored.data.shape)

        for n, s in enumerate(plsr.x_scores_.T):
            scored.data[:, n] = s
            scored.labels[1][n] = 'Latent Variable %d (%0.2f%%)' % (
                n + 1, plsr.y_weights_[0][0] * 100)

        # PLS-DA regions; mean +- 95% confidence in each axis for each cluster
        cw_x = defaultdict(list)
        cw_y = defaultdict(list)
        #figure_regions = []
        #for c,x,y in figure_data:
        #    cw_x[c].append( x )
        #    cw_y[c].append( y )

        for c in list(cw_x.keys()):
            # Calculate mean point
            cx = np.mean(cw_x[c])
            cy = np.mean(cw_y[c])

            # Calculate 95% CI
            rx = np.std(
                cw_x[c]
            ) * 2  # 2sd = 95% #1.95 * ( / srn) # 1.95 * SEM => 95% confidence
            ry = np.std(cw_y[c]) * 2  #1.95 * ( / srn)

            # Calculate 95% CI
            #srn = np.sqrt( len( cw_x[c] ) ) # Sample numbers sqrt
            #rx = 1.95*(np.std( cw_x[c] )/srn ) # 2sd = 95% #1.95 * ( / srn) # 1.95 * SEM => 95% confidence
            #ry = 1.95*(np.std( cw_y[c] )/srn ) #1.95 * ( / srn)

            figure_regions.append((c, cx, cy, rx, ry))

        # Label up the top 50 (the values are retained; just for clarity)
        wmx = np.amax(np.absolute(plsr.x_weights_), axis=1)
        dso_z = list(zip(dso.scales[1], dso.entities[1], dso.labels[1]))
        dso_z = sorted(zip(dso_z, wmx), key=lambda x: x[1])[-50:]  # Top 50
        dso_z = [x for x, wmx in dso_z]

        dso_lv = {}
        for n in range(0, plsr.x_weights_.shape[1]):
            lvd = DataSet(size=(1, input.shape[1]))
            lvd.entities[1] = input.entities[1]
            lvd.labels[1] = input.labels[1]
            lvd.scales[1] = input.scales[1]
            lvd.data = plsr.x_weights_[:, n:n + 1].T
            dso_lv['lv%s' % (n + 1)] = lvd

        return dict(
            list({
                'dso': dso,
                'scores': scored,
                #'figure_data': figure_data,
                #'figure_regions': figure_regions,
                'y_weights': plsr.y_weights_,
                'x_weights': plsr.x_weights_,
            }.items()) + list(dso_lv.items()))
Пример #19
0
    def generate(self, input_1=None, input_2=None, input_3=None, input_4=None):
        #dsi = input
        # Iterate all the compounds in the current analysis
        # Assign score to each of the compound's pathways
        # Sum up, crop and return a list of pathway_ids to display
        # Pass this in as the list to view
        # + requested pathways, - excluded pathways

        db = self.m.db

        mining_depth = self.config.get('/Data/MiningDepth')
        mining_type = self.config.get('/Data/MiningType')

        pathway_scores = defaultdict(int)

        for dsi in input_1, input_2, input_3, input_4:
            if dsi == None:
                continue

            print("Mining using '%s'" % mining_type)

            for n, entity in enumerate(dsi.entities[1]):
                if entity == None:
                    continue  # Skip

                score = dsi.data[0, n]
                #score = self.analysis[ m_id ]['score']

                # 1' neighbours; 2' neighbours etc. add score
                # Get a list of methods in connected reactions, add their score % to this compound
                # if m_id in db.compounds.keys():
                #    n_compounds = [r.compounds for r in db.compounds[ m_id ].reactions ]
                #     print n_compounds
                #     n_compounds = [m for ml in n_compounds for m in ml if n_m.id in self.analysis and m.id != m_id ]
                #     for n_m in n_compounds:
                #         score += self.analysis[ n_m.id ]['score'] * 0.5

                # Get the entity's pathways
                pathways = entity.pathways
                if pathways == []:
                    continue

                if self.config.get('/Data/MiningShared'):
                    # Share the change score between the associated pathways
                    # this prevents compounds having undue influence
                    score = score / len(pathways)

                for p in pathways:
                    mining_val = {
                        'c': abs(score),
                        'u': max(0, score),
                        'd': abs(min(0, score)),
                        'm': 1.0,
                        't': score,
                    }
                    pathway_scores[p] += mining_val[mining_type]

            # If we're using tendency scaling; abs the scores here
            if mining_type == 't':
                for p, v in list(pathway_scores.items()):
                    pathway_scores[p] = abs(v)

            # If we're pruning, then remove any pathways not in keep_pathways
            if self.config.get('/Data/MiningRelative'):
                print("Scaling pathway scores to pathway sizes...")
                for p, v in list(pathway_scores.items()):
                    pathway_scores[p] = float(v) / len(p.reactions)

        if not pathway_scores:
            # No data
            raise BaseException

        # Now take the accumulated scores; and create the output
        pathway_scorest = list(
            pathway_scores.items())  # Switch it to a dict so we can sort
        pathway_scorest = [(p, v) for p, v in pathway_scorest
                           if v > 0]  # Remove any scores of 0
        pathway_scorest.sort(key=lambda tup: tup[1],
                             reverse=True)  # Sort by scores (either system)

        # Get top N defined by mining_depth parameter
        keep_pathways = pathway_scorest[0:mining_depth]
        remaining_pathways = pathway_scorest[mining_depth + 1:mining_depth +
                                             100]

        print("Mining recommended %d out of %d" %
              (len(keep_pathways), len(pathway_scores)))

        for n, p in enumerate(keep_pathways):
            print("- %d. %s [%.2f]" % (n + 1, p[0].name, p[1]))

        #self.analysis['mining_ranked_remaining_pathways'] = []

        #if remaining_pathways:
        #    print "Note: Next pathways by current scoring method are..."
        #    for n2,p in enumerate(remaining_pathways):
        #        print "- %d. %s [%.2f]" % (n+n2+1, db.pathways[ p[0] ].name, p[1])
        #        self.analysis['mining_ranked_remaining_pathways'].append( p[0] )

        #self.analysis_suggested_pathways = [db.pathways[p[0]] for p in pathway_scorest]
        dso = DataSet(size=(1, len(keep_pathways)))
        dso.entities[1] = [k for k, v in keep_pathways]
        dso.labels[1] = [k.name for k, v in keep_pathways]
        dso.data = np.array([v for k, v in keep_pathways], ndmin=2)

        dso.labels[0][0] = "Pathway mining scores"

        return {'output': dso}
Пример #20
0
    def load_csv_C(self, filename):  # Load from csv with experiments in COLUMNS, metabolites in ROWS
        # Read in data for the graphing metabolite, with associated value (generate mean)
        f = open(filename, 'rU')
        fsize = os.path.getsize(filename)
        reader = csv.reader(f, delimiter=str(','), dialect='excel')

        hrow = next(reader)  # Discard top row (sample no's)
        samples = hrow[1:]

        hrow = next(reader)  # Get 2nd row
        classesa = hrow[1:]
        classes = [c for c in classesa if c != '.']

        metabolites = []

        data = []

        added_rows = 0
        for n, row in enumerate(reader):
            metabolite = row[0]
            metabolites.append(row[0])
            quants = []
            for cn, c in enumerate(row[1:]):
                if classesa[cn] != '.':
                    try:
                        data.append(float(c))
                    except:
                        data.append(0)

            if n % 100 == 0:
                self.progress.emit(float(f.tell()) / fsize)

        data = np.asarray(data)
        data = np.reshape(data, (n + 1, len(classes))).T

        xdim = len(quants)
        ydim = len(classes)

        # Build dataset object
        dso = DataSet(size=(xdim, ydim))  # self.add_data('imported_data', DataSetself) )
        dso.empty(size=(ydim, xdim))
        dso.labels[1] = metabolites

        scales = []
        mlabels = []
        for m in metabolites:
            try:
                scales.append(float(m))
                mlabels.append(None)
            except:
                scales.append(None)
                mlabels.append(m)

        dso.scales[0] = [None] * len(samples)
        dso.labels[0] = samples
        dso.classes[0] = classes
        dso.entities[0] = [None] * len(samples)

        dso.scales[1] = scales
        dso.labels[1] = mlabels
        dso.classes[1] = [None] * len(scales)
        dso.entities[1] = [None] * len(scales)

        dso.data = data

        return dso
Пример #21
0
    def load_csv_R(self, filename):  # Load from csv with experiments in ROWS, metabolites in COLUMNS
        # Read in data for the graphing metabolite, with associated value (generate mean)
        f = open(filename, 'rU')
        fsize = os.path.getsize(filename)
        reader = csv.reader(f, delimiter=str(','), dialect='excel')
        print('R')
        hrow = next(reader)  # Get top row
        metabolites = hrow[2:]
        ydim = 0
        xdim = len(metabolites)

        samples = []
        classes = []
        raw_data = []

        # Build quants table for metabolite classes
        #for metabolite in self.metabolites:
        #    quantities[ metabolite ] = defaultdict(list)

        for n, row in enumerate(reader):
            ydim += 1
            if row[1] != '.':  # Skip excluded classes # row[1] = Class
                samples.append(row[0])
                classes.append(row[1])
                data_row = []
                for c in row[2:]:  # in self.metabolites:
                    try:
                        c = float(c)
                    except:
                        c = 0
                    data_row.append(c)

                raw_data.append(data_row)
                    #metabolite_column = hrow.index( metabolite )
                    #if row[ metabolite_column ]:
                    #    data_row.append(
                    #    quantities[metabolite][ row[1] ].append( float(row[ metabolite_column ]) )
                        #self.statistics['ymin'] = min( self.statistics['ymin'], float(row[ metabolite_column ]) )
                        #self.statistics['ymax'] = max( self.statistics['ymax'], float(row[ metabolite_column ]) )
                    #else:
                    #    quantities[metabolite][ row[1] ].append( 0 )
            else:
                pass

            if n % 100 == 0:
                self.progress.emit(float(f.tell()) / fsize)

                #self.statistics['excluded'] += 1

        # Build dataset object
        dso = DataSet(size=(xdim, ydim))  # self.add_data('imported_data', DataSetself) )
        dso.empty(size=(ydim, xdim))
        #dso.labels[1] = metabolites

        scales = []
        mlabels = []
        for m in metabolites:
            try:
                scales.append(float(m))
                mlabels.append(None)
            except:
                scales.append(None)
                mlabels.append(m)

        dso.scales[1] = [None] * len(samples)
        dso.labels[0] = samples
        dso.classes[0] = classes
        dso.entities[0] = [None] * len(samples)

        dso.scales[1] = scales
        dso.labels[1] = mlabels
        dso.entities[1] = [None] * len(scales)
        dso.classes[1] = [None] * len(scales)

        dso.data = np.array(raw_data)

        return dso