Пример #1
0
    def generate(self, input=None):
        data = input.data
        
        vars = self.config.get('variables')

        correlations = {}
        for n, v in enumerate(self.config.get('variables')):
            a, b = v
            x = input.data[:, a ] 
            y = input.data[:, b ]  
            
            fit = np.polyfit(x,y,1)
            dso = DataSet( size=(len(x),2 ) )
            dso.data[:,0] = x
            dso.data[:,1] = y
            
            dso.labels[1][0] = make_label_for_entry( [input.scales[1][a], input.labels[1][a], input.entities[1][a] ] )
            dso.labels[1][1] = make_label_for_entry( [input.scales[1][b], input.labels[1][b], input.entities[1][b] ] )
            
            slope, intercept, r_value, p_value, std_err = sp.stats.linregress(x, y)
                        
            correlations[str(n+1)] = {
                'dso': dso,
                'fit': fit,
                'label': 'r²=%0.2f, p=%0.2f' % (r_value**2, p_value)
            }
        return {'correlations':correlations}
Пример #2
0
    def normalise(self, dsi):
        # Generate bin values for range start_scale to end_scale
        # Calculate the number of bins at binsize across range
        dso = DataSet(size=dsi.shape)
        dso.import_data(dsi)

        dso.data = self.algorithms[self.config.get('algorithm')](dso.data)
        # -- optionally use the line widths and take max within each of these for each spectra (peak shiftiness)
        # Filter the original data with those locations and output\

        return dso
Пример #3
0
    def normalise(self, dsi):
        # Generate bin values for range start_scale to end_scale
        # Calculate the number of bins at binsize across range
        dso = DataSet(size=dsi.shape)
        dso.import_data(dsi)

        dso.data = self.algorithms[self.config.get('algorithm')](dso.data)
        # -- optionally use the line widths and take max within each of these for each spectra (peak shiftiness)
        # Filter the original data with those locations and output\

        return dso
Пример #4
0
    def generate(self, input=None):  #, config, algorithms):
        # Generate bin values for range start_scale to end_scale
        # Calculate the number of bins at binsize across range
        dso = DataSet(size=input.shape)
        dso.import_data(input)

        #ng.analysis.peakpick.pick(data, thres, msep=None, direction='both', algorithm='thres', est_params=True, lineshapes=None)

        threshold = self.config.get('peak_threshold')
        algorithm = self.algorithms[self.config.get('algorithm')]
        msep = (self.config.get('peak_separation'), )

        # Take input dataset and flatten in first dimension (average spectra)
        data_avg = np.mean(input.data, axis=0)

        # pick peaks and return locations;
        #nmrglue.analysis.peakpick.pick(data, pthres, nthres=None, msep=None, algorithm='connected', est_params=True, lineshapes=None, edge=None, diag=False, c_struc=None, c_ndil=0, cluster=True, table=True, axis_names=['A', 'Z', 'Y', 'X'])[source]¶
        locations, scales, amps = ng.analysis.peakpick.pick(
            data_avg,
            threshold,
            msep=msep,
            algorithm=algorithm,
            est_params=True,
            cluster=False,
            table=False)

        #n_cluster = max( cluster_ids )
        n_locations = len(locations)

        new_shape = list(input.shape)
        new_shape[1] = n_locations  # correct number; tho will be zero indexed

        # Convert to numpy arrays so we can do clever things
        scales = [dso.scales[1][l[0]] for l in locations]

        # Adjust the scales (so aren't lost in crop)
        dso.labels[1] = [str(l) for l in scales]
        dso.scales[1] = scales

        dso.crop(new_shape)

        # Iterate over the clusters (1 to n)
        for n, l in enumerate(locations):
            #l = locations[ cluster_ids == n ]
            #peak_data = np.amax( peak_data, axis=1 ) # max across cols
            dso.data[:, n - 1] = input.data[:, l[0]]

        # FIXME:
        # Extract the location numbers (positions in original spectra)
        # Get max value in each row for those regions
        # Append that to n position in new dataset

        # -- optionally use the line widths and take max within each of these for each spectra (peak shiftiness)
        # Filter the original data with those locations and output\

        return {'output': dso}
Пример #5
0
    def load_metabolights(self, filename, id_col=0, name_col=4, data_col=18): # Load from csv with experiments in COLUMNS, metabolites in ROWS
        print("Loading Metabolights...")
        
        #sample	1	2	3	4
        #class	ADG10003u_007	ADG10003u_008	ADG10003u_009	ADG10003u_010   ADG19007u_192
        #2-oxoisovalerate	0.3841	0.44603	0.45971	0.40812
        reader = csv.reader( open( filename, 'rU'), delimiter=',', dialect='excel')
    
        # Sample identities from top row ( sample labels )
        hrow = next(reader)
        sample_ids = hrow[1:]    

        # Sample classes from second row; crop off after u_
        classes = [c for c in hrow if 'u_' in c]

        data_starts_at = hrow.index(classes[0])
        metabolite_names_at = hrow.index('metabolite_identification')

        classes = [ c.split('u_')[0] for c in classes]
        

        metabolites = []
        metabolite_data = []

        # Read in metabolite data n.b. can have >1 entry / metabolite so need to allow for this
        for row in reader:
            if row[0] != '': # Skip empty rows
                metabolites.append( row[metabolite_names_at] )
                metabolite_data.append( row[data_starts_at:] )
            
        ydim = len( classes )
        xdim = len( metabolites )
        
        dso = DataSet( size=(ydim, xdim) )

        dso.labels[0] = sample_ids
        dso.classes[0] = classes 

        dso.labels[1] = metabolites

        for n,md in enumerate(metabolite_data):
            print md
            dso.data[:,n] = np.array(md)
            
        return dso
Пример #6
0
    def generate(self, input=None):
        data = input.data
        
        pca = PCA(n_components=self.config.get('number_of_components'))
        pca.fit(data.T) # Transpose it, as vars need to along the top
        
        weights = pca.transform(data.T) # Get weights?
        
        # Label up the top 50 (the values are retained; just for clarity)
        wmx = np.amax( np.absolute( weights), axis=1 )

        dso_z = list(zip( input.scales[1], input.entities[1], input.labels[1] ))
        dso_z = sorted( zip( dso_z, wmx ), key=lambda x: x[1])[-50:] # Top 50
        
        dso_z = [x for x, wmx in dso_z ]  
        
        # Build scores into a dso no_of_samples x no_of_principal_components
        scored = DataSet(size=(len(pca.components_[0]),len(pca.components_)))  
        scored.labels[0] = input.labels[0]
        scored.classes[0] = input.classes[0]
        
        for n,s in enumerate(pca.components_):
            scored.data[:,n] = s
            scored.labels[1][n] = 'Principal Component %d (%0.2f%%)' % (n+1, pca.explained_variance_ratio_[0] * 100.)
        
        dso_pc = {}
        for n in range(0, weights.shape[1] ):
            pcd =  DataSet( size=(1, input.shape[1] ) )
            pcd.entities[1] = input.entities[1]
            pcd.labels[1] = input.labels[1]
            pcd.scales[1] = input.scales[1]
            pcd.data = weights[:,n:n+1].T
            dso_pc['pc%s' % (n+1)] = pcd
        
        return dict( list({
            'dso': input,
            'pca': pca,
            'scores': scored,
            #'weights': weights,
            'wmx': wmx,
            'dso_z': dso_z,        
        }.items()) + list(dso_pc.items()) )
Пример #7
0
    def load_soft_series_family(self, filename):  # Load from soft data file for genes
        # SOFT files are a /sort of/ basterdized csv with data in tab-separated columns
        # So, we use the csv reader to get that, accounting for most stuff being single field with
        # slightly strange identifiers

        reader = csv.reader(open(filename, 'rU'), delimiter='\t', dialect='excel')
        soft_data = self.preprocess_soft(reader)

        database = {}
        platform = {}
        samples = {}
        sample_data = {}

        for section, rows in list(soft_data.items()):

            if section.startswith('^DATABASE'):
                database = self.get_soft_metadata(rows)

            elif section.startswith('^PLATFORM'):
                platform = self.get_soft_metadata(rows)
                platform_data = self.get_soft_data(rows, '!platform_table_begin', '!platform_table_end')

            elif section.startswith('^SAMPLE'):
                key, sample_id = row[0].split(' = ')
                samples[sample_id] = self.get_soft_metadata(rows)
                sample_data[sample_id] = self.get_soft_data(rows, '!sample_table_begin', '!sample_table_end')
        # We now have the entire dataseries loaded; but in a bit of a messed up format
        # Build a dataset object to fit and map the data in

        xdim = len(platform_data)  # Use first sample to access the gene list
        ydim = len(sample_data)

        # Build dataset object
        dso = DataSet(size=(xdim, ydim))  # self.add_data('imported_data', DataSetself) )
        dso.empty(size=(ydim, xdim))

        sample_ids = sorted(samples.keys())  # Get the samples sorted so we keep everything lined up
        gene_ids = sorted(platform_data.keys())  # Get the keys sorted so we keep everything lined up

        dso.labels[0] = sample_ids
        dso.labels[1] = [platform_data[gene_id]['UNIGENE'] for gene_id in gene_ids]
        dso.entities[1] = [self.m.db.get_via_unification('UNIGENE', gene_id) for gene_id in dso.labels[1]]

        for xn, gene_id in enumerate(gene_ids):
            for yn, sample_id in enumerate(sample_ids):

                dso.data[yn, xn] = sample_data[sample_id][gene_id]['VALUE']

        return dso
Пример #8
0
    def generate(self, input=None): #, config, algorithms):
        # Generate bin values for range start_scale to end_scale
        # Calculate the number of bins at binsize across range
        dso = DataSet( size=input.shape )
        dso.import_data(input)
        
        #ng.analysis.peakpick.pick(data, thres, msep=None, direction='both', algorithm='thres', est_params=True, lineshapes=None)
        
        threshold =  self.config.get('peak_threshold')
        algorithm = self.algorithms[ self.config.get('algorithm')]
        msep = ( self.config.get('peak_separation'),)
        
        # Take input dataset and flatten in first dimension (average spectra)
        data_avg = np.mean( input.data, axis=0)

        # pick peaks and return locations; 
        #nmrglue.analysis.peakpick.pick(data, pthres, nthres=None, msep=None, algorithm='connected', est_params=True, lineshapes=None, edge=None, diag=False, c_struc=None, c_ndil=0, cluster=True, table=True, axis_names=['A', 'Z', 'Y', 'X'])[source]¶
        locations, scales, amps = ng.analysis.peakpick.pick(data_avg, threshold, msep=msep, algorithm=algorithm, est_params = True, cluster=False, table=False)

        #n_cluster = max( cluster_ids )
        n_locations = len( locations )
        
        new_shape = list( input.shape )
        new_shape[1] = n_locations # correct number; tho will be zero indexed
        
        # Convert to numpy arrays so we can do clever things
        scales = [dso.scales[1][l[0]] for l in locations ]

        # Adjust the scales (so aren't lost in crop)
        dso.labels[1] = [ str(l) for l in scales]
        dso.scales[1] = scales
        
        dso.crop( new_shape )

        # Iterate over the clusters (1 to n)
        for n, l in enumerate(locations):
            #l = locations[ cluster_ids == n ]
            #peak_data = np.amax( peak_data, axis=1 ) # max across cols
            dso.data[:,n-1] = input.data[:, l[0]]
            
        # FIXME:
        # Extract the location numbers (positions in original spectra)
        # Get max value in each row for those regions
        # Append that to n position in new dataset
        
        # -- optionally use the line widths and take max within each of these for each spectra (peak shiftiness)
        # Filter the original data with those locations and output\

        return {'output':dso}
Пример #9
0
    def load_datafile(self, file):

        reader = csv.reader(open(file, 'rU'), delimiter='\t', dialect='excel')
        hrow = next(reader)  # Get top row

        slabels = []
        data = []

        if hrow[0] == 'Profiled Data Type':  # Is a Chenomx output file; use the other columns to map data scale/etc. once implemented
            next(reader)  # Skip date row
            hrow = next(reader)
            labels = hrow[2:]  # We strip off the pH here; might be nice to keep it
            entities = [self.m.db.synrev[l] if l in self.m.db.synrev else None for l in labels]  # Map to entities if they exist

            next(reader)  # Skip compound ID
            next(reader)  # Skip InChI
            next(reader)  # Skip SMILES

            for hrow in reader:  # Now read the data rows
                slabels.append(hrow[0])
                td = []
                for x in hrow[2:]:
                    try:
                        td.append(float(x))
                    except:
                        td.append(0)
                data.append(td)

        data = np.array(data)
        dso = DataSet(size=data.shape)
        print(data.shape)
        dso.labels[1] = labels
        dso.entities[1] = entities
        dso.labels[0] = slabels
        dso.data = data

        return {'output': dso}
Пример #10
0
    def generate(self, input=None):
        dsi = input
        ###### BINNING USING CONFI
        # Generate bin values for range start_scale to end_scale
        # Calculate the number of bins at binsize across range
        dso = DataSet()
        dso.import_data(dsi)

        r = dsi.scales_r[1]
        self._bin_size, self._bin_offset = self.config.get('bin_size'), self.config.get('bin_offset')

        bins = np.arange(r[0] + self._bin_offset, r[1] + self._bin_offset, self._bin_size)
        number_of_bins = len(bins) - 1

        # Can't increase the size of data, if bins > current size return the original
        if number_of_bins >= len(dso.scales[1]):
            return {'dso': dso}

        # Resize (lossy) to the new shape
        old_shape, new_shape = list(dsi.data.shape), list(dso.data.shape)
        new_shape[1] = number_of_bins
        dso.crop(new_shape)  # Lossy crop, but we'll be within the boundary below


        for n, d in enumerate(dsi.data):
            binned_data = np.histogram(dsi.scales[1], bins=bins, weights=d)
            binned_num = np.histogram(dsi.scales[1], bins=bins)  # Number of data points that ended up contributing to each bin
            dso.data[n, :] = binned_data[0] / binned_num[0]  # Mean

        dso.scales[1] = [float(x) for x in binned_data[1][:-1]]
        dso.labels[1] = [str(x) for x in binned_data[1][:-1]]

        # Remove any NaNs that have crept in (due to the histogram)
        dso.remove_invalid_data()

        return {'output': dso, 'input': input}  # Pass back input for difference plot
Пример #11
0
    def generate(self, input=None):

        pathways = [k for k, v in db.dbm.get_pathways()]
        pathway_compounds = dict()

        for k, p in db.dbm.get_pathways():
            pathway_compounds[p.id] = set([m for m in p.compounds])

        data_m, labels_m = self.build_matrix(pathways, pathway_compounds)

        pathway_reactions = dict()

        for k, p in list(db.dbm.pathways.items()):
            pathway_reactions[p.id] = set([m for m in p.reactions])

        data_r, labels_r = self.build_matrix(pathways, pathway_reactions)

        pathway_active_reactions = dict()
        pathway_active_compounds = dict()
        active_pathways = input.entities[1]
        active_pathways_id = []

        for p in active_pathways:
            pathway_active_reactions[p.id] = set([r for r in p.reactions])
            pathway_active_compounds[p.id] = set([r for r in p.compounds])
            active_pathways_id.append(p.id)

        data_ar, labels_ar = self.build_matrix(active_pathways_id,
                                               pathway_active_reactions)
        data_am, labels_am = self.build_matrix(active_pathways_id,
                                               pathway_active_compounds)

        dim = len(data_ar)

        dso_r = DataSet(size=(dim, dim))
        dso_r.data = data_ar
        dso_r.labels[1] = labels_ar

        dso_m = DataSet(size=(dim, dim))
        dso_m.data = data_am
        dso_m.labels[1] = labels_am

        return {'dso_r': dso_r, 'dso_m': dso_m}
Пример #12
0
    def generate(self, input=None):

        pathways = list(self.m.db.pathways.keys())
        pathway_compounds = dict()

        for k, p in list(self.m.db.pathways.items()):
            pathway_compounds[p.id] = set([m for m in p.compounds])

        data_m, labels_m = self.build_matrix(pathways, pathway_compounds)

        pathway_reactions = dict()

        for k, p in list(self.m.db.pathways.items()):
            pathway_reactions[p.id] = set([m for m in p.reactions])

        data_r, labels_r = self.build_matrix(pathways, pathway_reactions)

        pathway_active_reactions = dict()
        pathway_active_compounds = dict()
        active_pathways = input.entities[1]  # [self.parent.db.pathways[p] for p in self.parent.config.value('/Pathways/Show').split(',')]
        active_pathways_id = []

        for p in active_pathways:
            pathway_active_reactions[p.id] = set([r for r in p.reactions])
            pathway_active_compounds[p.id] = set([r for r in p.compounds])
            active_pathways_id.append(p.id)

        data_ar, labels_ar = self.build_matrix(active_pathways_id, pathway_active_reactions)
        data_am, labels_am = self.build_matrix(active_pathways_id, pathway_active_compounds)

        dim = len(data_ar)

        dso_r = DataSet(size=(dim, dim))
        dso_r.data = data_ar
        dso_r.labels[1] = labels_ar

        dso_m = DataSet(size=(dim, dim))
        dso_m.data = data_am
        dso_m.labels[1] = labels_am

        return {'dso_r': dso_r, 'dso_m': dso_m}
Пример #13
0
    def load_bml_datafile(self, data_path, target, name):

        dso = DataSet()

        # Read in data for the graphing metabolite, with associated value (generate mean)
        reader = csv.reader(utils.nonull(open(data_path, 'rb')),
                            delimiter='\t',
                            dialect='excel')

        for row in reader:
            if row and row[0] == 'metabolite':  # Look for the top row
                break
        else:
            return

        samples = row[1:-2]  # Sample identities
        samples = [sample[8:-1] for sample in samples]

        xdim = 0
        ydim = len(samples)

        raw_data = []
        metabolites = []

        for row in reader:
            xdim += 1
            metabolites.append(row[0])

            raw_data.append([float(i) for i in row[1:-2]])

        dso = DataSet(size=(ydim, xdim))
        dso.labels[1] = metabolites

        dso.data = np.array(raw_data).T

        dso.name = name
        dso.description = 'Imported from FIMA (%s)' % name

        return dso
Пример #14
0
    def process_data_to_dso(self, nmr_data, nmr_ppms, sample_labels,
                            experiment_name):

        print("Processing spectra to dso...")
        sample_n = len(sample_labels)
        ppm_n = len(nmr_ppms)

        dso = DataSet(size=(sample_n, ppm_n))

        for n, nd in enumerate(nmr_data):
            print("Spectra %s" % sample_labels[n])
            dso.data[n, :] = nd
            dso.labels[0][n] = sample_labels[n]

        dso.labels[1] = [str(ppm) for ppm in nmr_ppms]
        dso.scales[1] = [float(ppm) for ppm in nmr_ppms]
        dso.name = experiment_name

        return dso
Пример #15
0
    def load_metabolights(
        self,
        filename,
        id_col=0,
        name_col=4,
        data_col=18
    ):  # Load from csv with experiments in COLUMNS, metabolites in ROWS
        print("Loading Metabolights...")

        #sample	1	2	3	4
        #class	ADG10003u_007	ADG10003u_008	ADG10003u_009	ADG10003u_010   ADG19007u_192
        #2-oxoisovalerate	0.3841	0.44603	0.45971	0.40812
        reader = csv.reader(open(filename, 'rU'),
                            delimiter=',',
                            dialect='excel')

        # Sample identities from top row ( sample labels )
        hrow = next(reader)
        sample_ids = hrow[1:]

        # Sample classes from second row; crop off after u_
        hrow = next(reader)
        classes = hrow[1:]
        classes = [c.split('u_')[0] for c in classes]

        metabolites = []
        metabolite_data = []
        # Read in metabolite data n.b. can have >1 entry / metabolite so need to allow for this
        for row in reader:
            if row[0] != '':  # Skip empty rows
                metabolites.append(row[0])
                metabolite_data.append(row[1:])

        ydim = len(classes)
        xdim = len(metabolites)

        dso = DataSet(size=(ydim, xdim))

        dso.labels[0] = sample_ids
        dso.classes[0] = classes

        dso.labels[1] = metabolites

        for n, md in enumerate(metabolite_data):
            dso.data[:, n] = np.array(md)

        return dso
Пример #16
0
    def process_data_to_dso(self, nmr_data, nmr_ppms, sample_labels, experiment_name):

        print("Processing spectra to dso...")
        sample_n = len(sample_labels)
        ppm_n = len(nmr_ppms)

        dso = DataSet(size=(sample_n, ppm_n))

        for n, nd in enumerate(nmr_data):
            print("Spectra %s" % sample_labels[n])
            dso.data[n, :] = nd
            dso.labels[0][n] = sample_labels[n]

        dso.labels[1] = [str(ppm) for ppm in nmr_ppms]
        dso.scales[1] = [float(ppm) for ppm in nmr_ppms]
        dso.name = experiment_name

        return dso
Пример #17
0
    def generate(self, input=None):

        pathways = list(self.m.db.pathways.keys())
        pathway_compounds = dict()

        for k, p in list(self.m.db.pathways.items()):
            pathway_compounds[p.id] = set([m for m in p.compounds])

        data_m, labels_m = self.build_matrix(pathways, pathway_compounds)

        pathway_reactions = dict()

        for k, p in list(self.m.db.pathways.items()):
            pathway_reactions[p.id] = set([m for m in p.reactions])

        data_r, labels_r = self.build_matrix(pathways, pathway_reactions)

        pathway_active_reactions = dict()
        pathway_active_compounds = dict()
        active_pathways = input.entities[1]  # [self.parent.db.pathways[p] for p in self.parent.config.value('/Pathways/Show').split(',')]
        active_pathways_id = []

        for p in active_pathways:
            pathway_active_reactions[p.id] = set([r for r in p.reactions])
            pathway_active_compounds[p.id] = set([r for r in p.compounds])
            active_pathways_id.append(p.id)

        data_ar, labels_ar = self.build_matrix(active_pathways_id, pathway_active_reactions)
        data_am, labels_am = self.build_matrix(active_pathways_id, pathway_active_compounds)

        dim = len(data_ar)

        dso_r = DataSet(size=(dim, dim))
        dso_r.data = data_ar
        dso_r.labels[1] = labels_ar

        dso_m = DataSet(size=(dim, dim))
        dso_m.data = data_am
        dso_m.labels[1] = labels_am

        return {'dso_r': dso_r, 'dso_m': dso_m}
Пример #18
0
    def load_datafile(self, file):

        reader = csv.reader(open(file, 'rU'), delimiter='\t', dialect='excel')
        hrow = next(reader)  # Get top row

        slabels = []
        data = []

        if hrow[0] == 'Profiled Data Type':  # Is a Chenomx output file; use the other columns to map data scale/etc. once implemented
            next(reader)  # Skip date row
            hrow = next(reader)
            labels = hrow[
                2:]  # We strip off the pH here; might be nice to keep it
            entities = [
                self.m.db.synrev[l] if l in self.m.db.synrev else None
                for l in labels
            ]  # Map to entities if they exist

            next(reader)  # Skip compound ID
            next(reader)  # Skip InChI
            next(reader)  # Skip SMILES

            for hrow in reader:  # Now read the data rows
                slabels.append(hrow[0])
                td = []
                for x in hrow[2:]:
                    try:
                        td.append(float(x))
                    except:
                        td.append(0)
                data.append(td)

        data = np.array(data)
        dso = DataSet(size=data.shape)
        print(data.shape)
        dso.labels[1] = labels
        dso.entities[1] = entities
        dso.labels[0] = slabels
        dso.data = data

        return {'output': dso}
Пример #19
0
    def generate(self, input=None):

        pathways = [k for k, v in db.dbm.get_pathways()]
        pathway_compounds = dict()

        for k, p in db.dbm.get_pathways():
            pathway_compounds[p.id] = set([m for m in p.compounds])

        data_m, labels_m = self.build_matrix(pathways, pathway_compounds)

        pathway_reactions = dict()

        for k, p in list(db.dbm.pathways.items()):
            pathway_reactions[p.id] = set([m for m in p.reactions])

        data_r, labels_r = self.build_matrix(pathways, pathway_reactions)

        pathway_active_reactions = dict()
        pathway_active_compounds = dict()
        active_pathways = input.entities[1]
        active_pathways_id = []

        for p in active_pathways:
            pathway_active_reactions[p.id] = set([r for r in p.reactions])
            pathway_active_compounds[p.id] = set([r for r in p.compounds])
            active_pathways_id.append(p.id)

        data_ar, labels_ar = self.build_matrix(active_pathways_id, pathway_active_reactions)
        data_am, labels_am = self.build_matrix(active_pathways_id, pathway_active_compounds)

        dim = len(data_ar)

        dso_r = DataSet(size=(dim, dim))
        dso_r.data = data_ar
        dso_r.labels[1] = labels_ar

        dso_m = DataSet(size=(dim, dim))
        dso_m.data = data_am
        dso_m.labels[1] = labels_am

        return {'dso_r': dso_r, 'dso_m': dso_m}
Пример #20
0
    def load_bml_datafile(self, data_path, target, name):

        dso = DataSet()

        # Read in data for the graphing metabolite, with associated value (generate mean)
        reader = csv.reader(utils.nonull(open(data_path, 'rb')), delimiter='\t', dialect='excel')

        for row in reader:
            if row and row[0] == 'metabolite':  # Look for the top row
                break
        else:
            return

        samples = row[1:-2]  # Sample identities
        samples = [sample[8:-1] for sample in samples]

        xdim = 0
        ydim = len(samples)

        raw_data = []
        metabolites = []

        for row in reader:
            xdim += 1
            metabolites.append(row[0])

            raw_data.append([float(i) for i in row[1:-2]])

        dso = DataSet(size=(ydim, xdim))
        dso.labels[1] = metabolites

        dso.data = np.array(raw_data).T

        dso.name = name
        dso.description = 'Imported from FIMA (%s)' % name

        return dso
Пример #21
0
    def load_datafile(self, filename):
        # Determine if we've got a csv or peakml file (extension)
        #self.data.o['output'].empty()
        dso = DataSet()

        # Read data in from peakml format file
        xml = et.parse(filename)

        # Get sample ids, names and class groupings
        sets = xml.iterfind('header/sets/set')
        midclass = {}
        classes = set()
        measurements = []
        masses = {}

        for aset in sets:
            id = aset.find('id').text
            mids = aset.find('measurementids').text
            for mid in self.decode(mids):
                midclass[mid] = id
                measurements.append(mid)

            classes.add(id)

        # We have all the sample data now, parse the intensity and identity info
        peaksets = xml.iterfind('peaks/peak')
        quantities = defaultdict(dict)
        all_identities = []

        for peakset in peaksets:

            # Find metabolite identities
            annotations = peakset.iterfind('annotations/annotation')
            identities = False
            for annotation in annotations:
                if annotation.find('label').text == 'identification':
                    identities = annotation.find('value').text.split(', ')
                    all_identities.extend(identities)
                    break

            if identities:
                # PeakML supports multiple alternative metabolite identities,currently we don't so duplicate
                # We have identities, now get intensities for the different samples
                chromatograms = peakset.iterfind(
                    'peaks/peak')  # Next level down

                for chromatogram in chromatograms:
                    mid = chromatogram.find('measurementid').text
                    intensity = float(chromatogram.find('intensity').text)
                    mass = float(chromatogram.find('mass').text)

                    # Write out to each of the identities table (need to buffer til we have the entire list)
                    for identity in identities:
                        quantities[mid][identity] = intensity

                    # Write out to each of the identities table (need to buffer til we have the entire list)
                    for identity in identities:
                        masses[identity] = mass

        # Sort the identities/masses into consecutive order

        # Quantities table built; class table built; now rearrange into dso
        dso.empty((len(measurements), len(all_identities)))
        dso.labels[0] = measurements
        dso.classes[0] = [midclass[mid] for mid in measurements]

        dso.labels[1] = all_identities
        db_hmdbids = self.m.db.unification['HMDB']
        dso.entities[1] = [
            db_hmdbids[hmdbid] if hmdbid in db_hmdbids else None
            for hmdbid in all_identities
        ]
        dso.scales[1] = [float(masses[i]) for i in all_identities]

        for mid, identities in list(quantities.items()):
            for identity, intensity in list(identities.items()):
                r = measurements.index(mid)
                c = all_identities.index(identity)

                dso.data[r, c] = intensity

        dso.name = os.path.basename(filename)
        dso.description = 'Imported PeakML file'
        self.set_name(dso.name)

        return {'output': dso}
Пример #22
0
    def load_datafile(self, filename):
        # Determine if we've got a csv or peakml file (extension)
        #self.data.o['output'].empty()
        dso = DataSet()

        # Read data in from peakml format file
        xml = et.parse(filename)

        # Get sample ids, names and class groupings
        sets = xml.iterfind('header/sets/set')
        midclass = {}
        classes = set()
        measurements = []
        masses = {}

        for aset in sets:
            id = aset.find('id').text
            mids = aset.find('measurementids').text
            for mid in self.decode(mids):
                midclass[mid] = id
                measurements.append(mid)

            classes.add(id)

        # We have all the sample data now, parse the intensity and identity info
        peaksets = xml.iterfind('peaks/peak')
        quantities = defaultdict(dict)
        all_identities = []

        for peakset in peaksets:

        # Find metabolite identities
            annotations = peakset.iterfind('annotations/annotation')
            identities = False
            for annotation in annotations:
                if annotation.find('label').text == 'identification':
                    identities = annotation.find('value').text.split(', ')
                    all_identities.extend(identities)
                    break

            if identities:
                # PeakML supports multiple alternative metabolite identities,currently we don't so duplicate
                # We have identities, now get intensities for the different samples
                chromatograms = peakset.iterfind('peaks/peak')  # Next level down

                for chromatogram in chromatograms:
                    mid = chromatogram.find('measurementid').text
                    intensity = float(chromatogram.find('intensity').text)
                    mass = float(chromatogram.find('mass').text)

                    # Write out to each of the identities table (need to buffer til we have the entire list)
                    for identity in identities:
                        quantities[mid][identity] = intensity

                    # Write out to each of the identities table (need to buffer til we have the entire list)
                    for identity in identities:
                        masses[identity] = mass

        # Sort the identities/masses into consecutive order


        # Quantities table built; class table built; now rearrange into dso
        dso.empty((len(measurements), len(all_identities)))
        dso.labels[0] = measurements
        dso.classes[0] = [midclass[mid] for mid in measurements]

        dso.labels[1] = all_identities
        db_hmdbids = self.m.db.unification['HMDB']
        dso.entities[1] = [db_hmdbids[hmdbid] if hmdbid in db_hmdbids else None for hmdbid in all_identities]
        dso.scales[1] = [float(masses[i]) for i in all_identities]

        for mid, identities in list(quantities.items()):
            for identity, intensity in list(identities.items()):
                r = measurements.index(mid)
                c = all_identities.index(identity)

                dso.data[r, c] = intensity

        dso.name = os.path.basename(filename)
        dso.description = 'Imported PeakML file'
        self.change_name.emit(dso.name)

        return {'output': dso}
Пример #23
0
    def load_csv_R(self, filename):  # Load from csv with experiments in ROWS, metabolites in COLUMNS
        # Read in data for the graphing metabolite, with associated value (generate mean)
        f = open(filename, 'rU')
        fsize = os.path.getsize(filename)
        reader = csv.reader(f, delimiter=str(','), dialect='excel')
        print('R')
        hrow = next(reader)  # Get top row
        metabolites = hrow[2:]
        ydim = 0
        xdim = len(metabolites)

        samples = []
        classes = []
        raw_data = []

        # Build quants table for metabolite classes
        #for metabolite in self.metabolites:
        #    quantities[ metabolite ] = defaultdict(list)

        for n, row in enumerate(reader):
            ydim += 1
            if row[1] != '.':  # Skip excluded classes # row[1] = Class
                samples.append(row[0])
                classes.append(row[1])
                data_row = []
                for c in row[2:]:  # in self.metabolites:
                    try:
                        c = float(c)
                    except:
                        c = 0
                    data_row.append(c)

                raw_data.append(data_row)
                    #metabolite_column = hrow.index( metabolite )
                    #if row[ metabolite_column ]:
                    #    data_row.append(
                    #    quantities[metabolite][ row[1] ].append( float(row[ metabolite_column ]) )
                        #self.statistics['ymin'] = min( self.statistics['ymin'], float(row[ metabolite_column ]) )
                        #self.statistics['ymax'] = max( self.statistics['ymax'], float(row[ metabolite_column ]) )
                    #else:
                    #    quantities[metabolite][ row[1] ].append( 0 )
            else:
                pass

            if n % 100 == 0:
                self.progress.emit(float(f.tell()) / fsize)

                #self.statistics['excluded'] += 1

        # Build dataset object
        dso = DataSet(size=(xdim, ydim))  # self.add_data('imported_data', DataSetself) )
        dso.empty(size=(ydim, xdim))
        #dso.labels[1] = metabolites

        scales = []
        mlabels = []
        for m in metabolites:
            try:
                scales.append(float(m))
                mlabels.append(None)
            except:
                scales.append(None)
                mlabels.append(m)

        dso.scales[1] = [None] * len(samples)
        dso.labels[0] = samples
        dso.classes[0] = classes
        dso.entities[0] = [None] * len(samples)

        dso.scales[1] = scales
        dso.labels[1] = mlabels
        dso.entities[1] = [None] * len(scales)
        dso.classes[1] = [None] * len(scales)

        dso.data = np.array(raw_data)

        return dso
Пример #24
0
    def generate(self, input=None):
        data = input.data

        pca = PCA(n_components=self.config.get('number_of_components'))
        pca.fit(data.T)  # Transpose it, as vars need to along the top

        weights = pca.transform(data.T)  # Get weights?

        # Label up the top 50 (the values are retained; just for clarity)
        wmx = np.amax(np.absolute(weights), axis=1)

        dso_z = list(zip(input.scales[1], input.entities[1], input.labels[1]))
        dso_z = sorted(zip(dso_z, wmx), key=lambda x: x[1])[-50:]  # Top 50

        dso_z = [x for x, wmx in dso_z]

        # Build scores into a dso no_of_samples x no_of_principal_components
        scored = DataSet(size=(len(pca.components_[0]), len(pca.components_)))
        scored.labels[0] = input.labels[0]
        scored.classes[0] = input.classes[0]

        for n, s in enumerate(pca.components_):
            scored.data[:, n] = s
            scored.labels[1][n] = 'Principal Component %d (%0.2f%%)' % (
                n + 1, pca.explained_variance_ratio_[0] * 100.)

        dso_pc = {}
        for n in range(0, weights.shape[1]):
            pcd = DataSet(size=(1, input.shape[1]))
            pcd.entities[1] = input.entities[1]
            pcd.labels[1] = input.labels[1]
            pcd.scales[1] = input.scales[1]
            pcd.data = weights[:, n:n + 1].T
            dso_pc['pc%s' % (n + 1)] = pcd

        return dict(
            list({
                'dso': input,
                'pca': pca,
                'scores': scored,
                #'weights': weights,
                'wmx': wmx,
                'dso_z': dso_z,
            }.items()) + list(dso_pc.items()))
Пример #25
0
    def load_soft_dataset(self, filename):  # Load from soft data file for genes
        # SOFT files are a /sort of/ basterdized csv with data in tab-separated columns
        # So, we use the csv reader to get that, accounting for most stuff being single field with
        # slightly strange identifiers
        f = open(filename, 'rU')
        fsize = os.path.getsize(filename)
        reader = csv.reader(f, delimiter='\t', dialect='excel')

        soft_data = self.preprocess_soft(reader, f=f, fsize=fsize)
        # soft_data now contains lists of sections with ^ markers

        database = {}
        dataset = {}
        dataset_data = {}
        subsets = {}

        for section, rows in list(soft_data.items()):

            if section.startswith('^DATABASE'):
                database = self.get_soft_metadata(rows)

            elif section.startswith('^DATASET'):
                dataset.update(self.get_soft_metadata(rows))  # update because seems can be >1 entry to dataset
                data = self.get_soft_data(rows, '!dataset_table_begin', '!dataset_table_end')
                dataset_data = data

            elif section.startswith('^SUBSET'):
                key, subset_id = section.split(' = ')
                subsets[subset_id] = self.get_soft_metadata(rows)
                subsets[subset_id]['subset_sample_id'] = subsets[subset_id]['subset_sample_id'].split(',')  # Turn to list of ids

        # We now have the entire dataset loaded; but in a bit of a messed up format
        # Build a dataset object to fit and map the data in
        sample_ids = []
        for k, subset in list(subsets.items()):
            sample_ids.extend(subset['subset_sample_id'])
        sample_ids = sorted(sample_ids)  # Get the samples sorted so we keep everything lined up

        class_lookup = {}
        for class_id, s in list(subsets.items()):
            for s_id in s['subset_sample_id']:
                class_lookup[s_id] = "%s (%s)" % (s['subset_description'] if 'subset_description' in s else '', class_id)

        xdim = len(dataset_data)  # Use first sample to access the gene list
        ydim = len(sample_ids)

        # Build dataset object
        dso = DataSet(size=(xdim, ydim))  # self.add_data('imported_data', DataSetself) )
        dso.empty(size=(ydim, xdim))

        gene_ids = sorted(dataset_data.keys())  # Get the keys sorted so we keep everything lined up

        dso.labels[0] = sample_ids
        dso.classes[0] = [class_lookup[s_id] for s_id in sample_ids]
        dso.labels[1] = [dataset_data[gene_id]['IDENTIFIER'] for gene_id in gene_ids]
        dso.entities[1] = [self.m.db.get_via_synonym(gene_id) for gene_id in dso.labels[1]]

        for xn, gene_id in enumerate(gene_ids):
            for yn, sample_id in enumerate(sample_ids):

                dso.data[yn, xn] = dataset_data[gene_id][sample_id]

        return dso
Пример #26
0
    def load_csv_C(self, filename):  # Load from csv with experiments in COLUMNS, metabolites in ROWS
        # Read in data for the graphing metabolite, with associated value (generate mean)
        f = open(filename, 'rU')
        fsize = os.path.getsize(filename)
        reader = csv.reader(f, delimiter=str(','), dialect='excel')

        hrow = next(reader)  # Discard top row (sample no's)
        samples = hrow[1:]

        hrow = next(reader)  # Get 2nd row
        classesa = hrow[1:]
        classes = [c for c in classesa if c != '.']

        metabolites = []

        data = []

        added_rows = 0
        for n, row in enumerate(reader):
            metabolite = row[0]
            metabolites.append(row[0])
            quants = []
            for cn, c in enumerate(row[1:]):
                if classesa[cn] != '.':
                    try:
                        data.append(float(c))
                    except:
                        data.append(0)

            if n % 100 == 0:
                self.progress.emit(float(f.tell()) / fsize)

        data = np.asarray(data)
        data = np.reshape(data, (n + 1, len(classes))).T

        xdim = len(quants)
        ydim = len(classes)

        # Build dataset object
        dso = DataSet(size=(xdim, ydim))  # self.add_data('imported_data', DataSetself) )
        dso.empty(size=(ydim, xdim))
        dso.labels[1] = metabolites

        scales = []
        mlabels = []
        for m in metabolites:
            try:
                scales.append(float(m))
                mlabels.append(None)
            except:
                scales.append(None)
                mlabels.append(m)

        dso.scales[0] = [None] * len(samples)
        dso.labels[0] = samples
        dso.classes[0] = classes
        dso.entities[0] = [None] * len(samples)

        dso.scales[1] = scales
        dso.labels[1] = mlabels
        dso.classes[1] = [None] * len(scales)
        dso.entities[1] = [None] * len(scales)

        dso.data = data

        return dso
Пример #27
0
    def generate(self, input=None):
        data = input.data

        pca = PCA(n_components=self.config.get('number_of_components'))
        pca.fit(data)
        scores = pca.transform(data)

        # Build scores into a dso no_of_samples x no_of_principal_components
        scored = DataSet(size=(scores.shape))
        scored.labels[0] = input.labels[0]
        scored.classes[0] = input.classes[0]
        scored.data = scores

        for n in range(0, scored.shape[1]):
            scored.labels[1][n] = 'Principal Component %d (%0.2f%%)' % (n + 1, pca.explained_variance_ratio_[0] * 100.)

        weightsd = DataSet(size=pca.components_.shape)
        weightsd.data = pca.components_

        weightsd.scales[1] = input.scales[1]

        dso_pc = {}
        for n in range(0, pca.components_.shape[0]):
            pcd = DataSet(size=(1, input.shape[1]))
            pcd.entities[1] = input.entities[1]
            pcd.labels[1] = input.labels[1]
            pcd.scales[1] = input.scales[1]
            pcd.data = weightsd.data[n:n + 1, :]
            dso_pc['pc%s' % (n + 1)] = pcd
            weightsd.labels[0][n] = "PC %s" % (n + 1)
            #weightsd.classes[0][n] = "PC %s" % (n+1)

        return dict(list({
            'dso': input,
            'pca': pca,
            'scores': scored,
            'weights': weightsd,
        }.items()) + list(dso_pc.items()))
Пример #28
0
    def generate(self, input_1=None, input_2=None, input_3=None, input_4=None):
        #dsi = input
        # Iterate all the compounds in the current analysis
        # Assign score to each of the compound's pathways
        # Sum up, crop and return a list of pathway_ids to display
        # Pass this in as the list to view
        # + requested pathways, - excluded pathways

        db = self.m.db

        mining_depth = self.config.get('/Data/MiningDepth')
        mining_type = self.config.get('/Data/MiningType')

        pathway_scores = defaultdict(int)

        for dsi in input_1, input_2, input_3, input_4:
            if dsi == None:
                continue

            print("Mining using '%s'" % mining_type)

            for n, entity in enumerate(dsi.entities[1]):
                if entity == None:
                    continue  # Skip

                score = dsi.data[0, n]
                #score = self.analysis[ m_id ]['score']

                # 1' neighbours; 2' neighbours etc. add score
                # Get a list of methods in connected reactions, add their score % to this compound
                # if m_id in db.compounds.keys():
                #    n_compounds = [r.compounds for r in db.compounds[ m_id ].reactions ]
                #     print n_compounds
                #     n_compounds = [m for ml in n_compounds for m in ml if n_m.id in self.analysis and m.id != m_id ]
                #     for n_m in n_compounds:
                #         score += self.analysis[ n_m.id ]['score'] * 0.5

                # Get the entity's pathways
                pathways = entity.pathways
                if pathways == []:
                    continue

                if self.config.get('/Data/MiningShared'):
                    # Share the change score between the associated pathways
                    # this prevents compounds having undue influence
                    score = score / len(pathways)

                for p in pathways:
                    mining_val = {
                        'c': abs(score),
                        'u': max(0, score),
                        'd': abs(min(0, score)),
                        'm': 1.0,
                        't': score,
                        }
                    pathway_scores[p] += mining_val[mining_type]


            # If we're using tendency scaling; abs the scores here
            if mining_type == 't':
                for p, v  in list(pathway_scores.items()):
                    pathway_scores[p] = abs(v)


            # If we're pruning, then remove any pathways not in keep_pathways
            if self.config.get('/Data/MiningRelative'):
                print("Scaling pathway scores to pathway sizes...")
                for p, v in list(pathway_scores.items()):
                    pathway_scores[p] = float(v) / len(p.reactions)

        
        if not pathway_scores:
            # No data
            raise BaseException

        # Now take the accumulated scores; and create the output
        pathway_scorest = list(pathway_scores.items())  # Switch it to a dict so we can sort
        pathway_scorest = [(p, v) for p, v in pathway_scorest if v > 0]  # Remove any scores of 0
        pathway_scorest.sort(key=lambda tup: tup[1], reverse=True)  # Sort by scores (either system)

        # Get top N defined by mining_depth parameter
        keep_pathways = pathway_scorest[0:mining_depth]
        remaining_pathways = pathway_scorest[mining_depth + 1:mining_depth + 100]

        print("Mining recommended %d out of %d" % (len(keep_pathways), len(pathway_scores)))

        for n, p in enumerate(keep_pathways):
            print("- %d. %s [%.2f]" % (n + 1, p[0].name, p[1]))

        #self.analysis['mining_ranked_remaining_pathways'] = []

        #if remaining_pathways:
        #    print "Note: Next pathways by current scoring method are..."
        #    for n2,p in enumerate(remaining_pathways):
        #        print "- %d. %s [%.2f]" % (n+n2+1, db.pathways[ p[0] ].name, p[1])
        #        self.analysis['mining_ranked_remaining_pathways'].append( p[0] )

        #self.analysis_suggested_pathways = [db.pathways[p[0]] for p in pathway_scorest]
        dso = DataSet(size=(1, len(keep_pathways)))
        dso.entities[1] = [k for k, v in keep_pathways]
        dso.labels[1] = [k.name for k, v in keep_pathways]
        dso.data = np.array([v for k, v in keep_pathways], ndmin=2)

        dso.labels[0][0] = "Pathway mining scores"

        return {'output': dso}
Пример #29
0
    def generate(self, input=None):   
        dso = input
        
        _experiment_test = self.config.get('experiment_test')
        _experiment_control = self.config.get('experiment_control')
                
        data = dso.data
        
        plsr = PLSRegression(n_components=self.config.get('number_of_components'), scale=self.config.get('autoscale')) #, algorithm=self.config.get('algorithm'))
        Y = np.array([0 if c == _experiment_control else 1 for c in dso.classes[0] ])

        plsr.fit(data, Y) # Transpose it, as vars need to along the top
        
        # Build scores into a dso no_of_samples x no_of_principal_components
        scored = DataSet(size=(len(plsr.x_scores_),len(plsr.x_scores_[0])))  
        scored.labels[0] = input.labels[0]
        scored.classes[0] = input.classes[0]

        for n,s in enumerate(plsr.x_scores_.T):
            scored.data[:,n] = s
            scored.labels[1][n] = 'Latent Variable %d' % (n+1) #, plsr.y_weights_[0][n])
                
        # PLS-DA regions; mean +- 95% confidence in each axis for each cluster
        cw_x = defaultdict(list)
        cw_y = defaultdict(list)
            
        for c in list(cw_x.keys()):
            # Calculate mean point
            cx = np.mean( cw_x[c] )
            cy = np.mean( cw_y[c] )
            
            # Calculate 95% CI
            rx = np.std( cw_x[c] ) *2 # 2sd = 95% #1.95 * ( / srn) # 1.95 * SEM => 95% confidence
            ry = np.std( cw_y[c] ) *2 #1.95 * ( / srn)

            figure_regions.append( 
                (c, cx, cy, rx, ry)
            )

        
            
        # Label up the top 50 (the values are retained; just for clarity)
        wmx = np.amax( np.absolute( plsr.x_weights_), axis=1 )
        dso_z = list(zip( dso.scales[1], dso.entities[1], dso.labels[1] ))
        dso_z = sorted( zip( dso_z, wmx ), key=lambda x: x[1])[-50:] # Top 50
        dso_z = [x for x, wmx in dso_z ]    

        weightsd = DataSet(size=plsr.x_weights_.T.shape)
        weightsd.data = plsr.x_weights_.T
        weightsd.scales[1] = input.scales[1]

        dso_lv = {}
        for n in range(0, plsr.x_weights_.shape[1] ):
            lvd =  DataSet( size=(1, input.shape[1] ) )
            lvd.entities[1] = input.entities[1]
            lvd.labels[1] = input.labels[1]
            lvd.scales[1] = input.scales[1]
            lvd.data = plsr.x_weights_[:,n:n+1].T
            dso_lv['lv%s' % (n+1)] = lvd
            weightsd.labels[0][n] = "Weights on LV %s" % (n+1)
            weightsd.classes[0][n] = "LV %s" % (n+1)
                    
        return dict(list({
            'dso': dso,
            'scores':scored,
            'weights':weightsd,
            #'figure_data': figure_data,
            #'figure_regions': figure_regions,
            'y_weights': plsr.y_weights_,
            'x_weights': plsr.x_weights_,
        }.items()) + list(dso_lv.items()) )
Пример #30
0
    def load_csv_C(
        self, filename
    ):  # Load from csv with experiments in COLUMNS, metabolites in ROWS
        # Read in data for the graphing metabolite, with associated value (generate mean)
        f = open(filename, 'rU')
        fsize = os.path.getsize(filename)
        reader = csv.reader(f, delimiter=str(','), dialect='excel')

        hrow = next(reader)  # Discard top row (sample no's)
        samples = hrow[1:]

        hrow = next(reader)  # Get 2nd row
        classesa = hrow[1:]
        classes = [c for c in classesa if c != '.']

        metabolites = []

        data = []

        added_rows = 0
        for n, row in enumerate(reader):
            metabolite = row[0]
            metabolites.append(row[0])
            quants = []
            for cn, c in enumerate(row[1:]):
                if classesa[cn] != '.':
                    try:
                        data.append(float(c))
                    except:
                        data.append(0)

            if n % 100 == 0:
                self.progress.emit(float(f.tell()) / fsize)

        data = np.asarray(data)
        data = np.reshape(data, (n + 1, len(classes))).T

        xdim = len(quants)
        ydim = len(classes)

        # Build dataset object
        dso = DataSet(
            size=(xdim, ydim))  # self.add_data('imported_data', DataSetself) )
        dso.empty(size=(ydim, xdim))
        dso.labels[1] = metabolites

        scales = []
        mlabels = []
        for m in metabolites:
            try:
                scales.append(float(m))
                mlabels.append(None)
            except:
                scales.append(None)
                mlabels.append(m)

        dso.scales[0] = [None] * len(samples)
        dso.labels[0] = samples
        dso.classes[0] = classes
        dso.entities[0] = [None] * len(samples)

        dso.scales[1] = scales
        dso.labels[1] = mlabels
        dso.classes[1] = [None] * len(scales)
        dso.entities[1] = [None] * len(scales)

        dso.data = data

        return dso
Пример #31
0
    def load_csv_R(
        self, filename
    ):  # Load from csv with experiments in ROWS, metabolites in COLUMNS
        # Read in data for the graphing metabolite, with associated value (generate mean)
        f = open(filename, 'rU')
        fsize = os.path.getsize(filename)
        reader = csv.reader(f, delimiter=str(','), dialect='excel')
        print('R')
        hrow = next(reader)  # Get top row
        metabolites = hrow[2:]
        ydim = 0
        xdim = len(metabolites)

        samples = []
        classes = []
        raw_data = []

        # Build quants table for metabolite classes
        #for metabolite in self.metabolites:
        #    quantities[ metabolite ] = defaultdict(list)

        for n, row in enumerate(reader):
            ydim += 1
            if row[1] != '.':  # Skip excluded classes # row[1] = Class
                samples.append(row[0])
                classes.append(row[1])
                data_row = []
                for c in row[2:]:  # in self.metabolites:
                    try:
                        c = float(c)
                    except:
                        c = 0
                    data_row.append(c)

                raw_data.append(data_row)
                #metabolite_column = hrow.index( metabolite )
                #if row[ metabolite_column ]:
                #    data_row.append(
                #    quantities[metabolite][ row[1] ].append( float(row[ metabolite_column ]) )
                #self.statistics['ymin'] = min( self.statistics['ymin'], float(row[ metabolite_column ]) )
                #self.statistics['ymax'] = max( self.statistics['ymax'], float(row[ metabolite_column ]) )
                #else:
                #    quantities[metabolite][ row[1] ].append( 0 )
            else:
                pass

            if n % 100 == 0:
                self.progress.emit(float(f.tell()) / fsize)

                #self.statistics['excluded'] += 1

        # Build dataset object
        dso = DataSet(
            size=(xdim, ydim))  # self.add_data('imported_data', DataSetself) )
        dso.empty(size=(ydim, xdim))
        #dso.labels[1] = metabolites

        scales = []
        mlabels = []
        for m in metabolites:
            try:
                scales.append(float(m))
                mlabels.append(None)
            except:
                scales.append(None)
                mlabels.append(m)

        dso.scales[1] = [None] * len(samples)
        dso.labels[0] = samples
        dso.classes[0] = classes
        dso.entities[0] = [None] * len(samples)

        dso.scales[1] = scales
        dso.labels[1] = mlabels
        dso.entities[1] = [None] * len(scales)
        dso.classes[1] = [None] * len(scales)

        dso.data = np.array(raw_data)

        return dso
Пример #32
0
    def generate(self, input=None):
        dso = input

        _experiment_test = self.config.get('experiment_test')
        _experiment_control = self.config.get('experiment_control')

        data = dso.data

        plsr = PLSRegression(
            n_components=self.config.get('number_of_components'),
            scale=self.config.get(
                'autoscale'))  #, algorithm=self.config.get('algorithm'))
        Y = np.array(
            [0 if c == _experiment_control else 1 for c in dso.classes[0]])
        #Y = Y.reshape( (len(dso.classes[0]),1) )

        plsr.fit(data, Y)  # Transpose it, as vars need to along the top

        #figure_data = zip( dso.classes[0], plsr.x_scores_[:,0], plsr.x_scores_[:,1])

        # Build scores into a dso no_of_samples x no_of_principal_components
        scored = DataSet(size=(len(plsr.x_scores_), len(plsr.x_scores_[0])))
        scored.labels[0] = input.labels[0]
        scored.classes[0] = input.classes[0]

        print(plsr.x_scores_.shape)
        print(scored.data.shape)

        for n, s in enumerate(plsr.x_scores_.T):
            scored.data[:, n] = s
            scored.labels[1][n] = 'Latent Variable %d (%0.2f%%)' % (
                n + 1, plsr.y_weights_[0][0] * 100)

        # PLS-DA regions; mean +- 95% confidence in each axis for each cluster
        cw_x = defaultdict(list)
        cw_y = defaultdict(list)
        #figure_regions = []
        #for c,x,y in figure_data:
        #    cw_x[c].append( x )
        #    cw_y[c].append( y )

        for c in list(cw_x.keys()):
            # Calculate mean point
            cx = np.mean(cw_x[c])
            cy = np.mean(cw_y[c])

            # Calculate 95% CI
            rx = np.std(
                cw_x[c]
            ) * 2  # 2sd = 95% #1.95 * ( / srn) # 1.95 * SEM => 95% confidence
            ry = np.std(cw_y[c]) * 2  #1.95 * ( / srn)

            # Calculate 95% CI
            #srn = np.sqrt( len( cw_x[c] ) ) # Sample numbers sqrt
            #rx = 1.95*(np.std( cw_x[c] )/srn ) # 2sd = 95% #1.95 * ( / srn) # 1.95 * SEM => 95% confidence
            #ry = 1.95*(np.std( cw_y[c] )/srn ) #1.95 * ( / srn)

            figure_regions.append((c, cx, cy, rx, ry))

        # Label up the top 50 (the values are retained; just for clarity)
        wmx = np.amax(np.absolute(plsr.x_weights_), axis=1)
        dso_z = list(zip(dso.scales[1], dso.entities[1], dso.labels[1]))
        dso_z = sorted(zip(dso_z, wmx), key=lambda x: x[1])[-50:]  # Top 50
        dso_z = [x for x, wmx in dso_z]

        dso_lv = {}
        for n in range(0, plsr.x_weights_.shape[1]):
            lvd = DataSet(size=(1, input.shape[1]))
            lvd.entities[1] = input.entities[1]
            lvd.labels[1] = input.labels[1]
            lvd.scales[1] = input.scales[1]
            lvd.data = plsr.x_weights_[:, n:n + 1].T
            dso_lv['lv%s' % (n + 1)] = lvd

        return dict(
            list({
                'dso': dso,
                'scores': scored,
                #'figure_data': figure_data,
                #'figure_regions': figure_regions,
                'y_weights': plsr.y_weights_,
                'x_weights': plsr.x_weights_,
            }.items()) + list(dso_lv.items()))
Пример #33
0
    def generate(self, input_1=None, input_2=None, input_3=None, input_4=None):
        #dsi = input
        # Iterate all the compounds in the current analysis
        # Assign score to each of the compound's pathways
        # Sum up, crop and return a list of pathway_ids to display
        # Pass this in as the list to view
        # + requested pathways, - excluded pathways

        db = self.m.db

        mining_depth = self.config.get('/Data/MiningDepth')
        mining_type = self.config.get('/Data/MiningType')

        pathway_scores = defaultdict(int)

        for dsi in input_1, input_2, input_3, input_4:
            if dsi == None:
                continue

            print("Mining using '%s'" % mining_type)

            for n, entity in enumerate(dsi.entities[1]):
                if entity == None:
                    continue  # Skip

                score = dsi.data[0, n]
                #score = self.analysis[ m_id ]['score']

                # 1' neighbours; 2' neighbours etc. add score
                # Get a list of methods in connected reactions, add their score % to this compound
                # if m_id in db.compounds.keys():
                #    n_compounds = [r.compounds for r in db.compounds[ m_id ].reactions ]
                #     print n_compounds
                #     n_compounds = [m for ml in n_compounds for m in ml if n_m.id in self.analysis and m.id != m_id ]
                #     for n_m in n_compounds:
                #         score += self.analysis[ n_m.id ]['score'] * 0.5

                # Get the entity's pathways
                pathways = entity.pathways
                if pathways == []:
                    continue

                if self.config.get('/Data/MiningShared'):
                    # Share the change score between the associated pathways
                    # this prevents compounds having undue influence
                    score = score / len(pathways)

                for p in pathways:
                    mining_val = {
                        'c': abs(score),
                        'u': max(0, score),
                        'd': abs(min(0, score)),
                        'm': 1.0,
                        't': score,
                    }
                    pathway_scores[p] += mining_val[mining_type]

            # If we're using tendency scaling; abs the scores here
            if mining_type == 't':
                for p, v in list(pathway_scores.items()):
                    pathway_scores[p] = abs(v)

            # If we're pruning, then remove any pathways not in keep_pathways
            if self.config.get('/Data/MiningRelative'):
                print("Scaling pathway scores to pathway sizes...")
                for p, v in list(pathway_scores.items()):
                    pathway_scores[p] = float(v) / len(p.reactions)

        if not pathway_scores:
            # No data
            raise BaseException

        # Now take the accumulated scores; and create the output
        pathway_scorest = list(
            pathway_scores.items())  # Switch it to a dict so we can sort
        pathway_scorest = [(p, v) for p, v in pathway_scorest
                           if v > 0]  # Remove any scores of 0
        pathway_scorest.sort(key=lambda tup: tup[1],
                             reverse=True)  # Sort by scores (either system)

        # Get top N defined by mining_depth parameter
        keep_pathways = pathway_scorest[0:mining_depth]
        remaining_pathways = pathway_scorest[mining_depth + 1:mining_depth +
                                             100]

        print("Mining recommended %d out of %d" %
              (len(keep_pathways), len(pathway_scores)))

        for n, p in enumerate(keep_pathways):
            print("- %d. %s [%.2f]" % (n + 1, p[0].name, p[1]))

        #self.analysis['mining_ranked_remaining_pathways'] = []

        #if remaining_pathways:
        #    print "Note: Next pathways by current scoring method are..."
        #    for n2,p in enumerate(remaining_pathways):
        #        print "- %d. %s [%.2f]" % (n+n2+1, db.pathways[ p[0] ].name, p[1])
        #        self.analysis['mining_ranked_remaining_pathways'].append( p[0] )

        #self.analysis_suggested_pathways = [db.pathways[p[0]] for p in pathway_scorest]
        dso = DataSet(size=(1, len(keep_pathways)))
        dso.entities[1] = [k for k, v in keep_pathways]
        dso.labels[1] = [k.name for k, v in keep_pathways]
        dso.data = np.array([v for k, v in keep_pathways], ndmin=2)

        dso.labels[0][0] = "Pathway mining scores"

        return {'output': dso}