def load_bml_datafile(self, data_path, target, name): dso = DataSet() # Read in data for the graphing metabolite, with associated value (generate mean) reader = csv.reader(utils.nonull(open(data_path, 'rb')), delimiter='\t', dialect='excel') for row in reader: if row and row[0] == 'metabolite': # Look for the top row break else: return samples = row[1:-2] # Sample identities samples = [sample[8:-1] for sample in samples] xdim = 0 ydim = len(samples) raw_data = [] metabolites = [] for row in reader: xdim += 1 metabolites.append(row[0]) raw_data.append([float(i) for i in row[1:-2]]) dso = DataSet(size=(ydim, xdim)) dso.labels[1] = metabolites dso.data = np.array(raw_data).T dso.name = name dso.description = 'Imported from FIMA (%s)' % name return dso
def load_datafile(self, filename): # Determine if we've got a csv or peakml file (extension) #self.data.o['output'].empty() dso = DataSet() # Read data in from peakml format file xml = et.parse(filename) # Get sample ids, names and class groupings sets = xml.iterfind('header/sets/set') midclass = {} classes = set() measurements = [] masses = {} for aset in sets: id = aset.find('id').text mids = aset.find('measurementids').text for mid in self.decode(mids): midclass[mid] = id measurements.append(mid) classes.add(id) # We have all the sample data now, parse the intensity and identity info peaksets = xml.iterfind('peaks/peak') quantities = defaultdict(dict) all_identities = [] for peakset in peaksets: # Find metabolite identities annotations = peakset.iterfind('annotations/annotation') identities = False for annotation in annotations: if annotation.find('label').text == 'identification': identities = annotation.find('value').text.split(', ') all_identities.extend(identities) break if identities: # PeakML supports multiple alternative metabolite identities,currently we don't so duplicate # We have identities, now get intensities for the different samples chromatograms = peakset.iterfind( 'peaks/peak') # Next level down for chromatogram in chromatograms: mid = chromatogram.find('measurementid').text intensity = float(chromatogram.find('intensity').text) mass = float(chromatogram.find('mass').text) # Write out to each of the identities table (need to buffer til we have the entire list) for identity in identities: quantities[mid][identity] = intensity # Write out to each of the identities table (need to buffer til we have the entire list) for identity in identities: masses[identity] = mass # Sort the identities/masses into consecutive order # Quantities table built; class table built; now rearrange into dso dso.empty((len(measurements), len(all_identities))) dso.labels[0] = measurements dso.classes[0] = [midclass[mid] for mid in measurements] dso.labels[1] = all_identities db_hmdbids = self.m.db.unification['HMDB'] dso.entities[1] = [ db_hmdbids[hmdbid] if hmdbid in db_hmdbids else None for hmdbid in all_identities ] dso.scales[1] = [float(masses[i]) for i in all_identities] for mid, identities in list(quantities.items()): for identity, intensity in list(identities.items()): r = measurements.index(mid) c = all_identities.index(identity) dso.data[r, c] = intensity dso.name = os.path.basename(filename) dso.description = 'Imported PeakML file' self.set_name(dso.name) return {'output': dso}
def load_datafile(self, filename): # Determine if we've got a csv or peakml file (extension) #self.data.o['output'].empty() dso = DataSet() # Read data in from peakml format file xml = et.parse(filename) # Get sample ids, names and class groupings sets = xml.iterfind('header/sets/set') midclass = {} classes = set() measurements = [] masses = {} for aset in sets: id = aset.find('id').text mids = aset.find('measurementids').text for mid in self.decode(mids): midclass[mid] = id measurements.append(mid) classes.add(id) # We have all the sample data now, parse the intensity and identity info peaksets = xml.iterfind('peaks/peak') quantities = defaultdict(dict) all_identities = [] for peakset in peaksets: # Find metabolite identities annotations = peakset.iterfind('annotations/annotation') identities = False for annotation in annotations: if annotation.find('label').text == 'identification': identities = annotation.find('value').text.split(', ') all_identities.extend(identities) break if identities: # PeakML supports multiple alternative metabolite identities,currently we don't so duplicate # We have identities, now get intensities for the different samples chromatograms = peakset.iterfind('peaks/peak') # Next level down for chromatogram in chromatograms: mid = chromatogram.find('measurementid').text intensity = float(chromatogram.find('intensity').text) mass = float(chromatogram.find('mass').text) # Write out to each of the identities table (need to buffer til we have the entire list) for identity in identities: quantities[mid][identity] = intensity # Write out to each of the identities table (need to buffer til we have the entire list) for identity in identities: masses[identity] = mass # Sort the identities/masses into consecutive order # Quantities table built; class table built; now rearrange into dso dso.empty((len(measurements), len(all_identities))) dso.labels[0] = measurements dso.classes[0] = [midclass[mid] for mid in measurements] dso.labels[1] = all_identities db_hmdbids = self.m.db.unification['HMDB'] dso.entities[1] = [db_hmdbids[hmdbid] if hmdbid in db_hmdbids else None for hmdbid in all_identities] dso.scales[1] = [float(masses[i]) for i in all_identities] for mid, identities in list(quantities.items()): for identity, intensity in list(identities.items()): r = measurements.index(mid) c = all_identities.index(identity) dso.data[r, c] = intensity dso.name = os.path.basename(filename) dso.description = 'Imported PeakML file' self.change_name.emit(dso.name) return {'output': dso}