def _init(self): if self._standalone is True: return # should download files from synapse if required. self._download_data('D9C1_goldstandard.gct.zip', 'syn4595275') # now unzip and read the gs on the go z = ZIP() z.loadZIPFile(self.get_pathname('D9C1_goldstandard.gct.zip')) data = z.read('D9C1_goldstandard.gct') self.goldstandard = pd.read_csv(BytesIO(data), sep='[ \t]', skiprows=2, engine='python') self.goldstandard.drop(['Description'], axis=1, inplace=True) self.goldstandard.set_index('Name', inplace=True) self.goldstandard.columns = [x.strip() for x in self.goldstandard.columns] # get template for SC1, SC2, SC3 self._download_data('D9C1_template_sc1.gct.zip', 'syn4595283') self.unzip('D9C1_template_sc1.gct.zip') self._download_data('D9C1_template_sc2.zip', 'syn4595587') self._download_data('D9C1_template_sc3.zip', 'syn4595588') # download gold standard for sc2 filename = self.getpath_gs( 'D9C1_goldstandard_sc2.txt') self.gs_priority = pd.read_csv(filename, sep='\t', header=None)
def download_all_data(self): """Download all large data sets from Synapse""" pb = progress_bar(5) # load the large gold standard file from D5C2 synapse main page filename = self._download_data('DREAM5_GoldStandard_probes.zip', 'syn2898469') pb.animate(1) z = ZIP() z.loadZIPFile(filename) data = z.read('Answers.txt') self.gs = pd.read_csv(StringIO.StringIO(data), sep='\t') # download 4 other filenames from dreamtools synapse project self._download_data('all_8mers.txt', 'syn4483185') pb.animate(2) self._download_data('8mers_gs.txt', 'syn4483187') pb.animate(3) self._download_data('probe35_gs.txt', 'syn4483184') pb.animate(4) self._download_data('probes35.txt', 'syn4483183') pb.animate(5)
def download_all_data(self): """Download all large data sets from Synapse""" pb = progress_bar(5) # load the large gold standard file from D5C2 synapse main page filename = self._download_data('DREAM5_GoldStandard_probes.zip', 'syn2898469') pb.animate(1) z = ZIP() z.loadZIPFile(filename) data = z.read('Answers.txt') self.gs = pd.read_csv(BytesIO(data), sep='\t') # download 4 other filenames from dreamtools synapse project self._download_data('all_8mers.txt', 'syn4483185') pb.animate(2) self._download_data('8mers_gs.txt', 'syn4483187') pb.animate(3) self._download_data('probe35_gs.txt', 'syn4483184') pb.animate(4) self._download_data('probes35.txt', 'syn4483183') pb.animate(5)
def _score_sc2_sc3(self, filename): # looks like exactly same function in sc2/sc3 # feature file is not used either in original code ?! # this should be a zip file with 2 files. z = ZIP() z.loadZIPFile(filename) # there should be 2 files, one ending in gct one in txt assert len(z.filenames) == 2, "There should be 2 files in the zip archive" for filename in z.filenames: if filename.endswith('gct'): prediction = z.read(filename) prediction = self._read_gct(prediction) elif filename.endswith('txt'): feature = z.read(filename) feature = self._read_feature(feature) # first column should be the names feature.set_index(0, inplace=True) else: raise ValueError("there should be only 2 files. \n" + "One ending with gct extension (prediction)\n" + "One ending with txt extension (feature)") #assert feature.shape == (2647,10) assert prediction.shape == (2647,44) self.prediction = prediction self.feature = feature # in SC2, only a subset of predictive features (2647 out of 17.000) are used df1 = self.goldstandard.ix[self.gs_priority[0]] self.df1 = df1 scores = [] df2 = self.prediction N = len(df1) scores = [df2.ix[i].corr(df1.ix[i], method='spearman') for i in range(0, N)] self.scores = scores final_score = sum(scores)/float(len(scores)) return {'score': final_score}
def unzip(self, filename): """Simple method to extract all files contained in an archive""" from dreamtools.core.ziptools import ZIP z = ZIP() z.loadZIPFile(self.get_pathname(filename)), z.extractall(self.directory)
def unzip(self, filename): """Simple method to extract all files contained in an archive""" from dreamtools.core.ziptools import ZIP z = ZIP() z.loadZIPFile(self.get_pathname(filename)), z.extractall( self.directory)
def test_zip(): z = ZIP() try: z.loadZIPFile('p') except: pass