def process_upload(self, exp, *args, **kwargs): """ @param exp: Experiment """ self.clean_errors() assay_df = pd.DataFrame.from_csv(self.es_matrix.get_file()) es = ExpressionSet(base_dir=exp.get_data_folder(), base_filename="%s_annotation" % self.uuid) pheno_df = pd.DataFrame.from_csv(self.pheno_matrix.get_file()) pheno_df.set_index(pheno_df.columns[0]) user_class_title = es.pheno_metadata["user_class_title"] if user_class_title not in pheno_df.columns: pheno_df[es.pheno_metadata["user_class_title"]] = "" es.store_assay_data_frame(assay_df) es.store_pheno_data_frame(pheno_df) if self.working_unit: es.working_unit = self.working_unit self.set_out_var("expression_set", es) exp.store_block(self) self.do_action("success", exp)
def process_upload(self, exp, *args, **kwargs): """ @param exp: Experiment """ self.clean_errors() assay_df = pd.DataFrame.from_csv(self.es_matrix.get_file()) es = ExpressionSet(base_dir=exp.get_data_folder(), base_filename="%s_annotation" % self.uuid) pheno_df = pd.DataFrame.from_csv(self.pheno_matrix.get_file()) pheno_df.set_index(pheno_df.columns[0]) user_class_title = es.pheno_metadata["user_class_title"] if user_class_title not in pheno_df.columns: pheno_df[es.pheno_metadata["user_class_title"]] = "" # if matrix is bad oriented, then do transposition if self.es_matrix_ori == "GxS": assay_df = assay_df.T es.store_assay_data_frame(assay_df) es.store_pheno_data_frame(pheno_df) if self.working_unit: es.working_unit = self.working_unit self.set_out_var("expression_set", es) exp.store_block(self) self.do_action("success", exp)
def process_upload(self, exp, *args, **kwargs): """ @param exp: Experiment """ # TODO: move to celery self.clean_errors() seq = [] sep = getattr(self, "csv_sep", " ") try: if len(self.pheno_matrices) != len(self.es_matrices): raise RuntimeError( "Different number of phenotypes and expression sets") self.labels = es_matrix_names = sorted(self.es_matrices) pheno_matrix_names = sorted(self.pheno_matrices) self.pheno_by_es_names = { es_name: pheno_name for es_name, pheno_name in zip(es_matrix_names, pheno_matrix_names) } for es_name, pheno_name in self.pheno_by_es_names.iteritems(): es_ufw = self.es_matrices[es_name] es_df = es_ufw.get_as_data_frame(sep) pheno_ufw = self.pheno_matrices[pheno_name] pheno_df = pheno_ufw.get_as_data_frame(sep) es_sample_names = sorted(es_df.columns.tolist()) pheno_sample_names = sorted(pheno_df.index.tolist()) if es_sample_names != pheno_sample_names: raise RuntimeError("Couldn't match `%s` and `%s` due to " "different sample name sets" % (es_name, pheno_name)) es = ExpressionSet(base_dir=exp.get_data_folder(), base_filename="%s_%s" % (self.uuid, es_name)) es.store_assay_data_frame(es_df) es.store_pheno_data_frame(pheno_df) es.pheno_metadata["user_class_title"] = pheno_df.columns[0] seq.append({"es": es, "__label__": es_name}) self.seq = seq exp.store_block(self) self.do_action("processing_done", exp, seq) except Exception as e: log.exception(e) self.errors.append(e) self.do_action("error_on_processing", exp, e)
def process_upload(self, exp, *args, **kwargs): """ @param exp: Experiment """ # TODO: move to celery self.clean_errors() seq = [] sep = getattr(self, "csv_sep", " ") try: if len(self.pheno_matrices) != len(self.es_matrices): raise RuntimeError("Different number of phenotypes and expression sets") self.labels = es_matrix_names = sorted(self.es_matrices) pheno_matrix_names = sorted(self.pheno_matrices) self.pheno_by_es_names = { es_name: pheno_name for es_name, pheno_name in zip(es_matrix_names, pheno_matrix_names) } for es_name, pheno_name in self.pheno_by_es_names.iteritems(): es_ufw = self.es_matrices[es_name] es_df = es_ufw.get_as_data_frame(sep) pheno_ufw = self.pheno_matrices[pheno_name] pheno_df = pheno_ufw.get_as_data_frame(sep) es_sample_names = sorted(es_df.columns.tolist()) pheno_sample_names = sorted(pheno_df.index.tolist()) if es_sample_names != pheno_sample_names: raise RuntimeError("Couldn't match `%s` and `%s` due to " "different sample name sets" % (es_name, pheno_name)) es = ExpressionSet( base_dir=exp.get_data_folder(), base_filename="%s_%s" % (self.uuid, es_name) ) es.store_assay_data_frame(es_df) es.store_pheno_data_frame(pheno_df) es.pheno_metadata["user_class_title"] = pheno_df.columns[0] seq.append({"es": es, "__label__": es_name}) self.seq = seq exp.store_block(self) self.do_action("processing_done", exp, seq) except Exception as e: log.exception(e) self.errors.append(e) self.do_action("error_on_processing", exp, e)
def process_upload(self, exp, *args, **kwargs): """ @param exp: Experiment """ # TODO: move to celery self.clean_errors() sep = getattr(self, "csv_sep", " ") try: if not self.pheno_matrix: self.warnings.append(Exception("Phenotype is undefined")) pheno_df = None else: pheno_df = self.pheno_matrix.get_as_data_frame(sep) pheno_df.set_index(pheno_df.columns[0]) # TODO: solve somehow better: Here we add empty column with user class assignment pheno_df[ExpressionSet(None, None).pheno_metadata["user_class_title"]] = "" if self.m_rna_matrix is not None: m_rna_assay_df = self.m_rna_matrix.get_as_data_frame(sep) m_rna_es = ExpressionSet(base_dir=exp.get_data_folder(), base_filename="%s_m_rna_es" % self.uuid) m_rna_es.store_assay_data_frame(m_rna_assay_df) m_rna_es.store_pheno_data_frame(pheno_df) m_rna_es.working_unit = self.m_rna_unit self.set_out_var("m_rna_es", m_rna_es) # TODO: fetch GPL annotation if GPL id was provided if self.mi_rna_matrix is not None: mi_rna_assay_df = self.mi_rna_matrix.get_as_data_frame(sep) mi_rna_es = ExpressionSet(base_dir=exp.get_data_folder(), base_filename="%s_mi_rna_es" % self.uuid) mi_rna_es.store_assay_data_frame(mi_rna_assay_df) mi_rna_es.store_pheno_data_frame(pheno_df) self.set_out_var("mi_rna_es", mi_rna_es) if self.methyl_matrix is not None: methyl_assay_df = self.methyl_matrix.get_as_data_frame(sep) methyl_es = ExpressionSet(base_dir=exp.get_data_folder(), base_filename="%s_methyl_es" % self.uuid) methyl_es.store_assay_data_frame(methyl_assay_df) methyl_es.store_pheno_data_frame(pheno_df) self.set_out_var("methyl_es", methyl_es) self.do_action("success", exp) except Exception as e: ex_type, ex, tb = sys.exc_info() traceback.print_tb(tb) self.do_action("error", exp, e)
def preprocess_soft(exp, block, source_file): #TODO: now we assume that we get GSE file try: soft = list(parse_geo(gzip.open(source_file.filepath))) except: raise RuntimeError("Bad source file, can't read") assert soft[2].entity_type == "PLATFORM" pl = soft[2].table_rows id_idx = pl[0].index('ID') entrez_idx = pl[0].index('ENTREZ_GENE_ID') #TODO bug here probe_to_genes_GS = GS() for row in pl[1:]: probe_to_genes_GS.description[row[id_idx]] = "" probe_to_genes_GS.genes[row[id_idx]] = row[entrez_idx].split(" /// ") platform_annotation = PlatformAnnotation( "TODO:GET NAME FROM SOFT", base_dir=exp.get_data_folder(), base_filename="%s_annotation" % block.uuid ) platform_annotation.gene_sets.metadata["gene_units"] = GeneUnits.ENTREZ_ID platform_annotation.gene_sets.metadata["set_units"] = GeneUnits.PROBE_ID platform_annotation.gene_sets.store_gs(probe_to_genes_GS) id_ref_idx = soft[3].table_rows[0].index("ID_REF") value_idx = soft[3].table_rows[0].index("VALUE") assay_df = DataFrame(dict([ ( soft[i].entity_attributes['Sample_geo_accession'], Series(dict([(row[id_ref_idx], row[value_idx]) for row in soft[i].table_rows[1:]])) ) for i in range(3, len(soft)) ])) expression_set = ExpressionSet(exp.get_data_folder(), "%s_es" % block.uuid) expression_set.store_assay_data_frame(assay_df) raw_factors = [soft[i].entity_attributes for i in range(3, len(soft))] pheno_index = [] # Here we trying to guess sub columns one_factor_row = raw_factors[0] pheno_complex_columns_def = {} for col_name, col in one_factor_row.iteritems(): if type(col) in [str, unicode]: continue else: if all([":" in sub_col for sub_col in col]): mb_sub_col_names_sets = [ tuple(map(lambda x: x.split(":")[0], row[col_name])) for row in raw_factors ] if len(set(mb_sub_col_names_sets)) == 1: pheno_complex_columns_def[col_name] = "dict" else: pheno_complex_columns_def[col_name] = "list" else: pheno_complex_columns_def[col_name] = "list" factors = [] for idx, factor in enumerate(raw_factors): pheno_index.append(factor.pop('Sample_geo_accession', idx)) factor.pop('sample_table_begin', None) factor.pop('sample_table_end', None) fixed_factor = {} for col_name, col in factor.iteritems(): # Special treat for sub columns if col_name in pheno_complex_columns_def: if pheno_complex_columns_def[col_name] == "list": for sub_idx, sub_col in enumerate(col): fixed_factor["%s_%s" % (col_name, sub_idx + 1)] = sub_col elif pheno_complex_columns_def[col_name] == "dict": for sub_col in col: sub_name, sub_value = sub_col.split(":", 1) fixed_factor["%s_%s" % (col_name, sub_name)] = sub_value else: fixed_factor[col_name] = col factors.append(fixed_factor) # TODO: add ordering to phenotype features pheno_df = DataFrame([Series(factor) for factor in factors], index=pheno_index) if expression_set.pheno_metadata["user_class_title"] not in pheno_df.columns: pheno_df[expression_set.pheno_metadata["user_class_title"]] = "" pheno_df.index.name = 'Sample_geo_accession' expression_set.store_pheno_data_frame(pheno_df) return [expression_set, platform_annotation], {}
def preprocess_soft(exp, block, source_file): #TODO: now we assume that we get GSE file try: soft = list(parse_geo(gzip.open(source_file.filepath))) except: raise RuntimeError("Bad source file, can't read") assert soft[2].entity_type == "PLATFORM" pl = soft[2].table_rows id_idx = pl[0].index('ID') # entrez_idx = pl[0].index('ENTREZ_GENE_ID') refseq_idx = [i for i, item in enumerate(pl[0]) if re.search('.*refseq.*', item, re.IGNORECASE)] if refseq_idx == []: refseq_idx = [i for i, item in enumerate(pl[0]) if re.search('.*mirna.*', item, re.IGNORECASE)][0] else: refseq_idx = refseq_idx[0] probe_to_genes_GS = GS() for row in pl[1:]: probe_to_genes_GS.description[row[id_idx]] = "" probe_to_genes_GS.genes[row[id_idx]] = [row[refseq_idx].split(" /// ")[0]] # platform_annotation = PlatformAnnotation( # "TODO:GET NAME FROM SOFT", # base_dir=exp.get_data_folder(), # base_filename="%s_annotation" % block.uuid # ) # # platform_annotation.gene_sets.metadata["gene_units"] = GeneUnits.ENTREZ_ID # platform_annotation.gene_sets.metadata["set_units"] = GeneUnits.PROBE_ID # platform_annotation.gene_sets.store_gs(probe_to_genes_GS) if settings.CELERY_DEBUG: import sys sys.path.append('/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg') import pydevd pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True) id_ref_idx = soft[3].table_rows[0].index("ID_REF") value_idx = soft[3].table_rows[0].index("VALUE") assay_df = DataFrame(dict([ ( soft[i].entity_attributes['Sample_geo_accession'], Series(dict( map_probes_to_refseqs(probe_to_genes_GS.genes, [(row[id_ref_idx], row[value_idx]) for row in soft[i].table_rows[1:]]) )) ) for i in range(3, len(soft)) ])) expression_set = ExpressionSet(exp.get_data_folder(), "%s_es" % block.uuid) expression_set.store_assay_data_frame(assay_df.T) raw_factors = [soft[i].entity_attributes for i in range(3, len(soft))] pheno_index = [] # Here we trying to guess sub columns one_factor_row = raw_factors[0] pheno_complex_columns_def = {} for col_name, col in one_factor_row.iteritems(): if type(col) in [str, unicode]: continue else: if all([":" in sub_col for sub_col in col]): mb_sub_col_names_sets = [ tuple(map(lambda x: x.split(":")[0], row[col_name])) for row in raw_factors ] if len(set(mb_sub_col_names_sets)) == 1: pheno_complex_columns_def[col_name] = "dict" else: pheno_complex_columns_def[col_name] = "list" else: pheno_complex_columns_def[col_name] = "list" factors = [] for idx, factor in enumerate(raw_factors): pheno_index.append(factor.pop('Sample_geo_accession', idx)) factor.pop('sample_table_begin', None) factor.pop('sample_table_end', None) fixed_factor = {} for col_name, col in factor.iteritems(): # Special treat for sub columns if col_name in pheno_complex_columns_def: if pheno_complex_columns_def[col_name] == "list": for sub_idx, sub_col in enumerate(col): fixed_factor["%s_%s" % (col_name, sub_idx + 1)] = sub_col elif pheno_complex_columns_def[col_name] == "dict": for sub_col in col: sub_name, sub_value = sub_col.split(":", 1) fixed_factor["%s_%s" % (col_name, sub_name)] = sub_value else: fixed_factor[col_name] = col factors.append(fixed_factor) # TODO: add ordering to phenotype features pheno_df = DataFrame([Series(factor) for factor in factors], index=pheno_index) if expression_set.pheno_metadata["user_class_title"] not in pheno_df.columns: pheno_df[expression_set.pheno_metadata["user_class_title"]] = "" pheno_df.index.name = 'Sample_geo_accession' expression_set.store_pheno_data_frame(pheno_df) return [expression_set], {}
def preprocess_soft(exp, block, source_file): #TODO: now we assume that we get GSE file try: soft = list(parse_geo(gzip.open(source_file.filepath))) except: raise RuntimeError("Bad source file, can't read") assert soft[2].entity_type == "PLATFORM" pl = soft[2].table_rows id_idx = pl[0].index('ID') # entrez_idx = pl[0].index('ENTREZ_GENE_ID') refseq_idx = [ i for i, item in enumerate(pl[0]) if re.search('.*refseq.*', item, re.IGNORECASE) ] if refseq_idx == []: refseq_idx = [ i for i, item in enumerate(pl[0]) if re.search('.*mirna.*', item, re.IGNORECASE) ][0] else: refseq_idx = refseq_idx[0] probe_to_genes_GS = GS() for row in pl[1:]: probe_to_genes_GS.description[row[id_idx]] = "" probe_to_genes_GS.genes[row[id_idx]] = [ row[refseq_idx].split(" /// ")[0] ] # platform_annotation = PlatformAnnotation( # "TODO:GET NAME FROM SOFT", # base_dir=exp.get_data_folder(), # base_filename="%s_annotation" % block.uuid # ) # # platform_annotation.gene_sets.metadata["gene_units"] = GeneUnits.ENTREZ_ID # platform_annotation.gene_sets.metadata["set_units"] = GeneUnits.PROBE_ID # platform_annotation.gene_sets.store_gs(probe_to_genes_GS) if settings.CELERY_DEBUG: import sys sys.path.append( '/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg' ) import pydevd pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True) id_ref_idx = soft[3].table_rows[0].index("ID_REF") value_idx = soft[3].table_rows[0].index("VALUE") assay_df = DataFrame( dict([(soft[i].entity_attributes['Sample_geo_accession'], Series( dict( map_probes_to_refseqs( probe_to_genes_GS.genes, [(row[id_ref_idx], row[value_idx]) for row in soft[i].table_rows[1:]])))) for i in range(3, len(soft))])) expression_set = ExpressionSet(exp.get_data_folder(), "%s_es" % block.uuid) expression_set.store_assay_data_frame(assay_df.T) raw_factors = [soft[i].entity_attributes for i in range(3, len(soft))] pheno_index = [] # Here we trying to guess sub columns one_factor_row = raw_factors[0] pheno_complex_columns_def = {} for col_name, col in one_factor_row.iteritems(): if type(col) in [str, unicode]: continue else: if all([":" in sub_col for sub_col in col]): mb_sub_col_names_sets = [ tuple(map(lambda x: x.split(":")[0], row[col_name])) for row in raw_factors ] if len(set(mb_sub_col_names_sets)) == 1: pheno_complex_columns_def[col_name] = "dict" else: pheno_complex_columns_def[col_name] = "list" else: pheno_complex_columns_def[col_name] = "list" factors = [] for idx, factor in enumerate(raw_factors): pheno_index.append(factor.pop('Sample_geo_accession', idx)) factor.pop('sample_table_begin', None) factor.pop('sample_table_end', None) fixed_factor = {} for col_name, col in factor.iteritems(): # Special treat for sub columns if col_name in pheno_complex_columns_def: if pheno_complex_columns_def[col_name] == "list": for sub_idx, sub_col in enumerate(col): fixed_factor["%s_%s" % (col_name, sub_idx + 1)] = sub_col elif pheno_complex_columns_def[col_name] == "dict": for sub_col in col: sub_name, sub_value = sub_col.split(":", 1) fixed_factor["%s_%s" % (col_name, sub_name)] = sub_value else: fixed_factor[col_name] = col factors.append(fixed_factor) # TODO: add ordering to phenotype features pheno_df = DataFrame([Series(factor) for factor in factors], index=pheno_index) if expression_set.pheno_metadata[ "user_class_title"] not in pheno_df.columns: pheno_df[expression_set.pheno_metadata["user_class_title"]] = "" pheno_df.index.name = 'Sample_geo_accession' expression_set.store_pheno_data_frame(pheno_df) return [expression_set], {}
def process_upload(self, exp, *args, **kwargs): """ @param exp: Experiment """ # TODO: move to celery self.clean_errors() sep = getattr(self, "csv_sep", " ") try: if not self.pheno_matrix: self.warnings.append(Exception("Phenotype is undefined")) pheno_df = None else: pheno_df = self.pheno_matrix.get_as_data_frame(sep) pheno_df.set_index(pheno_df.columns[0]) # TODO: solve somehow better: Here we add empty column with user class assignment pheno_df[ExpressionSet( None, None).pheno_metadata["user_class_title"]] = "" if self.m_rna_matrix is not None: m_rna_assay_df = self.m_rna_matrix.get_as_data_frame(sep) m_rna_es = ExpressionSet(base_dir=exp.get_data_folder(), base_filename="%s_m_rna_es" % self.uuid) m_rna_es.store_assay_data_frame(m_rna_assay_df) m_rna_es.store_pheno_data_frame(pheno_df) m_rna_es.working_unit = self.m_rna_unit self.set_out_var("m_rna_es", m_rna_es) # TODO: fetch GPL annotation if GPL id was provided if self.mi_rna_matrix is not None: mi_rna_assay_df = self.mi_rna_matrix.get_as_data_frame(sep) mi_rna_es = ExpressionSet(base_dir=exp.get_data_folder(), base_filename="%s_mi_rna_es" % self.uuid) mi_rna_es.store_assay_data_frame(mi_rna_assay_df) mi_rna_es.store_pheno_data_frame(pheno_df) self.set_out_var("mi_rna_es", mi_rna_es) if self.methyl_matrix is not None: methyl_assay_df = self.methyl_matrix.get_as_data_frame(sep) methyl_es = ExpressionSet(base_dir=exp.get_data_folder(), base_filename="%s_methyl_es" % self.uuid) methyl_es.store_assay_data_frame(methyl_assay_df) methyl_es.store_pheno_data_frame(pheno_df) self.set_out_var("methyl_es", methyl_es) self.do_action("success", exp) except Exception as e: ex_type, ex, tb = sys.exc_info() traceback.print_tb(tb) self.do_action("error", exp, e)