def preprocess_soft(exp, block, source_file): #TODO: now we assume that we get GSE file try: soft = list(parse_geo(gzip.open(source_file.filepath))) except: raise RuntimeError("Bad source file, can't read") assert soft[2].entity_type == "PLATFORM" pl = soft[2].table_rows id_idx = pl[0].index('ID') # entrez_idx = pl[0].index('ENTREZ_GENE_ID') refseq_idx = [i for i, item in enumerate(pl[0]) if re.search('.*refseq.*', item, re.IGNORECASE)] if refseq_idx == []: refseq_idx = [i for i, item in enumerate(pl[0]) if re.search('.*mirna.*', item, re.IGNORECASE)][0] else: refseq_idx = refseq_idx[0] probe_to_genes_GS = GS() for row in pl[1:]: probe_to_genes_GS.description[row[id_idx]] = "" probe_to_genes_GS.genes[row[id_idx]] = [row[refseq_idx].split(" /// ")[0]] # platform_annotation = PlatformAnnotation( # "TODO:GET NAME FROM SOFT", # base_dir=exp.get_data_folder(), # base_filename="%s_annotation" % block.uuid # ) # # platform_annotation.gene_sets.metadata["gene_units"] = GeneUnits.ENTREZ_ID # platform_annotation.gene_sets.metadata["set_units"] = GeneUnits.PROBE_ID # platform_annotation.gene_sets.store_gs(probe_to_genes_GS) if settings.CELERY_DEBUG: import sys sys.path.append('/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg') import pydevd pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True) id_ref_idx = soft[3].table_rows[0].index("ID_REF") value_idx = soft[3].table_rows[0].index("VALUE") assay_df = DataFrame(dict([ ( soft[i].entity_attributes['Sample_geo_accession'], Series(dict( map_probes_to_refseqs(probe_to_genes_GS.genes, [(row[id_ref_idx], row[value_idx]) for row in soft[i].table_rows[1:]]) )) ) for i in range(3, len(soft)) ])) expression_set = ExpressionSet(exp.get_data_folder(), "%s_es" % block.uuid) expression_set.store_assay_data_frame(assay_df.T) raw_factors = [soft[i].entity_attributes for i in range(3, len(soft))] pheno_index = [] # Here we trying to guess sub columns one_factor_row = raw_factors[0] pheno_complex_columns_def = {} for col_name, col in one_factor_row.iteritems(): if type(col) in [str, unicode]: continue else: if all([":" in sub_col for sub_col in col]): mb_sub_col_names_sets = [ tuple(map(lambda x: x.split(":")[0], row[col_name])) for row in raw_factors ] if len(set(mb_sub_col_names_sets)) == 1: pheno_complex_columns_def[col_name] = "dict" else: pheno_complex_columns_def[col_name] = "list" else: pheno_complex_columns_def[col_name] = "list" factors = [] for idx, factor in enumerate(raw_factors): pheno_index.append(factor.pop('Sample_geo_accession', idx)) factor.pop('sample_table_begin', None) factor.pop('sample_table_end', None) fixed_factor = {} for col_name, col in factor.iteritems(): # Special treat for sub columns if col_name in pheno_complex_columns_def: if pheno_complex_columns_def[col_name] == "list": for sub_idx, sub_col in enumerate(col): fixed_factor["%s_%s" % (col_name, sub_idx + 1)] = sub_col elif pheno_complex_columns_def[col_name] == "dict": for sub_col in col: sub_name, sub_value = sub_col.split(":", 1) fixed_factor["%s_%s" % (col_name, sub_name)] = sub_value else: fixed_factor[col_name] = col factors.append(fixed_factor) # TODO: add ordering to phenotype features pheno_df = DataFrame([Series(factor) for factor in factors], index=pheno_index) if expression_set.pheno_metadata["user_class_title"] not in pheno_df.columns: pheno_df[expression_set.pheno_metadata["user_class_title"]] = "" pheno_df.index.name = 'Sample_geo_accession' expression_set.store_pheno_data_frame(pheno_df) return [expression_set], {}
def preprocess_soft(exp, block, source_file): #TODO: now we assume that we get GSE file try: soft = list(parse_geo(gzip.open(source_file.filepath))) except: raise RuntimeError("Bad source file, can't read") assert soft[2].entity_type == "PLATFORM" pl = soft[2].table_rows id_idx = pl[0].index('ID') entrez_idx = pl[0].index('ENTREZ_GENE_ID') #TODO bug here probe_to_genes_GS = GS() for row in pl[1:]: probe_to_genes_GS.description[row[id_idx]] = "" probe_to_genes_GS.genes[row[id_idx]] = row[entrez_idx].split(" /// ") platform_annotation = PlatformAnnotation( "TODO:GET NAME FROM SOFT", base_dir=exp.get_data_folder(), base_filename="%s_annotation" % block.uuid ) platform_annotation.gene_sets.metadata["gene_units"] = GeneUnits.ENTREZ_ID platform_annotation.gene_sets.metadata["set_units"] = GeneUnits.PROBE_ID platform_annotation.gene_sets.store_gs(probe_to_genes_GS) id_ref_idx = soft[3].table_rows[0].index("ID_REF") value_idx = soft[3].table_rows[0].index("VALUE") assay_df = DataFrame(dict([ ( soft[i].entity_attributes['Sample_geo_accession'], Series(dict([(row[id_ref_idx], row[value_idx]) for row in soft[i].table_rows[1:]])) ) for i in range(3, len(soft)) ])) expression_set = ExpressionSet(exp.get_data_folder(), "%s_es" % block.uuid) expression_set.store_assay_data_frame(assay_df) raw_factors = [soft[i].entity_attributes for i in range(3, len(soft))] pheno_index = [] # Here we trying to guess sub columns one_factor_row = raw_factors[0] pheno_complex_columns_def = {} for col_name, col in one_factor_row.iteritems(): if type(col) in [str, unicode]: continue else: if all([":" in sub_col for sub_col in col]): mb_sub_col_names_sets = [ tuple(map(lambda x: x.split(":")[0], row[col_name])) for row in raw_factors ] if len(set(mb_sub_col_names_sets)) == 1: pheno_complex_columns_def[col_name] = "dict" else: pheno_complex_columns_def[col_name] = "list" else: pheno_complex_columns_def[col_name] = "list" factors = [] for idx, factor in enumerate(raw_factors): pheno_index.append(factor.pop('Sample_geo_accession', idx)) factor.pop('sample_table_begin', None) factor.pop('sample_table_end', None) fixed_factor = {} for col_name, col in factor.iteritems(): # Special treat for sub columns if col_name in pheno_complex_columns_def: if pheno_complex_columns_def[col_name] == "list": for sub_idx, sub_col in enumerate(col): fixed_factor["%s_%s" % (col_name, sub_idx + 1)] = sub_col elif pheno_complex_columns_def[col_name] == "dict": for sub_col in col: sub_name, sub_value = sub_col.split(":", 1) fixed_factor["%s_%s" % (col_name, sub_name)] = sub_value else: fixed_factor[col_name] = col factors.append(fixed_factor) # TODO: add ordering to phenotype features pheno_df = DataFrame([Series(factor) for factor in factors], index=pheno_index) if expression_set.pheno_metadata["user_class_title"] not in pheno_df.columns: pheno_df[expression_set.pheno_metadata["user_class_title"]] = "" pheno_df.index.name = 'Sample_geo_accession' expression_set.store_pheno_data_frame(pheno_df) return [expression_set, platform_annotation], {}
def preprocess_soft(exp, block, source_file): #TODO: now we assume that we get GSE file try: soft = list(parse_geo(gzip.open(source_file.filepath))) except: raise RuntimeError("Bad source file, can't read") assert soft[2].entity_type == "PLATFORM" pl = soft[2].table_rows id_idx = pl[0].index('ID') # entrez_idx = pl[0].index('ENTREZ_GENE_ID') refseq_idx = [ i for i, item in enumerate(pl[0]) if re.search('.*refseq.*', item, re.IGNORECASE) ] if refseq_idx == []: refseq_idx = [ i for i, item in enumerate(pl[0]) if re.search('.*mirna.*', item, re.IGNORECASE) ][0] else: refseq_idx = refseq_idx[0] probe_to_genes_GS = GS() for row in pl[1:]: probe_to_genes_GS.description[row[id_idx]] = "" probe_to_genes_GS.genes[row[id_idx]] = [ row[refseq_idx].split(" /// ")[0] ] # platform_annotation = PlatformAnnotation( # "TODO:GET NAME FROM SOFT", # base_dir=exp.get_data_folder(), # base_filename="%s_annotation" % block.uuid # ) # # platform_annotation.gene_sets.metadata["gene_units"] = GeneUnits.ENTREZ_ID # platform_annotation.gene_sets.metadata["set_units"] = GeneUnits.PROBE_ID # platform_annotation.gene_sets.store_gs(probe_to_genes_GS) if settings.CELERY_DEBUG: import sys sys.path.append( '/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg' ) import pydevd pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True) id_ref_idx = soft[3].table_rows[0].index("ID_REF") value_idx = soft[3].table_rows[0].index("VALUE") assay_df = DataFrame( dict([(soft[i].entity_attributes['Sample_geo_accession'], Series( dict( map_probes_to_refseqs( probe_to_genes_GS.genes, [(row[id_ref_idx], row[value_idx]) for row in soft[i].table_rows[1:]])))) for i in range(3, len(soft))])) expression_set = ExpressionSet(exp.get_data_folder(), "%s_es" % block.uuid) expression_set.store_assay_data_frame(assay_df.T) raw_factors = [soft[i].entity_attributes for i in range(3, len(soft))] pheno_index = [] # Here we trying to guess sub columns one_factor_row = raw_factors[0] pheno_complex_columns_def = {} for col_name, col in one_factor_row.iteritems(): if type(col) in [str, unicode]: continue else: if all([":" in sub_col for sub_col in col]): mb_sub_col_names_sets = [ tuple(map(lambda x: x.split(":")[0], row[col_name])) for row in raw_factors ] if len(set(mb_sub_col_names_sets)) == 1: pheno_complex_columns_def[col_name] = "dict" else: pheno_complex_columns_def[col_name] = "list" else: pheno_complex_columns_def[col_name] = "list" factors = [] for idx, factor in enumerate(raw_factors): pheno_index.append(factor.pop('Sample_geo_accession', idx)) factor.pop('sample_table_begin', None) factor.pop('sample_table_end', None) fixed_factor = {} for col_name, col in factor.iteritems(): # Special treat for sub columns if col_name in pheno_complex_columns_def: if pheno_complex_columns_def[col_name] == "list": for sub_idx, sub_col in enumerate(col): fixed_factor["%s_%s" % (col_name, sub_idx + 1)] = sub_col elif pheno_complex_columns_def[col_name] == "dict": for sub_col in col: sub_name, sub_value = sub_col.split(":", 1) fixed_factor["%s_%s" % (col_name, sub_name)] = sub_value else: fixed_factor[col_name] = col factors.append(fixed_factor) # TODO: add ordering to phenotype features pheno_df = DataFrame([Series(factor) for factor in factors], index=pheno_index) if expression_set.pheno_metadata[ "user_class_title"] not in pheno_df.columns: pheno_df[expression_set.pheno_metadata["user_class_title"]] = "" pheno_df.index.name = 'Sample_geo_accession' expression_set.store_pheno_data_frame(pheno_df) return [expression_set], {}