def get_test_data(): # @todo: how to test? logger.info('Loading genes and cell annotation matrice') genes_path = fg_io.get_input_path('test_genes_data_input') cells_meta = fg_io.get_input_path('test_cells_meta_input') #genes_path = './bladder-expr.txt' #cells_meta = './bladder-pheno.txt' data = pd.read_csv(genes_path, sep='\t') pheno = pd.read_csv(cells_meta, sep='\t') return data, pheno
def get_data(): logger.info('Loading genes and cell annotation matrice') # @todo: tidy up # genes_path = fg_io.get_input_path('genes_data_input') expre_path = fg_io.get_input_path('expression_input') cells_meta = fg_io.get_input_path('cells_meta_input') # combat requires full matrix input - unstack input file # combat expects matrix of shape [genes x cells], so index columns accordingly # @todo: check if this truly makes sense # @todo: the Columns.enum-Trick sucks, this should be some global definition # @todo: will blow up for large data files - X = pd.read_csv(expre_path, sep='\t') print(X.head(10)) data = X.set_index([Columns.GENES, Columns.CELLS])\ .unstack() \ .fillna(0) # @todo this sucks as well - won't hurt to select this column, but # @todo I'd rather have a global data scheme # .loc[:, Columns.EXPR] pheno = pd.read_csv(cells_meta, sep='\t') return data, pheno
import fastgenomics.io as fg_io # input exprs_in = fg_io.get_input_path('exprs_in') # output exprs_transformed = fg_io.get_output_path('expression.csv') summary = fg_io.get_summary_path()
def main(): """ main routine of hello genomics: minimal 'calculation app': reads the genes matrix, counts genes and writes a summary """ # LOAD PARAMETERS # # Your parameters are located in /fastgenomics/config/parameters.json (not implemented yet!) # Our fastgenomics.io module loads these parameters or uses the pre-defined defaults # defined in your manifest.json of your application. # logger.info("Loading parameters") parameters = fg_io.get_parameters() # BEST PRACTICE: # Set or save random seed to achieve reproducibility # random.seed(4711) parameters['random_seed'] = 4711 # GET GENES MATRIX FROM DATASET # # Data from the origin dataset is located in /fastgenomics/data/dataset # Data from other calculations is located in /fastgenomics/data/uuid_of_calculation/output # You can easily access data with our fastgenomics.io module as follows: # logger.info("Loading genes matrix") # HINT: the key 'genes_data_input' has to be defined in your manifest.json # the actual path and filename will be provided by the FASTGenomics runtime and be available via # /fastgenomics/config/input_file_mapping.json genes_path = fg_io.get_input_path('genes_data_input') with genes_path.open('r') as f_in: # LOAD GENES MATRIX: # The csv-reader-instance is an iterator for our input-file. # We save memory (in case of large input files) but iterating over rows instead of loading the content entirely. reader = csv.reader(f_in, delimiter=fg_io.get_parameter('delimiter')) # GET HEADER: # Get first row of the file, get rid of the '*type'-annotation, and transform column-names to lowercase header = [col.split('*')[0].lower() for col in next(reader)] # PERFORM SOME CALCULATION # # We here do some sample calculations and count genes and gene types # # 1. Create target dict with default entry int(0) gene_types = defaultdict(int) # 2. Get index of gene-type column gene_type_col = header.index("type") # 3. Count: Increase gene-type by one for each hit by iterating over the rows of our genes matrix num_genes = 0 for row in reader: gene_types[row[gene_type_col]] += 1 num_genes += 1 logger.info( f"Found {num_genes} genes and {len(gene_types.keys())} gene types." ) # WRITE OUTPUT # # You can write output files to /fastgenomics/output # Consider using the fastgenomics.io module to ensure correct paths and interfaces # logger.info("Storing results") # Hint: the key 'data_quality_output' has to be defined in your manifest.json output_path = fg_io.get_output_path('data_quality_output') results = {'num_genes': num_genes, 'gene_types': gene_types} with output_path.open('w') as f_out: json.dump(results, f_out) # WRITE SUMMARY # # Reproducibility is a core goal of FASTGenomics, but it is difficult to achieve this without # your help. Docker helps to freeze the exact code your app is using, but code without # documentation is difficult to use, so an app is expected to have a documentation and provide # a summary of its results (as GitHub Flavored Markdown). # You need to store it as /fastgenomics/summary/summary.md - otherwise it would be ignored. # # Please provide: # - an abstract about your application (without headings) # - results section (h3) # - methods section (h3) # - parameters section (h3): List of *all* parameters used. # DO NOT hard-code settings in your app but use parameters. # # In this example we use Jinja2 as template engine, use the template ./templates/summary.md.j2, # and pass over our results and parameters: # logger.debug("Loading Jinja2 summary template") with open(TEMPLATE_PATH / 'summary.md.j2') as temp: template_str = temp.read() logger.debug("Rendering template") template = jinja2.Template(template_str) summary = template.render(results=results, parameters=parameters, the_answer_to_everything=42) logger.info("Writing summary") summary_path = fg_io.get_summary_path() with summary_path.open('w') as f_sum: f_sum.write(summary) logger.info("Done.")
def test_can_read_input_file(local): # can get path to_test = fg_io.get_input_path("some_input") # path exists assert to_test.exists()
def test_cannot_read_undefined_input(local): with pytest.raises(ValueError): fg_io.get_input_path("i_don't_exist")