def pe_functional_profiling_pipeline(params): """Function calling functional profiling for fastq files""" work_dir = os.path.join(params['work_dir'], str(uuid.uuid4())) os.mkdir(work_dir) config_file = write_config_file(work_dir) project_file = write_project_file(params['input_reads'], params['reference'], work_dir, params['is_paired_end']) project = Project(config_file=config_file, project_file=project_file) if params['is_paired_end'] == "1": project = fastq_pe_pipeline(project) elif params['is_paired_end'] == "0": project = fastq_se_pipeline(project) else: raise ValueError('Wrong values of is_paired_end parameter', params['is_paired_end']) out_dir = os.path.join(work_dir, 'out') os.mkdir(out_dir) # export_reads output = {} output['krona_charts'] = {} out_fwd_fastq = os.path.join(work_dir, 'out_fwd.fastq') sample_id = project.list_samples()[0] if params['is_paired_end'] == "1": out_rev_fastq = os.path.join(work_dir, 'out_rev.fastq') else: out_rev_fastq = '' # write filtered fastq write_filtered_fastq(out_fwd_fastq, out_rev_fastq, project) output['fwd_reads'] = out_fwd_fastq if params['is_paired_end'] == "1": output['rev_reads'] = out_rev_fastq # Generate output out_report = os.path.join(out_dir, 'fama_report.html') generate_html_report(out_report, project, params['name2ref']) with zipfile.ZipFile(out_report + '.zip', 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: zip_file.write(out_report, 'fama_report.html') output['html_report'] = out_report + '.zip' # TODO: Krona charts generate_functions_chart(parser_fwd) report_files = {} if params['is_paired_end'] == "1": metric = 'efpkg' rawcount_flag = False for sample_id in project.list_samples(): if project.samples[sample_id].rpkg_scaling_factor == 0.0: rawcount_flag = True if rawcount_flag: metric = 'fragmentcount' else: metric = 'erpkg' rawcount_flag = False for sample_id in project.list_samples(): if project.samples[sample_id].rpkg_scaling_factor == 0.0: rawcount_flag = True if rawcount_flag: metric = 'readcount' # Create TraitMatrix object output['trait_matrix_ref'] = write_trait_matrix(project, params) output['functional_profile_ref'] = write_functional_profile( project, params, output['trait_matrix_ref']) report_files[out_report] = 'fama_report.html' project_xlsx_report = sanitize_file_name( os.path.join( project.options.work_dir, project.options.project_name + '_' + metric + '_functions.xlsx')) if os.path.exists(project_xlsx_report): report_files[project_xlsx_report] = 'Functional_profiles_combined.xlsx' else: print('Project XLSX file not found:', project_xlsx_report) project_xlsx_report = sanitize_file_name( os.path.join( project.options.work_dir, project.options.project_name + '_' + metric + '_functions_taxonomy.xlsx')) if os.path.exists(project_xlsx_report): report_files[ project_xlsx_report] = 'Function_taxonomy_profiles_combined.xlsx' else: print('Project XLSX file not found:', project_xlsx_report) for sample_id in project.list_samples(): sample_xlsx_report = sanitize_file_name( os.path.join(project.options.work_dir, sample_id + '_' + metric + '_functions_taxonomy.xlsx')) if os.path.exists(sample_xlsx_report): report_files[sample_xlsx_report] = sanitize_file_name( sample_id + ' function taxonomy profile long.xlsx') else: print('Sample XLSX file not found:', sample_xlsx_report) krona_file = sanitize_file_name( os.path.join( project.options.work_dir, sample_id + '_' + metric + '_functional_taxonomy_profile.xml.html')) if os.path.exists(krona_file): krona_output = \ sanitize_file_name(os.path.join(out_dir, sample_id + '_function_taxonomy_profile_chart.html')) shutil.copy2(krona_file, krona_output) with zipfile.ZipFile(krona_output + '.zip', 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: zip_file.write( krona_output, sanitize_file_name( sample_id + '_function_taxonomy_profile_chart.html')) report_files[krona_output] = \ sanitize_file_name(sample_id + '_function_taxonomy_profile_chart.html') output['krona_charts'][krona_output + '.zip'] = \ (sanitize_file_name(sample_id + '_function_taxonomy_profile_chart.html'), sample_id + ' function taxonomy chart') else: print('Krona diagram file not found:', krona_file) output_files = list() result_file = os.path.join(project.options.work_dir, 'Fama_result.zip') with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for filename in report_files: zip_file.write(filename, report_files[filename]) output_files.append({ 'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'Files generated by Fama App' }) output['report_files'] = output_files return output
def protein_functional_profiling_pipeline(params): """Function calling functional profiling for protein fasta file params = {'input_proteins': input_proteins, 'work_dir': self.shared_folder, 'reference': fama_reference, 'ws_name': params['workspace_name'], 'ws_client': ws_client, 'featureset_name': params['output_feature_set_name'], 'annotation_prefix': params['output_annotation_name'], 'name2ref' : name2ref } """ work_dir = os.path.join(params['work_dir'], str(uuid.uuid4())) os.mkdir(work_dir) config_file = write_config_file(work_dir) project_file = write_project_file(params['input_proteins'], params['reference'], work_dir) project = Project(config_file=config_file, project_file=project_file) # Run Fama project = protein_pipeline(project) out_dir = os.path.join(work_dir, 'out') os.mkdir(out_dir) # export_reads output = {} output['krona_charts'] = {} # Generate output report_files = {} metric = 'proteincount' project_xlsx_report = sanitize_file_name( os.path.join( project.options.work_dir, project.options.project_name + '_' + metric + '_functions.xlsx')) if os.path.exists(project_xlsx_report): report_files[project_xlsx_report] = 'Functional_profiles_combined.xlsx' else: print('Project XLSX file not found:', project_xlsx_report) project_xlsx_report = sanitize_file_name( os.path.join( project.options.work_dir, project.options.project_name + '_' + metric + '_functions_taxonomy.xlsx')) if os.path.exists(project_xlsx_report): report_files[ project_xlsx_report] = 'Function_taxonomy_profiles_combined.xlsx' else: print('Project XLSX file not found:', project_xlsx_report) project_text_report = sanitize_file_name( os.path.join(project.options.work_dir, 'all_proteins.list.txt')) if os.path.exists(project_text_report): report_files[project_text_report] = 'proteins_list.txt' else: print('Proteins list not found:', project_text_report) featureset_elements = {} featureset_element_ordering = [] objects_created = [] genome_names = {} # Get Domain Model Set reference dms_ref = get_dms(params['reference'], project.config.get_functions_file(project.collection), params['ws_name'], params['ws_client']) for sample_id in project.list_samples(): annotation_obj_ref, feature_ids, genome_name = \ save_domain_annotations(project, dms_ref, params['ws_name'], params['ws_client'], params['annotation_prefix'], sample_id, params['name2ref'][sample_id]) genome_names[sample_id] = genome_name objects_created.append({ 'ref': annotation_obj_ref, 'description': 'Functional annotations for genome ' + project.samples[sample_id].sample_name }) for feature_id in feature_ids: if feature_id not in featureset_elements: featureset_elements[feature_id] = [] featureset_elements[feature_id].append( params['name2ref'][sample_id]) featureset_element_ordering.append(feature_id) sample_xlsx_report = sanitize_file_name( os.path.join(project.options.work_dir, sample_id + '_' + metric + '_functions_taxonomy.xlsx')) if os.path.exists(sample_xlsx_report): report_files[sample_xlsx_report] = \ sanitize_file_name(genome_name + '_function_taxonomy_profile_long.xlsx') else: print('Sample XLSX file not found:', sample_xlsx_report) krona_file = sanitize_file_name( os.path.join( project.options.work_dir, sample_id + '_' + metric + '_functional_taxonomy_profile.xml.html')) if os.path.exists(krona_file): krona_output = \ sanitize_file_name(os.path.join(out_dir, genome_name + '_function_taxonomy_profile_chart.html')) shutil.copy2(krona_file, krona_output) with zipfile.ZipFile(krona_output + '.zip', 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: zip_file.write( krona_output, sanitize_file_name( genome_name + '_function_taxonomy_profile_chart.html')) report_files[krona_output] = \ sanitize_file_name(genome_name + '_function_taxonomy_profile_chart.html') output['krona_charts'][krona_output + '.zip'] = \ (sanitize_file_name(genome_name + '_function_taxonomy_profile_chart.html'), sample_id + ' function taxonomy chart') else: print('Krona diagram file not found:', krona_file) feature_set_data = { 'description': 'FeatureSet generated by Fama protein profiling', 'element_ordering': featureset_element_ordering, 'elements': featureset_elements } out_report = os.path.join(out_dir, 'fama_report.html') generate_protein_html_report(out_report, project, params['name2ref']) with zipfile.ZipFile(out_report + '.zip', 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: zip_file.write(out_report, 'fama_report.html') output['html_report'] = out_report + '.zip' report_files[out_report] = 'fama_report.html' output_files = list() result_file = os.path.join(project.options.work_dir, 'Fama_result.zip') with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for filename in report_files: zip_file.write(filename, report_files[filename]) output_files.append({ 'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'Files generated by Fama App' }) output['report_files'] = output_files output['project'] = project output['feature_set_data'] = feature_set_data output['objects_created'] = objects_created return output
def make_sample_tax_func_xlsx(project, scores, metric, function_id=None, rank=None): """Generates XLSX file for taxa scores for one or all functions in all samples. Args: project (:obj:'Project'): Project object that stores all annotated reads scores (dict[str, dict[str, dict[str, float]]]): outer key is function identifier, middle-level key is sample identifier, inner key is metric, value id float metric (str, optional): acceptable values are 'readcount', 'erpk', 'rpkm', 'fragmentcount', 'fpk', 'efpk', 'fpkm', 'erpkm', 'efpkm', 'fpkg', 'rpkg', 'erpkg', 'efpkg', 'proteincount' function_id (str, optional): function identifier. If function_id is None, all functions will be included into workbook. rank (str, optional): taxonomic rank. if rank parameter is not None, the resulting XLSX file will contain only entries for this rank. """ if function_id is None: if rank is None: xlsxfile = sanitize_file_name( os.path.join( project.options.work_dir, project.options.project_name + '_' + metric + '_samples_taxonomy.xlsx')) else: xlsxfile = sanitize_file_name( os.path.join( project.options.work_dir, project.options.project_name + '_' + metric + '_samples_' + rank + '_taxonomy.xlsx')) else: if rank is None: xlsxfile = sanitize_file_name( os.path.join( project.options.work_dir, function_id + '_' + metric + '_samples_taxonomy.xlsx')) else: xlsxfile = sanitize_file_name( os.path.join( project.options.work_dir, function_id + '_' + metric + '_samples_' + rank + '_taxonomy.xlsx')) print('Writing', xlsxfile) writer = pd.ExcelWriter(xlsxfile, engine='xlsxwriter') for function in sorted(project.ref_data.functions_dict.keys()): if function_id is not None and function != function_id: continue # Subsetting scores sample_scores = autovivify(3, float) for taxonomy_id in scores.keys(): if function in scores[taxonomy_id].keys(): for sample in project.list_samples(): if sample in scores[taxonomy_id][function]: for key, val in scores[taxonomy_id][function][ sample].items(): sample_scores[taxonomy_id][sample][key] = val else: sample_scores[taxonomy_id][sample][metric] = 0.0 tax_profile = TaxonomyProfile() tax_profile.make_function_taxonomy_profile(project.taxonomy_data, sample_scores) taxonomy_df = tax_profile.convert_profile_into_score_df(metric=metric) if rank is None: taxonomy_df.to_excel(writer, sheet_name=function, merge_cells=False) else: filtered_df = taxonomy_df[taxonomy_df[('', 'Rank')] == rank] filtered_df.to_excel(writer, sheet_name=function, merge_cells=False) format_taxonomy_worksheet(writer, function) # Make 'Average' sheet if function_id is None: sample_scores = autovivify(3, float) for taxonomy_id in scores: for function in sorted(project.ref_data.functions_dict.keys()): if function in scores[taxonomy_id]: for sample in project.list_samples(): if sample in scores[taxonomy_id][function]: for key, val in scores[taxonomy_id][function][ sample].items(): sample_scores[taxonomy_id][sample][key] += val else: sample_scores[taxonomy_id][sample][metric] += 0.0 for taxonomy_id in sample_scores: for sample in sample_scores[taxonomy_id]: sample_scores[taxonomy_id][sample][metric] = \ sample_scores[taxonomy_id][sample][metric] \ / len(project.ref_data.functions_dict.keys()) tax_profile = TaxonomyProfile() tax_profile.make_function_taxonomy_profile(project.taxonomy_data, sample_scores) taxonomy_df = tax_profile.convert_profile_into_score_df(metric=metric) if rank is None: taxonomy_df.to_excel(writer, sheet_name='Average', merge_cells=False) else: filtered_df = taxonomy_df[taxonomy_df[('', 'Rank')] == rank] filtered_df.to_excel(writer, sheet_name='Average', merge_cells=False) format_taxonomy_worksheet(writer, 'Average') writer.save()
def make_assembly_xlsx(assembler): """Generates XLSX file for assembly. Args: assembler (:obj:'GeneAssembler'): gene assembler object """ xlsxfile = sanitize_file_name( os.path.join(assembler.project.options.assembly_dir, 'out', assembler.project.options.project_name + '_assembly.xlsx')) xlsxfile = xlsxfile.replace(' ', '_') xlsxfile = xlsxfile.replace("'", "") xlsxfile = xlsxfile.replace('"', '') workbook = xlsxwriter.Workbook(xlsxfile) bold = workbook.add_format({'bold': True}) cell_numformat0 = workbook.add_format() cell_numformat0.set_num_format('0') cell_numformat1 = workbook.add_format() cell_numformat1.set_num_format('0.0') cell_numformat5 = workbook.add_format() cell_numformat5.set_num_format('0.00000') functions_list = set() samples_list = sorted(assembler.project.list_samples()) function_read_counts = autovivify( 2, float) # function_read_counts[function][sample] gene_rpkm = autovivify(3, float) # gene_rpkm[function][gene][sample], # parameters are RPKM, coverage, identity # count reads per function, per sample for function in assembler.assembly.reads: functions_list.add(function) for read in assembler.assembly.reads[function]: function_read_counts[function][assembler.assembly.reads[function] [read]] += 1 # collect RPKM scores for contigs per function, per sample (for contigs? for genes?) # calculate total read count total_read_count = 0 for sample in samples_list: total_read_count += assembler.project.options.get_fastq1_readcount( sample) total_read_count += assembler.project.options.get_fastq2_readcount( sample) # generate output # make worksheet for read counts per function reads_worksheet = workbook.add_worksheet('Functions read count') row = 0 col = 0 reads_worksheet.write(row, col, 'Function', bold) for sample in samples_list: col += 1 reads_worksheet.write(row, col, sample, bold) col += 1 reads_worksheet.write(row, col, 'All samples', bold) col += 1 reads_worksheet.write(row, col, 'Assembled reads', bold) col += 1 reads_worksheet.write(row, col, 'Unassembled reads', bold) col += 1 reads_worksheet.write(row, col, 'Definition', bold) for function in sorted(functions_list): row += 1 col = 0 reads_worksheet.write(row, col, function, bold) for sample in samples_list: col += 1 if sample in function_read_counts[function]: reads_worksheet.write( row, col, function_read_counts[function][sample] * 2, cell_numformat0) else: reads_worksheet.write(row, col, 0, cell_numformat0) col += 1 all_reads = sum(function_read_counts[function].values()) * 2 reads_worksheet.write(row, col, all_reads, cell_numformat0) col += 1 assembled_reads = 0 if function in assembler.assembly.contigs: assembled_reads = sum([ len(c.reads) for c in assembler.assembly.contigs[function].values() ]) reads_worksheet.write(row, col, assembled_reads, cell_numformat0) col += 1 reads_worksheet.write(row, col, all_reads - assembled_reads, cell_numformat0) col += 1 reads_worksheet.write( row, col, assembler.project.ref_data.lookup_function_name(function)) # adjust column width reads_worksheet.set_column(0, 0, 10) reads_worksheet.set_column(col, col, 50) # make worksheet with contig data contigs_worksheet = workbook.add_worksheet('Contigs') row = 0 col = 0 contigs_worksheet.write(row, col, 'Contig', bold) col += 1 contigs_worksheet.write(row, col, 'Function', bold) col += 1 contigs_worksheet.write(row, col, 'Length', bold) col += 1 contigs_worksheet.write(row, col, 'Read count', bold) col += 1 contigs_worksheet.write(row, col, 'RPKM', bold) col += 1 contigs_worksheet.write(row, col, 'Coverage', bold) col += 1 contigs_worksheet.write(row, col, 'Number of genes', bold) for sample in samples_list: col += 1 contigs_worksheet.write(row, col, sample, bold) col += 1 contigs_worksheet.write(row, col, sample, bold) col += 1 contigs_worksheet.write(row, col, sample, bold) col += 1 contigs_worksheet.write(row, col, 'Definition', bold) row += 1 col = 6 for sample in samples_list: col += 1 contigs_worksheet.write(row, col, 'Read count', bold) col += 1 contigs_worksheet.write(row, col, 'RPKM', bold) col += 1 contigs_worksheet.write(row, col, 'Coverage', bold) for function in sorted(functions_list): if function in assembler.assembly.contigs: for contig in sorted(assembler.assembly.contigs[function].keys()): row += 1 col = 0 contigs_worksheet.write(row, col, contig, bold) col += 1 contigs_worksheet.write(row, col, function) col += 1 contigs_worksheet.write( row, col, len(assembler.assembly.contigs[function][contig].sequence)) col += 1 contigs_worksheet.write( row, col, assembler.assembly.contigs[function] [contig].get_read_count()) col += 1 contigs_worksheet.write( row, col, assembler.assembly.contigs[function] [contig].get_rpkm(total_read_count), cell_numformat5) col += 1 contigs_worksheet.write( row, col, assembler.assembly.contigs[function] [contig].get_coverage(), cell_numformat1) col += 1 contigs_worksheet.write( row, col, len(assembler.assembly.contigs[function][contig].genes)) col += 1 for sample in samples_list: contigs_worksheet.write( row, col, assembler.assembly.contigs[function] [contig].get_read_count(sample)) col += 1 contigs_worksheet.write( row, col, assembler.assembly.contigs[function][contig].get_rpkm( assembler.project.options.get_fastq1_readcount( sample), sample), cell_numformat5) col += 1 contigs_worksheet.write( row, col, assembler.assembly.contigs[function] [contig].get_coverage(sample), cell_numformat1) col += 1 contigs_worksheet.write( row, col, assembler.project.ref_data.lookup_function_name(function)) # adjust column width contigs_worksheet.set_column(0, 1, 10) contigs_worksheet.set_column(col, col, 50) # make worksheet for genes genes_worksheet = workbook.add_worksheet('Genes') row = 0 col = 0 genes_worksheet.write(row, col, 'Gene', bold) col += 1 genes_worksheet.write(row, col, 'Reads function', bold) col += 1 genes_worksheet.write(row, col, 'Contig', bold) col += 1 genes_worksheet.write(row, col, 'Gene start', bold) col += 1 genes_worksheet.write(row, col, 'Gene end', bold) col += 1 genes_worksheet.write(row, col, 'Gene length', bold) col += 1 genes_worksheet.write(row, col, 'Gene strand', bold) col += 1 genes_worksheet.write(row, col, 'Read count', bold) col += 1 genes_worksheet.write(row, col, 'RPKM', bold) col += 1 genes_worksheet.write(row, col, 'Coverage', bold) col += 1 genes_worksheet.write(row, col, 'Fama gene status', bold) col += 1 genes_worksheet.write(row, col, 'Fama function', bold) col += 1 genes_worksheet.write(row, col, 'Fama identity', bold) col += 1 genes_worksheet.write(row, col, 'CDS completeness', bold) col += 1 genes_worksheet.write(row, col, 'Fama best hit', bold) col += 1 genes_worksheet.write(row, col, 'Fama best hit taxonomy ID', bold) col += 1 genes_worksheet.write(row, col, 'Fama best hit organism', bold) col += 1 genes_worksheet.write(row, col, 'Fama best hit taxonomy', bold) col += 1 genes_worksheet.write(row, col, 'Fama LCA taxonomy ID', bold) col += 1 genes_worksheet.write(row, col, 'Fama LCA organism', bold) col += 1 genes_worksheet.write(row, col, 'Fama LCA taxonomy', bold) for sample in samples_list: col += 1 genes_worksheet.write(row, col, sample, bold) col += 1 genes_worksheet.write(row, col, sample, bold) col += 1 genes_worksheet.write(row, col, sample, bold) col += 1 genes_worksheet.write(row, col, 'Definition', bold) row += 1 col = 20 for sample in samples_list: col += 1 genes_worksheet.write(row, col, 'Read count', bold) col += 1 genes_worksheet.write(row, col, 'RPKM', bold) col += 1 genes_worksheet.write(row, col, 'Coverage', bold) for function in sorted(functions_list): if function not in assembler.assembly.contigs: continue for contig in sorted(assembler.assembly.contigs[function].keys()): for gene_id in sorted( assembler.assembly.contigs[function][contig].genes.keys()): gene = assembler.assembly.contigs[function][contig].genes[ gene_id] row += 1 col = 0 # Write Gene ID genes_worksheet.write(row, col, gene_id) col += 1 # Write Gene function from read mapping genes_worksheet.write(row, col, function) col += 1 # Write Contig ID genes_worksheet.write(row, col, contig) col += 1 # Write gene start genes_worksheet.write(row, col, int(gene.start)) col += 1 # Write gene end genes_worksheet.write(row, col, int(gene.end)) col += 1 # Write gene length gene_length = int(gene.end) - int(gene.start) + 1 genes_worksheet.write(row, col, gene_length) col += 1 # Write gene strand genes_worksheet.write(row, col, gene.strand) col += 1 # Write read count (calculated from read count of contig, # adjusted by gene length) gene_read_count = assembler.assembly.contigs[function][contig].get_read_count()\ * gene_length \ / len(assembler.assembly.contigs[function][contig].sequence) genes_worksheet.write(row, col, gene_read_count, cell_numformat1) col += 1 # Write RPKM gene_rpkm = assembler.assembly.contigs[function][ contig].get_rpkm(total_read_count) genes_worksheet.write(row, col, gene_rpkm, cell_numformat5) col += 1 # Write coverage genes_worksheet.write( row, col, assembler.assembly.contigs[function] [contig].get_coverage(), cell_numformat1) col += 1 # Write FAMA gene status genes_worksheet.write(row, col, gene.status) col += 1 if gene.status == STATUS_GOOD: # Write FAMA predicted functions gene_functions = set( [y for x in gene.hit_list.hits for y in x.functions]) genes_worksheet.write(row, col, ','.join(gene_functions)) col += 1 # Write FAMA identity gene_identity = [x.identity for x in gene.hit_list.hits] genes_worksheet.write( row, col, sum(gene_identity) / len(gene_identity), cell_numformat1) col += 1 # Write CDS completeness ref_lengths = [x.s_len for x in gene.hit_list.hits] genes_worksheet.write( row, col, len(gene.protein_sequence) * 100 * len(ref_lengths) / sum(ref_lengths), cell_numformat1) col += 1 # Write FAMA best hits fama_hits = [ cleanup_protein_id(x.subject_id) for x in gene.hit_list.hits ] genes_worksheet.write(row, col, ','.join(fama_hits)) col += 1 # Write FAMA taxonomy ID gene_taxonomy = [ assembler.project.ref_data.lookup_protein_tax( cleanup_protein_id(x.subject_id)) for x in gene.hit_list.hits ] genes_worksheet.write(row, col, ','.join(gene_taxonomy)) col += 1 # Write Fama best hit organism gene_organism = [ assembler.project.taxonomy_data.get_name(x) for x in gene_taxonomy ] genes_worksheet.write(row, col, ','.join(gene_organism)) col += 1 # Write Fama best hit taxonomy best_hit_taxonomy = [ assembler.project.taxonomy_data.get_taxonomy_lineage(x) for x in gene_taxonomy ] genes_worksheet.write(row, col, '|'.join(best_hit_taxonomy)) col += 1 # Write Fama LCA taxonomy ID lca_taxonomy_id = gene.taxonomy genes_worksheet.write(row, col, lca_taxonomy_id) col += 1 # Write Fama LCA organism lca_organism = assembler.project.taxonomy_data.get_name( lca_taxonomy_id) genes_worksheet.write(row, col, lca_organism) col += 1 # Write Fama LCA taxonomy lca_taxonomy = assembler.project.taxonomy_data.get_taxonomy_lineage( lca_taxonomy_id) genes_worksheet.write(row, col, lca_taxonomy) else: for _ in range(0, 10): genes_worksheet.write(row, col, 'N/A') col += 1 for sample in samples_list: col += 1 gene_read_count = assembler.assembly.contigs[function][ contig].get_read_count(sample) * len( gene.protein_sequence) * 3 / len( assembler.assembly.contigs[function] [contig].sequence) genes_worksheet.write(row, col, gene_read_count, cell_numformat1) col += 1 gene_rpkm = assembler.assembly.contigs[function][ contig].get_rpkm( assembler.project.options.get_fastq1_readcount( sample), sample) genes_worksheet.write(row, col, gene_rpkm, cell_numformat5) col += 1 genes_worksheet.write( row, col, assembler.assembly.contigs[function] [contig].get_coverage(sample), cell_numformat1) col += 1 genes_worksheet.write( row, col, assembler.project.ref_data.lookup_function_name(function)) # adjust column width genes_worksheet.set_column(0, 0, 20) genes_worksheet.set_column(1, 1, 10) genes_worksheet.set_column(7, 9, 15) genes_worksheet.set_column(col, col, 50) workbook.close()
def make_function_sample_xlsx(project, scores, metric, sample_id=None): """Generates XLSX file for function scores for one or more samples. Args: project (:obj:'Project'): Project object that stores all annotated reads scores (dict[str, dict[str, dict[str, float]]]): outer key is function identifier, middle-level key is sample identifier, inner key is metric, value id float metric (str, optional): acceptable values are 'readcount', 'erpk', 'rpkm', 'fragmentcount', 'fpk', 'efpk', 'fpkm', 'erpkm', 'efpkm', 'fpkg', 'rpkg', 'erpkg', 'efpkg', 'proteincount' sample_id (str, optional): sample identifier """ if sample_id is None: xlsxfile = sanitize_file_name( os.path.join( project.options.work_dir, project.options.project_name + '_' + metric + '_functions.xlsx')) else: xlsxfile = sanitize_file_name( os.path.join(project.options.work_dir, sample_id + '_' + metric + '_functions.xlsx')) print('Writing', xlsxfile) workbook = xlsxwriter.Workbook(xlsxfile) bold = workbook.add_format({'bold': True}) functions_list = sorted(project.ref_data.functions_dict.keys()) categories_list = sorted( list( set([ project.ref_data.functions_dict[x]['group'] for x in project.ref_data.functions_dict.keys() ]))) scores_cat = autovivify(2, float) # generate tables for functions scores_worksheet = workbook.add_worksheet('Functions ' + metric) row = 0 col = 0 scores_worksheet.write(row, col, 'Function', bold) for sample in project.list_samples(): if sample_id is not None and sample != sample_id: continue col += 1 scores_worksheet.write(row, col, sample, bold) col += 1 scores_worksheet.write(row, col, 'Definition', bold) for function in functions_list: category = project.ref_data.lookup_function_group(function) row += 1 col = 0 scores_worksheet.write(row, col, function, bold) for sample in project.list_samples(): if sample_id is not None and sample != sample_id: continue col += 1 if function in scores and sample in scores[function]: scores_worksheet.write(row, col, scores[function][sample][metric]) scores_cat[category][sample] += scores[function][sample][ metric] else: scores_worksheet.write(row, col, 0.0) col += 1 scores_worksheet.write(row, col, project.ref_data.lookup_function_name(function)) # adjust column width scores_worksheet.set_column(0, 0, 10) scores_worksheet.set_column(col, col, 50) # Write worksheet for categories scores_cat_worksheet = workbook.add_worksheet('Categories ' + metric) row = 0 col = 0 scores_cat_worksheet.write(row, col, 'Categories', bold) for sample in project.list_samples(): if sample_id is not None and sample != sample_id: continue col += 1 scores_cat_worksheet.write(row, col, sample, bold) for category in categories_list: row += 1 col = 0 scores_cat_worksheet.write(row, col, category, bold) for sample in project.list_samples(): if sample_id is not None and sample != sample_id: continue col += 1 if category in scores_cat and sample in scores_cat[category]: scores_cat_worksheet.write(row, col, scores_cat[category][sample]) else: scores_cat_worksheet.write(row, col, 0.0) # adjust column width scores_cat_worksheet.set_column(0, 0, 50) workbook.close()