def compose_function_groups(project, sample_id, tab_index, metric=None): """ Makes table of functional groups """ result = ['<div id="tab' + tab_index + '" class="tabcontent">'] if metric is None: if project.samples[sample_id].is_paired_end: if (project.samples[sample_id].rpkg_scaling_factor != 0.0 and project.samples[sample_id].rpkg_scaling_factor is not None): metric = 'efpkg' elif (project.samples[sample_id].rpkm_scaling_factor != 0.0 and project.samples[sample_id].rpkm_scaling_factor is not None): metric = 'efpkm' else: metric = 'fragmentcount' else: if (project.samples[sample_id].rpkg_scaling_factor != 0.0 and project.samples[sample_id].rpkg_scaling_factor is not None): metric = 'erpkg' elif (project.samples[sample_id].rpkm_scaling_factor != 0.0 and project.samples[sample_id].rpkm_scaling_factor is not None): metric = 'erpkm' else: metric = 'readcount' scores = get_function_scores(project, sample_id=sample_id, metric=metric) categories = set() for function in scores: categories.add(project.ref_data.lookup_function_group(function)) result.append('<table><thead><tr>') result.append('<th>Function category</th><th>' + metric + '</th>') if metric not in ('readcount', 'proteincount'): result.append('<th>Raw sequence count</th>') result.append('<th>Amino acid identity %, average</th>') result.append('</thead></tr>') for category in sorted(list(categories)): category_data = autovivify(2, float) for function in scores: for sample in scores[function]: if project.ref_data.lookup_function_group(function) != category: continue category_data[sample][metric] += scores[function][sample][metric] category_data[sample]['count'] += scores[function][sample]['count'] category_data[sample]['identity'] += scores[function][sample]['identity'] category_data[sample]['hit_count'] += scores[function][sample]['hit_count'] result.append('<tr><td>' + category + '</td>') if metric in ('readcount', 'fragmentcount', 'proteincount'): result.append('<td>' + str(int(category_data[sample_id][metric])) + '</td>') else: result.append('<td>' + '{0:.5f}'.format(category_data[sample_id][metric]) + '</td>') if metric not in ('readcount', 'proteincount'): result.append('<td>' + '{0:.0f}'.format(category_data[sample_id]['count']) + '</td>') result.append('<td>' + '{0:.2f}'.format( category_data[sample_id]['identity'] / category_data[sample_id]['hit_count'] ) + '</td></tr>') result.append('</table>') result.append('</div>') return '\n'.join(result)
def compose_taxonomy_profile(project, sample_id, tab_index, metric=None): """Makes taxonomy profile """ if metric is None: if project.samples[sample_id].is_paired_end: if (project.samples[sample_id].rpkg_scaling_factor != 0.0 and project.samples[sample_id].rpkg_scaling_factor is not None): metric = 'efpkg' elif (project.samples[sample_id].rpkm_scaling_factor != 0.0 and project.samples[sample_id].rpkm_scaling_factor is not None): metric = 'efpkm' else: metric = 'fragmentcount' else: if (project.samples[sample_id].rpkg_scaling_factor != 0.0 and project.samples[sample_id].rpkg_scaling_factor is not None): metric = 'erpkg' elif (project.samples[sample_id].rpkm_scaling_factor != 0.0 and project.samples[sample_id].rpkm_scaling_factor is not None): metric = 'erpkm' else: metric = 'readcount' scores = get_function_taxonomy_scores(project, sample_id=sample_id, metric=metric) sample_scores = autovivify(3, float) for taxonomy_id in scores.keys(): for function_id in scores[taxonomy_id].keys(): if sample_id in scores[taxonomy_id][function_id]: for key, val in scores[taxonomy_id][function_id][sample_id].items(): sample_scores[taxonomy_id][function_id][key] = val tax_profile = TaxonomyProfile() tax_profile.make_function_taxonomy_profile(project.taxonomy_data, sample_scores) taxonomy_df = tax_profile.convert_profile_into_score_df(metric=metric) taxonomy_df.replace(0.0, np.nan, inplace=True) result = '<div id="tab' + tab_index + '" class="tabcontent">\n' if metric in ('readcount', 'fragmentcount', 'proteincount'): result += taxonomy_df.to_html(na_rep="", float_format='%.0f') else: result += taxonomy_df.to_html(na_rep="") result += '\n</div>\n' return result
def get_lca_dataseries_tax_xml(tax_profile, dataseries, taxid, offset, metric='efpkg'): """Returns XML node for a phylogenetic tree node and all its children. Creates additional child node for a fictional "Unclassified..." taxon if not all reads of the current node were mapped to children nodes. Args: tax_profile (:obj:TaxonomyProfile): taxonomy profile dataseries (list of str): either sample identifiers or function identifiers, depending on profile type (functional or taxonomic) taxid (str): taxonomy identifier of a node of interest offset (int): number of starting tabs metric (str): scoring metric (default value 'efpkg') Returns: ret_val (str): XML node attribute_values (defaultdict[str,dict[str,float]]): outer key is one of dataseries members, inner key is in [metric, 'count', 'identity' 'hit_count'], value is float. """ attribute_values = autovivify(2, float) if taxid not in tax_profile.tree.data: raise KeyError(taxid, 'not found in the tree!!!') ret_val = '\t' * offset + '<node name="' + tax_profile.tree.data[ taxid].name + '">\n' offset += 1 if tax_profile.tree.data[taxid].attributes: if metric != 'readcount' and metric != 'proteincount': ret_val += '\t' * offset + '<readcount>' for datapoint in dataseries: if (datapoint in tax_profile.tree.data[taxid].attributes) and ( 'count' in tax_profile.tree.data[taxid].attributes[datapoint]): ret_val += '<val>' + format( tax_profile.tree.data[taxid].attributes[datapoint] ['count'], "0.0f") + '</val>' else: ret_val += '<val>0</val>' ret_val += '</readcount>\n' ret_val += '\t' * offset + '<' + metric + '>' for datapoint in dataseries: if datapoint in tax_profile.tree.data[taxid].attributes and ( metric in tax_profile.tree.data[taxid].attributes[datapoint]): ret_val += '<val>' + format( tax_profile.tree.data[taxid].attributes[datapoint][metric], "0.6f") + '</val>' else: ret_val += '<val>0.0</val>' ret_val += '</' + metric + '>\n' + '\t' * offset + '<identity>' for datapoint in dataseries: if datapoint in tax_profile.tree.data[taxid].attributes and ( 'identity' in tax_profile.tree.data[taxid].attributes[datapoint]): ret_val += '<val>' + format( (tax_profile.tree.data[taxid].attributes[datapoint] ['identity'] / tax_profile.tree.data[taxid]. attributes[datapoint]['hit_count']), "0.1f") + '</val>' else: ret_val += '<val>0.0</val>' ret_val += '</identity>\n' else: if metric != 'readcount' and metric != 'proteincount': ret_val += '\t' * offset + '<readcount>' ret_val += '<val>0</val>' * len(dataseries) ret_val += '</readcount>\n' ret_val += '\t' * offset + '<' + metric + '>' ret_val += '<val>0.0</val>' * len(dataseries) ret_val += '<' + metric + '>\n' + '\t' * offset + '<identity>' ret_val += '<val>0.0</val>' * len(dataseries) ret_val += '</identity>\n' if tax_profile.tree.data[taxid].children: for child_taxid in tax_profile.tree.data[taxid].children: child_node, child_values = get_lca_dataseries_tax_xml( tax_profile, dataseries, child_taxid, offset, metric=metric) ret_val += child_node for datapoint in child_values.keys(): for key, val in child_values[datapoint].items(): attribute_values[datapoint][key] += val # Add a child node for unidentified child taxon, if needed unidentified_flag = False for datapoint in dataseries: if datapoint in tax_profile.tree.data[taxid].attributes: if (attribute_values[datapoint]['count'] < tax_profile.tree. data[taxid].attributes[datapoint]['count']): unidentified_flag = True break if unidentified_flag: if offset == 2: ret_val += '\t' * offset + '<node name="Unclassified">\n' else: ret_val += '\t'*offset + '<node name="Unclassified '\ + tax_profile.tree.data[taxid].name + '">\n' offset += 1 if metric != 'readcount' and metric != 'proteincount': ret_val += '\t' * offset + '<readcount>' for datapoint in dataseries: if datapoint in tax_profile.tree.data[ taxid].attributes and ( attribute_values[datapoint]['count'] < tax_profile.tree.data[taxid]. attributes[datapoint]['count']): ret_val += '<val>' + format( (tax_profile.tree.data[taxid].attributes[datapoint] ['count'] - attribute_values[datapoint]['count']), "0.0f") + '</val>' else: ret_val += '<val>0</val>' ret_val += '</readcount>\n' ret_val += '\t' * offset + '<' + metric + '>' for datapoint in dataseries: if datapoint in tax_profile.tree.data[taxid].attributes and ( attribute_values[datapoint]['count'] < tax_profile. tree.data[taxid].attributes[datapoint]['count']): ret_val += '<val>' + format( (tax_profile.tree.data[taxid].attributes[datapoint] [metric] - attribute_values[datapoint][metric]), "0.6f") + '</val>' else: ret_val += '<val>0.0</val>' ret_val += '</' + metric + '>\n' ret_val += '\t' * offset + '<identity>' for datapoint in dataseries: if datapoint in tax_profile.tree.data[taxid].attributes and ( 'hit_count' in tax_profile.tree.data[taxid].attributes[datapoint] ) and (attribute_values[datapoint]['hit_count'] < tax_profile. tree.data[taxid].attributes[datapoint]['hit_count']): ret_val += '<val>' + format( ((tax_profile.tree.data[taxid].attributes[datapoint] ['identity'] - attribute_values[datapoint]['identity']) / (tax_profile.tree.data[taxid].attributes[datapoint] ['hit_count'] - attribute_values[datapoint]['hit_count'])), "0.1f") + '</val>' else: ret_val += '<val>0.0</val>' ret_val += '</identity>\n' offset -= 1 ret_val += '\t' * offset + '</node>\n' offset -= 1 ret_val += '\t' * offset + '</node>\n' attribute_values = autovivify(1) for datapoint in dataseries: if datapoint in tax_profile.tree.data[taxid].attributes: if metric in tax_profile.tree.data[taxid].attributes[datapoint]: attribute_values[datapoint][metric] = tax_profile.tree.data[taxid]\ .attributes[datapoint][metric] if 'count' in tax_profile.tree.data[taxid].attributes[datapoint]: attribute_values[datapoint]['count'] = tax_profile.tree.data[taxid]\ .attributes[datapoint]['count'] if 'identity' in tax_profile.tree.data[taxid].attributes[ datapoint]: attribute_values[datapoint]['identity'] = tax_profile.tree.data[taxid]\ .attributes[datapoint]['identity'] if 'hit_count' in tax_profile.tree.data[taxid].attributes[ datapoint]: attribute_values[datapoint]['hit_count'] = tax_profile.tree.data[taxid]\ .attributes[datapoint]['hit_count'] return ret_val, attribute_values
def convert_node_into_values_dict(self, taxid, function_list, line_number, metric='efpkg'): """Returns node of functional-taxonomic profile for conversion into DataFrame. Recursively called for all children of the node. Args: taxid (str): taxonomy identifier of node function_list (list of str): function identifiers to be included to the table line_number (int): sequential number of node printed metric (str): score metric (default value 'efpkg') to be reported Returns: attribute_values (defaultdict[str,dict[str,float]]): outer key is function identifier, inner key is metric, value is float. ret_val (dict[str,tuple(str,str)]): key is line number, value is a tuple with function identifier or empty string as first element and field name as second element. Field names are 'Rank', 'Taxon name', metric. """ # Collect all attributes for reporting to the upper level attribute_values = defaultdict(dict) for function in function_list: if function in self.tree.data[taxid].attributes: attribute_values[function][metric] = 0.0 if metric in self.tree.data[taxid].attributes[function]: attribute_values[function][metric] = \ self.tree.data[taxid].attributes[function][metric] ret_val = defaultdict(dict) children_values = autovivify(2, float) if taxid in self.tree.data: ret_val[line_number][('', 'Rank')] = self.tree.data[taxid].rank ret_val[line_number][('', 'Taxon name')] = self.tree.data[taxid].name for function in function_list: ret_val[line_number][(function, metric)] = 0.0 if function in self.tree.data[taxid].attributes: ret_val[line_number][(function, metric)] = \ self.tree.data[taxid].attributes[function][metric] line_number += 1 if self.tree.data[taxid].children: for child_id in sorted(self.tree.data[taxid].children): child_lines, child_values = \ self.convert_node_into_values_dict(child_id, function_list, line_number, metric) for child_line_number, child_line in child_lines.items(): ret_val[child_line_number] = child_line line_number += len(child_lines) for datapoint in child_values.keys(): for key, val in child_values[datapoint].items(): children_values[datapoint][key] += val # Add a child node for unidentified child taxon, if needed unidentified_flag = False for function in function_list: if function in self.tree.data[taxid].attributes and ( children_values[function][metric] < self.tree.data[taxid].attributes[function][metric] ): unidentified_flag = True break if unidentified_flag and self.tree.data[ taxid].rank in LOWER_RANKS: ret_val[line_number][( '', 'Rank')] = LOWER_RANKS[self.tree.data[taxid].rank] ret_val[line_number][('', 'Taxon name')] = 'Unclassified ' \ + self.tree.data[taxid].name if taxid == '1': ret_val[line_number][('', 'Taxon name')] = 'Unclassified' for function in function_list: ret_val[line_number][(function, metric)] = 0.0 if function in self.tree.data[taxid].attributes and ( children_values[function][metric] < self.tree. data[taxid].attributes[function][metric]): ret_val[line_number][(function, metric)] = \ self.tree.data[taxid].attributes[function][metric] \ - children_values[function][metric] line_number += 1 else: print('Node not found:', taxid) return ret_val, attribute_values
def convert_node_into_dict(self, taxid, function_list, line_number, metric='rpkm'): """Returns node of functional-taxonomic profile for conversion into DataFrame. Recursively called for all children of the node. Args: taxid (str): taxonomy identifier of node function_list (list of str): function identifiers to be included to the table line_number (int): sequential number of node printed metric (str): score metric (default value 'rpkm') to be reported Returns: ret_val (dict[str,dict[tuple(str,str),float]]): outer key is line number, inner key is a tuple with function identifier or empty string as first element and field name as second element, value is a float. Field names are 'Rank', 'Taxon name', '1.Score', '2.Identity', '3.Raw count'. For each function, only the latter three fields are reported. attribute_values (defaultdict[str,dict[str,obj]]): outer key is function identifier, inner key may be metric, 'hit_count', 'identity' 'hit_count', value is float. """ # Collect values of all required attributes for reporting to the upper level attribute_values = defaultdict(dict) ret_val = defaultdict(dict) if taxid not in self.tree.data: return ret_val, attribute_values for function in function_list: for attribute_name in ['count', 'identity', 'hit_count', metric]: attribute_values[function][attribute_name] = 0.0 if function in self.tree.data[taxid].attributes: if metric in self.tree.data[taxid].attributes[function]: attribute_values[function][metric] = \ self.tree.data[taxid].attributes[function][metric] if 'count' in self.tree.data[taxid].attributes[function]: attribute_values[function]['count'] = \ self.tree.data[taxid].attributes[function]['count'] if 'identity' in self.tree.data[taxid].attributes[function]: attribute_values[function]['identity'] = \ self.tree.data[taxid].attributes[function]['identity'] attribute_values[function]['hit_count'] = \ self.tree.data[taxid].attributes[function]['hit_count'] children_values = autovivify(2, float) ret_val[line_number][('', 'Rank')] = self.tree.data[taxid].rank ret_val[line_number][('', 'Taxon name')] = self.tree.data[taxid].name for function in function_list: for field_name in ['1.Score', '2.Identity', '3.Raw count']: ret_val[line_number][(function, field_name)] = 0.0 if function in self.tree.data[taxid].attributes: ret_val[line_number][(function, '1.Score')] = \ self.tree.data[taxid].attributes[function][metric] ret_val[line_number][(function, '3.Raw count')] = \ self.tree.data[taxid].attributes[function]['count'] if 'identity' in self.tree.data[taxid].attributes[function]: ret_val[line_number][(function, '2.Identity')] = \ self.tree.data[taxid].attributes[function]['identity'] \ / self.tree.data[taxid].attributes[function]['hit_count'] line_number += 1 # If node has children, call convert_node_into_dict recursively if self.tree.data[taxid].children: for child_id in sorted(self.tree.data[taxid].children): child_lines, child_attribute_values = \ self.convert_node_into_dict(child_id, function_list, line_number, metric) for child_line_number, child_line in child_lines.items(): ret_val[child_line_number] = child_line line_number += len(child_lines) for child_function, child_attrib in child_attribute_values.items( ): for key, val in child_attrib.items(): children_values[child_function][key] += val # If read count for at least one function is greater than sum of read # counts from all children, some reads map to unidentified # taxon. Add a child node for fictional unidentified taxon. unidentified_flag = False for function in function_list: if function in self.tree.data[taxid].attributes and ( children_values[function]['count'] < self.tree.data[taxid].attributes[function]['count']): unidentified_flag = True break # For root node, fictional child name is 'Unclassified' # For other node, fictional child name is ' Unclassified <node taxon>' # For example, 'Unclassified Proteobacteria' if unidentified_flag and self.tree.data[taxid].rank in LOWER_RANKS: ret_val[line_number][( '', 'Rank')] = LOWER_RANKS[self.tree.data[taxid].rank] ret_val[line_number][('', 'Taxon name')] = 'Unclassified ' \ + self.tree.data[taxid].name if taxid == '1': ret_val[line_number][('', 'Taxon name')] = 'Unclassified' # Calculate scores for fictional node for function in function_list: for field_name in ['1.Score', '2.Identity', '3.Raw count']: ret_val[line_number][(function, field_name)] = 0.0 if function in self.tree.data[taxid].attributes and ( children_values[function]['count'] < self.tree.data[taxid].attributes[function]['count'] ): ret_val[line_number][(function, '1.Score')] = \ self.tree.data[taxid].attributes[function][metric] \ - children_values[function][metric] ret_val[line_number][(function, '3.Raw count')] = \ self.tree.data[taxid].attributes[function]['count'] \ - children_values[function]['count'] if 'identity' in self.tree.data[taxid].attributes[ function] and ( self.tree.data[taxid].attributes[function] ['hit_count'] > children_values[function]['hit_count']): ret_val[line_number][(function, '2.Identity')] = ( self.tree.data[taxid].attributes[function] ['identity'] - children_values[function]['identity']) / ( self.tree.data[taxid].attributes[function] ['hit_count'] - children_values[function]['hit_count']) line_number += 1 return ret_val, attribute_values
def make_assembly_xlsx(assembler): """Generates XLSX file for assembly. Args: assembler (:obj:'GeneAssembler'): gene assembler object """ xlsxfile = sanitize_file_name( os.path.join(assembler.project.options.assembly_dir, 'out', assembler.project.options.project_name + '_assembly.xlsx')) xlsxfile = xlsxfile.replace(' ', '_') xlsxfile = xlsxfile.replace("'", "") xlsxfile = xlsxfile.replace('"', '') workbook = xlsxwriter.Workbook(xlsxfile) bold = workbook.add_format({'bold': True}) cell_numformat0 = workbook.add_format() cell_numformat0.set_num_format('0') cell_numformat1 = workbook.add_format() cell_numformat1.set_num_format('0.0') cell_numformat5 = workbook.add_format() cell_numformat5.set_num_format('0.00000') functions_list = set() samples_list = sorted(assembler.project.list_samples()) function_read_counts = autovivify( 2, float) # function_read_counts[function][sample] gene_rpkm = autovivify(3, float) # gene_rpkm[function][gene][sample], # parameters are RPKM, coverage, identity # count reads per function, per sample for function in assembler.assembly.reads: functions_list.add(function) for read in assembler.assembly.reads[function]: function_read_counts[function][assembler.assembly.reads[function] [read]] += 1 # collect RPKM scores for contigs per function, per sample (for contigs? for genes?) # calculate total read count total_read_count = 0 for sample in samples_list: total_read_count += assembler.project.options.get_fastq1_readcount( sample) total_read_count += assembler.project.options.get_fastq2_readcount( sample) # generate output # make worksheet for read counts per function reads_worksheet = workbook.add_worksheet('Functions read count') row = 0 col = 0 reads_worksheet.write(row, col, 'Function', bold) for sample in samples_list: col += 1 reads_worksheet.write(row, col, sample, bold) col += 1 reads_worksheet.write(row, col, 'All samples', bold) col += 1 reads_worksheet.write(row, col, 'Assembled reads', bold) col += 1 reads_worksheet.write(row, col, 'Unassembled reads', bold) col += 1 reads_worksheet.write(row, col, 'Definition', bold) for function in sorted(functions_list): row += 1 col = 0 reads_worksheet.write(row, col, function, bold) for sample in samples_list: col += 1 if sample in function_read_counts[function]: reads_worksheet.write( row, col, function_read_counts[function][sample] * 2, cell_numformat0) else: reads_worksheet.write(row, col, 0, cell_numformat0) col += 1 all_reads = sum(function_read_counts[function].values()) * 2 reads_worksheet.write(row, col, all_reads, cell_numformat0) col += 1 assembled_reads = 0 if function in assembler.assembly.contigs: assembled_reads = sum([ len(c.reads) for c in assembler.assembly.contigs[function].values() ]) reads_worksheet.write(row, col, assembled_reads, cell_numformat0) col += 1 reads_worksheet.write(row, col, all_reads - assembled_reads, cell_numformat0) col += 1 reads_worksheet.write( row, col, assembler.project.ref_data.lookup_function_name(function)) # adjust column width reads_worksheet.set_column(0, 0, 10) reads_worksheet.set_column(col, col, 50) # make worksheet with contig data contigs_worksheet = workbook.add_worksheet('Contigs') row = 0 col = 0 contigs_worksheet.write(row, col, 'Contig', bold) col += 1 contigs_worksheet.write(row, col, 'Function', bold) col += 1 contigs_worksheet.write(row, col, 'Length', bold) col += 1 contigs_worksheet.write(row, col, 'Read count', bold) col += 1 contigs_worksheet.write(row, col, 'RPKM', bold) col += 1 contigs_worksheet.write(row, col, 'Coverage', bold) col += 1 contigs_worksheet.write(row, col, 'Number of genes', bold) for sample in samples_list: col += 1 contigs_worksheet.write(row, col, sample, bold) col += 1 contigs_worksheet.write(row, col, sample, bold) col += 1 contigs_worksheet.write(row, col, sample, bold) col += 1 contigs_worksheet.write(row, col, 'Definition', bold) row += 1 col = 6 for sample in samples_list: col += 1 contigs_worksheet.write(row, col, 'Read count', bold) col += 1 contigs_worksheet.write(row, col, 'RPKM', bold) col += 1 contigs_worksheet.write(row, col, 'Coverage', bold) for function in sorted(functions_list): if function in assembler.assembly.contigs: for contig in sorted(assembler.assembly.contigs[function].keys()): row += 1 col = 0 contigs_worksheet.write(row, col, contig, bold) col += 1 contigs_worksheet.write(row, col, function) col += 1 contigs_worksheet.write( row, col, len(assembler.assembly.contigs[function][contig].sequence)) col += 1 contigs_worksheet.write( row, col, assembler.assembly.contigs[function] [contig].get_read_count()) col += 1 contigs_worksheet.write( row, col, assembler.assembly.contigs[function] [contig].get_rpkm(total_read_count), cell_numformat5) col += 1 contigs_worksheet.write( row, col, assembler.assembly.contigs[function] [contig].get_coverage(), cell_numformat1) col += 1 contigs_worksheet.write( row, col, len(assembler.assembly.contigs[function][contig].genes)) col += 1 for sample in samples_list: contigs_worksheet.write( row, col, assembler.assembly.contigs[function] [contig].get_read_count(sample)) col += 1 contigs_worksheet.write( row, col, assembler.assembly.contigs[function][contig].get_rpkm( assembler.project.options.get_fastq1_readcount( sample), sample), cell_numformat5) col += 1 contigs_worksheet.write( row, col, assembler.assembly.contigs[function] [contig].get_coverage(sample), cell_numformat1) col += 1 contigs_worksheet.write( row, col, assembler.project.ref_data.lookup_function_name(function)) # adjust column width contigs_worksheet.set_column(0, 1, 10) contigs_worksheet.set_column(col, col, 50) # make worksheet for genes genes_worksheet = workbook.add_worksheet('Genes') row = 0 col = 0 genes_worksheet.write(row, col, 'Gene', bold) col += 1 genes_worksheet.write(row, col, 'Reads function', bold) col += 1 genes_worksheet.write(row, col, 'Contig', bold) col += 1 genes_worksheet.write(row, col, 'Gene start', bold) col += 1 genes_worksheet.write(row, col, 'Gene end', bold) col += 1 genes_worksheet.write(row, col, 'Gene length', bold) col += 1 genes_worksheet.write(row, col, 'Gene strand', bold) col += 1 genes_worksheet.write(row, col, 'Read count', bold) col += 1 genes_worksheet.write(row, col, 'RPKM', bold) col += 1 genes_worksheet.write(row, col, 'Coverage', bold) col += 1 genes_worksheet.write(row, col, 'Fama gene status', bold) col += 1 genes_worksheet.write(row, col, 'Fama function', bold) col += 1 genes_worksheet.write(row, col, 'Fama identity', bold) col += 1 genes_worksheet.write(row, col, 'CDS completeness', bold) col += 1 genes_worksheet.write(row, col, 'Fama best hit', bold) col += 1 genes_worksheet.write(row, col, 'Fama best hit taxonomy ID', bold) col += 1 genes_worksheet.write(row, col, 'Fama best hit organism', bold) col += 1 genes_worksheet.write(row, col, 'Fama best hit taxonomy', bold) col += 1 genes_worksheet.write(row, col, 'Fama LCA taxonomy ID', bold) col += 1 genes_worksheet.write(row, col, 'Fama LCA organism', bold) col += 1 genes_worksheet.write(row, col, 'Fama LCA taxonomy', bold) for sample in samples_list: col += 1 genes_worksheet.write(row, col, sample, bold) col += 1 genes_worksheet.write(row, col, sample, bold) col += 1 genes_worksheet.write(row, col, sample, bold) col += 1 genes_worksheet.write(row, col, 'Definition', bold) row += 1 col = 20 for sample in samples_list: col += 1 genes_worksheet.write(row, col, 'Read count', bold) col += 1 genes_worksheet.write(row, col, 'RPKM', bold) col += 1 genes_worksheet.write(row, col, 'Coverage', bold) for function in sorted(functions_list): if function not in assembler.assembly.contigs: continue for contig in sorted(assembler.assembly.contigs[function].keys()): for gene_id in sorted( assembler.assembly.contigs[function][contig].genes.keys()): gene = assembler.assembly.contigs[function][contig].genes[ gene_id] row += 1 col = 0 # Write Gene ID genes_worksheet.write(row, col, gene_id) col += 1 # Write Gene function from read mapping genes_worksheet.write(row, col, function) col += 1 # Write Contig ID genes_worksheet.write(row, col, contig) col += 1 # Write gene start genes_worksheet.write(row, col, int(gene.start)) col += 1 # Write gene end genes_worksheet.write(row, col, int(gene.end)) col += 1 # Write gene length gene_length = int(gene.end) - int(gene.start) + 1 genes_worksheet.write(row, col, gene_length) col += 1 # Write gene strand genes_worksheet.write(row, col, gene.strand) col += 1 # Write read count (calculated from read count of contig, # adjusted by gene length) gene_read_count = assembler.assembly.contigs[function][contig].get_read_count()\ * gene_length \ / len(assembler.assembly.contigs[function][contig].sequence) genes_worksheet.write(row, col, gene_read_count, cell_numformat1) col += 1 # Write RPKM gene_rpkm = assembler.assembly.contigs[function][ contig].get_rpkm(total_read_count) genes_worksheet.write(row, col, gene_rpkm, cell_numformat5) col += 1 # Write coverage genes_worksheet.write( row, col, assembler.assembly.contigs[function] [contig].get_coverage(), cell_numformat1) col += 1 # Write FAMA gene status genes_worksheet.write(row, col, gene.status) col += 1 if gene.status == STATUS_GOOD: # Write FAMA predicted functions gene_functions = set( [y for x in gene.hit_list.hits for y in x.functions]) genes_worksheet.write(row, col, ','.join(gene_functions)) col += 1 # Write FAMA identity gene_identity = [x.identity for x in gene.hit_list.hits] genes_worksheet.write( row, col, sum(gene_identity) / len(gene_identity), cell_numformat1) col += 1 # Write CDS completeness ref_lengths = [x.s_len for x in gene.hit_list.hits] genes_worksheet.write( row, col, len(gene.protein_sequence) * 100 * len(ref_lengths) / sum(ref_lengths), cell_numformat1) col += 1 # Write FAMA best hits fama_hits = [ cleanup_protein_id(x.subject_id) for x in gene.hit_list.hits ] genes_worksheet.write(row, col, ','.join(fama_hits)) col += 1 # Write FAMA taxonomy ID gene_taxonomy = [ assembler.project.ref_data.lookup_protein_tax( cleanup_protein_id(x.subject_id)) for x in gene.hit_list.hits ] genes_worksheet.write(row, col, ','.join(gene_taxonomy)) col += 1 # Write Fama best hit organism gene_organism = [ assembler.project.taxonomy_data.get_name(x) for x in gene_taxonomy ] genes_worksheet.write(row, col, ','.join(gene_organism)) col += 1 # Write Fama best hit taxonomy best_hit_taxonomy = [ assembler.project.taxonomy_data.get_taxonomy_lineage(x) for x in gene_taxonomy ] genes_worksheet.write(row, col, '|'.join(best_hit_taxonomy)) col += 1 # Write Fama LCA taxonomy ID lca_taxonomy_id = gene.taxonomy genes_worksheet.write(row, col, lca_taxonomy_id) col += 1 # Write Fama LCA organism lca_organism = assembler.project.taxonomy_data.get_name( lca_taxonomy_id) genes_worksheet.write(row, col, lca_organism) col += 1 # Write Fama LCA taxonomy lca_taxonomy = assembler.project.taxonomy_data.get_taxonomy_lineage( lca_taxonomy_id) genes_worksheet.write(row, col, lca_taxonomy) else: for _ in range(0, 10): genes_worksheet.write(row, col, 'N/A') col += 1 for sample in samples_list: col += 1 gene_read_count = assembler.assembly.contigs[function][ contig].get_read_count(sample) * len( gene.protein_sequence) * 3 / len( assembler.assembly.contigs[function] [contig].sequence) genes_worksheet.write(row, col, gene_read_count, cell_numformat1) col += 1 gene_rpkm = assembler.assembly.contigs[function][ contig].get_rpkm( assembler.project.options.get_fastq1_readcount( sample), sample) genes_worksheet.write(row, col, gene_rpkm, cell_numformat5) col += 1 genes_worksheet.write( row, col, assembler.assembly.contigs[function] [contig].get_coverage(sample), cell_numformat1) col += 1 genes_worksheet.write( row, col, assembler.project.ref_data.lookup_function_name(function)) # adjust column width genes_worksheet.set_column(0, 0, 20) genes_worksheet.set_column(1, 1, 10) genes_worksheet.set_column(7, 9, 15) genes_worksheet.set_column(col, col, 50) workbook.close()
def make_sample_tax_func_xlsx(project, scores, metric, function_id=None, rank=None): """Generates XLSX file for taxa scores for one or all functions in all samples. Args: project (:obj:'Project'): Project object that stores all annotated reads scores (dict[str, dict[str, dict[str, float]]]): outer key is function identifier, middle-level key is sample identifier, inner key is metric, value id float metric (str, optional): acceptable values are 'readcount', 'erpk', 'rpkm', 'fragmentcount', 'fpk', 'efpk', 'fpkm', 'erpkm', 'efpkm', 'fpkg', 'rpkg', 'erpkg', 'efpkg', 'proteincount' function_id (str, optional): function identifier. If function_id is None, all functions will be included into workbook. rank (str, optional): taxonomic rank. if rank parameter is not None, the resulting XLSX file will contain only entries for this rank. """ if function_id is None: if rank is None: xlsxfile = sanitize_file_name( os.path.join( project.options.work_dir, project.options.project_name + '_' + metric + '_samples_taxonomy.xlsx')) else: xlsxfile = sanitize_file_name( os.path.join( project.options.work_dir, project.options.project_name + '_' + metric + '_samples_' + rank + '_taxonomy.xlsx')) else: if rank is None: xlsxfile = sanitize_file_name( os.path.join( project.options.work_dir, function_id + '_' + metric + '_samples_taxonomy.xlsx')) else: xlsxfile = sanitize_file_name( os.path.join( project.options.work_dir, function_id + '_' + metric + '_samples_' + rank + '_taxonomy.xlsx')) print('Writing', xlsxfile) writer = pd.ExcelWriter(xlsxfile, engine='xlsxwriter') for function in sorted(project.ref_data.functions_dict.keys()): if function_id is not None and function != function_id: continue # Subsetting scores sample_scores = autovivify(3, float) for taxonomy_id in scores.keys(): if function in scores[taxonomy_id].keys(): for sample in project.list_samples(): if sample in scores[taxonomy_id][function]: for key, val in scores[taxonomy_id][function][ sample].items(): sample_scores[taxonomy_id][sample][key] = val else: sample_scores[taxonomy_id][sample][metric] = 0.0 tax_profile = TaxonomyProfile() tax_profile.make_function_taxonomy_profile(project.taxonomy_data, sample_scores) taxonomy_df = tax_profile.convert_profile_into_score_df(metric=metric) if rank is None: taxonomy_df.to_excel(writer, sheet_name=function, merge_cells=False) else: filtered_df = taxonomy_df[taxonomy_df[('', 'Rank')] == rank] filtered_df.to_excel(writer, sheet_name=function, merge_cells=False) format_taxonomy_worksheet(writer, function) # Make 'Average' sheet if function_id is None: sample_scores = autovivify(3, float) for taxonomy_id in scores: for function in sorted(project.ref_data.functions_dict.keys()): if function in scores[taxonomy_id]: for sample in project.list_samples(): if sample in scores[taxonomy_id][function]: for key, val in scores[taxonomy_id][function][ sample].items(): sample_scores[taxonomy_id][sample][key] += val else: sample_scores[taxonomy_id][sample][metric] += 0.0 for taxonomy_id in sample_scores: for sample in sample_scores[taxonomy_id]: sample_scores[taxonomy_id][sample][metric] = \ sample_scores[taxonomy_id][sample][metric] \ / len(project.ref_data.functions_dict.keys()) tax_profile = TaxonomyProfile() tax_profile.make_function_taxonomy_profile(project.taxonomy_data, sample_scores) taxonomy_df = tax_profile.convert_profile_into_score_df(metric=metric) if rank is None: taxonomy_df.to_excel(writer, sheet_name='Average', merge_cells=False) else: filtered_df = taxonomy_df[taxonomy_df[('', 'Rank')] == rank] filtered_df.to_excel(writer, sheet_name='Average', merge_cells=False) format_taxonomy_worksheet(writer, 'Average') writer.save()
def make_function_sample_xlsx(project, scores, metric, sample_id=None): """Generates XLSX file for function scores for one or more samples. Args: project (:obj:'Project'): Project object that stores all annotated reads scores (dict[str, dict[str, dict[str, float]]]): outer key is function identifier, middle-level key is sample identifier, inner key is metric, value id float metric (str, optional): acceptable values are 'readcount', 'erpk', 'rpkm', 'fragmentcount', 'fpk', 'efpk', 'fpkm', 'erpkm', 'efpkm', 'fpkg', 'rpkg', 'erpkg', 'efpkg', 'proteincount' sample_id (str, optional): sample identifier """ if sample_id is None: xlsxfile = sanitize_file_name( os.path.join( project.options.work_dir, project.options.project_name + '_' + metric + '_functions.xlsx')) else: xlsxfile = sanitize_file_name( os.path.join(project.options.work_dir, sample_id + '_' + metric + '_functions.xlsx')) print('Writing', xlsxfile) workbook = xlsxwriter.Workbook(xlsxfile) bold = workbook.add_format({'bold': True}) functions_list = sorted(project.ref_data.functions_dict.keys()) categories_list = sorted( list( set([ project.ref_data.functions_dict[x]['group'] for x in project.ref_data.functions_dict.keys() ]))) scores_cat = autovivify(2, float) # generate tables for functions scores_worksheet = workbook.add_worksheet('Functions ' + metric) row = 0 col = 0 scores_worksheet.write(row, col, 'Function', bold) for sample in project.list_samples(): if sample_id is not None and sample != sample_id: continue col += 1 scores_worksheet.write(row, col, sample, bold) col += 1 scores_worksheet.write(row, col, 'Definition', bold) for function in functions_list: category = project.ref_data.lookup_function_group(function) row += 1 col = 0 scores_worksheet.write(row, col, function, bold) for sample in project.list_samples(): if sample_id is not None and sample != sample_id: continue col += 1 if function in scores and sample in scores[function]: scores_worksheet.write(row, col, scores[function][sample][metric]) scores_cat[category][sample] += scores[function][sample][ metric] else: scores_worksheet.write(row, col, 0.0) col += 1 scores_worksheet.write(row, col, project.ref_data.lookup_function_name(function)) # adjust column width scores_worksheet.set_column(0, 0, 10) scores_worksheet.set_column(col, col, 50) # Write worksheet for categories scores_cat_worksheet = workbook.add_worksheet('Categories ' + metric) row = 0 col = 0 scores_cat_worksheet.write(row, col, 'Categories', bold) for sample in project.list_samples(): if sample_id is not None and sample != sample_id: continue col += 1 scores_cat_worksheet.write(row, col, sample, bold) for category in categories_list: row += 1 col = 0 scores_cat_worksheet.write(row, col, category, bold) for sample in project.list_samples(): if sample_id is not None and sample != sample_id: continue col += 1 if category in scores_cat and sample in scores_cat[category]: scores_cat_worksheet.write(row, col, scores_cat[category][sample]) else: scores_cat_worksheet.write(row, col, 0.0) # adjust column width scores_cat_worksheet.set_column(0, 0, 50) workbook.close()