예제 #1
0
def compose_function_groups(project, sample_id, tab_index, metric=None):
    """ Makes table of functional groups """
    result = ['<div id="tab' + tab_index + '" class="tabcontent">']
    if metric is None:
        if project.samples[sample_id].is_paired_end:
            if (project.samples[sample_id].rpkg_scaling_factor != 0.0 and
                    project.samples[sample_id].rpkg_scaling_factor is not None):
                metric = 'efpkg'
            elif (project.samples[sample_id].rpkm_scaling_factor != 0.0 and
                    project.samples[sample_id].rpkm_scaling_factor is not None):
                metric = 'efpkm'
            else:
                metric = 'fragmentcount'
        else:
            if (project.samples[sample_id].rpkg_scaling_factor != 0.0 and
                    project.samples[sample_id].rpkg_scaling_factor is not None):
                metric = 'erpkg'
            elif (project.samples[sample_id].rpkm_scaling_factor != 0.0 and
                    project.samples[sample_id].rpkm_scaling_factor is not None):
                metric = 'erpkm'
            else:
                metric = 'readcount'
    scores = get_function_scores(project, sample_id=sample_id, metric=metric)
    categories = set()
    for function in scores:
        categories.add(project.ref_data.lookup_function_group(function))

    result.append('<table><thead><tr>')
    result.append('<th>Function category</th><th>' + metric + '</th>')
    if metric not in ('readcount', 'proteincount'):
        result.append('<th>Raw sequence count</th>')
    result.append('<th>Amino acid identity %, average</th>')
    result.append('</thead></tr>')

    for category in sorted(list(categories)):
        category_data = autovivify(2, float)
        for function in scores:
            for sample in scores[function]:
                if project.ref_data.lookup_function_group(function) != category:
                    continue
                category_data[sample][metric] += scores[function][sample][metric]
                category_data[sample]['count'] += scores[function][sample]['count']
                category_data[sample]['identity'] += scores[function][sample]['identity']
                category_data[sample]['hit_count'] += scores[function][sample]['hit_count']

        result.append('<tr><td>' + category + '</td>')
        if metric in ('readcount', 'fragmentcount', 'proteincount'):
            result.append('<td>' + str(int(category_data[sample_id][metric])) + '</td>')
        else:
            result.append('<td>' + '{0:.5f}'.format(category_data[sample_id][metric]) + '</td>')
        if metric not in ('readcount', 'proteincount'):
            result.append('<td>' + '{0:.0f}'.format(category_data[sample_id]['count']) + '</td>')
        result.append('<td>' + '{0:.2f}'.format(
            category_data[sample_id]['identity'] / category_data[sample_id]['hit_count']
            ) + '</td></tr>')

    result.append('</table>')
    result.append('</div>')
    return '\n'.join(result)
예제 #2
0
def compose_taxonomy_profile(project, sample_id, tab_index, metric=None):
    """Makes taxonomy profile """
    if metric is None:
        if project.samples[sample_id].is_paired_end:
            if (project.samples[sample_id].rpkg_scaling_factor != 0.0 and
                    project.samples[sample_id].rpkg_scaling_factor is not None):
                metric = 'efpkg'
            elif (project.samples[sample_id].rpkm_scaling_factor != 0.0 and
                    project.samples[sample_id].rpkm_scaling_factor is not None):
                metric = 'efpkm'
            else:
                metric = 'fragmentcount'
        else:
            if (project.samples[sample_id].rpkg_scaling_factor != 0.0 and
                    project.samples[sample_id].rpkg_scaling_factor is not None):
                metric = 'erpkg'
            elif (project.samples[sample_id].rpkm_scaling_factor != 0.0 and
                    project.samples[sample_id].rpkm_scaling_factor is not None):
                metric = 'erpkm'
            else:
                metric = 'readcount'
    scores = get_function_taxonomy_scores(project, sample_id=sample_id, metric=metric)

    sample_scores = autovivify(3, float)
    for taxonomy_id in scores.keys():
        for function_id in scores[taxonomy_id].keys():
            if sample_id in scores[taxonomy_id][function_id]:
                for key, val in scores[taxonomy_id][function_id][sample_id].items():
                    sample_scores[taxonomy_id][function_id][key] = val

    tax_profile = TaxonomyProfile()
    tax_profile.make_function_taxonomy_profile(project.taxonomy_data, sample_scores)
    taxonomy_df = tax_profile.convert_profile_into_score_df(metric=metric)
    taxonomy_df.replace(0.0, np.nan, inplace=True)
    result = '<div id="tab' + tab_index + '" class="tabcontent">\n'
    if metric in ('readcount', 'fragmentcount', 'proteincount'):
        result += taxonomy_df.to_html(na_rep="", float_format='%.0f')
    else:
        result += taxonomy_df.to_html(na_rep="")
    result += '\n</div>\n'
    return result
예제 #3
0
def get_lca_dataseries_tax_xml(tax_profile,
                               dataseries,
                               taxid,
                               offset,
                               metric='efpkg'):
    """Returns XML node for a phylogenetic tree node and all its children.
    Creates additional child node for a fictional "Unclassified..." taxon
    if not all reads of the current node were mapped to children nodes.

    Args:
        tax_profile (:obj:TaxonomyProfile): taxonomy profile
        dataseries (list of str): either sample identifiers or function identifiers,
            depending on profile type (functional or taxonomic)
        taxid (str): taxonomy identifier of a node of interest
        offset (int): number of starting tabs
        metric (str): scoring metric (default value 'efpkg')

    Returns:
        ret_val (str): XML node
        attribute_values (defaultdict[str,dict[str,float]]): outer key is
            one of dataseries members, inner key is in [metric, 'count', 'identity'
            'hit_count'], value is float.
    """
    attribute_values = autovivify(2, float)

    if taxid not in tax_profile.tree.data:
        raise KeyError(taxid, 'not found in the tree!!!')
    ret_val = '\t' * offset + '<node name="' + tax_profile.tree.data[
        taxid].name + '">\n'
    offset += 1
    if tax_profile.tree.data[taxid].attributes:
        if metric != 'readcount' and metric != 'proteincount':
            ret_val += '\t' * offset + '<readcount>'
            for datapoint in dataseries:
                if (datapoint in tax_profile.tree.data[taxid].attributes) and (
                        'count'
                        in tax_profile.tree.data[taxid].attributes[datapoint]):
                    ret_val += '<val>' + format(
                        tax_profile.tree.data[taxid].attributes[datapoint]
                        ['count'], "0.0f") + '</val>'
                else:
                    ret_val += '<val>0</val>'
            ret_val += '</readcount>\n'
        ret_val += '\t' * offset + '<' + metric + '>'
        for datapoint in dataseries:
            if datapoint in tax_profile.tree.data[taxid].attributes and (
                    metric
                    in tax_profile.tree.data[taxid].attributes[datapoint]):
                ret_val += '<val>' + format(
                    tax_profile.tree.data[taxid].attributes[datapoint][metric],
                    "0.6f") + '</val>'
            else:
                ret_val += '<val>0.0</val>'
        ret_val += '</' + metric + '>\n' + '\t' * offset + '<identity>'
        for datapoint in dataseries:
            if datapoint in tax_profile.tree.data[taxid].attributes and (
                    'identity'
                    in tax_profile.tree.data[taxid].attributes[datapoint]):
                ret_val += '<val>' + format(
                    (tax_profile.tree.data[taxid].attributes[datapoint]
                     ['identity'] / tax_profile.tree.data[taxid].
                     attributes[datapoint]['hit_count']), "0.1f") + '</val>'
            else:
                ret_val += '<val>0.0</val>'
        ret_val += '</identity>\n'
    else:
        if metric != 'readcount' and metric != 'proteincount':
            ret_val += '\t' * offset + '<readcount>'
            ret_val += '<val>0</val>' * len(dataseries)
            ret_val += '</readcount>\n'
        ret_val += '\t' * offset + '<' + metric + '>'
        ret_val += '<val>0.0</val>' * len(dataseries)
        ret_val += '<' + metric + '>\n' + '\t' * offset + '<identity>'
        ret_val += '<val>0.0</val>' * len(dataseries)
        ret_val += '</identity>\n'

    if tax_profile.tree.data[taxid].children:
        for child_taxid in tax_profile.tree.data[taxid].children:
            child_node, child_values = get_lca_dataseries_tax_xml(
                tax_profile, dataseries, child_taxid, offset, metric=metric)
            ret_val += child_node
            for datapoint in child_values.keys():
                for key, val in child_values[datapoint].items():
                    attribute_values[datapoint][key] += val
        # Add a child node for unidentified child taxon, if needed
        unidentified_flag = False
        for datapoint in dataseries:
            if datapoint in tax_profile.tree.data[taxid].attributes:
                if (attribute_values[datapoint]['count'] < tax_profile.tree.
                        data[taxid].attributes[datapoint]['count']):
                    unidentified_flag = True
                    break

        if unidentified_flag:
            if offset == 2:
                ret_val += '\t' * offset + '<node name="Unclassified">\n'
            else:
                ret_val += '\t'*offset + '<node name="Unclassified '\
                           + tax_profile.tree.data[taxid].name + '">\n'
            offset += 1
            if metric != 'readcount' and metric != 'proteincount':
                ret_val += '\t' * offset + '<readcount>'
                for datapoint in dataseries:
                    if datapoint in tax_profile.tree.data[
                            taxid].attributes and (
                                attribute_values[datapoint]['count'] <
                                tax_profile.tree.data[taxid].
                                attributes[datapoint]['count']):
                        ret_val += '<val>' + format(
                            (tax_profile.tree.data[taxid].attributes[datapoint]
                             ['count'] - attribute_values[datapoint]['count']),
                            "0.0f") + '</val>'
                    else:
                        ret_val += '<val>0</val>'
                ret_val += '</readcount>\n'
            ret_val += '\t' * offset + '<' + metric + '>'
            for datapoint in dataseries:
                if datapoint in tax_profile.tree.data[taxid].attributes and (
                        attribute_values[datapoint]['count'] < tax_profile.
                        tree.data[taxid].attributes[datapoint]['count']):
                    ret_val += '<val>' + format(
                        (tax_profile.tree.data[taxid].attributes[datapoint]
                         [metric] - attribute_values[datapoint][metric]),
                        "0.6f") + '</val>'
                else:
                    ret_val += '<val>0.0</val>'
            ret_val += '</' + metric + '>\n'
            ret_val += '\t' * offset + '<identity>'
            for datapoint in dataseries:
                if datapoint in tax_profile.tree.data[taxid].attributes and (
                        'hit_count'
                        in tax_profile.tree.data[taxid].attributes[datapoint]
                ) and (attribute_values[datapoint]['hit_count'] < tax_profile.
                       tree.data[taxid].attributes[datapoint]['hit_count']):
                    ret_val += '<val>' + format(
                        ((tax_profile.tree.data[taxid].attributes[datapoint]
                          ['identity'] -
                          attribute_values[datapoint]['identity']) /
                         (tax_profile.tree.data[taxid].attributes[datapoint]
                          ['hit_count'] -
                          attribute_values[datapoint]['hit_count'])),
                        "0.1f") + '</val>'
                else:
                    ret_val += '<val>0.0</val>'
            ret_val += '</identity>\n'
            offset -= 1
            ret_val += '\t' * offset + '</node>\n'
    offset -= 1
    ret_val += '\t' * offset + '</node>\n'
    attribute_values = autovivify(1)
    for datapoint in dataseries:
        if datapoint in tax_profile.tree.data[taxid].attributes:
            if metric in tax_profile.tree.data[taxid].attributes[datapoint]:
                attribute_values[datapoint][metric] = tax_profile.tree.data[taxid]\
                    .attributes[datapoint][metric]
            if 'count' in tax_profile.tree.data[taxid].attributes[datapoint]:
                attribute_values[datapoint]['count'] = tax_profile.tree.data[taxid]\
                    .attributes[datapoint]['count']
            if 'identity' in tax_profile.tree.data[taxid].attributes[
                    datapoint]:
                attribute_values[datapoint]['identity'] = tax_profile.tree.data[taxid]\
                    .attributes[datapoint]['identity']
            if 'hit_count' in tax_profile.tree.data[taxid].attributes[
                    datapoint]:
                attribute_values[datapoint]['hit_count'] = tax_profile.tree.data[taxid]\
                    .attributes[datapoint]['hit_count']
    return ret_val, attribute_values
예제 #4
0
    def convert_node_into_values_dict(self,
                                      taxid,
                                      function_list,
                                      line_number,
                                      metric='efpkg'):
        """Returns node of functional-taxonomic profile for conversion into DataFrame.
        Recursively called for all children of the node.

        Args:
            taxid (str): taxonomy identifier of node
            function_list (list of str): function identifiers to be included
                to the table
            line_number (int): sequential number of node printed
            metric (str): score metric (default value 'efpkg') to be reported

        Returns:
            attribute_values (defaultdict[str,dict[str,float]]): outer key is
                function identifier, inner key is metric,  value is float.
            ret_val (dict[str,tuple(str,str)]): key is line number, value is a
                tuple with function identifier or empty string as first element
                and field name as second element. Field names are 'Rank',
                'Taxon name', metric.
        """
        # Collect all attributes for reporting to the upper level
        attribute_values = defaultdict(dict)
        for function in function_list:
            if function in self.tree.data[taxid].attributes:
                attribute_values[function][metric] = 0.0
                if metric in self.tree.data[taxid].attributes[function]:
                    attribute_values[function][metric] = \
                        self.tree.data[taxid].attributes[function][metric]

        ret_val = defaultdict(dict)
        children_values = autovivify(2, float)
        if taxid in self.tree.data:
            ret_val[line_number][('', 'Rank')] = self.tree.data[taxid].rank
            ret_val[line_number][('',
                                  'Taxon name')] = self.tree.data[taxid].name
            for function in function_list:
                ret_val[line_number][(function, metric)] = 0.0
                if function in self.tree.data[taxid].attributes:
                    ret_val[line_number][(function, metric)] = \
                        self.tree.data[taxid].attributes[function][metric]
            line_number += 1
            if self.tree.data[taxid].children:
                for child_id in sorted(self.tree.data[taxid].children):
                    child_lines, child_values = \
                        self.convert_node_into_values_dict(child_id,
                                                           function_list,
                                                           line_number,
                                                           metric)
                    for child_line_number, child_line in child_lines.items():
                        ret_val[child_line_number] = child_line
                    line_number += len(child_lines)
                    for datapoint in child_values.keys():
                        for key, val in child_values[datapoint].items():
                            children_values[datapoint][key] += val

                # Add a child node for unidentified child taxon, if needed
                unidentified_flag = False
                for function in function_list:
                    if function in self.tree.data[taxid].attributes and (
                            children_values[function][metric] <
                            self.tree.data[taxid].attributes[function][metric]
                    ):
                        unidentified_flag = True
                        break

                if unidentified_flag and self.tree.data[
                        taxid].rank in LOWER_RANKS:
                    ret_val[line_number][(
                        '', 'Rank')] = LOWER_RANKS[self.tree.data[taxid].rank]
                    ret_val[line_number][('', 'Taxon name')] = 'Unclassified ' \
                        + self.tree.data[taxid].name
                    if taxid == '1':
                        ret_val[line_number][('',
                                              'Taxon name')] = 'Unclassified'
                    for function in function_list:
                        ret_val[line_number][(function, metric)] = 0.0
                        if function in self.tree.data[taxid].attributes and (
                                children_values[function][metric] < self.tree.
                                data[taxid].attributes[function][metric]):
                            ret_val[line_number][(function, metric)] = \
                                self.tree.data[taxid].attributes[function][metric] \
                                - children_values[function][metric]
                    line_number += 1
        else:
            print('Node not found:', taxid)

        return ret_val, attribute_values
예제 #5
0
    def convert_node_into_dict(self,
                               taxid,
                               function_list,
                               line_number,
                               metric='rpkm'):
        """Returns node of functional-taxonomic profile for conversion into DataFrame.
        Recursively called for all children of the node.

        Args:
            taxid (str): taxonomy identifier of node
            function_list (list of str): function identifiers to be included
                to the table
            line_number (int): sequential number of node printed
            metric (str): score metric (default value 'rpkm') to be reported

        Returns:
            ret_val (dict[str,dict[tuple(str,str),float]]): outer key is line number,
                inner key is a tuple with function identifier or empty string as
                first element and field name as second element, value is a float.
                Field names are 'Rank', 'Taxon name', '1.Score', '2.Identity',
                '3.Raw count'. For each function, only the latter three fields are reported.
            attribute_values (defaultdict[str,dict[str,obj]]): outer key is
                function identifier, inner key may be metric, 'hit_count', 'identity'
                'hit_count', value is float.
        """
        # Collect values of all required attributes for reporting to the upper level
        attribute_values = defaultdict(dict)
        ret_val = defaultdict(dict)
        if taxid not in self.tree.data:
            return ret_val, attribute_values
        for function in function_list:
            for attribute_name in ['count', 'identity', 'hit_count', metric]:
                attribute_values[function][attribute_name] = 0.0
            if function in self.tree.data[taxid].attributes:
                if metric in self.tree.data[taxid].attributes[function]:
                    attribute_values[function][metric] = \
                        self.tree.data[taxid].attributes[function][metric]
                if 'count' in self.tree.data[taxid].attributes[function]:
                    attribute_values[function]['count'] = \
                        self.tree.data[taxid].attributes[function]['count']
                if 'identity' in self.tree.data[taxid].attributes[function]:
                    attribute_values[function]['identity'] = \
                        self.tree.data[taxid].attributes[function]['identity']
                    attribute_values[function]['hit_count'] = \
                        self.tree.data[taxid].attributes[function]['hit_count']

        children_values = autovivify(2, float)

        ret_val[line_number][('', 'Rank')] = self.tree.data[taxid].rank
        ret_val[line_number][('', 'Taxon name')] = self.tree.data[taxid].name
        for function in function_list:
            for field_name in ['1.Score', '2.Identity', '3.Raw count']:
                ret_val[line_number][(function, field_name)] = 0.0
            if function in self.tree.data[taxid].attributes:
                ret_val[line_number][(function, '1.Score')] = \
                    self.tree.data[taxid].attributes[function][metric]
                ret_val[line_number][(function, '3.Raw count')] = \
                    self.tree.data[taxid].attributes[function]['count']
                if 'identity' in self.tree.data[taxid].attributes[function]:
                    ret_val[line_number][(function, '2.Identity')] = \
                        self.tree.data[taxid].attributes[function]['identity'] \
                        / self.tree.data[taxid].attributes[function]['hit_count']
        line_number += 1
        # If node has children, call convert_node_into_dict recursively
        if self.tree.data[taxid].children:
            for child_id in sorted(self.tree.data[taxid].children):
                child_lines, child_attribute_values = \
                    self.convert_node_into_dict(child_id,
                                                function_list,
                                                line_number,
                                                metric)
                for child_line_number, child_line in child_lines.items():
                    ret_val[child_line_number] = child_line
                line_number += len(child_lines)
                for child_function, child_attrib in child_attribute_values.items(
                ):
                    for key, val in child_attrib.items():
                        children_values[child_function][key] += val

            # If read count for at least one function is greater than sum of read
            # counts from all children, some reads map to unidentified
            # taxon. Add a child node for fictional unidentified taxon.
            unidentified_flag = False
            for function in function_list:
                if function in self.tree.data[taxid].attributes and (
                        children_values[function]['count'] <
                        self.tree.data[taxid].attributes[function]['count']):
                    unidentified_flag = True
                    break
            # For root node, fictional child name is 'Unclassified'
            # For other node, fictional child name is ' Unclassified <node taxon>'
            # For example, 'Unclassified Proteobacteria'
            if unidentified_flag and self.tree.data[taxid].rank in LOWER_RANKS:
                ret_val[line_number][(
                    '', 'Rank')] = LOWER_RANKS[self.tree.data[taxid].rank]
                ret_val[line_number][('', 'Taxon name')] = 'Unclassified ' \
                    + self.tree.data[taxid].name
                if taxid == '1':
                    ret_val[line_number][('', 'Taxon name')] = 'Unclassified'
                # Calculate scores for fictional node
                for function in function_list:
                    for field_name in ['1.Score', '2.Identity', '3.Raw count']:
                        ret_val[line_number][(function, field_name)] = 0.0
                    if function in self.tree.data[taxid].attributes and (
                            children_values[function]['count'] <
                            self.tree.data[taxid].attributes[function]['count']
                    ):
                        ret_val[line_number][(function, '1.Score')] = \
                            self.tree.data[taxid].attributes[function][metric] \
                            - children_values[function][metric]
                        ret_val[line_number][(function, '3.Raw count')] = \
                            self.tree.data[taxid].attributes[function]['count'] \
                            - children_values[function]['count']

                        if 'identity' in self.tree.data[taxid].attributes[
                                function] and (
                                    self.tree.data[taxid].attributes[function]
                                    ['hit_count'] >
                                    children_values[function]['hit_count']):
                            ret_val[line_number][(function, '2.Identity')] = (
                                self.tree.data[taxid].attributes[function]
                                ['identity'] -
                                children_values[function]['identity']) / (
                                    self.tree.data[taxid].attributes[function]
                                    ['hit_count'] -
                                    children_values[function]['hit_count'])
                line_number += 1
        return ret_val, attribute_values
예제 #6
0
def make_assembly_xlsx(assembler):
    """Generates XLSX file for assembly.

    Args:
        assembler (:obj:'GeneAssembler'): gene assembler object
    """
    xlsxfile = sanitize_file_name(
        os.path.join(assembler.project.options.assembly_dir, 'out',
                     assembler.project.options.project_name +
                     '_assembly.xlsx'))
    xlsxfile = xlsxfile.replace(' ', '_')
    xlsxfile = xlsxfile.replace("'", "")
    xlsxfile = xlsxfile.replace('"', '')
    workbook = xlsxwriter.Workbook(xlsxfile)
    bold = workbook.add_format({'bold': True})
    cell_numformat0 = workbook.add_format()
    cell_numformat0.set_num_format('0')
    cell_numformat1 = workbook.add_format()
    cell_numformat1.set_num_format('0.0')
    cell_numformat5 = workbook.add_format()
    cell_numformat5.set_num_format('0.00000')

    functions_list = set()
    samples_list = sorted(assembler.project.list_samples())
    function_read_counts = autovivify(
        2, float)  # function_read_counts[function][sample]
    gene_rpkm = autovivify(3, float)  # gene_rpkm[function][gene][sample],
    # parameters are RPKM, coverage, identity

    # count reads per function, per sample
    for function in assembler.assembly.reads:
        functions_list.add(function)
        for read in assembler.assembly.reads[function]:
            function_read_counts[function][assembler.assembly.reads[function]
                                           [read]] += 1

    # collect RPKM scores for contigs per function, per sample (for contigs? for genes?)
    # calculate total read count
    total_read_count = 0

    for sample in samples_list:
        total_read_count += assembler.project.options.get_fastq1_readcount(
            sample)
        total_read_count += assembler.project.options.get_fastq2_readcount(
            sample)
    # generate output

    # make worksheet for read counts per function
    reads_worksheet = workbook.add_worksheet('Functions read count')

    row = 0
    col = 0
    reads_worksheet.write(row, col, 'Function', bold)

    for sample in samples_list:
        col += 1
        reads_worksheet.write(row, col, sample, bold)
    col += 1
    reads_worksheet.write(row, col, 'All samples', bold)
    col += 1
    reads_worksheet.write(row, col, 'Assembled reads', bold)
    col += 1
    reads_worksheet.write(row, col, 'Unassembled reads', bold)
    col += 1
    reads_worksheet.write(row, col, 'Definition', bold)

    for function in sorted(functions_list):
        row += 1
        col = 0
        reads_worksheet.write(row, col, function, bold)
        for sample in samples_list:
            col += 1
            if sample in function_read_counts[function]:
                reads_worksheet.write(
                    row, col, function_read_counts[function][sample] * 2,
                    cell_numformat0)
            else:
                reads_worksheet.write(row, col, 0, cell_numformat0)
        col += 1
        all_reads = sum(function_read_counts[function].values()) * 2
        reads_worksheet.write(row, col, all_reads, cell_numformat0)
        col += 1
        assembled_reads = 0
        if function in assembler.assembly.contigs:
            assembled_reads = sum([
                len(c.reads)
                for c in assembler.assembly.contigs[function].values()
            ])
        reads_worksheet.write(row, col, assembled_reads, cell_numformat0)
        col += 1
        reads_worksheet.write(row, col, all_reads - assembled_reads,
                              cell_numformat0)
        col += 1
        reads_worksheet.write(
            row, col,
            assembler.project.ref_data.lookup_function_name(function))

    # adjust column width
    reads_worksheet.set_column(0, 0, 10)
    reads_worksheet.set_column(col, col, 50)

    # make worksheet with contig data
    contigs_worksheet = workbook.add_worksheet('Contigs')

    row = 0
    col = 0
    contigs_worksheet.write(row, col, 'Contig', bold)
    col += 1
    contigs_worksheet.write(row, col, 'Function', bold)
    col += 1
    contigs_worksheet.write(row, col, 'Length', bold)
    col += 1
    contigs_worksheet.write(row, col, 'Read count', bold)
    col += 1
    contigs_worksheet.write(row, col, 'RPKM', bold)
    col += 1
    contigs_worksheet.write(row, col, 'Coverage', bold)
    col += 1
    contigs_worksheet.write(row, col, 'Number of genes', bold)

    for sample in samples_list:
        col += 1
        contigs_worksheet.write(row, col, sample, bold)
        col += 1
        contigs_worksheet.write(row, col, sample, bold)
        col += 1
        contigs_worksheet.write(row, col, sample, bold)

    col += 1
    contigs_worksheet.write(row, col, 'Definition', bold)

    row += 1
    col = 6
    for sample in samples_list:
        col += 1
        contigs_worksheet.write(row, col, 'Read count', bold)
        col += 1
        contigs_worksheet.write(row, col, 'RPKM', bold)
        col += 1
        contigs_worksheet.write(row, col, 'Coverage', bold)

    for function in sorted(functions_list):
        if function in assembler.assembly.contigs:
            for contig in sorted(assembler.assembly.contigs[function].keys()):
                row += 1
                col = 0
                contigs_worksheet.write(row, col, contig, bold)
                col += 1
                contigs_worksheet.write(row, col, function)
                col += 1
                contigs_worksheet.write(
                    row, col,
                    len(assembler.assembly.contigs[function][contig].sequence))
                col += 1
                contigs_worksheet.write(
                    row, col, assembler.assembly.contigs[function]
                    [contig].get_read_count())
                col += 1
                contigs_worksheet.write(
                    row, col, assembler.assembly.contigs[function]
                    [contig].get_rpkm(total_read_count), cell_numformat5)
                col += 1
                contigs_worksheet.write(
                    row, col, assembler.assembly.contigs[function]
                    [contig].get_coverage(), cell_numformat1)
                col += 1
                contigs_worksheet.write(
                    row, col,
                    len(assembler.assembly.contigs[function][contig].genes))
                col += 1

                for sample in samples_list:
                    contigs_worksheet.write(
                        row, col, assembler.assembly.contigs[function]
                        [contig].get_read_count(sample))
                    col += 1
                    contigs_worksheet.write(
                        row, col,
                        assembler.assembly.contigs[function][contig].get_rpkm(
                            assembler.project.options.get_fastq1_readcount(
                                sample), sample), cell_numformat5)
                    col += 1
                    contigs_worksheet.write(
                        row, col, assembler.assembly.contigs[function]
                        [contig].get_coverage(sample), cell_numformat1)
                    col += 1
                contigs_worksheet.write(
                    row, col,
                    assembler.project.ref_data.lookup_function_name(function))

    # adjust column width
    contigs_worksheet.set_column(0, 1, 10)
    contigs_worksheet.set_column(col, col, 50)

    # make worksheet for genes
    genes_worksheet = workbook.add_worksheet('Genes')

    row = 0
    col = 0
    genes_worksheet.write(row, col, 'Gene', bold)
    col += 1
    genes_worksheet.write(row, col, 'Reads function', bold)
    col += 1
    genes_worksheet.write(row, col, 'Contig', bold)
    col += 1
    genes_worksheet.write(row, col, 'Gene start', bold)
    col += 1
    genes_worksheet.write(row, col, 'Gene end', bold)
    col += 1
    genes_worksheet.write(row, col, 'Gene length', bold)
    col += 1
    genes_worksheet.write(row, col, 'Gene strand', bold)
    col += 1
    genes_worksheet.write(row, col, 'Read count', bold)
    col += 1
    genes_worksheet.write(row, col, 'RPKM', bold)
    col += 1
    genes_worksheet.write(row, col, 'Coverage', bold)
    col += 1
    genes_worksheet.write(row, col, 'Fama gene status', bold)
    col += 1
    genes_worksheet.write(row, col, 'Fama function', bold)
    col += 1
    genes_worksheet.write(row, col, 'Fama identity', bold)
    col += 1
    genes_worksheet.write(row, col, 'CDS completeness', bold)
    col += 1
    genes_worksheet.write(row, col, 'Fama best hit', bold)
    col += 1
    genes_worksheet.write(row, col, 'Fama best hit taxonomy ID', bold)
    col += 1
    genes_worksheet.write(row, col, 'Fama best hit organism', bold)
    col += 1
    genes_worksheet.write(row, col, 'Fama best hit taxonomy', bold)
    col += 1
    genes_worksheet.write(row, col, 'Fama LCA taxonomy ID', bold)
    col += 1
    genes_worksheet.write(row, col, 'Fama LCA organism', bold)
    col += 1
    genes_worksheet.write(row, col, 'Fama LCA taxonomy', bold)

    for sample in samples_list:
        col += 1
        genes_worksheet.write(row, col, sample, bold)
        col += 1
        genes_worksheet.write(row, col, sample, bold)
        col += 1
        genes_worksheet.write(row, col, sample, bold)

    col += 1
    genes_worksheet.write(row, col, 'Definition', bold)

    row += 1
    col = 20
    for sample in samples_list:
        col += 1
        genes_worksheet.write(row, col, 'Read count', bold)
        col += 1
        genes_worksheet.write(row, col, 'RPKM', bold)
        col += 1
        genes_worksheet.write(row, col, 'Coverage', bold)

    for function in sorted(functions_list):
        if function not in assembler.assembly.contigs:
            continue
        for contig in sorted(assembler.assembly.contigs[function].keys()):
            for gene_id in sorted(
                    assembler.assembly.contigs[function][contig].genes.keys()):
                gene = assembler.assembly.contigs[function][contig].genes[
                    gene_id]
                row += 1
                col = 0
                # Write Gene ID
                genes_worksheet.write(row, col, gene_id)
                col += 1
                # Write Gene function from read mapping
                genes_worksheet.write(row, col, function)
                col += 1
                # Write Contig ID
                genes_worksheet.write(row, col, contig)
                col += 1
                # Write gene start
                genes_worksheet.write(row, col, int(gene.start))
                col += 1
                # Write gene end
                genes_worksheet.write(row, col, int(gene.end))
                col += 1
                # Write gene length
                gene_length = int(gene.end) - int(gene.start) + 1
                genes_worksheet.write(row, col, gene_length)
                col += 1
                # Write gene strand
                genes_worksheet.write(row, col, gene.strand)
                col += 1
                # Write read count (calculated from read count of contig,
                # adjusted by gene length)
                gene_read_count = assembler.assembly.contigs[function][contig].get_read_count()\
                    * gene_length \
                    / len(assembler.assembly.contigs[function][contig].sequence)
                genes_worksheet.write(row, col, gene_read_count,
                                      cell_numformat1)
                col += 1
                # Write RPKM
                gene_rpkm = assembler.assembly.contigs[function][
                    contig].get_rpkm(total_read_count)
                genes_worksheet.write(row, col, gene_rpkm, cell_numformat5)
                col += 1
                # Write coverage
                genes_worksheet.write(
                    row, col, assembler.assembly.contigs[function]
                    [contig].get_coverage(), cell_numformat1)
                col += 1
                # Write FAMA gene status
                genes_worksheet.write(row, col, gene.status)
                col += 1
                if gene.status == STATUS_GOOD:
                    # Write FAMA predicted functions
                    gene_functions = set(
                        [y for x in gene.hit_list.hits for y in x.functions])
                    genes_worksheet.write(row, col, ','.join(gene_functions))
                    col += 1
                    # Write FAMA identity
                    gene_identity = [x.identity for x in gene.hit_list.hits]
                    genes_worksheet.write(
                        row, col,
                        sum(gene_identity) / len(gene_identity),
                        cell_numformat1)
                    col += 1
                    # Write CDS completeness
                    ref_lengths = [x.s_len for x in gene.hit_list.hits]
                    genes_worksheet.write(
                        row, col,
                        len(gene.protein_sequence) * 100 * len(ref_lengths) /
                        sum(ref_lengths), cell_numformat1)
                    col += 1
                    # Write FAMA best hits
                    fama_hits = [
                        cleanup_protein_id(x.subject_id)
                        for x in gene.hit_list.hits
                    ]
                    genes_worksheet.write(row, col, ','.join(fama_hits))
                    col += 1
                    # Write FAMA taxonomy ID
                    gene_taxonomy = [
                        assembler.project.ref_data.lookup_protein_tax(
                            cleanup_protein_id(x.subject_id))
                        for x in gene.hit_list.hits
                    ]
                    genes_worksheet.write(row, col, ','.join(gene_taxonomy))
                    col += 1

                    # Write Fama best hit organism
                    gene_organism = [
                        assembler.project.taxonomy_data.get_name(x)
                        for x in gene_taxonomy
                    ]
                    genes_worksheet.write(row, col, ','.join(gene_organism))
                    col += 1
                    # Write Fama best hit taxonomy
                    best_hit_taxonomy = [
                        assembler.project.taxonomy_data.get_taxonomy_lineage(x)
                        for x in gene_taxonomy
                    ]
                    genes_worksheet.write(row, col,
                                          '|'.join(best_hit_taxonomy))
                    col += 1

                    # Write Fama LCA taxonomy ID
                    lca_taxonomy_id = gene.taxonomy
                    genes_worksheet.write(row, col, lca_taxonomy_id)
                    col += 1
                    # Write Fama LCA organism
                    lca_organism = assembler.project.taxonomy_data.get_name(
                        lca_taxonomy_id)
                    genes_worksheet.write(row, col, lca_organism)
                    col += 1
                    # Write Fama LCA taxonomy
                    lca_taxonomy = assembler.project.taxonomy_data.get_taxonomy_lineage(
                        lca_taxonomy_id)
                    genes_worksheet.write(row, col, lca_taxonomy)

                else:
                    for _ in range(0, 10):
                        genes_worksheet.write(row, col, 'N/A')
                        col += 1

                for sample in samples_list:
                    col += 1
                    gene_read_count = assembler.assembly.contigs[function][
                        contig].get_read_count(sample) * len(
                            gene.protein_sequence) * 3 / len(
                                assembler.assembly.contigs[function]
                                [contig].sequence)

                    genes_worksheet.write(row, col, gene_read_count,
                                          cell_numformat1)
                    col += 1
                    gene_rpkm = assembler.assembly.contigs[function][
                        contig].get_rpkm(
                            assembler.project.options.get_fastq1_readcount(
                                sample), sample)
                    genes_worksheet.write(row, col, gene_rpkm, cell_numformat5)
                    col += 1
                    genes_worksheet.write(
                        row, col, assembler.assembly.contigs[function]
                        [contig].get_coverage(sample), cell_numformat1)
                col += 1
                genes_worksheet.write(
                    row, col,
                    assembler.project.ref_data.lookup_function_name(function))

    # adjust column width
    genes_worksheet.set_column(0, 0, 20)
    genes_worksheet.set_column(1, 1, 10)
    genes_worksheet.set_column(7, 9, 15)
    genes_worksheet.set_column(col, col, 50)
    workbook.close()
예제 #7
0
def make_sample_tax_func_xlsx(project,
                              scores,
                              metric,
                              function_id=None,
                              rank=None):
    """Generates XLSX file for taxa scores for one or all functions in all samples.

    Args:
        project (:obj:'Project'): Project object that stores all annotated reads
        scores (dict[str, dict[str, dict[str, float]]]): outer key is function
        identifier, middle-level key is sample identifier,
        inner key is metric, value id float
        metric (str, optional): acceptable values are 'readcount', 'erpk', 'rpkm',
            'fragmentcount', 'fpk', 'efpk', 'fpkm', 'erpkm', 'efpkm',
            'fpkg', 'rpkg', 'erpkg', 'efpkg', 'proteincount'
        function_id (str, optional): function identifier. If function_id is None, all
            functions will be included into workbook.
        rank (str, optional): taxonomic rank. if rank parameter is not None, the
            resulting XLSX file will contain only entries for this rank.
    """
    if function_id is None:
        if rank is None:
            xlsxfile = sanitize_file_name(
                os.path.join(
                    project.options.work_dir, project.options.project_name +
                    '_' + metric + '_samples_taxonomy.xlsx'))
        else:
            xlsxfile = sanitize_file_name(
                os.path.join(
                    project.options.work_dir, project.options.project_name +
                    '_' + metric + '_samples_' + rank + '_taxonomy.xlsx'))

    else:
        if rank is None:
            xlsxfile = sanitize_file_name(
                os.path.join(
                    project.options.work_dir,
                    function_id + '_' + metric + '_samples_taxonomy.xlsx'))
        else:
            xlsxfile = sanitize_file_name(
                os.path.join(
                    project.options.work_dir, function_id + '_' + metric +
                    '_samples_' + rank + '_taxonomy.xlsx'))

    print('Writing', xlsxfile)
    writer = pd.ExcelWriter(xlsxfile, engine='xlsxwriter')

    for function in sorted(project.ref_data.functions_dict.keys()):
        if function_id is not None and function != function_id:
            continue

        # Subsetting scores
        sample_scores = autovivify(3, float)
        for taxonomy_id in scores.keys():
            if function in scores[taxonomy_id].keys():
                for sample in project.list_samples():
                    if sample in scores[taxonomy_id][function]:
                        for key, val in scores[taxonomy_id][function][
                                sample].items():
                            sample_scores[taxonomy_id][sample][key] = val
                    else:
                        sample_scores[taxonomy_id][sample][metric] = 0.0

        tax_profile = TaxonomyProfile()
        tax_profile.make_function_taxonomy_profile(project.taxonomy_data,
                                                   sample_scores)

        taxonomy_df = tax_profile.convert_profile_into_score_df(metric=metric)

        if rank is None:
            taxonomy_df.to_excel(writer,
                                 sheet_name=function,
                                 merge_cells=False)
        else:
            filtered_df = taxonomy_df[taxonomy_df[('', 'Rank')] == rank]
            filtered_df.to_excel(writer,
                                 sheet_name=function,
                                 merge_cells=False)
        format_taxonomy_worksheet(writer, function)

    # Make 'Average' sheet
    if function_id is None:
        sample_scores = autovivify(3, float)
        for taxonomy_id in scores:
            for function in sorted(project.ref_data.functions_dict.keys()):
                if function in scores[taxonomy_id]:
                    for sample in project.list_samples():
                        if sample in scores[taxonomy_id][function]:
                            for key, val in scores[taxonomy_id][function][
                                    sample].items():
                                sample_scores[taxonomy_id][sample][key] += val
                        else:
                            sample_scores[taxonomy_id][sample][metric] += 0.0
        for taxonomy_id in sample_scores:
            for sample in sample_scores[taxonomy_id]:
                sample_scores[taxonomy_id][sample][metric] = \
                    sample_scores[taxonomy_id][sample][metric] \
                    / len(project.ref_data.functions_dict.keys())

        tax_profile = TaxonomyProfile()
        tax_profile.make_function_taxonomy_profile(project.taxonomy_data,
                                                   sample_scores)

        taxonomy_df = tax_profile.convert_profile_into_score_df(metric=metric)

        if rank is None:
            taxonomy_df.to_excel(writer,
                                 sheet_name='Average',
                                 merge_cells=False)
        else:
            filtered_df = taxonomy_df[taxonomy_df[('', 'Rank')] == rank]
            filtered_df.to_excel(writer,
                                 sheet_name='Average',
                                 merge_cells=False)

        format_taxonomy_worksheet(writer, 'Average')

    writer.save()
예제 #8
0
def make_function_sample_xlsx(project, scores, metric, sample_id=None):
    """Generates XLSX file for function scores for one or more samples.

    Args:
        project (:obj:'Project'): Project object that stores all annotated reads
        scores (dict[str, dict[str, dict[str, float]]]): outer key is function
        identifier, middle-level key is sample identifier,
        inner key is metric, value id float
        metric (str, optional): acceptable values are 'readcount', 'erpk', 'rpkm',
            'fragmentcount', 'fpk', 'efpk', 'fpkm', 'erpkm', 'efpkm',
            'fpkg', 'rpkg', 'erpkg', 'efpkg', 'proteincount'
        sample_id (str, optional): sample identifier
    """
    if sample_id is None:
        xlsxfile = sanitize_file_name(
            os.path.join(
                project.options.work_dir, project.options.project_name + '_' +
                metric + '_functions.xlsx'))
    else:
        xlsxfile = sanitize_file_name(
            os.path.join(project.options.work_dir,
                         sample_id + '_' + metric + '_functions.xlsx'))

    print('Writing', xlsxfile)
    workbook = xlsxwriter.Workbook(xlsxfile)
    bold = workbook.add_format({'bold': True})

    functions_list = sorted(project.ref_data.functions_dict.keys())
    categories_list = sorted(
        list(
            set([
                project.ref_data.functions_dict[x]['group']
                for x in project.ref_data.functions_dict.keys()
            ])))

    scores_cat = autovivify(2, float)

    # generate tables for functions
    scores_worksheet = workbook.add_worksheet('Functions ' + metric)

    row = 0
    col = 0
    scores_worksheet.write(row, col, 'Function', bold)
    for sample in project.list_samples():
        if sample_id is not None and sample != sample_id:
            continue
        col += 1
        scores_worksheet.write(row, col, sample, bold)

    col += 1
    scores_worksheet.write(row, col, 'Definition', bold)

    for function in functions_list:
        category = project.ref_data.lookup_function_group(function)
        row += 1
        col = 0
        scores_worksheet.write(row, col, function, bold)
        for sample in project.list_samples():
            if sample_id is not None and sample != sample_id:
                continue
            col += 1
            if function in scores and sample in scores[function]:
                scores_worksheet.write(row, col,
                                       scores[function][sample][metric])
                scores_cat[category][sample] += scores[function][sample][
                    metric]
            else:
                scores_worksheet.write(row, col, 0.0)

        col += 1
        scores_worksheet.write(row, col,
                               project.ref_data.lookup_function_name(function))

    # adjust column width
    scores_worksheet.set_column(0, 0, 10)
    scores_worksheet.set_column(col, col, 50)

    # Write worksheet for categories
    scores_cat_worksheet = workbook.add_worksheet('Categories ' + metric)
    row = 0
    col = 0
    scores_cat_worksheet.write(row, col, 'Categories', bold)

    for sample in project.list_samples():
        if sample_id is not None and sample != sample_id:
            continue
        col += 1
        scores_cat_worksheet.write(row, col, sample, bold)

    for category in categories_list:
        row += 1
        col = 0
        scores_cat_worksheet.write(row, col, category, bold)
        for sample in project.list_samples():
            if sample_id is not None and sample != sample_id:
                continue
            col += 1
            if category in scores_cat and sample in scores_cat[category]:
                scores_cat_worksheet.write(row, col,
                                           scores_cat[category][sample])
            else:
                scores_cat_worksheet.write(row, col, 0.0)
    # adjust column width
    scores_cat_worksheet.set_column(0, 0, 50)

    workbook.close()