示例#1
0
def add_contig_structure_data(data_str, structure, ref_contigs, chr_full_names, contig_names_by_refs,
                              used_chromosomes, links_to_chromosomes, chr_names_by_id):
    for el in structure:
        if isinstance(el, Alignment):
            if el.ref_name in ref_contigs:
                num_chr = ref_contigs.index(el.ref_name)
                # corr_len = sum(chr_lengths[:num_chr+1])
            else:
                # corr_len = -int(el.end)
                if el.ref_name not in used_chromosomes:
                    used_chromosomes.append(el.ref_name)
                    if contig_names_by_refs:
                        other_ref_name = contig_names_by_refs[el.ref_name]
                        links_to_chromosomes.append('links_to_chromosomes["' + el.ref_name + '"] = "' +
                                                get_html_name(other_ref_name, chr_full_names) + '";')
            corr_el_start = el.start
            corr_el_end = el.end
            data_str.append('{corr_start: ' + str(corr_el_start) + ',corr_end: ' +
                            str(corr_el_end) + ',start:' + str(el.unshifted_start) + ',end:' + str(el.unshifted_end) +
                            ',start_in_contig:' + str(el.start_in_contig) + ',end_in_contig:' +
                            str(el.end_in_contig) + ',IDY:' + el.idy + ',chr: "' + chr_names_by_id[el.ref_name] + '"},')
        elif type(el) == str:
            ms_description, ms_type = parse_misassembly_info(el)
            data_str.append('{contig_type: "M", mstype: "' + ms_type + '", msg: "' + ms_description + '"},')
    return data_str
示例#2
0
def prepare_alignment_data_for_one_ref(chr, chr_full_names, chr_names_by_id, ref_contigs, data_str, chr_to_aligned_blocks,
                                       structures_by_labels, contigs_by_assemblies, ambiguity_alignments_by_labels=None,
                                       contig_names_by_refs=None, output_dir_path=None,
                                       cov_data_str=None, physical_cov_data_str=None, gc_data_str=None):
    html_name = get_html_name(chr, chr_full_names)
    alignment_viewer_fpath = join(output_dir_path, html_name + '.html')

    additional_assemblies_data = ''
    data_str.append('var links_to_chromosomes;')
    links_to_chromosomes = []
    used_chromosomes = []
    if contig_names_by_refs:
        data_str.append('links_to_chromosomes = {};')
    num_misassemblies = 0
    aligned_assemblies = set()

    is_one_html = len(chr_full_names) == 1
    data_str.append('var oneHtml = ' + str(is_one_html).lower() + ';')
    # adding assembly data
    data_str.append('var contig_data = {};')
    data_str.append('contig_data["' + chr + '"] = {};')
    assemblies_len = defaultdict(int)
    assemblies_contigs = defaultdict(set)
    ms_types = dict()
    for assembly in chr_to_aligned_blocks.keys():
        data_str.append('contig_data["' + chr + '"]["' + assembly + '"] = [ ')
        ms_types[assembly] = defaultdict(int)
        contigs = dict((contig.name, contig) for contig in contigs_by_assemblies[assembly])
        for num_contig, ref_contig in enumerate(ref_contigs):
            if ref_contig in chr_to_aligned_blocks[assembly]:
                overlapped_contigs = defaultdict(list)
                alignments = sorted(chr_to_aligned_blocks[assembly][ref_contig], key=lambda x: x.start)
                prev_end = 0
                prev_alignments = []
                for alignment in alignments:
                    if prev_end > alignment.start:
                        for prev_align in prev_alignments:
                            if alignment.name != prev_align.name and prev_align.end - alignment.start > 100:
                                overlapped_contigs[prev_align].append('{contig:"' + alignment.name + '",corr_start: ' + str(alignment.start) +
                                    ',corr_end: ' + str(alignment.end) + ',start:' + str(alignment.unshifted_start) +
                                    ',end:' + str(alignment.unshifted_end) + ',start_in_contig:' + str(alignment.start_in_contig) +
                                    ',end_in_contig:' + str(alignment.end_in_contig) + ',chr: "' + chr_names_by_id[alignment.ref_name] + '"}')
                                overlapped_contigs[alignment].append('{contig:"' + prev_align.name + '",corr_start: ' + str(prev_align.start) +
                                    ',corr_end: ' + str(prev_align.end) + ',start:' + str(prev_align.unshifted_start) +
                                    ',end:' + str(prev_align.unshifted_end) + ',start_in_contig:' + str(prev_align.start_in_contig) +
                                    ',end_in_contig:' + str(prev_align.end_in_contig) + ',chr: "' + chr_names_by_id[prev_align.ref_name] + '"}')
                        prev_alignments.append(alignment)
                    else:
                        prev_alignments = [alignment]
                    prev_end = max(prev_end, alignment.end)

                for alignment in alignments:
                    assemblies_len[assembly] += abs(alignment.end_in_contig - alignment.start_in_contig) + 1
                    assemblies_contigs[assembly].add(alignment.name)
                    contig_structure = structures_by_labels[alignment.label]
                    contig_more_unaligned = False
                    if alignment.misassembled:
                        num_misassemblies += 1
                        misassemblies, misassembled_ends = get_misassembly_for_alignment(contig_structure[alignment.name], alignment)
                        for misassembly in misassemblies:
                            if misassembly:
                                ms_types[assembly][misassembly] += 1
                        alignment.misassemblies = ';'.join(misassemblies)
                        if 'unknown' in alignment.misassemblies:
                            alignment.misassemblies = 'unknown'
                            misassembled_ends = ''
                            contig_more_unaligned = True
                    else:
                        if contigs[alignment.name].contig_type == 'correct_unaligned':
                            contig_more_unaligned = True
                        misassembled_ends = ''

                    genes = []
                    start_in_contig, end_in_contig = min(alignment.start_in_contig, alignment.end_in_contig), \
                                                     max(alignment.start_in_contig, alignment.end_in_contig)
                    for gene in contigs[alignment.name].genes:
                        if start_in_contig < gene.start < end_in_contig or start_in_contig < gene.end < end_in_contig:
                            corr_start = max(alignment.start, alignment.start + (gene.start - start_in_contig))
                            corr_end = min(alignment.end, alignment.end + (gene.end - end_in_contig))
                            gene_info = '{start:' + str(gene.start) + ',end:' + str(gene.end) + ',corr_start:' + \
                                        str(corr_start) + ',corr_end:' + str(corr_end) + '}'
                            genes.append(gene_info)
                    data_str.append('{name:"' + alignment.name + '",corr_start:' + str(alignment.start) + ',corr_end:' +
                                    str(alignment.end) + ',start:' + str(alignment.unshifted_start) + ',end:' +
                                    str(alignment.unshifted_end) + ',misassemblies:"' + alignment.misassemblies + '",mis_ends:"' + misassembled_ends + '"')
                    if alignment.similar:
                        data_str[-1] += ',similar:"True"'
                    if alignment.ambiguous:
                        data_str[-1] += ',ambiguous:"True"'
                    if alignment.is_best_set:
                        data_str[-1] += ',is_best:"True"'
                    if contig_more_unaligned:
                        data_str[-1] += ',more_unaligned:"True"'

                    aligned_assemblies.add(alignment.label)
                    if overlapped_contigs[alignment]:
                        data_str.append(',overlaps:[ ')
                        data_str.append(','.join(overlapped_contigs[alignment]))
                        data_str.append(']')
                    if qconfig.gene_finding:
                        data_str.append(',genes:[' + ','.join(genes) + ']')
                    if ambiguity_alignments_by_labels and qconfig.ambiguity_usage == 'all':
                        data_str.append(',ambiguous_alignments:[ ')
                        data_str = add_contig_structure_data(data_str, ambiguity_alignments_by_labels[alignment.label][alignment.name],
                                                             ref_contigs, chr_full_names, contig_names_by_refs,
                                                             used_chromosomes, links_to_chromosomes, chr_names_by_id)
                        data_str[-1] = data_str[-1][:-1] + '],'
                    data_str[-1] = data_str[-1] + '},'

        data_str[-1] = data_str[-1][:-1] + '];'
        assembly_len = assemblies_len[assembly]
        assembly_contigs = len(assemblies_contigs[assembly])
        local_misassemblies = ms_types[assembly]['local'] // 2
        ext_misassemblies = (sum(ms_types[assembly].values()) - ms_types[assembly]['interspecies translocation']) // 2 - \
                            local_misassemblies + ms_types[assembly]['interspecies translocation']
        additional_assemblies_data += 'assemblies_len["' + assembly + '"] = ' + str(assembly_len) + ';\n'
        additional_assemblies_data += 'assemblies_contigs["' + assembly + '"] = ' + str(assembly_contigs) + ';\n'
        additional_assemblies_data += 'assemblies_misassemblies["' + assembly + '"] = "' + str(ext_misassemblies) + '+' + \
                                      str(local_misassemblies) + '";\n'

    if cov_data_str:
        # adding coverage data
        data_str.extend(cov_data_str)
    if physical_cov_data_str:
        data_str.extend(physical_cov_data_str)
    if gc_data_str:
        data_str.append('var gc_window_size = ' + (str(qconfig.GC_window_size_large) if
                        qconfig.large_genome else str(qconfig.GC_window_size)) + ';')
        data_str.extend(gc_data_str)

    misassemblies_types = ['relocation', 'translocation', 'inversion', 'interspecies translocation', 'local']
    if not qconfig.is_combined_ref:
        misassemblies_types.remove('interspecies translocation')

    ms_counts_by_type = OrderedDict()
    for ms_type in misassemblies_types:
        factor = 1 if ms_type == 'interspecies translocation' else 2
        ms_counts_by_type[ms_type] = sum(ms_types[assembly][ms_type] // factor for assembly in chr_to_aligned_blocks.keys())
    total_ms_count = sum(ms_counts_by_type.values()) - ms_counts_by_type['local']
    ms_selectors = []
    for ms_type, ms_count in ms_counts_by_type.items():
        ms_name = ms_type
        if ms_count != 1 and ms_type != 'local':
            ms_name += 's'
        ms_selectors.append((ms_type, ms_name, str(ms_count)))

    contigs_structure_str = get_contigs_structure(assemblies_contigs, chr_to_aligned_blocks, contigs_by_assemblies, ref_contigs, chr_full_names,
                                                   contig_names_by_refs, structures_by_labels, used_chromosomes, links_to_chromosomes, chr_names_by_id)

    if contig_names_by_refs:
        data_str.append(''.join(links_to_chromosomes))
    data_str = '\n'.join(data_str)
    return alignment_viewer_fpath, data_str, contigs_structure_str, additional_assemblies_data, ms_selectors, num_misassemblies, aligned_assemblies
示例#3
0
def get_contigs_data(contigs_by_assemblies, nx_marks, assemblies_n50, structures_by_labels, contig_names_by_refs, ref_names,
                     chr_full_names):
    additional_data = []
    additional_data.append('var links_to_chromosomes;')
    one_html = len(chr_full_names) == 1
    if not one_html:
        additional_data.append('links_to_chromosomes = {};')
        for ref_name in ref_names:
            chr_name = ref_name
            if contig_names_by_refs and ref_name in contig_names_by_refs:
                chr_name = contig_names_by_refs[ref_name]
            chr_name = get_html_name(chr_name, chr_full_names)
            additional_data.append('links_to_chromosomes["' + ref_name + '"] = "' + chr_name + '";')
    contigs_sizes_str = ['var contig_data = {};']
    contigs_sizes_str.append('var chromosome;')
    contigs_sizes_lines = []
    total_len = 0
    min_contig_size = qconfig.min_contig
    too_many_contigs = False
    has_aligned_contigs = structures_by_labels.values()
    for assembly in contigs_by_assemblies:
        contigs_sizes_str.append('contig_data["' + assembly + '"] = [ ')
        cum_length = 0
        contigs = sorted(contigs_by_assemblies[assembly], key=lambda x: x.size, reverse=True)
        last_contig_num = min(len(contigs), qconfig.max_contigs_num_for_size_viewer)
        contig_threshold = contigs[last_contig_num - 1].size
        not_used_nx = [nx for nx in nx_marks]
        if len(contigs) > qconfig.max_contigs_num_for_size_viewer:
            too_many_contigs = True

        for i, alignment in enumerate(contigs):
            if i >= last_contig_num:
                break
            cum_length, contigs_sizes_lines, align, not_used_nx = add_contig(cum_length, alignment, not_used_nx, assemblies_n50,
                                                                assembly, contigs, contigs_sizes_lines, i, structures_by_labels,
                                                                             has_aligned_contigs=has_aligned_contigs)
            contigs_sizes_str.append(align)
        if len(contigs) > qconfig.max_contigs_num_for_size_viewer:
            assembly_len = cum_length
            remained_len = sum(alignment.size for alignment in contigs[last_contig_num:])
            cum_length += remained_len
            remained_genes = ['{start:' + str(gene.start) + ',end:' + str(gene.end) + '}' for contig in contigs[last_contig_num:]
                              for gene in contig.genes]
            remained_contigs_name = str(len(contigs) - last_contig_num) + ' hidden contigs shorter than ' + str(contig_threshold) + \
                                    ' bp (total length: ' + format_long_numbers(remained_len) + ' bp)'
            contigs_sizes_str.append(('{name:"' + remained_contigs_name + '", size:' + str(remained_len) +
                                     ', contig_type:"short_contigs"' + (',genes:[' + ','.join(remained_genes) + ']},'
                                      if qconfig.gene_finding else '},')))
        if not_used_nx and last_contig_num < len(contigs):
            for i, alignment in enumerate(contigs[last_contig_num:]):
                if not not_used_nx:
                    break
                assembly_len, contigs_sizes_lines, align, not_used_nx = add_contig(assembly_len, alignment, not_used_nx, assemblies_n50,
                                                                    assembly, contigs, contigs_sizes_lines, last_contig_num + i, structures_by_labels, only_nx=True)
        total_len = max(total_len, cum_length)
        contigs_sizes_str[-1] = contigs_sizes_str[-1][:-1] + '];\n\n'
    contigs_sizes_str = '\n'.join(contigs_sizes_str)
    contigs_sizes_str += 'var contigLines = [' + ','.join(contigs_sizes_lines) + '];\n\n'
    contigs_sizes_str += 'var contigs_total_len = ' + str(total_len) + ';\n'
    contigs_sizes_str += 'var minContigSize = ' + str(min_contig_size) + ';'
    contig_viewer_data = contigs_sizes_str + '\n'.join(additional_data)
    return contig_viewer_data, too_many_contigs