Exemplo n.º 1
0
def parse_gff(file, feature):
    genes = []

    number = 0

    for line in file:
        m = gff_pattern.match(line)
        if m and m.group('feature') == feature:
            gene = Gene(seqname=qutils.correct_name(m.group('seqname')),
                        start=int(m.group('start')),
                        end=int(m.group('end')))

            attributes = m.group('attributes').split(';')
            for attr in attributes:
                if attr and attr != '' and '=' in attr:
                    key, val = attr.split('=')
                    if key.lower() == 'id':
                        gene.id = val
                    if key.lower() == 'name':
                        gene.name = val

            gene.number = number
            number += 1

            genes.append(gene)

    return genes
Exemplo n.º 2
0
def parse_ncbi(ncbi_file):
    annotation_pattern = re.compile(r'Annotation: (?P<seqname>.+) \((?P<start>\d+)\.\.(?P<end>\d+)(, complement)?\)', re.I)
    chromosome_pattern = re.compile(r'Chromosome: (?P<chromosome>\S+);', re.I)
    id_pattern = re.compile(r'ID: (?P<id>\d+)', re.I)

    genes = []

    line = ncbi_file.readline()
    while line != '':
        while line.rstrip() == '' or line.startswith('##'):
            if line == '':
                break
            line = ncbi_file.readline()

        m = ncbi_start_pattern.match(line.rstrip())
        while not m:
            m = ncbi_start_pattern.match(line.rstrip())

        gene = Gene(number=int(m.group('number')),
                    name=qutils.correct_name(m.group('name')))

        the_rest_lines = []

        line = ncbi_file.readline()
        while line != '' and not ncbi_start_pattern.match(line.rstrip()):
            the_rest_lines.append(line.rstrip())
            line = ncbi_file.readline()

        for info_line in the_rest_lines:
            if info_line.startswith('Chromosome:'):
                m = re.match(chromosome_pattern, info_line)
                if m:
                    gene.chromosome = m.group('chromosome')

            if info_line.startswith('Annotation:'):
                m = re.match(annotation_pattern, info_line)
                if m:
                    gene.seqname = m.group('seqname')
                    gene.start = int(m.group('start'))
                    gene.end = int(m.group('end'))

                    to_trim = 'Chromosome' + ' ' + str(gene.chromosome)
                    if gene.chromosome and gene.seqname.startswith(to_trim):
                        gene.seqname = gene.seqname[len(to_trim):]
                        gene.seqname.lstrip(' ,')

                else:
                    logger.warning('Wrong NCBI annotation for gene ' + str(gene.number) + '. ' + gene.name + '. Skipping this gene.')

            if info_line.startswith('ID:'):
                m = re.match(id_pattern, info_line)
                if m:
                    gene.id = m.group('id')
                else:
                    logger.warning('Can\'t parse gene\'s ID in NCBI format. Gene is ' + str(gene.number) + '. ' + gene.name + '. Skipping it.')

        if gene.start is not None and gene.end is not None:
            genes.append(gene)
        # raise ParseException('NCBI format parsing error: provide start and end for gene ' + gene.number + '. ' + gene.name + '.')
    return genes
Exemplo n.º 3
0
    def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath):
        seq_fname = ref_name
        seq_fname += ref_fasta_ext

        if total_references > 1:
            corr_seq_fpath = corrected_ref_fpaths[-1]
        else:
            corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname))
            corrected_ref_fpaths.append(corr_seq_fpath)
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath)
        corr_seq_name += '_' + qutils.correct_name(seq_name[:20])
        if not qconfig.no_check:
            corr_seq = seq.upper()
            dic = {'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N'}
            pat = "(%s)" % "|".join(map(re.escape, dic.keys()))
            corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq)
            if re.compile(r'[^ACGTN]').search(corr_seq):
                logger.warning('Skipping ' + ref_fpath + ' because it contains non-ACGTN characters.',
                        indent='    ')
                return None, None

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')
        fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a')

        contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath)
        chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq)))

        return corr_seq_name, corr_seq_fpath
Exemplo n.º 4
0
def parse_gff(file, feature):
    genes = []

    number = 0

    for line in file:
        m = gff_pattern.match(line)
        if m and m.group('feature').lower() == feature:
            gene = Gene(seqname=qutils.correct_name(m.group('seqname')),
                        start=int(m.group('start')),
                        end=int(m.group('end')))

            attributes = m.group('attributes').split(';')
            for attr in attributes:
                if attr and attr != '' and '=' in attr:
                    key = attr.split('=')[0]
                    val = attr[len(key) + 1:]
                    if key.lower() == 'id':
                        gene.id = val
                    if key.lower() == 'name':
                        gene.name = val

            gene.number = number
            number += 1

            genes.append(gene)

    return genes
Exemplo n.º 5
0
def parse_ncbi(ncbi_file):
    annotation_pattern = re.compile(r'Annotation: (?P<seqname>.+) \((?P<start>\d+)\.\.(?P<end>\d+)(, complement)?\)', re.I)
    chromosome_pattern = re.compile(r'Chromosome: (?P<chromosome>\S+);', re.I)
    id_pattern = re.compile(r'ID: (?P<id>\d+)', re.I)

    genes = []

    line = ncbi_file.readline()
    while line != '':
        while line.rstrip() == '' or line.startswith('##'):
            if line == '':
                break
            line = ncbi_file.readline()

        m = ncbi_start_pattern.match(line.rstrip())
        while not m:
            m = ncbi_start_pattern.match(line.rstrip())

        gene = Gene(number=int(m.group('number')),
                    name=qutils.correct_name(m.group('name')))

        the_rest_lines = []

        line = ncbi_file.readline()
        while line != '' and not ncbi_start_pattern.match(line.rstrip()):
            the_rest_lines.append(line.rstrip())
            line = ncbi_file.readline()

        for info_line in the_rest_lines:
            if info_line.startswith('Chromosome:'):
                m = re.match(chromosome_pattern, info_line)
                if m:
                    gene.chromosome = m.group('chromosome')

            if info_line.startswith('Annotation:'):
                m = re.match(annotation_pattern, info_line)
                if m:
                    gene.seqname = m.group('seqname')
                    gene.start = int(m.group('start'))
                    gene.end = int(m.group('end'))

                    to_trim = 'Chromosome' + ' ' + str(gene.chromosome)
                    if gene.chromosome and gene.seqname.startswith(to_trim):
                        gene.seqname = gene.seqname[len(to_trim):]
                        gene.seqname.lstrip(' ,')

                else:
                    logger.warning('Wrong NCBI annotation for gene ' + str(gene.number) + '. ' + gene.name + '. Skipping this gene.')

            if info_line.startswith('ID:'):
                m = re.match(id_pattern, info_line)
                if m:
                    gene.id = m.group('id')
                else:
                    logger.warning('Can\'t parse gene\'s ID in NCBI format. Gene is ' + str(gene.number) + '. ' + gene.name + '. Skipping it.')

        if gene.start is not None and gene.end is not None:
            genes.append(gene)
        # raise ParseException('NCBI format parsing error: provide start and end for gene ' + gene.number + '. ' + gene.name + '.')
    return genes
Exemplo n.º 6
0
    def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references):
        seq_fname = ref_name
        if total_references > 1:
            seq_fname += '_' + qutils.correct_name(seq_name[:20])
        seq_fname += ref_fasta_ext

        corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname))
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath)

        corrected_ref_fpaths.append(corr_seq_fpath)

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')
        fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a')

        return corr_seq_name
Exemplo n.º 7
0
def correct_fasta(original_fpath, corrected_fpath, min_contig,
                  is_reference=False):
    modified_fasta_entries = []
    for first_line, seq in fastaparser.read_fasta(original_fpath):
        if (len(seq) >= min_contig) or is_reference:
            corr_name = qutils.correct_name(first_line)

            # seq to uppercase, because we later looking only uppercase letters
            corr_seq = seq.upper()

            # correcting alternatives (gage can't work with alternatives)
            # dic = {'M': 'A', 'K': 'G', 'R': 'A', 'Y': 'C', 'W': 'A', 'S': 'C', 'V': 'A', 'B': 'C', 'H': 'A', 'D': 'A'}
            dic = {'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N'}
            pat = "(%s)" % "|".join(map(re.escape, dic.keys()))
            corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq)

            # make sure that only A, C, G, T or N are in the sequence
            if re.compile(r'[^ACGTN]').search(corr_seq):
                logger.warning('Skipping ' + original_fpath + ' because it contains non-ACGTN characters.',
                        indent='    ')
                return False

            modified_fasta_entries.append((corr_name, corr_seq))

    fastaparser.write_fasta(corrected_fpath, modified_fasta_entries)

    if is_reference:
        ref_len = sum(len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries)
        if ref_len > qconfig.MAX_REFERENCE_LENGTH:
            _, fasta_ext = os.path.splitext(corrected_fpath)
            splitted_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath), 'splitted_ref')
            os.makedirs(splitted_ref_dirpath)

            for i, (chr_name, chr_seq) in enumerate(modified_fasta_entries):
                if len(chr_seq) > qconfig.MAX_REFERENCE_LENGTH:
                    logger.warning("Skipping chromosome " + chr_name + " because it length is greater than " +
                            str(qconfig.MAX_REFERENCE_LENGTH) + " (Nucmer's constraint).")
                    continue

                splitted_ref_fpath = os.path.join(splitted_ref_dirpath, "chr_" + str(i + 1)) + fasta_ext
                qconfig.splitted_ref.append(splitted_ref_fpath)
                fastaparser.write_fasta(splitted_ref_fpath, [(chr_name, chr_seq)])

            if len(qconfig.splitted_ref) == 0:
                logger.warning("Skipping reference because all of its chromosomes exceeded Nucmer's constraint.")
                return False
    return True
Exemplo n.º 8
0
def correct_fasta(original_fpath, corrected_fpath, min_contig,
                  is_reference=False):
    modified_fasta_entries = []
    for first_line, seq in fastaparser.read_fasta(original_fpath):
        if (len(seq) >= min_contig) or is_reference:
            corr_name = qutils.correct_name(first_line)

            # seq to uppercase, because we later looking only uppercase letters
            corr_seq = seq.upper()

            # correcting alternatives (gage can't work with alternatives)
            # dic = {'M': 'A', 'K': 'G', 'R': 'A', 'Y': 'C', 'W': 'A', 'S': 'C', 'V': 'A', 'B': 'C', 'H': 'A', 'D': 'A'}
            dic = {'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N'}
            pat = "(%s)" % "|".join(map(re.escape, dic.keys()))
            corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq)

            # make sure that only A, C, G, T or N are in the sequence
            if re.compile(r'[^ACGTN]').search(corr_seq):
                logger.warning('Skipping ' + original_fpath + ' because it contains non-ACGTN characters.',
                        indent='    ')
                return False

            modified_fasta_entries.append((corr_name, corr_seq))

    fastaparser.write_fasta(corrected_fpath, modified_fasta_entries)

    if is_reference:
        ref_len = sum(len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries)
        if ref_len > qconfig.MAX_REFERENCE_LENGTH:
            _, fasta_ext = os.path.splitext(corrected_fpath)
            splitted_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath), 'splitted_ref')
            os.makedirs(splitted_ref_dirpath)

            for i, (chr_name, chr_seq) in enumerate(modified_fasta_entries):
                if len(chr_seq) > qconfig.MAX_REFERENCE_LENGTH:
                    logger.warning("Skipping chromosome " + chr_name + " because it length is greater than " +
                            str(qconfig.MAX_REFERENCE_LENGTH) + " (Nucmer's constraint).")
                    continue

                splitted_ref_fpath = os.path.join(splitted_ref_dirpath, "chr_" + str(i + 1)) + fasta_ext
                qconfig.splitted_ref.append(splitted_ref_fpath)
                fastaparser.write_fasta(splitted_ref_fpath, [(chr_name, chr_seq)])

            if len(qconfig.splitted_ref) == 0:
                logger.warning("Skipping reference because all of its chromosomes exceeded Nucmer's constraint.")
                return False
    return True
Exemplo n.º 9
0
    def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references):
        seq_fname = ref_name
        if total_references > 1:
            seq_fname += '_' + qutils.correct_name(seq_name[:20])
        seq_fname += ref_fasta_ext

        corr_seq_fpath = qutils.unique_corrected_fpath(
            os.path.join(corrected_dirpath, seq_fname))
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath)

        corrected_ref_fpaths.append(corr_seq_fpath)

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')
        fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)],
                                'a')

        return corr_seq_name
Exemplo n.º 10
0
def parse_txt(file):
    genes = []

    for line in file:
        m = txt_pattern_gi.match(line)
        if not m:
            m = txt_pattern.match(line)
        if m:
            gene = Gene(number=int(m.group('number')),
                        seqname=qutils.correct_name(m.group('seqname')))
            s = int(m.group('start'))
            e = int(m.group('end'))
            gene.start = min(s, e)
            gene.end = max(s, e)
            gene.id = m.group('number')
            genes.append(gene)

    return genes
Exemplo n.º 11
0
    def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references,
                    ref_fpath):
        seq_fname = ref_name
        seq_fname += ref_fasta_ext

        if total_references > 1:
            corr_seq_fpath = corrected_ref_fpaths[-1]
        else:
            corr_seq_fpath = qutils.unique_corrected_fpath(
                os.path.join(corrected_dirpath, seq_fname))
            corrected_ref_fpaths.append(corr_seq_fpath)
        corr_seq_name = qutils.name_from_fpath(corr_seq_fpath)
        corr_seq_name += '_' + qutils.correct_name(seq_name[:20])
        if not qconfig.no_check:
            corr_seq = seq.upper()
            dic = {
                'M': 'N',
                'K': 'N',
                'R': 'N',
                'Y': 'N',
                'W': 'N',
                'S': 'N',
                'V': 'N',
                'B': 'N',
                'H': 'N',
                'D': 'N'
            }
            pat = "(%s)" % "|".join(map(re.escape, dic.keys()))
            corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq)
            if re.compile(r'[^ACGTN]').search(corr_seq):
                logger.warning('Skipping ' + ref_fpath +
                               ' because it contains non-ACGTN characters.',
                               indent='    ')
                return None, None

        fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a')
        fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)],
                                'a')

        contigs_analyzer.ref_labels_by_chromosomes[
            corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath)
        chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq)))

        return corr_seq_name, corr_seq_fpath
Exemplo n.º 12
0
def js_data_gen(assemblies, contigs_fpaths, chr_names, chromosomes_length, output_dir_path, cov_fpath, ref_fpath, genome_size):
    chr_to_aligned_blocks = dict()
    for chr in chr_names:
        chr_init = []
        for fpath in contigs_fpaths:
            f = Alignment('FICTIVE', 0, 0, 0, 0, False, 0, 0, None)
            f.label = qutils.label_from_fpath(fpath)
            f.unshifted_start = 0
            f.unshifted_end = 0
            chr_init.append(f)
        chr_to_aligned_blocks.setdefault(chr, chr_init)
    for assembly in assemblies.assemblies:
        for align in assembly.alignments:
            chr_to_aligned_blocks[align.ref_name].append(align)

    summary_fname = 'alignment_summary.html'
    summary_path = os.path.join(output_dir_path, summary_fname)
    output_all_files_dir_path = os.path.join(output_dir_path, alignment_plots_dirname)
    if not os.path.exists(output_all_files_dir_path):
        os.mkdir(output_all_files_dir_path)
    import contigs_analyzer
    if contigs_analyzer.ref_labels_by_chromosomes:
        contig_names_by_refs = contigs_analyzer.ref_labels_by_chromosomes
        chr_full_names = list(set([contig_names_by_refs[contig] for contig in chr_names]))
    elif genome_size < MAX_SIZE_FOR_COMB_PLOT and len(chr_names) >= MIN_CONTIGS_FOR_COMB_PLOT:
        chr_full_names = [NAME_FOR_ONE_PLOT]
    else:
        chr_full_names = chr_names

    if cov_fpath:
        cov_data = dict()
        not_covered = dict()
        cur_len = dict()
        with open(cov_fpath, 'r') as coverage:
            name = chr_names[0]
            contig_to_chr = {}
            for chr in chr_full_names:
                cov_data.setdefault(chr, [])
                not_covered.setdefault(chr, [])
                cur_len.setdefault(chr, 0)
                if contigs_analyzer.ref_labels_by_chromosomes:
                    contigs = [contig for contig in chr_names if contig_names_by_refs[contig] == chr]
                elif chr == NAME_FOR_ONE_PLOT:
                    contigs = chr_names
                else:
                    contigs = [chr]
                for contig in contigs:
                    contig_to_chr[contig] = chr
            for index, line in enumerate(coverage):
                c = list(line.split())
                name = contig_to_chr[qutils.correct_name(c[0])]
                cur_len[name] += int(c[2])
                if index % 100 == 0 and index > 0:
                    cov_data[name].append(cur_len[name]/100)
                    cur_len[name] = 0
                if c[2] == '0':
                    not_covered[name].append(c[1])
    chr_sizes = {}
    num_contigs = {}
    aligned_bases = genome_analyzer.get_ref_aligned_lengths()
    aligned_bases_by_chr = {}
    num_misassemblies = {}
    aligned_assemblies = {}

    for i, chr in enumerate(chr_full_names):
        short_chr = chr[:30]
        num_misassemblies[chr] = 0
        aligned_bases_by_chr[chr] = []
        aligned_assemblies[chr] = []
        with open(os.path.join(output_all_files_dir_path, 'data_%s.js' % short_chr), 'w') as result:
            result.write('"use strict";\n')
            if contigs_analyzer.ref_labels_by_chromosomes:
                contigs = [contig for contig in chr_names if contig_names_by_refs[contig] == chr]
                result.write('var links_to_chromosomes = {};\n')
                links_to_chromosomes = []
                used_chromosomes = []
            elif chr == NAME_FOR_ONE_PLOT:
                contigs = chr_names
            else:
                contigs = [chr]
            chr_size = sum([chromosomes_length[contig] for contig in contigs])
            chr_sizes[chr] = chr_size
            num_contigs[chr] = len(contigs)
            for contig in contigs:
                aligned_bases_by_chr[chr].extend(aligned_bases[contig])
            data_str = 'var chromosomes_len = {};\n'
            for contig in contigs:
                l = chromosomes_length[contig]
                data_str += 'chromosomes_len["{contig}"] = {l};\n'.format(**locals())
            result.write(data_str)

            # adding assembly data
            data_str = 'var contig_data = {};\n'
            data_str += 'contig_data["{chr}"] = [ '.format(**locals())
            prev_len = 0
            chr_lengths = [0] + [chromosomes_length[contig] for contig in contigs]
            for num_contig, contig in enumerate(contigs):
                if num_contig > 0:
                    prev_len += chr_lengths[num_contig]
                if len(chr_to_aligned_blocks[contig]) > 0:
                    for alignment in chr_to_aligned_blocks[contig]:
                        if alignment.misassembled:
                            num_misassemblies[chr] += 1
                        corr_start = prev_len + alignment.unshifted_start
                        corr_end = prev_len + alignment.unshifted_end
                        data_str += '{{name: "{alignment.name}", corr_start: {corr_start}, corr_end: {corr_end},' \
                                    'start: {alignment.unshifted_start}, end: {alignment.unshifted_end}, assembly: "{alignment.label}", similar: "{alignment.similar}", misassembled: "{alignment.misassembled}" '.format(**locals())
                        if alignment.name != 'FICTIVE':
                            if len(aligned_assemblies[chr]) < len(contigs_fpaths) and alignment.label not in aligned_assemblies[chr]:
                                aligned_assemblies[chr].append(alignment.label)
                            data_str += ', structure: ['
                            for el in alignment.misassembled_structure:
                                if type(el) == list:
                                    if el[5] in contigs:
                                        num_chr = contigs.index(el[5])
                                        corr_len = sum(chr_lengths[:num_chr+1])
                                    else:
                                        corr_len = -int(el[1])
                                        if contigs_analyzer.ref_labels_by_chromosomes and el[5] not in used_chromosomes:
                                            used_chromosomes.append(el[5])
                                            new_chr = contig_names_by_refs[el[5]]
                                            links_to_chromosomes.append('links_to_chromosomes["{el[5]}"] = "{new_chr}";\n'.format(**locals()))
                                    corr_start = corr_len + int(el[0])
                                    corr_end = corr_len + int(el[1])
                                    data_str += '{{type: "A", corr_start: {corr_start}, corr_end: {corr_end}, start: {el[0]}, end: {el[1]}, start_in_contig: {el[2]}, end_in_contig: {el[3]}, IDY: {el[4]}, chr: "{el[5]}"}},'.format(**locals())
                                elif type(el) == str:
                                    data_str += '{{type: "M", mstype: "{el}"}},'.format(**locals())
                            if data_str[-1] == '[':
                                data_str = data_str + ']},'
                            else:
                                data_str = data_str[: -1] + ']},'
                        else: data_str += '},'
            data_str = data_str[:-1] + '];\n\n'
            result.write(data_str)
            if contigs_analyzer.ref_labels_by_chromosomes:
                result.write(''.join(links_to_chromosomes))
            if cov_fpath:
                # adding coverage data
                data_str = 'var coverage_data = {};\n'
                if cov_data[chr]:
                    data_str += 'coverage_data["{chr}"] = [ '.format(**locals())
                    for e in cov_data[chr]:
                        data_str += '{e},'.format(**locals())
                        if len(data_str) > 10000 and e != cov_data[chr][-1]:
                            result.write(data_str)
                            data_str = ''
                    data_str = data_str[:-1] + '];\n'
                    result.write(data_str)
                    data_str = ''

                data_str = 'var not_covered = {};\n'
                data_str += 'not_covered["{chr}"] = [ '.format(**locals())
                if len(not_covered[chr]) > 0:
                    for e in not_covered[chr]:
                        data_str += '{e},'.format(**locals())
                        if len(data_str) > 10000 and e != cov_data[chr][-1]:
                            result.write(data_str)
                            data_str = ''
                    data_str = data_str[:-1]
                data_str += '];\n'
                result.write(data_str)
                data_str = ''

            with open(html_saver.get_real_path('_chr_templ.html'), 'r') as template:
                with open(os.path.join(output_all_files_dir_path, '_{short_chr}.html'.format(**locals())), 'w') as result:
                    for line in template:
                        if line.find('<script type="text/javascript" src=""></script>') != -1:
                            result.write('<script type="text/javascript" src="data_{short_chr}.js"></script>\n'.format(**locals()))
                        else:
                            result.write(line)
                            if line.find('<body>') != -1:
                                chr_size = chr_sizes[chr]
                                chr_name = chr.replace('_', ' ')
                                if len(chr_name) > 50:
                                    chr_name = chr_name[:50] + '...'
                                title = 'CONTIG ALIGNMENT BROWSER: %s (' % chr_name + ('%s fragments, ' % num_contigs[chr] if num_contigs[chr] > 1 else '') + '%s bp)' % format_long_numbers(chr_size)
                                result.write('<div class = "block title"><a href="../{summary_fname}"><button class="back_button">&crarr;</button></a>{title}</div>\n'.format(**locals()))
                            if line.find('<script type="text/javascript">') != -1:
                                chromosome = '","'.join(contigs)
                                result.write('var CHROMOSOME = "{chr}";\n'.format(**locals()))
                                result.write('var chrContigs = ["{chromosome}"];\n'.format(**locals()))

    with open(html_saver.get_real_path('alignment_summary_templ.html'), 'r') as template:
        with open(summary_path, 'w') as result:
            num_aligned_assemblies = [len(aligned_assemblies[chr]) for chr in chr_full_names]
            is_unaligned_asm_exists = len(set(num_aligned_assemblies)) > 1
            for line in template:
                result.write(line)
                if line.find('<!--- assemblies: ---->') != -1:
                    if not is_unaligned_asm_exists:
                        result.write('<div class="subtitle"># assemblies: %s</div>' % len(contigs_fpaths))
                if line.find('<!--- th_assemblies: ---->') != -1:
                    if is_unaligned_asm_exists:
                        result.write('<th># assemblies</th>')
                if line.find('<!--- references: ---->') != -1:
                    for chr in sorted(chr_full_names):
                        result.write('<tr>')
                        short_chr = chr[:30]
                        chr_link = os.path.join(alignment_plots_dirname, '_{short_chr}.html'.format(**locals()))
                        chr_name = chr.replace('_', ' ')
                        aligned_lengths = [aligned_len for aligned_len in aligned_bases_by_chr[chr] if aligned_len is not None]
                        chr_genome = sum(aligned_lengths) * 100.0 / (chr_sizes[chr] * len(contigs_fpaths))
                        chr_size = chr_sizes[chr]
                        result.write('<td><a href="%s">%s</a></td>' % (chr_link, chr_name))
                        result.write('<td>%s</td>' % num_contigs[chr])
                        result.write('<td>%s</td>' % format_long_numbers(chr_size))
                        if is_unaligned_asm_exists:
                            result.write('<td>%s</td>' % len(aligned_assemblies[chr]))
                        result.write('<td>%.3f</td>' % chr_genome)
                        result.write('<td>%s</td>' % num_misassemblies[chr])
                        result.write('</tr>')

    copyfile(html_saver.get_real_path(os.path.join('static', 'contig_alignment_plot.css')),
             os.path.join(output_all_files_dir_path, 'contig_alignment_plot.css'))
    copyfile(html_saver.get_real_path(os.path.join('static', 'd3.js')),
             os.path.join(output_all_files_dir_path, 'd3.js'))
    copyfile(html_saver.get_real_path(os.path.join('static', 'scripts', 'contig_alignment_plot_script.js')),
             os.path.join(output_all_files_dir_path, 'contig_alignment_plot_script.js'))
Exemplo n.º 13
0
def get_corr_name(name):
    return qutils.correct_name(name)
Exemplo n.º 14
0
def parse_nucmer_contig_report(report_fpath, sorted_ref_names, cumulative_ref_lengths):
    aligned_blocks = []

    with open(report_fpath) as report_file:
        misassembled_contigs_ids = []

        for line in report_file:
            if line.startswith('Analyzing contigs...'):
                break

        cur_contig_id = ''
        last_contig_id = ''

        misassembled_id_to_structure = dict()
        for line in report_file:
            if line.startswith('CONTIG:'):
                cur_contig_id = line.split('CONTIG:')[1].strip()
                last_contig_id = cur_contig_id.split(' ')[0]

            if last_contig_id not in misassembled_id_to_structure:
                misassembled_id_to_structure[last_contig_id] = [False]

            if (line.find('Alignment') != -1 or line.find('most ') != -1) and line.find('Excluding') == -1:
                l = line.split(':')[1].split(' ')
                misassembled_id_to_structure[last_contig_id].append([l[1], l[2], l[4], l[5], l[10], qutils.correct_name(l[12])])

            if line.find('misassembly') != -1 and line.find('Fake') == -1:
                misassembled_id_to_structure[last_contig_id].append(line.split('(')[1].split(')')[0])

            if line.find('Extensive misassembly') != -1 and cur_contig_id != '':
                misassembled_contigs_ids.append(cur_contig_id.split()[0])
                cur_contig_id = ''

            if line.startswith('Analyzing coverage...'):
                break

        cur_shift = 0
        ref_blocks = []

        for line in report_file:
            split_line = line.strip().split(' ')
            if split_line and split_line[0] == 'Reference':
                ref_name = split_line[1][:-1]
                if ref_name in sorted_ref_names:
                    cur_shift = cumulative_ref_lengths[sorted_ref_names.index(ref_name)]
                else:
                    logger.warning('reference name ' + ref_name + ' not found in file with reference!\nCannot draw contig alignment plot!')
                    return None
            elif split_line and split_line[0] == 'Align' and 'Excluding' not in split_line and 'Fake' not in split_line:
                unshifted_start = int(split_line[2])
                unshifted_end = int(split_line[3])
                start = unshifted_start + cur_shift
                end = unshifted_end + cur_shift
                contig_id = split_line[4]
                start_in_contig = int(split_line[5])
                end_in_contig = int(split_line[6])

                is_rc = ((start - end) * (start_in_contig - end_in_contig)) < 0
                position_in_contig = min(start_in_contig, end_in_contig)
                position_in_ref = max(int(split_line[2]), int(split_line[3]))
                block = Alignment(
                    contig_id, start, end, unshifted_start, unshifted_end, is_rc,
                    position_in_contig, position_in_ref, ref_name)

                if contig_id in misassembled_contigs_ids:
                    block.misassembled = True

                block.misassembled_structure = misassembled_id_to_structure[contig_id]
                if contig_id in misassembled_contigs_ids:
                    block.misassembled = True
                aligned_blocks.append(block)

        if ref_blocks:
            aligned_blocks.extend(ref_blocks)

    return aligned_blocks
Exemplo n.º 15
0
def get_corr_name(name):
    return qutils.correct_name(name)
Exemplo n.º 16
0
def correct_fasta(original_fpath,
                  corrected_fpath,
                  min_contig,
                  is_reference=False):
    modified_fasta_entries = []
    for first_line, seq in fastaparser.read_fasta(original_fpath):
        if (len(seq) >= min_contig) or is_reference:
            corr_name = qutils.correct_name(first_line)

            if not qconfig.no_check:
                # seq to uppercase, because we later looking only uppercase letters
                corr_seq = seq.upper()

                # correcting alternatives (gage can't work with alternatives)
                # dic = {'M': 'A', 'K': 'G', 'R': 'A', 'Y': 'C', 'W': 'A', 'S': 'C', 'V': 'A', 'B': 'C', 'H': 'A', 'D': 'A'}
                dic = {
                    'M': 'N',
                    'K': 'N',
                    'R': 'N',
                    'Y': 'N',
                    'W': 'N',
                    'S': 'N',
                    'V': 'N',
                    'B': 'N',
                    'H': 'N',
                    'D': 'N'
                }
                pat = "(%s)" % "|".join(map(re.escape, dic.keys()))
                corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq)

                # make sure that only A, C, G, T or N are in the sequence
                if re.compile(r'[^ACGTN]').search(corr_seq):
                    logger.warning(
                        'Skipping ' + original_fpath +
                        ' because it contains non-ACGTN characters.',
                        indent='    ')
                    return False
            else:
                corr_seq = seq
            modified_fasta_entries.append((corr_name, corr_seq))

    fastaparser.write_fasta(corrected_fpath, modified_fasta_entries)

    if is_reference:
        ref_len = sum(
            len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries)
        if ref_len > qconfig.MAX_REFERENCE_FILE_LENGTH:
            qconfig.splitted_ref = [
            ]  # important for MetaQUAST which runs QUAST multiple times
            _, fasta_ext = os.path.splitext(corrected_fpath)
            split_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath),
                                             'split_ref')
            if os.path.exists(split_ref_dirpath):
                shutil.rmtree(split_ref_dirpath, ignore_errors=True)
            os.makedirs(split_ref_dirpath)
            max_len = min(ref_len / qconfig.max_threads,
                          qconfig.MAX_REFERENCE_LENGTH)
            cur_part_len = 0
            cur_part_num = 1
            cur_part_fpath = os.path.join(split_ref_dirpath,
                                          "part_%d" % cur_part_num) + fasta_ext

            for (chr_name, chr_seq) in modified_fasta_entries:
                cur_chr_len = len(chr_seq)
                if cur_chr_len > qconfig.MAX_REFERENCE_LENGTH:
                    logger.warning("Skipping chromosome " + chr_name +
                                   " because its length is greater than " +
                                   str(qconfig.MAX_REFERENCE_LENGTH) +
                                   " (Nucmer's constraint).")
                    continue

                cur_part_len += cur_chr_len
                if cur_part_len > max_len and cur_part_len != cur_chr_len:
                    qconfig.splitted_ref.append(cur_part_fpath)
                    cur_part_len = cur_chr_len
                    cur_part_num += 1
                    cur_part_fpath = os.path.join(
                        split_ref_dirpath,
                        "part_%d" % cur_part_num) + fasta_ext
                fastaparser.write_fasta(cur_part_fpath, [(chr_name, chr_seq)],
                                        mode='a')
            if cur_part_len > 0:
                qconfig.splitted_ref.append(cur_part_fpath)
            if len(qconfig.splitted_ref) == 0:
                logger.warning(
                    "Skipping reference because all of its chromosomes exceeded Nucmer's constraint."
                )
                return False
    return True
Exemplo n.º 17
0
def parse_nucmer_contig_report(report_fpath, sorted_ref_names,
                               cumulative_ref_lengths):
    aligned_blocks = []

    with open(report_fpath) as report_file:
        misassembled_contigs_ids = []

        for line in report_file:
            if line.startswith('Analyzing contigs...'):
                break

        cur_contig_id = ''
        last_contig_id = ''

        misassembled_id_to_structure = dict()
        for line in report_file:
            if line.startswith('CONTIG:'):
                cur_contig_id = line.split('CONTIG:')[1].strip()
                last_contig_id = cur_contig_id.split(' ')[0]

            if last_contig_id not in misassembled_id_to_structure:
                misassembled_id_to_structure[last_contig_id] = [False]

            if (line.find('Alignment') != -1 or
                    line.find('most ') != -1) and line.find('Excluding') == -1:
                l = line.split(':')[1].split(' ')
                misassembled_id_to_structure[last_contig_id].append([
                    l[1], l[2], l[4], l[5], l[10],
                    qutils.correct_name(l[12])
                ])

            if line.find('misassembly') != -1 and line.find('Fake') == -1:
                misassembled_id_to_structure[last_contig_id].append(
                    line.split('(')[1].split(')')[0])

            if line.find(
                    'Extensive misassembly') != -1 and cur_contig_id != '':
                misassembled_contigs_ids.append(cur_contig_id.split()[0])
                cur_contig_id = ''

            if line.startswith('Analyzing coverage...'):
                break

        cur_shift = 0
        ref_blocks = []

        for line in report_file:
            split_line = line.strip().split(' ')
            if split_line and split_line[0] == 'Reference':
                ref_name = split_line[1][:-1]
                if ref_name in sorted_ref_names:
                    cur_shift = cumulative_ref_lengths[sorted_ref_names.index(
                        ref_name)]
                else:
                    logger.warning(
                        'reference name ' + ref_name +
                        ' not found in file with reference!\nCannot draw contig alignment plot!'
                    )
                    return None
            elif split_line and split_line[
                    0] == 'Align' and 'Excluding' not in split_line and 'Fake' not in split_line:
                unshifted_start = int(split_line[2])
                unshifted_end = int(split_line[3])
                start = unshifted_start + cur_shift
                end = unshifted_end + cur_shift
                contig_id = split_line[4]
                start_in_contig = int(split_line[5])
                end_in_contig = int(split_line[6])

                is_rc = ((start - end) * (start_in_contig - end_in_contig)) < 0
                position_in_contig = min(start_in_contig, end_in_contig)
                position_in_ref = max(int(split_line[2]), int(split_line[3]))
                block = Alignment(contig_id, start, end, unshifted_start,
                                  unshifted_end, is_rc, position_in_contig,
                                  position_in_ref, ref_name)

                if contig_id in misassembled_contigs_ids:
                    block.misassembled = True

                block.misassembled_structure = misassembled_id_to_structure[
                    contig_id]
                if contig_id in misassembled_contigs_ids:
                    block.misassembled = True
                aligned_blocks.append(block)

        if ref_blocks:
            aligned_blocks.extend(ref_blocks)

    return aligned_blocks
Exemplo n.º 18
0
Arquivo: quast.py Projeto: ctb/quast
def correct_fasta(original_fpath, corrected_fpath, min_contig,
                  is_reference=False):
    modified_fasta_entries = []
    for first_line, seq in fastaparser.read_fasta(original_fpath):
        if (len(seq) >= min_contig) or is_reference:
            corr_name = qutils.correct_name(first_line)

            if not qconfig.no_check:
                # seq to uppercase, because we later looking only uppercase letters
                corr_seq = seq.upper()

                # correcting alternatives (gage can't work with alternatives)
                # dic = {'M': 'A', 'K': 'G', 'R': 'A', 'Y': 'C', 'W': 'A', 'S': 'C', 'V': 'A', 'B': 'C', 'H': 'A', 'D': 'A'}
                dic = {'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N'}
                pat = "(%s)" % "|".join(map(re.escape, dic.keys()))
                corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq)

                # make sure that only A, C, G, T or N are in the sequence
                if re.compile(r'[^ACGTN]').search(corr_seq):
                    logger.warning('Skipping ' + original_fpath + ' because it contains non-ACGTN characters.',
                            indent='    ')
                    return False
            else:
                corr_seq = seq
            modified_fasta_entries.append((corr_name, corr_seq))

    fastaparser.write_fasta(corrected_fpath, modified_fasta_entries)

    if is_reference:
        ref_len = sum(len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries)
        if ref_len > qconfig.MAX_REFERENCE_FILE_LENGTH:
            qconfig.splitted_ref = []  # important for MetaQUAST which runs QUAST multiple times
            _, fasta_ext = os.path.splitext(corrected_fpath)
            split_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath), 'split_ref')
            if os.path.exists(split_ref_dirpath):
                shutil.rmtree(split_ref_dirpath, ignore_errors=True)
            os.makedirs(split_ref_dirpath)
            max_len = min(ref_len/qconfig.max_threads, qconfig.MAX_REFERENCE_LENGTH)
            cur_part_len = 0
            cur_part_num = 1
            cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext

            for (chr_name, chr_seq) in modified_fasta_entries:
                cur_chr_len = len(chr_seq)
                if cur_chr_len > qconfig.MAX_REFERENCE_LENGTH:
                    logger.warning("Skipping chromosome " + chr_name + " because its length is greater than " +
                            str(qconfig.MAX_REFERENCE_LENGTH) + " (Nucmer's constraint).")
                    continue

                cur_part_len += cur_chr_len
                if cur_part_len > max_len and cur_part_len != cur_chr_len:
                    qconfig.splitted_ref.append(cur_part_fpath)
                    cur_part_len = cur_chr_len
                    cur_part_num += 1
                    cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext
                fastaparser.write_fasta(cur_part_fpath, [(chr_name, chr_seq)], mode='a')
            if cur_part_len > 0:
                qconfig.splitted_ref.append(cur_part_fpath)
            if len(qconfig.splitted_ref) == 0:
                logger.warning("Skipping reference because all of its chromosomes exceeded Nucmer's constraint.")
                return False
    return True
Exemplo n.º 19
0
def js_data_gen(assemblies, contigs_fpaths, chr_names, chromosomes_length,
                output_dir_path, cov_fpath, ref_fpath, genome_size):
    chr_to_aligned_blocks = dict()
    for chr in chr_names:
        chr_init = []
        for fpath in contigs_fpaths:
            f = Alignment('FICTIVE', 0, 0, 0, 0, False, 0, 0, None)
            f.label = qutils.label_from_fpath(fpath)
            f.unshifted_start = 0
            f.unshifted_end = 0
            chr_init.append(f)
        chr_to_aligned_blocks.setdefault(chr, chr_init)
    for assembly in assemblies.assemblies:
        for align in assembly.alignments:
            chr_to_aligned_blocks[align.ref_name].append(align)

    summary_fname = 'alignment_summary.html'
    summary_path = os.path.join(output_dir_path, summary_fname)
    output_all_files_dir_path = os.path.join(output_dir_path,
                                             alignment_plots_dirname)
    if not os.path.exists(output_all_files_dir_path):
        os.mkdir(output_all_files_dir_path)
    import contigs_analyzer
    if contigs_analyzer.ref_labels_by_chromosomes:
        contig_names_by_refs = contigs_analyzer.ref_labels_by_chromosomes
        chr_full_names = list(
            set([contig_names_by_refs[contig] for contig in chr_names]))
    elif genome_size < MAX_SIZE_FOR_COMB_PLOT and len(
            chr_names) >= MIN_CONTIGS_FOR_COMB_PLOT:
        chr_full_names = [NAME_FOR_ONE_PLOT]
    else:
        chr_full_names = chr_names

    if cov_fpath:
        cov_data = dict()
        not_covered = dict()
        cur_len = dict()
        with open(cov_fpath, 'r') as coverage:
            name = chr_names[0]
            contig_to_chr = {}
            for chr in chr_full_names:
                cov_data.setdefault(chr, [])
                not_covered.setdefault(chr, [])
                cur_len.setdefault(chr, 0)
                if contigs_analyzer.ref_labels_by_chromosomes:
                    contigs = [
                        contig for contig in chr_names
                        if contig_names_by_refs[contig] == chr
                    ]
                elif chr == NAME_FOR_ONE_PLOT:
                    contigs = chr_names
                else:
                    contigs = [chr]
                for contig in contigs:
                    contig_to_chr[contig] = chr
            for index, line in enumerate(coverage):
                c = list(line.split())
                name = contig_to_chr[qutils.correct_name(c[0])]
                cur_len[name] += int(c[2])
                if index % 100 == 0 and index > 0:
                    cov_data[name].append(cur_len[name] / 100)
                    cur_len[name] = 0
                if c[2] == '0':
                    not_covered[name].append(c[1])
    chr_sizes = {}
    num_contigs = {}
    aligned_bases = genome_analyzer.get_ref_aligned_lengths()
    aligned_bases_by_chr = {}
    num_misassemblies = {}
    aligned_assemblies = {}

    for i, chr in enumerate(chr_full_names):
        short_chr = chr[:30]
        num_misassemblies[chr] = 0
        aligned_bases_by_chr[chr] = []
        aligned_assemblies[chr] = []
        with open(
                os.path.join(output_all_files_dir_path,
                             'data_%s.js' % short_chr), 'w') as result:
            result.write('"use strict";\n')
            if contigs_analyzer.ref_labels_by_chromosomes:
                contigs = [
                    contig for contig in chr_names
                    if contig_names_by_refs[contig] == chr
                ]
                result.write('var links_to_chromosomes = {};\n')
                links_to_chromosomes = []
                used_chromosomes = []
            elif chr == NAME_FOR_ONE_PLOT:
                contigs = chr_names
            else:
                contigs = [chr]
            chr_size = sum([chromosomes_length[contig] for contig in contigs])
            chr_sizes[chr] = chr_size
            num_contigs[chr] = len(contigs)
            for contig in contigs:
                aligned_bases_by_chr[chr].extend(aligned_bases[contig])
            data_str = 'var chromosomes_len = {};\n'
            for contig in contigs:
                l = chromosomes_length[contig]
                data_str += 'chromosomes_len["{contig}"] = {l};\n'.format(
                    **locals())
            result.write(data_str)

            # adding assembly data
            data_str = 'var contig_data = {};\n'
            data_str += 'contig_data["{chr}"] = [ '.format(**locals())
            prev_len = 0
            chr_lengths = [0] + [
                chromosomes_length[contig] for contig in contigs
            ]
            for num_contig, contig in enumerate(contigs):
                if num_contig > 0:
                    prev_len += chr_lengths[num_contig]
                if len(chr_to_aligned_blocks[contig]) > 0:
                    for alignment in chr_to_aligned_blocks[contig]:
                        if alignment.misassembled:
                            num_misassemblies[chr] += 1
                        corr_start = prev_len + alignment.unshifted_start
                        corr_end = prev_len + alignment.unshifted_end
                        data_str += '{{name: "{alignment.name}", corr_start: {corr_start}, corr_end: {corr_end},' \
                                    'start: {alignment.unshifted_start}, end: {alignment.unshifted_end}, assembly: "{alignment.label}", similar: "{alignment.similar}", misassembled: "{alignment.misassembled}" '.format(**locals())
                        if alignment.name != 'FICTIVE':
                            if len(aligned_assemblies[chr]) < len(
                                    contigs_fpaths
                            ) and alignment.label not in aligned_assemblies[
                                    chr]:
                                aligned_assemblies[chr].append(alignment.label)
                            data_str += ', structure: ['
                            for el in alignment.misassembled_structure:
                                if type(el) == list:
                                    if el[5] in contigs:
                                        num_chr = contigs.index(el[5])
                                        corr_len = sum(chr_lengths[:num_chr +
                                                                   1])
                                    else:
                                        corr_len = -int(el[1])
                                        if contigs_analyzer.ref_labels_by_chromosomes and el[
                                                5] not in used_chromosomes:
                                            used_chromosomes.append(el[5])
                                            new_chr = contig_names_by_refs[
                                                el[5]]
                                            links_to_chromosomes.append(
                                                'links_to_chromosomes["{el[5]}"] = "{new_chr}";\n'
                                                .format(**locals()))
                                    corr_start = corr_len + int(el[0])
                                    corr_end = corr_len + int(el[1])
                                    data_str += '{{type: "A", corr_start: {corr_start}, corr_end: {corr_end}, start: {el[0]}, end: {el[1]}, start_in_contig: {el[2]}, end_in_contig: {el[3]}, IDY: {el[4]}, chr: "{el[5]}"}},'.format(
                                        **locals())
                                elif type(el) == str:
                                    data_str += '{{type: "M", mstype: "{el}"}},'.format(
                                        **locals())
                            if data_str[-1] == '[':
                                data_str = data_str + ']},'
                            else:
                                data_str = data_str[:-1] + ']},'
                        else:
                            data_str += '},'
            data_str = data_str[:-1] + '];\n\n'
            result.write(data_str)
            if contigs_analyzer.ref_labels_by_chromosomes:
                result.write(''.join(links_to_chromosomes))
            if cov_fpath:
                # adding coverage data
                data_str = 'var coverage_data = {};\n'
                if cov_data[chr]:
                    data_str += 'coverage_data["{chr}"] = [ '.format(
                        **locals())
                    for e in cov_data[chr]:
                        data_str += '{e},'.format(**locals())
                        if len(data_str) > 10000 and e != cov_data[chr][-1]:
                            result.write(data_str)
                            data_str = ''
                    data_str = data_str[:-1] + '];\n'
                    result.write(data_str)
                    data_str = ''

                data_str = 'var not_covered = {};\n'
                data_str += 'not_covered["{chr}"] = [ '.format(**locals())
                if len(not_covered[chr]) > 0:
                    for e in not_covered[chr]:
                        data_str += '{e},'.format(**locals())
                        if len(data_str) > 10000 and e != cov_data[chr][-1]:
                            result.write(data_str)
                            data_str = ''
                    data_str = data_str[:-1]
                data_str += '];\n'
                result.write(data_str)
                data_str = ''

            with open(html_saver.get_real_path('_chr_templ.html'),
                      'r') as template:
                with open(
                        os.path.join(output_all_files_dir_path,
                                     '_{short_chr}.html'.format(**locals())),
                        'w') as result:
                    for line in template:
                        if line.find(
                                '<script type="text/javascript" src=""></script>'
                        ) != -1:
                            result.write(
                                '<script type="text/javascript" src="data_{short_chr}.js"></script>\n'
                                .format(**locals()))
                        else:
                            result.write(line)
                            if line.find('<body>') != -1:
                                chr_size = chr_sizes[chr]
                                chr_name = chr.replace('_', ' ')
                                if len(chr_name) > 50:
                                    chr_name = chr_name[:50] + '...'
                                title = 'CONTIG ALIGNMENT BROWSER: %s (' % chr_name + (
                                    '%s fragments, ' % num_contigs[chr]
                                    if num_contigs[chr] > 1 else ''
                                ) + '%s bp)' % format_long_numbers(chr_size)
                                result.write(
                                    '<div class = "block title"><a href="../{summary_fname}"><button class="back_button">&crarr;</button></a>{title}</div>\n'
                                    .format(**locals()))
                            if line.find(
                                    '<script type="text/javascript">') != -1:
                                chromosome = '","'.join(contigs)
                                result.write(
                                    'var CHROMOSOME = "{chr}";\n'.format(
                                        **locals()))
                                result.write(
                                    'var chrContigs = ["{chromosome}"];\n'.
                                    format(**locals()))

    with open(html_saver.get_real_path('alignment_summary_templ.html'),
              'r') as template:
        with open(summary_path, 'w') as result:
            num_aligned_assemblies = [
                len(aligned_assemblies[chr]) for chr in chr_full_names
            ]
            is_unaligned_asm_exists = len(set(num_aligned_assemblies)) > 1
            for line in template:
                result.write(line)
                if line.find('<!--- assemblies: ---->') != -1:
                    if not is_unaligned_asm_exists:
                        result.write(
                            '<div class="subtitle"># assemblies: %s</div>' %
                            len(contigs_fpaths))
                if line.find('<!--- th_assemblies: ---->') != -1:
                    if is_unaligned_asm_exists:
                        result.write('<th># assemblies</th>')
                if line.find('<!--- references: ---->') != -1:
                    for chr in sorted(chr_full_names):
                        result.write('<tr>')
                        short_chr = chr[:30]
                        chr_link = os.path.join(
                            alignment_plots_dirname,
                            '_{short_chr}.html'.format(**locals()))
                        chr_name = chr.replace('_', ' ')
                        aligned_lengths = [
                            aligned_len
                            for aligned_len in aligned_bases_by_chr[chr]
                            if aligned_len is not None
                        ]
                        chr_genome = sum(aligned_lengths) * 100.0 / (
                            chr_sizes[chr] * len(contigs_fpaths))
                        chr_size = chr_sizes[chr]
                        result.write('<td><a href="%s">%s</a></td>' %
                                     (chr_link, chr_name))
                        result.write('<td>%s</td>' % num_contigs[chr])
                        result.write('<td>%s</td>' %
                                     format_long_numbers(chr_size))
                        if is_unaligned_asm_exists:
                            result.write('<td>%s</td>' %
                                         len(aligned_assemblies[chr]))
                        result.write('<td>%.3f</td>' % chr_genome)
                        result.write('<td>%s</td>' % num_misassemblies[chr])
                        result.write('</tr>')

    copyfile(
        html_saver.get_real_path(
            os.path.join('static', 'contig_alignment_plot.css')),
        os.path.join(output_all_files_dir_path, 'contig_alignment_plot.css'))
    copyfile(html_saver.get_real_path(os.path.join('static', 'd3.js')),
             os.path.join(output_all_files_dir_path, 'd3.js'))
    copyfile(
        html_saver.get_real_path(
            os.path.join('static', 'scripts',
                         'contig_alignment_plot_script.js')),
        os.path.join(output_all_files_dir_path,
                     'contig_alignment_plot_script.js'))