예제 #1
0
def write_assembly_statistics(assembly, outdir):
    """
    Write assembly statistics

    Parameters
    ----------
    assembly : str
        Path to assembly fasta file
    outdir : str
        Path to the output directory

    Returns
    -------

    """
    assembly_lengths = []

    assembly = open(assembly, mode='rt')  # TODO: newline=None in Python3
    fasta_iter = (
        g for k, g in itertools_groupby(assembly, lambda x: x.startswith('>')))
    for header in fasta_iter:
        # _ = header.__next__()[1:].rstrip('\r\n')  # TODO: Python3
        _ = next(header)[1:].rstrip('\r\n')
        # seq = ''.join(s.rstrip('\r\n') for s in fasta_iter.__next__())  # TODO: Python3
        seq = ''.join(s.rstrip('\r\n') for s in next(fasta_iter))
        assembly_lengths.append(len(seq))

    with open(os.path.join(outdir, 'pilon_assembly_statistics.tab'),
              'wt') as writer:
        writer.write('#' + '\t'.join(['contigs', 'bp']) + '\n')
        writer.write(
            '\t'.join(map(str, [len(assembly_lengths),
                                sum(assembly_lengths)])) + '\n')
예제 #2
0
 def kv_items(self):
     if self.key is not None:
         for k, v in itertools_groupby(self.sequence, key=self.key):
             yield k, self.val_postproc(map(self.val, v))
     else:
         for i, v in enumerate(self.sequence):
             yield i, self.val(v)
예제 #3
0
파일: pilon.py 프로젝트: B-UMMI/INNUca
def write_assembly_statistics(assembly, outdir):
    """
    Write assembly statistics

    Parameters
    ----------
    assembly : str
        Path to assembly fasta file
    outdir : str
        Path to the output directory

    Returns
    -------

    """
    assembly_lengths = []

    assembly = open(assembly, mode='rt')  # TODO: newline=None in Python3
    fasta_iter = (g for k, g in itertools_groupby(assembly, lambda x: x.startswith('>')))
    for header in fasta_iter:
        # _ = header.__next__()[1:].rstrip('\r\n')  # TODO: Python3
        _ = header.next()[1:].rstrip('\r\n')
        # seq = ''.join(s.rstrip('\r\n') for s in fasta_iter.__next__())  # TODO: Python3
        seq = ''.join(s.rstrip('\r\n') for s in fasta_iter.next())
        assembly_lengths.append(len(seq))

    with open(os.path.join(outdir, 'pilon_assembly_statistics.tab'), 'wt') as writer:
        writer.write('#' + '\t'.join(['contigs', 'bp']) + '\n')
        writer.write('\t'.join(map(str, [len(assembly_lengths), sum(assembly_lengths)])) + '\n')
예제 #4
0
def get_sequence_information(fasta_file):
    headers = {}
    sequence_dict = {}
    headers_changed = False

    sequence_counter = 0

    reader = open(fasta_file, mode='rt', newline=None)
    fasta_iter = (
        g for k, g in itertools_groupby(reader, lambda x: x.startswith('>')))
    for header in fasta_iter:
        original_header, new_header = clean_header(
            header.__next__()[1:].rstrip('\r\n'))
        if new_header in headers:
            sys.exit('Found duplicated sequence'
                     ' headers: {original_header}'.format(
                         original_header=original_header))
        seq = ''.join(s.rstrip('\r\n') for s in fasta_iter.__next__())
        sequence_counter += 1
        sequence_dict[sequence_counter] = {
            'header': new_header,
            'sequence': seq,
            'length': len(seq)
        }
        headers[new_header] = str(original_header)
        if new_header != original_header:
            headers_changed = True
    reader.close()

    return sequence_dict, headers, headers_changed
예제 #5
0
파일: mlst.py 프로젝트: B-UMMI/INNUca
def clean_novel_alleles(novel_alleles, scheme_mlst, profile):
    """
    Clean the fasta file with the novel alleles produced by mlst

    Parameters
    ----------
    novel_alleles : str
        Path for fasta file containing the novel alleles
    scheme_mlst : str
        MLST schema found by mlst
    profile : list
        List of strings with the profile found
    Returns
    -------

    """
    unknown_genes = []
    for gene_allele in profile:
        gene = gene_allele.split('(')[0]
        try:
            allele = gene_allele.split('(')[1].rstrip(')')
            if allele.startswith('~'):
                unknown_genes.append(gene)
        except IndexError as e:
            print('WARNING: {}'.format(e))

    try:
        novel_alleles_keep = {}
        if len(unknown_genes) > 0:
            reader = open(novel_alleles, mode='rt')  # TODO: newline=None in Python3
            fasta_iter = (g for k, g in itertools_groupby(reader, lambda x: x.startswith('>')))
            for header in fasta_iter:
                # header = header.__next__()[1:].rstrip('\r\n')  # TODO: Python3
                header = header.next()[1:].rstrip('\r\n')
                # seq = ''.join(s.rstrip('\r\n') for s in fasta_iter.__next__())  # TODO: Python3
                seq = ''.join(s.rstrip('\r\n') for s in fasta_iter.next())
                if header.startswith(scheme_mlst):
                    gene = header.split('.')[1].split('~')[0]
                    if gene in unknown_genes:
                        novel_alleles_keep[header] = seq
            reader.close()

        os.remove(novel_alleles)

        if len(novel_alleles_keep) > 0:
            with open(novel_alleles, 'wt') as writer:
                for header, seq in novel_alleles_keep.items():
                    writer.write('>{}\n'.format(header))
                    writer.write('\n'.join(utils.chunkstring(seq, 80)) + '\n')
    except OSError as e:  # TODO: FileNotFoundError in Python3
        print('An unknown ST was found but no novel alleles fasta file was produced by mlst software:\n'
              '{}'.format(e))
예제 #6
0
def clean_novel_alleles(novel_alleles, scheme_mlst, profile):
    """
    Clean the fasta file with the novel alleles produced by mlst

    Parameters
    ----------
    novel_alleles : str
        Path for fasta file containing the novel alleles
    scheme_mlst : str
        MLST schema found by mlst
    profile : list
        List of strings with the profile found
    Returns
    -------

    """
    unknown_genes = []
    for gene_allele in profile:
        gene = gene_allele.split('(')[0]
        try:
            allele = gene_allele.split('(')[1].rstrip(')')
            if allele.startswith('~'):
                unknown_genes.append(gene)
        except IndexError as e:
            print('WARNING: {}'.format(e))

    novel_alleles_keep = {}
    if len(unknown_genes) > 0:
        reader = open(novel_alleles,
                      mode='rt')  # TODO: newline=None in Python3
        fasta_iter = (
            g
            for k, g in itertools_groupby(reader, lambda x: x.startswith('>')))
        for header in fasta_iter:
            # header = header.__next__()[1:].rstrip('\r\n')  # TODO: Python3
            header = header.next()[1:].rstrip('\r\n')
            # seq = ''.join(s.rstrip('\r\n') for s in fasta_iter.__next__())  # TODO: Python3
            seq = ''.join(s.rstrip('\r\n') for s in fasta_iter.next())
            if header.startswith(scheme_mlst):
                gene = header.split('.')[1].split('~')[0]
                if gene in unknown_genes:
                    novel_alleles_keep[header] = seq
        reader.close()

    os.remove(novel_alleles)

    if len(novel_alleles_keep) > 0:
        with open(novel_alleles, 'wt') as writer:
            for header, seq in novel_alleles_keep.items():
                writer.write('>{}\n'.format(header))
                writer.write('\n'.join(utils.chunkstring(seq, 80)) + '\n')
예제 #7
0
def groupby(inset, keyfunc):
    """groupby on unsorted inset"""
    return itertools_groupby(sorted(inset, key=keyfunc), keyfunc)
예제 #8
0
        css=osp.relpath(collection_css, path)
    )
    with open(path, 'wt', encoding='utf-8') as fp:
        fp.write(render(collection_template, view))

# write out an all collections file
with open(osp.join(collection_path, 'ALL'), 'wt', encoding='utf-8') as fp:
    fp.write(' '.join(all_collections))

# write the index.htmls
idxtemplate = open("src/book-index.mako").read()
idxpaths = sorted(set(b["path"] for b in ndx))
start = osp.join(CONTENT, "index.html")
back = start
i = 1
for path, group in itertools_groupby(ndx, lambda v: v["path"]):
    view = dict(
        name="index",
        books=group,
        back=osp.relpath(back, osp.dirname(path)),
        next=osp.relpath(idxpaths[i] if i < len(idxpaths)
                         else start, osp.dirname(path)),
        css=osp.relpath(osp.join(OUT, "index.css"), path),
    )
    with open(path, "wt", encoding="utf-8") as fp:
        fp.write(render(idxtemplate, view))
    back = path
    i += 1

# write the word indexes
WOUT = osp.join(CONTENT, "index")