Пример #1
0
def mongodb_command(verbose, taxonomy, no_cache, indent, progress, gff_file,
                    output_file):

    mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO)

    LOG.info('Writing to file (%s)',
             getattr(output_file, 'name', repr(output_file)))

    lineage_func = None

    if taxonomy is not None:
        taxonomy = taxon.Taxonomy(taxonomy)
        lineage_func = functools.partial(taxon.get_lineage, taxonomy)
        if no_cache:
            LOG.info('Using cached calls to lineage')
            lineage_func = simple_cache.memoize(lineage_func)

    iterator = gff.parse_gff(gff_file)

    if progress:
        iterator = tqdm(iterator)

    for annotation in iterator:
        output_file.write(
            annotation.to_mongodb(lineage_func=lineage_func,
                                  indent=indent).encode('ascii'))
        output_file.write('\n'.encode('ascii'))
Пример #2
0
def add_fields(verbose, attributes, overwrite, only_edited, uids, input_file,
               output_file):

    logger.config_log(level=logging.DEBUG if verbose else logging.INFO)

    if overwrite:
        LOG.info("Attributes/Values will be overwritten")

    LOG.info("Fields to add/change: %s", ', '.join(x[0] for x in attributes))

    if len(set((x[0] for x in attributes))) != len(attributes):
        utils.exit_script("Found duplicates attributes to edit", 1)

    if uids is not None:
        uids = set(line.strip() for line in uids)
        LOG.info("Number of `uid` passed: %d", len(uids))

    for annotation in gff.parse_gff(input_file):
        if (uids is not None) and (annotation.uid not in uids):
            if not only_edited:
                annotation.to_file(output_file)
            continue

        change_attr = set()
        for attribute, value in attributes:
            if (not overwrite) and annotation.has_attr(attribute):
                continue

            annotation.set_attr(attribute, value)
            change_attr.add(attribute)

        if change_attr or (not only_edited):
            annotation.to_file(output_file)
Пример #3
0
def coverage_command(verbose, reference, json_out, strand_specific, rename,
                     progress, gff_file, output_file):

    mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO)

    sequences = dict(
        fasta.load_fasta_rename(reference) if rename else fasta.
        load_fasta(reference))
    iterator = gff.annotation_coverage_sorted(gff.parse_gff(gff_file),
                                              sequences,
                                              strand=strand_specific)
    if progress:
        iterator = tqdm(iterator)

    contig_coverage = {}

    for seq_id, strand, coverage in iterator:
        if json_out:
            contig_coverage[seq_id] = coverage
        else:
            output_file.write("{}\t{}\t{}\n".format(
                seq_id, "NA" if strand is None else strand,
                coverage).encode('ascii'))

    if json_out:
        output_file.write(
            json.dumps(contig_coverage, indent=4).encode('ascii'))
Пример #4
0
def sequence_command(verbose, reverse, no_wrap, split, reference, progress,
                     gff_file, fasta_file):

    mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO)

    if reference is None:
        utils.exit_script('A fasta reference file is required', 1)

    wrap = 60

    if no_wrap:
        wrap = None

    seqs = dict((seq_id.split(' ')[0] if split else seq_id, seq)
                for seq_id, seq in fasta.load_fasta(reference))

    ann_iter = gff.parse_gff(gff_file, gff_type=gff.from_gff)

    seq_iter = gff.extract_nuc_seqs(ann_iter, seqs, reverse=reverse)

    if progress:
        seq_iter = tqdm(seq_iter)

    for name, seq in seq_iter:
        fasta.write_fasta_sequence(fasta_file, name, seq, wrap=wrap)
Пример #5
0
def gtf_command(verbose, gene_id, gff_file, gtf_file):

    mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO)

    LOG.info('Writing to file (%s)', getattr(gtf_file, 'name', repr(gtf_file)))

    for annotation in gff.parse_gff(gff_file):
        gtf_file.write(annotation.to_gtf(gene_id_attr=gene_id).encode('ascii'))
Пример #6
0
def test_Annotation_to_file(gff_file, tmpdir):

    ann = gff.from_gff(gff_file[0])

    file_name = (tmpdir / 'test-write.gff').strpath
    file_handle = open_file(file_name, 'wb')
    ann.to_file(file_handle)
    file_handle.close()

    ann2 = next(gff.parse_gff(file_name))

    assert ann == ann2
Пример #7
0
def view_fields(verbose, num_ann, gff_file, txt_file):

    logger.config_log(level=logging.DEBUG if verbose else logging.INFO)

    fields = set()

    for count, annotation in enumerate(gff.parse_gff(gff_file)):
        fields.update(annotation.to_dict())
        if num_ann > 0:
            if (count + 1) == num_ann:
                break

    txt_file.write('\n'.join(sorted(fields)) + '\n')
Пример #8
0
def test_split_gff_file2(tmpdir, shared_datadir):

    gff_file = str(shared_datadir / 'test.gff')

    gff.split_gff_file(gff_file, (tmpdir / 'test{}.gff').strpath, 2)

    files = list(
        str(path)
        for path in pathlib.Path(tmpdir.strpath).glob('*.gff')
    )

    count1 = sum(1 for x in gff.parse_gff(gff_file))
    count2 = sum(1 for x in gff.parse_gff_files(files))

    assert count1 == count2
Пример #9
0
def remove_fields(verbose, attributes, uids, input_file, output_file):

    logger.config_log(level=logging.DEBUG if verbose else logging.INFO)

    attributes = set(x.strip() for x in attributes)

    LOG.info("Fields to remove: %s", ', '.join(attributes))

    if uids is not None:
        uids = set(line.strip() for line in uids)
        LOG.info("Number of `uid` passed: %d", len(uids))

    for annotation in gff.parse_gff(input_file):
        if (uids is not None) and (annotation.uid not in uids):
            annotation.to_file(output_file)
            continue

        for attribute in attributes:
            annotation.del_attr(attribute)

        annotation.to_file(output_file)
Пример #10
0
def add_fields_from_table(verbose, key, attribute, only_edited, skip_rows,
                          separator, comment, table_file, key_index,
                          attr_index, input_file, output_file):

    logger.config_log(level=logging.DEBUG if verbose else logging.INFO)

    LOG.info("Key used is '%s' and attribute '%s'", key, attribute)
    LOG.info("N. rows skipped '%d' Key index is '%d' and attribute index '%d'",
             skip_rows, key_index, attr_index)

    if getattr(table_file, 'name', None) is not None:
        LOG.info("Reading values from (%s)", table_file.name)

    fields = dict(
        text_to_dict(open_file(table_file),
                     skip_lines=skip_rows,
                     sep=separator,
                     key_index=key_index,
                     value_index=attr_index,
                     encoding='ascii',
                     skip_empty=True,
                     skip_comment=comment))

    changed = 0

    for annotation in gff.parse_gff(input_file):
        try:
            key_ann_value = annotation.get_attr(key)
        except gff.AttributeNotFound:
            if only_edited:
                continue
        try:
            annotation.set_attr(attribute, fields[key_ann_value])
            changed += 1
        except KeyError:
            if only_edited:
                continue
        annotation.to_file(output_file)

    LOG.info('Changed %d annotations', changed)
Пример #11
0
def print_fields(verbose, header, keep_empty, attributes, separator,
                 input_file, output_file):

    logger.config_log(level=logging.DEBUG if verbose else logging.INFO)

    attributes = list(x.strip() for x in attributes)
    LOG.info("Fields to print: %s", ', '.join(attributes))
    if header:
        output_file.write('{}\n'.format(separator.join(attributes)))

    for annotation in gff.parse_gff(input_file):
        values = []
        for attribute in attributes:
            try:
                values.append(annotation.get_attr(attribute))
            except gff.AttributeNotFound:
                if not keep_empty:
                    continue
                values.append('')

        if len(attributes) == len(values):
            output_file.write('{}\n'.format(separator.join(values)))
Пример #12
0
def filter_taxa_command(verbose, table, taxonomy, include_taxon_id,
                        include_taxon_name, exclude_taxon_id,
                        exclude_taxon_name, progress, input_file, output_file):
    mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO)

    LOG.info('Writing to file (%s)',
             getattr(output_file, 'name', repr(output_file)))

    taxonomy = taxon.Taxonomy(taxonomy)

    exclude_ids = validate_taxon_ids(exclude_taxon_id, taxonomy) | \
        validate_taxon_names(exclude_taxon_name, taxonomy)

    include_ids = validate_taxon_ids(include_taxon_id, taxonomy) | \
        validate_taxon_names(include_taxon_name, taxonomy)

    if exclude_ids:
        LOG.info("Excluding Taxa: %s", exclude_ids)
        exclude_func = functools.partial(filter_taxon_by_id_list,
                                         filter_list=exclude_ids,
                                         exclude=True,
                                         func=functools.partial(
                                             taxon.is_ancestor, taxonomy))
        exclude_func = memoize(exclude_func)
    else:
        exclude_func = None
    if include_ids:
        LOG.info("Only include Taxa: %s", include_ids)
        include_func = functools.partial(filter_taxon_by_id_list,
                                         filter_list=include_ids,
                                         exclude=False,
                                         func=functools.partial(
                                             taxon.is_ancestor, taxonomy))
        include_func = memoize(include_func)
    else:
        include_func = None

    if table:
        iterator = blast.parse_accession_taxa_table(input_file,
                                                    key=0,
                                                    value=1,
                                                    num_lines=None)
        if progress:
            iterator = tqdm(iterator)
        for acc_id, taxon_id in iterator:
            if include_func is not None:
                if not include_func(taxon_id):
                    continue
            if exclude_func is not None:
                if not exclude_func(taxon_id):
                    continue
            output_file.write("{}\t{}\n".format(acc_id,
                                                taxon_id).encode('ascii'))
    else:
        iterator = gff.parse_gff(input_file)
        if progress:
            iterator = tqdm(iterator)
        for annotation in iterator:
            if annotation.taxon_id is None:
                continue
            if include_func is not None:
                if not include_func(annotation.taxon_id):
                    continue
            if exclude_func is not None:
                if not exclude_func(annotation.taxon_id):
                    continue
            annotation.to_file(output_file)
Пример #13
0
def lca_contig_command(verbose, taxonomy, no_lca, only_ranked, bitscore,
                       rename, sorted, feat_type, reference, simple_table,
                       krona_total, out_format, progress, gff_file,
                       output_file):
    mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO)
    LOG.info('Writing to file (%s)',
             getattr(output_file, 'name', repr(output_file)))

    if (out_format == 'gff') and (reference is None):
        utils.exit_script(
            'The output format *gff* requires a FASTA file, passed with -r', 3)

    if no_lca is not None:
        LOG.info("Writing contigs/taxon_ids of contigs with no LCA to (%s)",
                 getattr(no_lca, 'name', repr(no_lca)))

    taxonomy = taxon.Taxonomy(taxonomy)

    if reference is not None:
        seqs = dict(
            fasta.load_fasta_rename(reference) if rename else fasta.
            load_fasta(reference))
    else:
        seqs = None

    # basic filter for the presence of a taxon_id and bitscore
    annotations_iter = (
        annotation for annotation in gff.parse_gff(gff_file)
        # only use annotations whose bitscore pass the filter
        # and have a taxon_id
        if ((annotation.bitscore >= bitscore) or (annotation.bitscore is None))
        and (annotation.taxon_id is not None) and
        # redundant probably, but used in cases when a taxon_id was deleted
        # from the taxonomy
        (annotation.taxon_id in taxonomy))

    if sorted:
        LOG.info("Input GFF is assumed sorted")
        annotations = gff.group_annotations_sorted(
            annotations_iter, lambda annotation: annotation.seq_id)
    else:
        # groups the annotations by sequence, in case they're not sorted
        annotations = viewvalues(
            gff.group_annotations(annotations_iter,
                                  lambda annotation: annotation.seq_id))

    count = 0
    lca_dict = {}
    if progress:
        annotations = tqdm(annotations)
    assigned_contigs = set()
    for seq_ann in annotations:
        count += 1
        seq_id = seq_ann[0].seq_id
        assigned_contigs.add(seq_id)
        if seqs is None:
            base_pairs = None
        else:
            base_pairs = len(seqs[seq_id])
        try:
            taxon_id = taxon.last_common_ancestor_multiple(
                taxonomy, (annotation.taxon_id for annotation in seq_ann))
        except taxon.NoLcaFound as error:
            LOG.warning("No LCA found for %s (%s)", seq_id, error)
            if no_lca is not None:
                write_no_lca(no_lca,
                             seq_id,
                             (annotation.taxon_id for annotation in seq_ann),
                             extra=set(
                                 taxonomy.get_ranked_taxon(
                                     annotation.taxon_id, 'phylum').s_name
                                 for annotation in seq_ann))
            if out_format == 'krona':
                write_krona(output_file,
                            taxonomy,
                            None,
                            False,
                            base_pairs=base_pairs)
            continue

        taxon_name, lineage = get_taxon_info(taxonomy, taxon_id, only_ranked)
        if out_format == 'gff':
            write_lca_gff(output_file, seq_id, seqs[seq_id], taxon_id,
                          taxon_name, lineage, feat_type)
        elif out_format == 'krona':
            write_krona(output_file,
                        taxonomy,
                        taxon_id,
                        only_ranked,
                        base_pairs=base_pairs)
        elif out_format == 'json':
            write_json(lca_dict, seq_id, taxonomy, taxon_id, only_ranked)
        elif out_format == 'tab':
            if simple_table:
                write_lca_tab_simple(output_file, seq_id, taxon_id)
            else:
                write_lca_tab(output_file, seq_id, taxon_id, taxon_name,
                              taxonomy[taxon_id].rank, lineage)
    if (out_format == 'krona'):
        if (krona_total is not None) and (seqs is None):
            for index in range(count, krona_total):
                output_file.write('Unknown\n'.encode('ascii'))
        elif seqs is not None:
            for seq_id in set(seqs) - assigned_contigs:
                output_file.write('{}\tUnknown\n'.format(len(
                    seqs[seq_id])).encode('ascii'))

    if out_format == 'json':
        output_file.write(json.dumps(lca_dict, indent=4).encode('ascii'))
Пример #14
0
def dbm_command(verbose, output_dir, gff_file):
    mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO)

    db = dbm.create_gff_dbm(gff.parse_gff(gff_file), output_dir)
    db.close()