def mongodb_command(verbose, taxonomy, no_cache, indent, progress, gff_file, output_file): mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) LOG.info('Writing to file (%s)', getattr(output_file, 'name', repr(output_file))) lineage_func = None if taxonomy is not None: taxonomy = taxon.Taxonomy(taxonomy) lineage_func = functools.partial(taxon.get_lineage, taxonomy) if no_cache: LOG.info('Using cached calls to lineage') lineage_func = simple_cache.memoize(lineage_func) iterator = gff.parse_gff(gff_file) if progress: iterator = tqdm(iterator) for annotation in iterator: output_file.write( annotation.to_mongodb(lineage_func=lineage_func, indent=indent).encode('ascii')) output_file.write('\n'.encode('ascii'))
def add_fields(verbose, attributes, overwrite, only_edited, uids, input_file, output_file): logger.config_log(level=logging.DEBUG if verbose else logging.INFO) if overwrite: LOG.info("Attributes/Values will be overwritten") LOG.info("Fields to add/change: %s", ', '.join(x[0] for x in attributes)) if len(set((x[0] for x in attributes))) != len(attributes): utils.exit_script("Found duplicates attributes to edit", 1) if uids is not None: uids = set(line.strip() for line in uids) LOG.info("Number of `uid` passed: %d", len(uids)) for annotation in gff.parse_gff(input_file): if (uids is not None) and (annotation.uid not in uids): if not only_edited: annotation.to_file(output_file) continue change_attr = set() for attribute, value in attributes: if (not overwrite) and annotation.has_attr(attribute): continue annotation.set_attr(attribute, value) change_attr.add(attribute) if change_attr or (not only_edited): annotation.to_file(output_file)
def coverage_command(verbose, reference, json_out, strand_specific, rename, progress, gff_file, output_file): mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) sequences = dict( fasta.load_fasta_rename(reference) if rename else fasta. load_fasta(reference)) iterator = gff.annotation_coverage_sorted(gff.parse_gff(gff_file), sequences, strand=strand_specific) if progress: iterator = tqdm(iterator) contig_coverage = {} for seq_id, strand, coverage in iterator: if json_out: contig_coverage[seq_id] = coverage else: output_file.write("{}\t{}\t{}\n".format( seq_id, "NA" if strand is None else strand, coverage).encode('ascii')) if json_out: output_file.write( json.dumps(contig_coverage, indent=4).encode('ascii'))
def sequence_command(verbose, reverse, no_wrap, split, reference, progress, gff_file, fasta_file): mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) if reference is None: utils.exit_script('A fasta reference file is required', 1) wrap = 60 if no_wrap: wrap = None seqs = dict((seq_id.split(' ')[0] if split else seq_id, seq) for seq_id, seq in fasta.load_fasta(reference)) ann_iter = gff.parse_gff(gff_file, gff_type=gff.from_gff) seq_iter = gff.extract_nuc_seqs(ann_iter, seqs, reverse=reverse) if progress: seq_iter = tqdm(seq_iter) for name, seq in seq_iter: fasta.write_fasta_sequence(fasta_file, name, seq, wrap=wrap)
def gtf_command(verbose, gene_id, gff_file, gtf_file): mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) LOG.info('Writing to file (%s)', getattr(gtf_file, 'name', repr(gtf_file))) for annotation in gff.parse_gff(gff_file): gtf_file.write(annotation.to_gtf(gene_id_attr=gene_id).encode('ascii'))
def test_Annotation_to_file(gff_file, tmpdir): ann = gff.from_gff(gff_file[0]) file_name = (tmpdir / 'test-write.gff').strpath file_handle = open_file(file_name, 'wb') ann.to_file(file_handle) file_handle.close() ann2 = next(gff.parse_gff(file_name)) assert ann == ann2
def view_fields(verbose, num_ann, gff_file, txt_file): logger.config_log(level=logging.DEBUG if verbose else logging.INFO) fields = set() for count, annotation in enumerate(gff.parse_gff(gff_file)): fields.update(annotation.to_dict()) if num_ann > 0: if (count + 1) == num_ann: break txt_file.write('\n'.join(sorted(fields)) + '\n')
def test_split_gff_file2(tmpdir, shared_datadir): gff_file = str(shared_datadir / 'test.gff') gff.split_gff_file(gff_file, (tmpdir / 'test{}.gff').strpath, 2) files = list( str(path) for path in pathlib.Path(tmpdir.strpath).glob('*.gff') ) count1 = sum(1 for x in gff.parse_gff(gff_file)) count2 = sum(1 for x in gff.parse_gff_files(files)) assert count1 == count2
def remove_fields(verbose, attributes, uids, input_file, output_file): logger.config_log(level=logging.DEBUG if verbose else logging.INFO) attributes = set(x.strip() for x in attributes) LOG.info("Fields to remove: %s", ', '.join(attributes)) if uids is not None: uids = set(line.strip() for line in uids) LOG.info("Number of `uid` passed: %d", len(uids)) for annotation in gff.parse_gff(input_file): if (uids is not None) and (annotation.uid not in uids): annotation.to_file(output_file) continue for attribute in attributes: annotation.del_attr(attribute) annotation.to_file(output_file)
def add_fields_from_table(verbose, key, attribute, only_edited, skip_rows, separator, comment, table_file, key_index, attr_index, input_file, output_file): logger.config_log(level=logging.DEBUG if verbose else logging.INFO) LOG.info("Key used is '%s' and attribute '%s'", key, attribute) LOG.info("N. rows skipped '%d' Key index is '%d' and attribute index '%d'", skip_rows, key_index, attr_index) if getattr(table_file, 'name', None) is not None: LOG.info("Reading values from (%s)", table_file.name) fields = dict( text_to_dict(open_file(table_file), skip_lines=skip_rows, sep=separator, key_index=key_index, value_index=attr_index, encoding='ascii', skip_empty=True, skip_comment=comment)) changed = 0 for annotation in gff.parse_gff(input_file): try: key_ann_value = annotation.get_attr(key) except gff.AttributeNotFound: if only_edited: continue try: annotation.set_attr(attribute, fields[key_ann_value]) changed += 1 except KeyError: if only_edited: continue annotation.to_file(output_file) LOG.info('Changed %d annotations', changed)
def print_fields(verbose, header, keep_empty, attributes, separator, input_file, output_file): logger.config_log(level=logging.DEBUG if verbose else logging.INFO) attributes = list(x.strip() for x in attributes) LOG.info("Fields to print: %s", ', '.join(attributes)) if header: output_file.write('{}\n'.format(separator.join(attributes))) for annotation in gff.parse_gff(input_file): values = [] for attribute in attributes: try: values.append(annotation.get_attr(attribute)) except gff.AttributeNotFound: if not keep_empty: continue values.append('') if len(attributes) == len(values): output_file.write('{}\n'.format(separator.join(values)))
def filter_taxa_command(verbose, table, taxonomy, include_taxon_id, include_taxon_name, exclude_taxon_id, exclude_taxon_name, progress, input_file, output_file): mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) LOG.info('Writing to file (%s)', getattr(output_file, 'name', repr(output_file))) taxonomy = taxon.Taxonomy(taxonomy) exclude_ids = validate_taxon_ids(exclude_taxon_id, taxonomy) | \ validate_taxon_names(exclude_taxon_name, taxonomy) include_ids = validate_taxon_ids(include_taxon_id, taxonomy) | \ validate_taxon_names(include_taxon_name, taxonomy) if exclude_ids: LOG.info("Excluding Taxa: %s", exclude_ids) exclude_func = functools.partial(filter_taxon_by_id_list, filter_list=exclude_ids, exclude=True, func=functools.partial( taxon.is_ancestor, taxonomy)) exclude_func = memoize(exclude_func) else: exclude_func = None if include_ids: LOG.info("Only include Taxa: %s", include_ids) include_func = functools.partial(filter_taxon_by_id_list, filter_list=include_ids, exclude=False, func=functools.partial( taxon.is_ancestor, taxonomy)) include_func = memoize(include_func) else: include_func = None if table: iterator = blast.parse_accession_taxa_table(input_file, key=0, value=1, num_lines=None) if progress: iterator = tqdm(iterator) for acc_id, taxon_id in iterator: if include_func is not None: if not include_func(taxon_id): continue if exclude_func is not None: if not exclude_func(taxon_id): continue output_file.write("{}\t{}\n".format(acc_id, taxon_id).encode('ascii')) else: iterator = gff.parse_gff(input_file) if progress: iterator = tqdm(iterator) for annotation in iterator: if annotation.taxon_id is None: continue if include_func is not None: if not include_func(annotation.taxon_id): continue if exclude_func is not None: if not exclude_func(annotation.taxon_id): continue annotation.to_file(output_file)
def lca_contig_command(verbose, taxonomy, no_lca, only_ranked, bitscore, rename, sorted, feat_type, reference, simple_table, krona_total, out_format, progress, gff_file, output_file): mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) LOG.info('Writing to file (%s)', getattr(output_file, 'name', repr(output_file))) if (out_format == 'gff') and (reference is None): utils.exit_script( 'The output format *gff* requires a FASTA file, passed with -r', 3) if no_lca is not None: LOG.info("Writing contigs/taxon_ids of contigs with no LCA to (%s)", getattr(no_lca, 'name', repr(no_lca))) taxonomy = taxon.Taxonomy(taxonomy) if reference is not None: seqs = dict( fasta.load_fasta_rename(reference) if rename else fasta. load_fasta(reference)) else: seqs = None # basic filter for the presence of a taxon_id and bitscore annotations_iter = ( annotation for annotation in gff.parse_gff(gff_file) # only use annotations whose bitscore pass the filter # and have a taxon_id if ((annotation.bitscore >= bitscore) or (annotation.bitscore is None)) and (annotation.taxon_id is not None) and # redundant probably, but used in cases when a taxon_id was deleted # from the taxonomy (annotation.taxon_id in taxonomy)) if sorted: LOG.info("Input GFF is assumed sorted") annotations = gff.group_annotations_sorted( annotations_iter, lambda annotation: annotation.seq_id) else: # groups the annotations by sequence, in case they're not sorted annotations = viewvalues( gff.group_annotations(annotations_iter, lambda annotation: annotation.seq_id)) count = 0 lca_dict = {} if progress: annotations = tqdm(annotations) assigned_contigs = set() for seq_ann in annotations: count += 1 seq_id = seq_ann[0].seq_id assigned_contigs.add(seq_id) if seqs is None: base_pairs = None else: base_pairs = len(seqs[seq_id]) try: taxon_id = taxon.last_common_ancestor_multiple( taxonomy, (annotation.taxon_id for annotation in seq_ann)) except taxon.NoLcaFound as error: LOG.warning("No LCA found for %s (%s)", seq_id, error) if no_lca is not None: write_no_lca(no_lca, seq_id, (annotation.taxon_id for annotation in seq_ann), extra=set( taxonomy.get_ranked_taxon( annotation.taxon_id, 'phylum').s_name for annotation in seq_ann)) if out_format == 'krona': write_krona(output_file, taxonomy, None, False, base_pairs=base_pairs) continue taxon_name, lineage = get_taxon_info(taxonomy, taxon_id, only_ranked) if out_format == 'gff': write_lca_gff(output_file, seq_id, seqs[seq_id], taxon_id, taxon_name, lineage, feat_type) elif out_format == 'krona': write_krona(output_file, taxonomy, taxon_id, only_ranked, base_pairs=base_pairs) elif out_format == 'json': write_json(lca_dict, seq_id, taxonomy, taxon_id, only_ranked) elif out_format == 'tab': if simple_table: write_lca_tab_simple(output_file, seq_id, taxon_id) else: write_lca_tab(output_file, seq_id, taxon_id, taxon_name, taxonomy[taxon_id].rank, lineage) if (out_format == 'krona'): if (krona_total is not None) and (seqs is None): for index in range(count, krona_total): output_file.write('Unknown\n'.encode('ascii')) elif seqs is not None: for seq_id in set(seqs) - assigned_contigs: output_file.write('{}\tUnknown\n'.format(len( seqs[seq_id])).encode('ascii')) if out_format == 'json': output_file.write(json.dumps(lca_dict, indent=4).encode('ascii'))
def dbm_command(verbose, output_dir, gff_file): mgkit.logger.config_log(level=logging.DEBUG if verbose else logging.INFO) db = dbm.create_gff_dbm(gff.parse_gff(gff_file), output_dir) db.close()