def extract_GO_terms_from_emapper_annotation_file(emapper_annotation_file, output_file): GO_terms_dict = SynDict(filename=emapper_annotation_file, key_index=0, value_index=5, split_values=True, values_separator=",", comments_prefix="#", separator="\t") GO_terms_dict.header = "#protein_id\tGO_terms" GO_terms_dict.write(output_file, header=True, splited_values=True) return GO_terms_dict
def extract_predicted_gene_names_from_emapper_annotation_file( emapper_annotation_file, output_file): extract_predicted_gene_names_dict = SynDict( filename=emapper_annotation_file, key_index=0, value_index=4, split_values=True, values_separator=",", comments_prefix="#", separator="\t") extract_predicted_gene_names_dict.header = "#protein_id\tpredicted_gene_name" extract_predicted_gene_names_dict.write(output_file, header=True, splited_values=True) return extract_predicted_gene_names_dict
def count_miRNA_reads(self, alignment_file, gff_file, output_prefix, annotation_file_type="GTF", min_read_fraction_overlap=1.0, feature_type_to_use=None, attribute_type_to_use=None, sample_name=None, stranded=1): no_multimapped_read_counts = "%s.no_multimapped_reads.count" % output_prefix with_multimapped_read_counts = "%s.with_multimapped_reads.count" % output_prefix all_adjusted_read_counts = "%s.all_adjusted_reads.count" % output_prefix self.count(alignment_file, gff_file, no_multimapped_read_counts, annotation_file_type=annotation_file_type, min_read_fraction_overlap=min_read_fraction_overlap, feature_type_to_use=feature_type_to_use, attribute_type_to_use=attribute_type_to_use, stranded=stranded) self.count(alignment_file, gff_file, with_multimapped_read_counts, count_multimapped_reads=True, annotation_file_type=annotation_file_type, min_read_fraction_overlap=min_read_fraction_overlap, feature_type_to_use=feature_type_to_use, attribute_type_to_use=attribute_type_to_use, stranded=stranded) no_multimapped_read_count_dict = SynDict(filename=no_multimapped_read_counts, comments_prefix="#", key_index=0, value_index=6, expression=int, header=True) with_multimapped_read_count_dict = SynDict(filename=with_multimapped_read_counts, comments_prefix="#", key_index=0, value_index=6, expression=int, header=True) similar_feature_number_dict = SynDict(filename=with_multimapped_read_counts, comments_prefix="#", header=True, key_index=0, value_index=1, expression=lambda s: len(s.split(";"))) sample_nameeee = sample_name if sample_name else similar_feature_number_dict.header.split()[6] all_adjusted_read_count_dict = SynDict() all_adjusted_read_count_dict.header = ".\t%s" % sample_nameeee #print no_multimapped_read_count_dict #print with_multimapped_read_count_dict #print similar_feature_number_dict for feature_id in no_multimapped_read_count_dict: all_adjusted_read_count_dict[feature_id] = int(ceil(float(no_multimapped_read_count_dict[feature_id]) + \ (float(with_multimapped_read_count_dict[feature_id]) - float(no_multimapped_read_count_dict[feature_id])) / float(similar_feature_number_dict[feature_id]))) all_adjusted_read_count_dict.write(all_adjusted_read_counts, header=True)