def main(old_gff, new_gff, output_gff, re_construct_features, tmp_identifier): logger.info('Reading original GFF3 file: (%s)...\n', old_gff) old_gff3 = Gff3(gff_file=old_gff) logger.info('Reading updated GFF3 file: (%s)...\n', new_gff) new_gff3 = Gff3(gff_file=new_gff) if re_construct_features: out_f = open(re_construct_features, 'w') else: out_f = None polypeptide_re_construct(old_gff3=old_gff3, new_gff3=new_gff3, tmp_identifier=tmp_identifier, report=out_f) re_construct(old_gff3=old_gff3, new_gff3=new_gff3, tmp_identifier=tmp_identifier, report=out_f) logger.info('Generating the re-constructed gff3 file: (%s)...\n', output_gff) write_gff3(new_gff3, output_gff) if re_construct_features: out_f.close()
def open_gff_file(gff): gff_hd = Gff3(gff) cds_recs = [] for line in gff_hd.lines[4:]: if 'seqid' in line and 'type' in line and 'strand' in line: if line['type'] == 'CDS' and line['strand'] == '+': cds_recs.append(line) print "number of cds: {}".format(len(cds_recs)) return cds_recs
def run_modifier(args): modifier = Modifier(args.annotation) print("Reading gff file") gff: Gff3 = Gff3(gff_file=args.gff_path) # Modify the gff file using the Modifier class modifier.modify_gff(gff) print("Writing modified gff file") # Write the modified gff to the output path if args.output_path is None: gff.write(sys.stdout) else: with open(args.output_path, "w") as file_out: gff.write(file_out)
def parse_genes(gff3_file, go_terms_file): """parses the genes from gff3 and enriches it with additional information""" go_map = _get_go_map(go_terms_file) gff = Gff3(gff3_file) genes = [ line for line in gff.lines if line['line_type'] == 'feature' and line['type'] == 'gene' ] genes_map = {} for gene in genes: gene_id = gene['attributes']['ID'] symbol = gene['attributes'].get('symbol', None) full_name = gene['attributes'].get('full_name', None) aliases = gene['attributes'].get('Alias', []) aliases = [{ 'symbol': aliases[i], 'full_name': (None if i >= len(aliases) - 1 else aliases[i + 1]) } for i in range(0, len(aliases), 2)] if symbol and full_name: aliases.insert(0, {'symbol': symbol, 'full_name': full_name}) gene_dict = { 'positions': { 'gte': gene['start'], 'lte': gene['end'] }, 'chr': gene['seqid'].lower(), 'type': gene['type'], 'strand': gene['strand'], 'name': gene_id, 'aliases': aliases, 'isoforms': [], 'GO': go_map.get(gene_id, []) } gene_dict['isoforms'] = _parse_isoforms(gff, gene) genes_map[gene_id] = gene_dict gene_dict['suggest'] = [gene_id] gene_dict['suggest'].extend( set([alias['symbol'] for alias in gene_dict['aliases']])) return genes_map
def main(): arguments = myTools.checkArgs([("gffFile", file)], [], __doc__) gff = Gff3(arguments["gffFile"]) genes = [ line for line in gff.lines if line['line_type'] == 'feature' and line['type'] == 'mRNA' ] for gene in genes: #print >> sys.stdout, gene['seqid'], gene['start'], gene['end'], gene['strand'], gene['attributes']['ID'] if gene['strand'] == "+": gene['strand'] = '1' else: gene['strand'] = '-1' print(myFile.myTSV.printLine([ gene['seqid'], gene['start'], gene['end'], gene['strand'], gene['attributes']['ID'] ]), file=sys.stdout)
def main(in_gff, merge_report, out_merge_report, out_gff, uuid_on, prefix, digitlen, report, alias): logger.info('Reading input gff3 file: (%s)', in_gff) gff3 = Gff3(gff_file=in_gff, logger=None) if merge_report: if not out_merge_report: logger.error( '-m is given. Please specify the filename of the updated merge report with -om' ) sys.exit(1) else: logger.info( 'Reading the update report file generated by gff3_merge program: (%s)', merge_report) header_lines, log_lines, merge_report_dict = read_merge_report( gff3, merge_report) # generate a table of comparison between old and new IDs. if report: out_report = open(report, 'w') # old and new IDs pair dict # ID_dict = {old_ID:newID, missingID: [newID1, newID2]} ID_dict = {'missing': []} ID_order = [] roots = list() logger.info('Generate new ID for features in (%s)', in_gff) for line in gff3.lines: try: if line['line_type'] == 'feature': if uuid_on: newID = str(uuid.uuid1()) if 'ID' in line['attributes']: if line['attributes']['ID'] in ID_dict: ID_dict[line['attributes']['ID']].append(newID) if alias: line['attributes']['Alias'] = line[ 'attributes']['ID'] line['attributes']['ID'] = newID else: ID_dict[line['attributes']['ID']] = [newID] ID_order.append(line['attributes']['ID']) if alias: line['attributes']['Alias'] = line[ 'attributes']['ID'] line['attributes']['ID'] = newID else: ID_dict['missing'].append(newID) line['attributes']['ID'] = newID if 'Parent' in line['attributes']: for index, parent in enumerate( line['attributes']['Parent']): if parent in ID_dict: line['attributes']['Parent'][index] = ID_dict[ parent][0] else: newID = str(uuid.uuid1()) ID_dict[parent] = [newID] ID_order.append(parent) line['attributes']['Parent'][index] = newID else: if 'Parent' not in line['attributes']: roots.append(line) except KeyError: logger.warning('[Missing Attributes] Line (%s)', str(line['line_index'] + 1)) IDnumber = 0 for root in roots: newID = idgenerator(prefix, IDnumber, digitlen) IDnumber = newID['maxnum'] ID_dict[root['attributes']['ID']] = [newID['ID']] ID_order.append(root['attributes']['ID']) if alias: root['attributes']['Alias'] = root['attributes']['ID'] root['attributes']['ID'] = newID['ID'] children = root['children'] alphabets = alphabets_suffix(len(children)) for child in children: for index, parent in enumerate(child['attributes']['Parent']): if parent in ID_dict: child['attributes']['Parent'][index] = newID['ID'] newcID = '%s-R%s' % (newID['ID'], alphabets.pop(0)) ID_dict[child['attributes']['ID']] = [newcID] ID_order.append(child['attributes']['ID']) if alias: child['attributes']['Alias'] = child['attributes']['ID'] child['attributes']['ID'] = newcID collected_list = descendants_list(line_data=child, level=0) levellist = level_list(collected_list) IDnumber_dict = dict() for item_list in levellist: reverse = False if len(item_list) > 1: if item_list[0]['strand'] == '-': reverse = True descendant_sort = TypeSort(item_list, dict(), reverse) for descend in descendant_sort: flag = False if descend['type'] not in IDnumber_dict: IDnumber_dict[descend['type']] = 0 for index, parent in enumerate( descend['attributes']['Parent']): if parent in ID_dict: if flag == True: break if descend['attributes']['ID'] not in ID_dict: deprefix = '%s-%s' % (ID_dict[parent][0], descend['type']) newdID = idgenerator( deprefix, IDnumber_dict[descend['type']], 3) IDnumber_dict[ descend['type']] = newdID['maxnum'] ID_dict[descend['attributes']['ID']] = [ newdID['ID'] ] ID_order.append(descend['attributes']['ID']) descend['attributes']['ID'] = newdID['ID'] flag = True if flag == False: deprefix = '%s-%s' % (ID_dict[parent][0], descend['type']) newdID = idgenerator( deprefix, IDnumber_dict[descend['type']], 3) IDnumber_dict[ descend['type']] = newdID['maxnum'] ID_dict[descend['attributes']['ID']].append( newdID['ID']) descend['attributes']['ID'] = newdID['ID'] flag = True descend['attributes']['Parent'][index] = ID_dict[ parent][0] if merge_report and out_merge_report: logger.info( 'Update report file generated by gff3_merge program with new IDs.') with open(out_merge_report, 'w') as out_f: for header_line in header_lines: out_f.write(header_line + '\n') for key in merge_report_dict: if key not in ID_order: logger.error( 'The report file has to correspond to the gff3 file specified with -g' ) sys.exit(1) else: for line_num in merge_report_dict[key]: # update Tmp_OGSv0_ID log_lines[line_num][4] = ID_dict[key][0] for log_line in log_lines: out_f.write('\t'.join(log_line) + '\n') logger.info('Write out gff3 file: (%s)', out_gff) write_gff3(gff3, out_gff) if report: ID_order.append('missing') logger.info( 'Generate a report of comparison between old and new IDs: (%s)', report) out_line = 'Old_ID\tNewID' out_report.write(out_line + '\n') for key in ID_order: for value in ID_dict[key]: out_line = '%s\t%s' % (key, value) out_report.write(out_line + '\n') out_report.close()
#python extract_intron_stat_from_gff.py links18.scaffolds.fa.rnd3_rerun.GAAS.gff test.csv from gff3 import Gff3 import sys #usage: python [input gff file] [output csv file] gff = Gff3(sys.argv[1]) output = sys.argv[2] intron_num = {} exon_length = {} mrna_len = {} intron_len = {} single_intron_len = [] CDS_length = {} for l in gff.lines: if l['type'] == 'mRNA': #if l['type']=='mRNA' and float(l['attributes']['_AED']) <0.5: rna_nam = l['attributes']['Name'] mrna_len[rna_nam] = l['end'] - l['start'] exons = [] CDS_len = 0 for rec in l['children']: exons.append((rec['start'], rec['end'])) if rec['type'] == 'CDS': CDS_len = CDS_len + rec['end'] - rec['start'] + 1 CDS_length[rna_nam] = CDS_len exons_merged = [] for begin, end in sorted(exons): if exons_merged and exons_merged[-1][1] >= begin - 1: exons_merged[-1] = (exons_merged[-1][0], end)
# fix_source.py # ================= # script uses Han Lin's gff3 class to import a gff3, take a list of IDs that currently have source ManualCuration, and switch the source of these models (all parents and children) to I5K # Requires: https://github.com/hotdogee/gff3-py from gff3 import Gff3 gff = Gff3('agla_v1_2-NALmod3.gff3') id_list = [ 'AGLA014663', 'AGLA003801', 'AGLA017751', 'AGLA003809', 'AGLA000919', 'AGLA000103' ] source_map = {'ManualCuration': 'I5K'} for feature_id in id_list: for feature in gff.features[feature_id]: for line in gff.descendants(feature): if line['source'] in source_map: line['source'] = source_map[line['source']] if feature['source'] in source_map: feature['source'] = source_map[feature['source']] gff.write('agla_v1_2-NALmod4.gff3')
def clustersToGFF(clusterspath, gffpath, goldpath, annotpath, source_type): gffcontent = Gff3(gffpath) clustercontent, goldContent, annotationContent = "", "", "" clustercontent = Utils.readFileLines(clusterspath) clusters = Utils.foldClusterData( clustercontent, "", 0.5) if 'score' in clusterspath else Utils.foldClusterData( clustercontent, "gold", "") goldContent = '\t'.join(Utils.readFileLines(goldpath)) if goldpath else "" annotationList = Utils.readFileLines(annotpath) if annotpath else "" annotationContent = ('\n').join(annotationList) if annotpath else "" # sort dict by key clusters = OrderedDict(sorted(clusters.items(), key=lambda x: x[0])) gffclusterfile = clusterspath.rsplit('.', 1)[0] + '.percluster.gff3' gffgenefile = clusterspath.rsplit('.', 1)[0] + '.pergene.gff3' outputcluster, outputgene = "##gff-version 3\n", "##gff-version 3\n" # filter only "mRNA" features, return dict {gene name, gff line} mRNAdict = { line['attributes']['Name'].replace('.1', ''): line for line in gffcontent.lines if line['type'] == 'mRNA' } for key, value in clusters.items(): for gene in value: gene = gene.replace('.1', '') thisgene = mRNAdict.get(gene) if (thisgene is not None): chr = thisgene['seqid'] position = str(thisgene['start']) + '\t' + str(thisgene['end']) score = '?' strand = thisgene['strand'] phase = thisgene['phase'] info = 'Name=' + gene + ';Note=' + key + '\n' if (goldContent): if (gene in annotationContent): annot = [ item for item in annotationList if gene in item ] annot = annot[0].split('\t')[1] if annot else '' if ('backbone' in annot): info = info.replace("\n", ";color=#EE0000\n") # red elif ('tailor' in annot): info = info.replace("\n", ";color=#EE9300\n") # orange elif ('transcript') in annot: info = info.replace( "\n", ";color=#048014\n") # forest green elif ('transport' in annot): info = info.replace( "\n", ";color=#1888f0\n") # light blue elif (gene in goldContent): info = info.replace( "\n", ";color=#9931f2\n") # bright purple outputgene += chr + '\t' + source_type + '\t' + position + '\t' + score + '\t' + strand + '\t' + phase + '\t' + info else: print('gene not found:', gene) startID = value[0].replace('.1', '') endID = value[-1].replace('.1', '') startGene = mRNAdict.get(startID) endGene = mRNAdict.get(endID) chr = startGene['seqid'] position = str(startGene['start']) + '\t' + str(endGene['end']) strand = startGene['strand'] phase = startGene['phase'] score = '?' info = 'Name=' + key + ';Note=' + ('|').join(value) + '\n' outputcluster += chr + '\t' + source_type + '\t' + position + '\t' + score + '\t' + strand + '\t' + phase + '\t' + info Utils.writeFile(gffclusterfile, outputcluster) Utils.writeFile(gffgenefile, outputgene) return gffcontent
args = parser.parse_args(['-g', 'annotations.gff']) else: args = parser.parse_args() if args.gff_file: logger_stderr.info('Checking GFF3 file (%s)...', args.gff_file) elif not sys.stdin.isatty(): # if STDIN connected to pipe or file args.gff_file = sys.stdin logger_stderr.info('Reading from STDIN...') else: # no input parser.print_help() sys.exit(1) logger_stderr.info('Checking syntax and formatting...') gff3 = Gff3(gff_file=args.gff_file, fasta_external=args.fasta_file, logger=logger_null) logger_stderr.info('Checking reference seqid, bounds and N count...') gff3.check_reference(allowed_num_of_n=args.allowed_num_of_n, feature_types=args.check_n_feature_types) logger_stderr.info('Checking parent boundaries...') gff3.check_parent_boundary() gff3.check_phase() if args.report_file: logger_stderr.info('Writing validation report (%s)...', args.report_file) report_fh = open(args.report_file, 'wb') else: report_fh = sys.stdout
parser = argparse.ArgumentParser() parser.add_argument( 'genes_gff', help="Input gff3 annotation file including only genes features") parser.add_argument('--busco_result', default=None, help="BUSCO full report output") parser.add_argument('--ips_result', help="InterProScan tsv output") parser.add_argument('--blast_result', help="Blast search result of proteins vs. DB") parser.add_argument('--repeats_gff', help="gff3 file with repeats features only") parser.add_argument('out_report', help="Output QA report") args = parser.parse_args() gff_obj = Gff3(args.genes_gff) qa_methods = [("Chromosome", mrna_chrom, (gff_obj, )), ("AED", mrna_aed, (gff_obj, )), ("Exons", mrna_exons_count, (gff_obj, )), ("UTR", mrna_utr, (gff_obj, ))] if args.repeats_gff: rep_gff_obj = Gff3(args.repeats_gff) qa_methods.append(('Repeats', repeats_overlap, (gff_obj, rep_gff_obj))) if args.busco_result: qa_methods.append(('BUSCO', prot_busco, (args.busco_result, ))) if args.blast_result: qa_methods.append(('BLAST', prot_similarity, (args.blast_result, ))) if args.ips_result: qa_methods.append(('IPS', prot_domains, (args.ips_result, ))) qa_data = [pd.Series(m[1].__call__(*m[2]), name=m[0]) for m in qa_methods]
import sys # try to import from project first from os.path import dirname sys.path.insert(1, dirname(dirname(__file__))) from gff3 import Gff3 # initialize a Gff3 object gff = Gff3() # parse GFF3 file and do syntax checking, this populates gff.lines and gff.features # if an embedded ##FASTA directive is found, parse the sequences into gff.fasta_embedded gff.parse('annotations.gff') # parse the external FASTA file into gff.fasta_external #gff.parse_fasta_external('annotations.fa') # Check seqid, bounds and the number of Ns in each feature using one or more reference sources gff.check_reference(allowed_num_of_n=0, feature_types=['CDS']) # Checks whether child features are within the coordinate boundaries of parent features gff.check_parent_boundary() # Calculates the correct phase and checks if it matches the given phase for CDS features gff.check_phase()
filemode='w', level=logging.INFO) logging.info('start program') #gff_data = pd.read_csv('clec_OGS_v1_2_with_pep_CDS.gff3', sep="\t", header = None,comment='#') ''' remove NOTES attribute name attributes: gene, mRNA, pseudogenic_transcript ''' f_out = open(file_name1 + '_out.txt', 'w') #1. remove %09 #preprocess= subprocess.Popen("sed s/%09//g "+ file_in +">temp.gff", stdout=subprocess.PIPE,shell=True) #gff_data = pd.read_csv('temp.gff', sep="\t", header = None,comment='#') gff_data = pd.read_csv(file_in, sep="\t", header=None, comment='#') gff = Gff3(gff_file=file_in) #2. remodel pseudogene def Remodel_pseudogenes(): pseudo_list = [] df0 = gff_data.where(gff_data[2].str.contains("pseudogene")) idx0 = df0.dropna().index.tolist() for index in idx0: gff_data.iloc[index, 2] = "gene" # gff_data.iloc[index, 8] = gff_data.loc[index, 8] + ";pseudogene=unknown" # get gene_ID gene_ID = "".join( re.match("^.*ID=([^;]+);.+$", gff_data.iloc[index, 8]).groups()) # print gene_ID
#python intron_length_comparison_with_intersp_protein2genome.py Sapria_longintron.rnd1.protein2genome.gff rnd1_rerun_protein2genome.intron_pos.tsv from gff3 import Gff3 import sys, re #gff = Gff3(sys.argv[1]) #output=open(sys.argv[2],'a') gff = Gff3('Sapria_longintron.rnd1.protein2genome.gff') output = open('rnd1_protein2genome.sum_by_intron.tsv', 'a') output.write('\t'.join([ 'prot_aln_ID', 'scaffold', 'start', 'end', 'type', '5_end_intron_ID', 'protein_hit', 'protein_corrd', 'alignment_note' ]) + '\n') ID = 0 for l in gff.lines: #for protein2genome if l['type'] == 'protein_match': protein_target_id = l['attributes']['Name'] if protein_target_id.startswith( 'Potri') or protein_target_id.startswith('Mane'): ID = ID + 1 #if no intron if len(l['children']) == 1: scaf = l['seqid'] strand = l['strand'] rec = l['children'][0] output.write('\t'.join([ 'ALN_' + ` ID `, scaf, str(rec['start']), str(rec['end']), 'prot_match', '-', protein_target_id,
import sys # try to import from project first from os.path import dirname sys.path.insert(1, dirname(dirname(__file__))) from gff3 import Gff3 gff = Gff3('annotations.gff') type_map = {'exon': 'pseudogenic_exon', 'transcript': 'pseudogenic_transcript'} pseudogenes = [line for line in gff.lines if line['type'] == 'pseudogene'] for pseudogene in pseudogenes: # convert types for line in gff.descendants(pseudogene): if line['type'] in type_map: line['type'] = type_map[line['type']] # find overlapping gene overlapping_genes = [ line for line in gff.lines if line['type'] == 'gene' and gff.overlap(line, pseudogene) ] if overlapping_genes: # move pseudogene children to overlapping gene gff.adopt(pseudogene, overlapping_genes[0]) # remove pseudogene gff.remove(pseudogene) gff.write('annotations_fixed.gff')