def main(old_gff, new_gff, output_gff, re_construct_features, tmp_identifier): logger.info('Reading original GFF3 file: (%s)...\n', old_gff) old_gff3 = Gff3(gff_file=old_gff) logger.info('Reading updated GFF3 file: (%s)...\n', new_gff) new_gff3 = Gff3(gff_file=new_gff) if re_construct_features: out_f = open(re_construct_features, 'w') else: out_f = None polypeptide_re_construct(old_gff3=old_gff3, new_gff3=new_gff3, tmp_identifier=tmp_identifier, report=out_f) re_construct(old_gff3=old_gff3, new_gff3=new_gff3, tmp_identifier=tmp_identifier, report=out_f) logger.info('Generating the re-constructed gff3 file: (%s)...\n', output_gff) write_gff3(new_gff3, output_gff) if re_construct_features: out_f.close()
def main(gff, output=None, sorting_order=None, isoform_sort=False, logger=None, reference=False): logger_null = logging.getLogger(__name__ + 'null') null_handler = logging.NullHandler() logger_null.addHandler(null_handler) gff3 = Gff3(gff_file=gff, logger=logger_null) if output: report = open(output, 'w') else: report = sys.stdout logger.info('Sorting and printing out...') # Visit the GFF3 object through root-level features (eg. gene, pseudogene, and etc.) roots = [] gff3_linenum_Set = set() for line in gff3.lines: if line['line_type'] == 'feature': gff3_linenum_Set.add(line['line_index']) try: if line['line_type'] == 'feature' and not 'Parent' in line[ 'attributes'] and len(line['attributes']) != 0: roots.append(line) except: logger.warning( '[Missing Attributes] Program failed.\n\t\t- Line {0:s}: {1:s}' .format(str(line['line_index'] + 1), line['line_raw'])) #roots = [line for line in gff3.lines if line['line_type'] == 'feature' and not line['attributes'].has_key('Parent')] # Sort the root-level features based on the order of the genomic sequences roots_sorted = PositionSort(roots, reference) # Write the gff version # report.write('##gff-version 3\n') wrote_sequence_region = set() # build sequence region data sequence_regions = {} if gff3.fasta_embedded: for seqid in gff3.fasta_embedded: sequence_regions[seqid] = (1, len(gff3.fasta_embedded[seqid]['seq'])) else: directives_lines = [ line_data for line_data in gff3.lines if line_data['line_type'] == 'directive' and line_data['directive'] == '##sequence-region' ] for sequence_region in directives_lines: sequence_regions[sequence_region['seqid']] = ( sequence_region['start'], sequence_region['end']) ignore_directives = ['##sequence-region', '###', '##FASTA'] # write directive directives_lines = [ line_data for line_data in gff3.lines if line_data['line_type'] == 'directive' and line_data['directive'] not in ignore_directives ] for directives_line in directives_lines: report.write(directives_line['line_raw']) # Visit every root-level feature for root in roots_sorted: # write ##sequence-region if root['seqid'] not in wrote_sequence_region: if root['seqid'] in sequence_regions: report.write( '##sequence-region %s %d %d\n' % (root['seqid'], sequence_regions[root['seqid']][0], sequence_regions[root['seqid']][1])) wrote_sequence_region.add(root['seqid']) if sorting_order == None: report.write(root['line_raw']) gff3_linenum_Set.discard(root['line_index']) children = root[ 'children'] # Collect the second-level features (eg. mRNA, ncRNA, and etc.) children_sorted = PositionSort(children, reference) otherlines = [] for child in children_sorted: ## ID information is stored in child['attributes']['ID'] #print('----------------') gff3_linenum_Set.discard(child['line_index']) report.write(child['line_raw']) grandchildren = child[ 'children'] # Collect third-level features (eg. exon, CDS, and etc.) gchildgroup = {} # Visit every third-level feature, and collect a dictionary of 'type' to 'features' for grandchild in grandchildren: # Visit each third-level feature if str(grandchild['type']) in gchildgroup: gchildgroup[str(grandchild['type'])].append(grandchild) else: gchildgroup[str(grandchild['type'])] = [] gchildgroup[str(grandchild['type'])].append(grandchild) otherlines.extend(gff3.collect_descendants(grandchild)) # Seperate the third-level features into three groups: exon, cds, and others exons = [] cdss = [] others = [] for k, v in gchildgroup.items(): if k == 'exon' or k == 'pseudogenic_exon': exons.extend(v) elif k == 'CDS': cdss.extend(v) else: others.extend(v) # Sort exons by considering strand information (StrandSort) if len(exons): exons_sorted = [] if StrandSort(exons): exons_sorted = StrandSort(exons) for exon in exons_sorted: if 'Parent' in exon['attributes']: if isinstance( exon['attributes']['Parent'], list) and len( exon['attributes']['Parent']) > 1: gff3_linenum_Set.discard( exon['line_index']) report.write( TwoParent(child['attributes']['ID'], exon)) else: gff3_linenum_Set.discard( exon['line_index']) report.write(exon['line_raw']) else: gff3_linenum_Set.discard(exon['line_index']) report.write(exon['line_raw']) # Sort cds features by considering strand information (StrandSort) if len(cdss): cdss_sorted = [] if StrandSort(cdss): cdss_sorted = StrandSort(cdss) for cds in cdss_sorted: if 'Parent' in cds['attributes']: if isinstance( cds['attributes']['Parent'], list) and len( cds['attributes']['Parent']) > 1: gff3_linenum_Set.discard(cds['line_index']) report.write( TwoParent(child['attributes']['ID'], cds)) else: gff3_linenum_Set.discard(cds['line_index']) report.write(cds['line_raw']) else: gff3_linenum_Set.discard(cds['line_index']) report.write(cds['line_raw']) # Sort other features by PositionSort if len(others): if PositionSort(others, reference): for other in others: if 'Parent' in other['attributes']: if isinstance( other['attributes']['Parent'], list) and len( other['attributes']['Parent']) > 1: gff3_linenum_Set.discard( other['line_index']) report.write( TwoParent(child['attributes']['ID'], other)) else: gff3_linenum_Set.discard( other['line_index']) report.write(other['line_raw']) else: gff3_linenum_Set.discard(other['line_index']) report.write(other['line_raw']) # Sort the features beyond the third-level by PositionSort unique = {} otherlines_sorted = [] if PositionSort(otherlines, reference): otherlines_sorted = PositionSort(otherlines, reference) for k in otherlines_sorted: gff3_linenum_Set.discard(k['line_index']) unique[k['line_raw']] = 1 for k, v in unique.items(): report.write(k) else: if not isoform_sort: gff3_linenum_Set = write_out_by_level( level=0, report=report, line_data=root, sorting_order=sorting_order, gff3_linenum_Set=gff3_linenum_Set) else: model = gff3.collect_descendants(root) model.insert(0, root) strand_set = list(set([line['strand'] for line in model])) reverse = False for line in model: if len(strand_set) == 1: if strand_set == '-': reverse = True line_list = TypeSort(model, sorting_order, reverse=reverse) for line in line_list: gff3_linenum_Set.discard(line['line_index']) report.write(line['line_raw']) report.write('###\n') #Missing 'root' feature if len(gff3_linenum_Set) != 0: logger.warning( 'The following lines are omitted from the output file, because there is a problem with the input file. Please review the input file or run gff-QC.py to identify the error.\n' ) for line_num in gff3_linenum_Set: print('\t\t- Line {0:s}: {1:s}'.format( str(line_num + 1), gff3.lines[line_num]['line_raw'])) # write fasta fasta = gff3.fasta_embedded if fasta: report.write('##FASTA\n') for key in fasta: seq = fasta[key]['seq'] report.write(u'{0:s}\n{1:s}\n'.format(fasta[key]['header'], seq))
def main(gff1, gff2, fasta, outdir, scode, logger, all_assign=False, user_defined1=None, user_defined2=None): logger_null = logging.getLogger(__name__ + 'null') null_handler = logging.NullHandler() logger_null.addHandler(null_handler) if not os.path.isdir(outdir): os.makedirs(outdir) tmpdir = '{0:s}/{1:s}'.format(outdir, 'tmp') if not os.path.isdir(tmpdir): os.makedirs(tmpdir) #Check if there is a non-coding transcript transcripts = set() transcripts_type = set() gff3_1 = Gff3(gff_file=gff1, fasta_external=fasta, logger=logger) gff3_2 = Gff3(gff_file=gff2, fasta_external=fasta, logger=logger) makeblastdb_path = os.path.join(lib_path, 'ncbi-blast+', 'bin', 'makeblastdb') blastn_path = os.path.join(lib_path, 'ncbi-blast+', 'bin', 'blastn') if user_defined1 is None: roots = [] for line in gff3_1.lines: try: if line['line_type'] == 'feature': # remove all the replace attributes if all_assign and 'replace' in line['attributes']: del line['attributes']['replace'] if 'Parent' not in line['attributes'] and len( line['attributes']) != 0: roots.append(line) except: pass for root in roots: children = root['children'] for child in children: cid = 'NA' if child['attributes'].has_key('ID'): cid = child['attributes']['ID'] defline = cid gchildren = child['children'] CDSflag = 0 for gchild in gchildren: if gchild['type'] == 'CDS': CDSflag += 1 if CDSflag == 0: transcripts.add(defline) if child.has_key('type'): transcripts_type.add(child['type']) else: for lines in user_defined1: transcripts_type.add(lines[0]) for line in gff3_1.lines: if line['line_type'] == 'feature': if all_assign and 'replace' in line['attributes']: del line['attributes']['replace'] if line['type'] in transcripts_type: id = str() if line['attributes'].has_key('ID'): id = line['attributes']['ID'] transcripts.add(id) gff2_transcripts_type = set() if user_defined2 is None: roots = [] for line in gff3_2.lines: try: if line['line_type'] == 'feature': if 'Parent' not in line['attributes'] and len( line['attributes']) != 0: roots.append(line) except KeyError: pass for root in roots: for child in root['children']: if 'type' in child: gff2_transcripts_type.add(child['type']) else: for lines in user_defined2: gff2_transcripts_type.add(lines[0]) if all_assign: # modified gff1 without any relace attributes gff3_1_mod = os.path.join(tmpdir, 'gff1_mod.gff3') gff3_1.write(gff3_1_mod) gff1 = gff3_1_mod out1_type = os.path.join(tmpdir, 'gff1_transcript_type.txt') with open(out1_type, "w") as trans_type: for line in transcripts_type: trans_type.write(line + "\n") cmd = os.path.join(lib_path, 'auto_assignment', 'create_annotation_summaries_nov21-7.pl') logger.info('Generate info table for {0:s} by using {1:s}'.format( gff1, cmd)) summary = os.path.join(tmpdir, 'summary_report.txt') subprocess.Popen(['perl', cmd, gff1, fasta, summary, scode, out1_type], stdout=DEVNULL).wait() logger.info('Extract sequences from {0:s}...'.format(gff1)) out1 = os.path.join(tmpdir, 'gff1') if user_defined1 is None: logger.info('\tExtract CDS sequences...') gff3_to_fasta.main(gff_file=gff1, fasta_file=fasta, stype='cds', dline='complete', qc=False, output_prefix=out1, logger=logger_null) logger.info('\tExtract premature transcript sequences...') gff3_to_fasta.main(gff_file=gff1, fasta_file=fasta, stype='pre_trans', dline='complete', qc=False, output_prefix=out1, logger=logger_null) if len(transcripts) > 0: logger.info('\tExtract transcript sequences...') gff3_to_fasta.main(gff_file=gff1, fasta_file=fasta, stype='trans', dline='complete', qc=False, output_prefix=out1, logger=logger_null) else: logger.info('\tExtract user_defined_file1 sequences...') user_defined_out1 = '{0:s}_{1:s}'.format(out1, 'cds.fa') user_defined_pretrans1 = '{0:s}_{1:s}'.format(out1, 'pre_trans.fa') user_defined_tmp = '{0:s}_{1:s}'.format(out1, 'user_defined.fa') parent_type = set() with open(user_defined_out1, "w") as outfile: for lines in user_defined1: gff3_to_fasta.main(gff_file=gff1, fasta_file=fasta, stype='user_defined', user_defined=lines, dline='complete', qc=False, output_prefix=out1, logger=logger_null) with open(user_defined_tmp, 'rb') as fd: shutil.copyfileobj(fd, outfile) parent_type.add(lines[0]) with open(user_defined_pretrans1, "w") as outfile: for line in parent_type: seq = gff3_to_fasta.extract_start_end(gff3_1, line, 'complete') for k, v in seq.items(): if len(k) != 0 and len(v) != 0: outfile.write('{0:s}\n{1:s}\n'.format(k, v)) logger.info('Extract sequences from {0:s}...'.format(gff2)) out2 = os.path.join(tmpdir, 'gff2') if user_defined2 is None: logger.info('\tExtract CDS sequences...') gff3_to_fasta.main(gff_file=gff2, fasta_file=fasta, stype='cds', dline='complete', qc=False, output_prefix=out2, logger=logger_null) logger.info('\tExtract premature transcript sequences...') gff3_to_fasta.main(gff_file=gff2, fasta_file=fasta, stype='pre_trans', dline='complete', qc=False, output_prefix=out2, logger=logger_null) if len(transcripts) > 0: logger.info('\tExtract transcript sequences...') gff3_to_fasta.main(gff_file=gff2, fasta_file=fasta, stype='trans', dline='complete', qc=False, output_prefix=out2, logger=logger_null) else: logger.info('\tExtract user_defined_file2 sequences...') user_defined_out2 = '{0:s}_{1:s}'.format(out2, 'cds.fa') user_defined_pretrans2 = '{0:s}_{1:s}'.format(out2, 'pre_trans.fa') user_defined_tmp = '{0:s}_{1:s}'.format(out2, 'user_defined.fa') parent_type = set() with open(user_defined_out2, "w") as outfile: for lines in user_defined2: gff3_to_fasta.main(gff_file=gff2, fasta_file=fasta, stype='user_defined', user_defined=lines, dline='complete', qc=False, output_prefix=out2, logger=logger_null) with open(user_defined_tmp, 'rb') as fd: shutil.copyfileobj(fd, outfile) parent_type.add(lines[0]) with open(user_defined_pretrans2, "w") as outfile: for line in parent_type: seq = gff3_to_fasta.extract_start_end(gff3_2, line, 'complete') for k, v in seq.items(): if len(k) != 0 and len(v) != 0: outfile.write('{0:s}\n{1:s}\n'.format(k, v)) logger.info('Catenate {0:s} and {1:s}...'.format(gff1, gff2)) cgff = os.path.join(tmpdir, 'cat.gff') with open(cgff, "w") as outfile: for catfile in [gff1, gff2]: with open(catfile, 'rb') as fd: shutil.copyfileobj(fd, outfile) bdb = '{0:s}_{1:s}'.format(out2, 'cds.fa') logger.info('Make blastDB for CDS sequences from {0:s}...'.format(bdb)) subprocess.Popen([makeblastdb_path, '-in', bdb, '-dbtype', 'nucl']).wait() print('\n') logger.info( 'Sequence alignment for cds fasta files between {0:s} and {1:s}...'. format(gff1, gff2)) binput = '{0:s}_{1:s}'.format(out1, 'cds.fa') bout = os.path.join(tmpdir, 'blastn.out') subprocess.Popen([ blastn_path, '-db', bdb, '-query', binput, '-out', bout, '-evalue', '1e-10', '-penalty', '-15', '-ungapped', '-outfmt', '6' ]).wait() # update out1_type transcripts_type.update(gff2_transcripts_type) with open(out1_type, "w") as trans_type: for line in transcripts_type: trans_type.write(line + "\n") logger.info('Find CDS matched pairs between {0:s} and {1:s}...'.format( gff1, gff2)) cmd = os.path.join(lib_path, 'auto_assignment', 'find_match.pl') report1 = os.path.join(tmpdir, 'report1.txt') subprocess.Popen(['perl', cmd, cgff, bout, scode, report1, out1_type]).wait() with open(bout, "r") as bcds: for line in bcds: try: QueryID = re.match("^.*ID=([^|]+).+$", line.split("\t")[0]).group(1) transcripts.discard(QueryID) except: pass if len(transcripts) > 0: if user_defined2 is None: bdb = '{0:s}_{1:s}'.format(out2, 'trans.fa') else: bdb = '{0:s}_{1:s}'.format(out2, 'cds.fa') logger.info( 'Make blastDB for transcript sequences from {0:s}...'.format( bdb)) subprocess.Popen([makeblastdb_path, '-in', bdb, '-dbtype', 'nucl']).wait() print('\n') logger.info( 'Sequence alignment for transcript fasta files between {0:s} and {1:s}...' .format(gff1, gff2)) if user_defined1 is None: binput = '{0:s}_{1:s}'.format(out1, 'trans.fa') else: binput = '{0:s}_{1:s}'.format(out1, 'cds.fa') bout = '{0:s}/{1:s}'.format(tmpdir, 'blastn.out') subprocess.Popen([ blastn_path, '-db', bdb, '-query', binput, '-out', bout, '-evalue', '1e-10', '-penalty', '-15', '-ungapped', '-outfmt', '6' ]).wait() logger.info( 'Find transcript matched pairs between {0:s} and {1:s}...'.format( gff1, gff2)) cmd = os.path.join(lib_path, 'auto_assignment', 'find_match.pl') report1_trans = os.path.join(tmpdir, 'report1_trans.txt') subprocess.Popen( ['perl', cmd, cgff, bout, scode, report1_trans, out1_type]).wait() with open(report1, "a") as rep1: with open(report1_trans, "r") as rep1_trans: for line in rep1_trans: try: transID = line.split("\t")[2] if transID in transcripts: rep1.write(line) except: pass bdb = '{0:s}_{1:s}'.format(out2, 'pre_trans.fa') logger.info( 'Make blastDB for premature transcript sequences from {0:s}...'.format( bdb)) subprocess.Popen([makeblastdb_path, '-in', bdb, '-dbtype', 'nucl']).wait() print('\n') logger.info( 'Sequence alignment for premature transcript fasta files between {0:s} and {1:s}...' .format(gff1, gff2)) binput = '{0:s}_{1:s}'.format(out1, 'pre_trans.fa') bout = os.path.join(tmpdir, 'blastn.out') subprocess.Popen([ blastn_path, '-db', bdb, '-query', binput, '-out', bout, '-evalue', '1e-10', '-penalty', '-15', '-ungapped', '-outfmt', '6' ]).wait() cmd = os.path.join(lib_path, 'auto_assignment', 'find_match.pl') logger.info( 'Find premature transcript matched pairs between {0:s} and {1:s}...'. format(gff1, gff2)) report2 = os.path.join(tmpdir, 'report2.txt') subprocess.Popen(['perl', cmd, cgff, bout, scode, report2, out1_type]).wait() print('\n') cmd = os.path.join(lib_path, 'auto_assignment', 'gen_spreadsheet.pl') check1 = os.path.join(outdir, 'check1.txt') logger.info( 'Generate {0:s} for Check Point 1 internal reviewing...'.format( check1)) subprocess.Popen(['perl', cmd, summary, report1, report2, check1]).wait()
def main(gff_file1, gff_file2, output_gff, report_fh, user_defined1=None, user_defined2=None, logger=None): logger_null = logging.getLogger(__name__ + 'null') null_handler = logging.NullHandler() logger_null.addHandler(null_handler) if not logger: logger = logger_null logger.info( 'Sorting the WA gff by following the order of Scaffold number and coordinates...' ) gff3_sort.main(gff_file1, output='WA_sorted.gff', logger=logger) logger.info( 'Sorting the other gff by following the order of Scaffold number and coordinates...' ) gff3_sort.main(gff_file2, output='other_sorted.gff', logger=logger) logger.info('Reading WA gff3 file...') gff3 = Gff3(gff_file='WA_sorted.gff', logger=logger_null) logger.info('Reading the other gff3 file...') gff3M = Gff3(gff_file='other_sorted.gff', logger=logger_null) #Maker logger.info('Identifying types of replacement based on replace tag...') ReplaceGroups = replace_OGS.Groups(WAgff=gff3, Pgff=gff3M, outsideNum=1, user_defined1=user_defined1, user_defined2=user_defined2, logger=logger_null) logger.info('Replacing...') u_types = set() u1_types = set() if user_defined1 is not None: for line in user_defined1: u1_types.add(line[0]) u_types |= u1_types else: u1_types = None u2_types = set() if user_defined2 is not None: for line in user_defined2: u2_types.add(line[0]) u_types |= u2_types else: u2_types = None roots = [] transcripts = [] unique = set() for line in gff3.lines: if user_defined1 is None: try: if line['line_type'] == 'feature' and not line[ 'attributes'].has_key('Parent'): roots.append(line) except: pass else: if line['type'] in u1_types: transcripts.append(line) for root in gff3.collect_roots(line): if root['line_raw'] not in unique: roots.append(root) unique.add(root['line_raw']) #roots = [line for line in gff3.lines if line['line_type'] == 'feature' and not line['attributes'].has_key('Parent')] rnum, cnum, changed = 0, 0, 0 cal_type_children = {} changed_rootid = set() not_orphan = set() for root in roots: rnum += 1 if user_defined1 is None: children = root['children'] else: children = [] unique = set() if root['type'] in u1_types: children.append(root) else: for child in gff3.collect_descendants(root): if child['type'] in u1_types: if child['line_raw'] not in unique: children.append(child) unique.add(child['line_raw']) children = sorted(children, key=lambda k: k['line_index']) tags = {} cnum += len(children) maxisoforms = 0 for child in children: tags[str(child['attributes']['replace'])] = 0 for tag in child['attributes']['replace']: if not tag == 'NA': not_orphan.add(tag) t = gff3M.features[ReplaceGroups.mapName2ID[tag]][0] if user_defined2 is None: tmp = len(t['parents'][0][0]['children']) else: if len(t['parents']) == 0 and t['type'] in u2_types: #this transcript don't have parent feature(e.g. gene), set the number of isoform as 1. tmp = 1 else: tmp = len(t['parents'][0][0]['children']) if tmp > maxisoforms: maxisoforms = tmp if len(tags) <= 1: if maxisoforms >= 2: root['attributes']['replace_type'] = 'multi-ref' for child in children: child['attributes']['replace_type'] = 'multi-ref' if user_defined1 is None: ans = ReplaceGroups.replacer_multi(root, ReplaceGroups, gff3M, u1_types, u2_types) else: ans = ReplaceGroups.replacer_multi(root, ReplaceGroups, gff3M, u1_types, u2_types, gff3) report_fh.write('{0:s}\n'.format(ans)) changed_rootid.add(root['attributes']['ID']) changed += 1 else: ReplaceGroups.replacer(root, ReplaceGroups, gff3M, u1_types, gff3) changed_rootid.add(root['attributes']['ID']) changed += 1 else: logger.info( '[Warning] multiple replace tags in multiple isoforms! {0:s}. This model is not processed\n' .format(root['attributes']['ID'])) report_fh.write( '[Warning] multiple replace tags in multiple isoforms! {0:s}. This model is not processed\n' .format(root['attributes']['ID'])) for child in children: if child['attributes'].has_key('status') and ( child['attributes']['status'] == 'Delete' or child['attributes']['status'] == 'delete'): child['attributes']['replace_type'] = 'Delete' if cal_type_children.has_key(child['attributes']['replace_type']): cal_type_children[child['attributes']['replace_type']] += 1 else: cal_type_children[child['attributes']['replace_type']] = 1 cal_type = {} for i in ReplaceGroups.info: tokens = i.split('\t') tmp = re.search('(.+?):(.*)', tokens[3]) if cal_type.has_key(tmp.groups()[0]): cal_type[tmp.groups()[0]] += 1 else: cal_type[tmp.groups()[0]] = 1 #print('{0:s}'.format(i)) report_fh.write('# Number of WA loci: {0:d}\n'.format(rnum)) report_fh.write('# Number of WA transcripts: {0:d}\n'.format(cnum)) report_fh.write( '# Number of WA loci that were used to replace the models in reference gff: {0:d}\n' .format(changed)) for k, v in cal_type.items(): if k == 'simple': report_fh.write( '# Number of loci with {0:s}/Delete replacement: {1:d}\n'. format(k, v)) else: report_fh.write( '# Number of loci with {0:s} replacement: {1:d}\n'.format( k, v)) for k, v in cal_type_children.items(): report_fh.write( '# Number of transcripts with {0:s} replacement: {1:d}\n'.format( k, v)) report_fh.write( 'Change_log\tOriginal_gene_name\tOriginal_transcript_ID\tOriginal_transcript_name\tTmp_OGSv0_ID\n' ) roots = [] transcripts = [] unique = set() for line in gff3M.lines: if user_defined2 is None: try: if line['line_type'] == 'feature' and not line[ 'attributes'].has_key('Parent'): roots.append(line) except: pass else: if line['type'] in u_types: transcripts.append(line) for root in gff3M.collect_roots(line): if root['line_raw'] not in unique: roots.append(root) unique.add(root['line_raw']) #roots = [line for line in gff3M.lines if line['line_type'] == 'feature' and not line['attributes'].has_key('Parent')] for root in roots: if root['attributes']['ID'] not in changed_rootid: if user_defined2 is None: children = root['children'] else: children = [] unique = set() if root['type'] in u_types: children.append(root) else: for child in gff3M.collect_descendants(root): if child['type'] in u_types: if child['line_raw'] not in unique: children.append(child) unique.add(child['line_raw']) children = sorted(children, key=lambda k: k['line_index']) elif root['attributes'][ 'ID'] in changed_rootid and user_defined1 is not None: children = [] unique = set() if root['type'] in u1_types: children.append(root) else: for child in gff3.collect_descendants(root): if child['type'] in u1_types: if child['line_raw'] not in unique: children.append(child) unique.add(child['line_raw']) children = sorted(children, key=lambda k: k['line_index']) else: children = root['children'] for child in children: cflag = 0 if not child['line_status'] == 'removed': #print(child['attributes']) if child['attributes'].has_key('replace_type'): for i in root['attributes']['replace']: tname, tid, gid, tmpid = 'NA', 'NA', 'NA', 'NA' tmpid = child['attributes']['ID'] if not i == 'NA': t = gff3M.features[ReplaceGroups.mapName2ID[i]][0] try: tname = t['attributes']['Name'] except: tname = t['attributes']['ID'] tid = t['attributes']['ID'] gid_list = list() if user_defined2 is None: for tp_line in t['parents']: for tp in tp_line: gid_list.append(tp['attributes']['ID']) gid = ','.join(gid_list) else: for tp in gff3M.collect_roots(t): gid_list.append(tp['attributes']['ID']) gid = ','.join(gid_list) if tname not in not_orphan: tmpid = 'NA' report_fh.write( '{0:s}\t{1:s}\t{2:s}\t{3:s}\t{4:s}\n'.format( ReplaceGroups.mapType2Log[ child['attributes']['replace_type']], gid, tid, tname, tmpid)) del child['attributes']['replace_type'] cflag += 1 if child['attributes'].has_key('replace'): del child['attributes']['replace'] if cflag == 0: gid = None gid_list = list() if user_defined2 is None: for p_line in child['parents']: for p in p_line: gid_list.append(p['attributes']['ID']) else: for p in gff3M.collect_roots(child): gid_list.append(p['attributes']['ID']) gid = ','.join(gid_list) report_fh.write( '{0:s}\t{1:s}\t{2:s}\t{3:s}\t{4:s}\n'.format( ReplaceGroups.mapType2Log['other'], gid, child['attributes']['ID'], ReplaceGroups.id2name[child['attributes']['ID']], child['attributes']['ID'])) else: if child['attributes'].has_key('status') and child[ 'attributes']['status'] == 'Delete': for i in child['attributes']['replace']: if i == 'NA': sys.exit( 'The replace tag for Delete replacement cannot be NA: {0:s}' .format(child['line_raw'])) t = gff3M.features[ReplaceGroups.mapName2ID[i]][0] tname = t['attributes']['Name'] tid = t['attributes']['ID'] gid_list = list() if user_defined2 is None: for tp_line in t['parents']: for tp in tp_line: gid_list.append(tp['attributes']['ID']) else: for tp_line in gff3M.collect_roots(t): gid_list.append(tp_line['attributes']['ID']) gid = ','.join(gid_list) report_fh.write( '{0:s}\t{1:s}\t{2:s}\t{3:s}\t{4:s}\n'.format( ReplaceGroups.mapType2Log['Delete'], gid, tid, tname, "NA")) if child['attributes'].has_key('replace'): del child['attributes']['replace'] if root['attributes'].has_key('replace'): del root['attributes']['replace'] if root['attributes'].has_key('replace_type'): del root['attributes']['replace_type'] if root['attributes'].has_key('modified_track'): del root['attributes']['modified_track'] ReplaceGroups.name2id(gff3M) gff3M.write(output_gff) rm_list = ['WA_sorted.gff', 'other_sorted.gff'] remove_files_from_list(rm_list)
def main(gff_file, revision_file, output_gff, report_file=None, user_defined1=None, auto=True, logger=None): logger_null = logging.getLogger(__name__ + 'null') null_handler = logging.NullHandler() logger_null.addHandler(null_handler) if not logger: logger = logger_null NCRNA = ['rRNA', 'miRNA', 'ncRNA', 'snRNA', 'snoRNA', 'tRNA'] logger.info('Reading revision file... ({0:s})'.format(revision_file)) flines = open(revision_file, 'r') fflag = 0 revision = {} revision_id = {} rtype = {} for line_raw in flines: fflag += 1 if fflag == 1: continue else: if not re.search('\t\n', line_raw): line_strip = line_raw.rstrip('\n') tokens = line_strip.split('\t') key = '{0:s}:{1:s}-{2:s}:{3:s}:{4:s}'.format( tokens[6], tokens[7], tokens[8], tokens[9], tokens[10]) revision[key] = [tokens[24], line_strip] revision_id[tokens[12]] = [tokens[24], line_strip] rtype[tokens[10]] = 1 logger.info('Reading gff3 file... ({0:s})'.format(gff_file)) gff3 = Gff3(gff_file=gff_file, logger=logger_null) if report_file: logger.info('Writing summary report ({0:s})...'.format(report_file)) report_fh = open(report_file, 'w') else: logger.info('Writing summary report: replace_tag_report.txt...') report_fh = open('replace_tag_report.txt', 'w') # Validation Summary report_fh.write('# GFF3 Revision Report ({0:s})'.format(report_file)) if gff_file and sys.stdin.isatty(): report_fh.write(': {0:s} and {1:s}'.format(gff_file, revision_file)) report_fh.write('\n\n') report_fh.write('# Summary\n') if len(revision_id) == 0: report_fh.write('* Found 0 lines to be revised\n') else: report_fh.write('* Found {0:d} lines of the revision file\n'.format( len(revision_id))) match = 0 for line in gff3.lines: if line['type'] in rtype: key = '{0:s}:{1:s}-{2:s}:{3:s}:{4:s}'.format( line['seqid'], str(line['start']), str(line['end']), line['strand'], line['type']) if line['attributes']['ID'] in revision_id: match += 1 # if 'replace' not in line['attributes']: # line['attributes']['replace'] = revision_id[line['attributes']['ID']][0] line['attributes']['replace'] = [ revision_id[line['attributes']['ID']][0] ] revision_id[line['attributes']['ID']][1] = 'hit' elif key in revision: tokens = revision[key][1].split('\t') if not revision[key][1] == 'hit': report_fh.write( '\t- Same genomic region, but different IDs:\t(Annotator){0:s}\t(Gff){1:s}\n' .format(tokens[12], line['attributes']['ID'])) match += 1 if 'replace' not in line['attributes']: line['attributes']['replace'] = [revision[key][0]] revision[key][1] = 'hit' else: report_fh.write( '\t- Same genomic region, but different IDs and duplicate seuqences at the same location:\t(Location){0:s}\t(Gff){1:s}\n' .format(key, line['attributes']['ID'])) if match == 0: #print '\n[Warning!] No matched lines in the input gff!\n' print('\n') #sys.exit() else: report_fh.write( '* Found {0:d} matched IDs of the revision file\n'.format(match)) report_fh.write( '* Are there IDs that should be revised, but cannot be found in the gff?\n' ) count = 0 for v in list(revision_id.values()): if not v[1] == 'hit': tokens = v[1].split('\t') key = '{0:s}:{1:s}-{2:s}:{3:s}:{4:s}'.format( tokens[6], tokens[7], tokens[8], tokens[9], tokens[10]) if not revision[key][1] == 'hit': report_fh.write('\t- {0:s}\n'.format(v[1])) count += 1 if count == 0: report_fh.write('\t- All IDs are properly found in the gff.\n') u_types = set() if user_defined1 != None: for line in user_defined1: u_types.add(line[0]) roots = [] transcripts = [] unique = set() for line in gff3.lines: if user_defined1 is None: try: if line['line_type'] == 'feature' and 'Parent' not in line[ 'attributes']: roots.append(line) except KeyError: print( 'WARNING [Missing Attributes] Program failed.\n\t\t- Line {0:s}: {1:s}' .format(str(line['line_index'] + 1), line['line_raw'])) else: if line['type'] in u_types: transcripts.append(line) for root in gff3.collect_roots(line): if root['line_raw'] not in unique: roots.append(root) unique.add(root['line_raw']) #roots = [line for line in gff3.lines if line['line_type'] == 'feature' and 'Parent' not in line['attributes']] for line in roots: if 'replace' in line['attributes'] and 'children' in line: for index in range(len(line['attributes']['replace'])): line['attributes']['replace'][index] = re.sub( '\s+', '', line['attributes']['replace'][index]) if user_defined1 is None: children = line['children'] else: children = [] unique = set() if line['type'] in u_types: children.append(line) else: for child in gff3.collect_descendants(line): if child['type'] in u_types: if child['line_raw'] not in unique: children.append(child) unique.add(child['line_raw']) children = sorted(children, key=lambda k: k['line_index']) flag = 0 for child in children: f = 0 if 'replace' not in child['attributes']: child['attributes']['replace'] = line['attributes'][ 'replace'] flag += 1 f += 1 for index in range(len(child['attributes']['replace'])): child['attributes']['replace'][index] = re.sub( '\s+', '', child['attributes']['replace'][index]) if f == 0: #print('\nReplace tags found at both gene and mRNA level:{0:s}; {1:s}'.format(line['attributes']['replace'], child['attributes']['replace'])) i = str(sorted(line['attributes']['replace'])) j = str(sorted(child['attributes']['replace'])) if not i == j: print( '[Warning!] replace tag at gene level ({0:s}) is not consistent with that at mRNA level ({1:s})' .format(i, j)) if user_defined1 is None: del line['attributes']['replace'] else: if line['type'] not in u_types: del line['attributes']['replace'] # add an exon features with the same coordiantes to the ncRNA feature if the ncRNA does not contain at least one exon. if user_defined1 is None: children = line['children'] else: children = [] unique = set() if line['type'] in u_types: children.append(line) else: for child in gff3.collect_descendants(line): if child['type'] in u_types: if child['line_raw'] not in unique: children.append(child) unique.add(child['line_raw']) children = sorted(children, key=lambda k: k['line_index']) for child in children: exonflag = 0 if child['type'] in NCRNA: gchildren = child['children'] for gchild in gchildren: if gchild['type'] == 'exon': exonflag += 1 if exonflag == 0: newid = '{0:s}-EXON1'.format(child['attributes']['ID']) newExon = copy.deepcopy(child) eofindex = len(gff3.lines) newExon['line_index'] = eofindex newExon['parents'] = [] newExon['attributes']['Parent'] = [] newExon['attributes']['ID'] = newid newExon['attributes']['Name'] = newid newExon['type'] = 'exon' if 'replace' in newExon['attributes']: del newExon['attributes']['replace'] newExon['parents'].append(child) newExon['attributes']['Parent'].append( child['attributes']['ID']) child['children'].append(newExon) gff3.features[newExon['attributes']['ID']].append( newExon) gff3.lines.append(newExon) if line['type'] == 'gene' or line['type'] == 'pseudogene': if 'children' not in line: gff3.remove(line) if auto: if 'children' in line: if user_defined1 is None: children = line['children'] else: children = [] unique = set() if line['type'] in u_types: children.append(line) else: for child in gff3.collect_descendants(line): if child['type'] in u_types: if child['line_raw'] not in unique: children.append(child) unique.add(child['line_raw']) tags = {} for child in children: tag = ','.join(child['attributes']['replace']).replace( ' ', '') tag = tag.split(',') tags[tuple(tag)] = 0 # multi-isoforms have different replace tags if len(tags) > 1: flag = 0 merged_tag = set() for tag in tags.keys(): if 'NA' in tag: flag = 1 merged_tag.update(list(tag)) if flag == 0: for child in children: child['attributes']['replace'] = list(merged_tag) if report_file: report_fh.close() logger.info('Writing revised gff: ({0:s})...'.format(output_gff)) gff3.write(output_gff)
def main(in_gff, merge_report, out_merge_report, out_gff, uuid_on, prefix, digitlen, report, alias): logger.info('Reading input gff3 file: (%s)', in_gff) gff3 = Gff3(gff_file=in_gff, logger=None) if merge_report: if not out_merge_report: logger.error( '-m is given. Please specify the filename of the updated merge report with -om' ) sys.exit(1) else: logger.info( 'Reading the update report file generated by gff3_merge program: (%s)', merge_report) header_lines, log_lines, merge_report_dict = read_merge_report( gff3, merge_report) # generate a table of comparison between old and new IDs. if report: out_report = open(report, 'w') # old and new IDs pair dict # ID_dict = {old_ID:newID, missingID: [newID1, newID2]} ID_dict = {'missing': []} ID_order = [] roots = list() logger.info('Generate new ID for features in (%s)', in_gff) for line in gff3.lines: try: if line['line_type'] == 'feature': if uuid_on: newID = str(uuid.uuid1()) if 'ID' in line['attributes']: if line['attributes']['ID'] in ID_dict: ID_dict[line['attributes']['ID']].append(newID) if alias: line['attributes']['Alias'] = line[ 'attributes']['ID'] line['attributes']['ID'] = newID else: ID_dict[line['attributes']['ID']] = [newID] ID_order.append(line['attributes']['ID']) if alias: line['attributes']['Alias'] = line[ 'attributes']['ID'] line['attributes']['ID'] = newID else: ID_dict['missing'].append(newID) line['attributes']['ID'] = newID if 'Parent' in line['attributes']: for index, parent in enumerate( line['attributes']['Parent']): if parent in ID_dict: line['attributes']['Parent'][index] = ID_dict[ parent][0] else: newID = str(uuid.uuid1()) ID_dict[parent] = [newID] ID_order.append(parent) line['attributes']['Parent'][index] = newID else: if 'Parent' not in line['attributes']: roots.append(line) except KeyError: logger.warning('[Missing Attributes] Line (%s)', str(line['line_index'] + 1)) IDnumber = 0 for root in roots: newID = idgenerator(prefix, IDnumber, digitlen) IDnumber = newID['maxnum'] ID_dict[root['attributes']['ID']] = [newID['ID']] ID_order.append(root['attributes']['ID']) if alias: root['attributes']['Alias'] = root['attributes']['ID'] root['attributes']['ID'] = newID['ID'] children = root['children'] alphabets = alphabets_suffix(len(children)) for child in children: for index, parent in enumerate(child['attributes']['Parent']): if parent in ID_dict: child['attributes']['Parent'][index] = newID['ID'] newcID = '%s-R%s' % (newID['ID'], alphabets.pop(0)) ID_dict[child['attributes']['ID']] = [newcID] ID_order.append(child['attributes']['ID']) if alias: child['attributes']['Alias'] = child['attributes']['ID'] child['attributes']['ID'] = newcID collected_list = descendants_list(line_data=child, level=0) levellist = level_list(collected_list) IDnumber_dict = dict() for item_list in levellist: reverse = False if len(item_list) > 1: if item_list[0]['strand'] == '-': reverse = True descendant_sort = TypeSort(item_list, dict(), reverse) for descend in descendant_sort: flag = False if descend['type'] not in IDnumber_dict: IDnumber_dict[descend['type']] = 0 for index, parent in enumerate( descend['attributes']['Parent']): if parent in ID_dict: if flag == True: break if descend['attributes']['ID'] not in ID_dict: deprefix = '%s-%s' % (ID_dict[parent][0], descend['type']) newdID = idgenerator( deprefix, IDnumber_dict[descend['type']], 3) IDnumber_dict[ descend['type']] = newdID['maxnum'] ID_dict[descend['attributes']['ID']] = [ newdID['ID'] ] ID_order.append(descend['attributes']['ID']) descend['attributes']['ID'] = newdID['ID'] flag = True if flag == False: deprefix = '%s-%s' % (ID_dict[parent][0], descend['type']) newdID = idgenerator( deprefix, IDnumber_dict[descend['type']], 3) IDnumber_dict[ descend['type']] = newdID['maxnum'] ID_dict[descend['attributes']['ID']].append( newdID['ID']) descend['attributes']['ID'] = newdID['ID'] flag = True descend['attributes']['Parent'][index] = ID_dict[ parent][0] if merge_report and out_merge_report: logger.info( 'Update report file generated by gff3_merge program with new IDs.') with open(out_merge_report, 'w') as out_f: for header_line in header_lines: out_f.write(header_line + '\n') for key in merge_report_dict: if key not in ID_order: logger.error( 'The report file has to correspond to the gff3 file specified with -g' ) sys.exit(1) else: for line_num in merge_report_dict[key]: # update Tmp_OGSv0_ID log_lines[line_num][4] = ID_dict[key][0] for log_line in log_lines: out_f.write('\t'.join(log_line) + '\n') logger.info('Write out gff3 file: (%s)', out_gff) write_gff3(gff3, out_gff) if report: ID_order.append('missing') logger.info( 'Generate a report of comparison between old and new IDs: (%s)', report) out_line = 'Old_ID\tNewID' out_report.write(out_line + '\n') for key in ID_order: for value in ID_dict[key]: out_line = '%s\t%s' % (key, value) out_report.write(out_line + '\n') out_report.close()
def main(gff_file=None, fasta_file=None, embedded_fasta=False, stype=None, user_defined=None, dline=None, qc=True, output_prefix=None, logger=None): stderr_handler = logging.StreamHandler() stderr_handler.setFormatter( logging.Formatter('%(levelname)-8s %(message)s')) logger_null = logging.getLogger(__name__ + 'null') null_handler = logging.NullHandler() logger_null.addHandler(null_handler) if not gff_file or (not fasta_file and not embedded_fasta) or not stype: print( 'Gff file, fasta file, and type of extracted sequences need to be specified' ) sys.exit(1) type_set = [ 'gene', 'exon', 'pre_trans', 'trans', 'cds', 'pep', 'all', 'user_defined' ] if not stype in type_set: logger.error( 'Your sequence type is "{0:s}". Sequence type must be one of {1:s}!' .format(stype, str(type_set))) sys.exit(1) if stype == 'all' and output_prefix: pass elif stype != 'all' and output_prefix: logger.info('Specifying prefix of output file name: (%s)...', output_prefix) fname = '{0:s}_{1:s}.fa'.format(output_prefix, stype) report_fh = open(fname, 'w') else: print('[Error] Please specify the prefix of output file name...') sys.exit(1) if stype == 'user_defined' and user_defined != None: if len(user_defined) != 2: logger.error( 'Please specify parent and child feature via the -u argument. Format: [parent feature type],[child feature type]' ) sys.exit(1) elif stype != 'user_defined' and user_defined != None: logger.warning( 'Your sequence type is "{0:s}", -u argument will be ignored.'. format(stype)) elif stype == 'user_defined' and user_defined == None: logger.error('-u is needed in combination with -st user_defined.') sys.exit(1) logger.info('Reading files: {0:s}, {1:s}...'.format(gff_file, fasta_file)) gff = None if qc: initial_phase = False gff = Gff3(gff_file=gff_file, fasta_external=fasta_file, logger=logger) if embedded_fasta and len(gff.fasta_embedded) == 0: logger.error('There is no embedded fasta in the GFF3 file.') sys.exit(1) logger.info('Checking errors...') gff.check_parent_boundary() gff.check_phase(initial_phase) gff.check_reference() error_set = function4gff.extract_internal_detected_errors(gff) t = intra_model.main(gff, logger=logger) if t: error_set.extend(t) t = single_feature.main(gff, logger=logger) if t: error_set.extend(t) if error_set and len(error_set): escaped_error = ['Esf0012', 'Esf0033'] eSet = list() for e in error_set: if not e['eCode'] in escaped_error: eSet.append(e) if len(eSet): logger.warning( 'The extracted sequences might be wrong for the following features which have formatting errors...' ) print('ID\tError_Code\tError_Tag') for e in eSet: tag = '[{0:s}]'.format(e['eTag']) print(e['ID'], e['eCode'], tag) else: gff = Gff3(gff_file=gff_file, fasta_external=fasta_file, logger=logger_null) if embedded_fasta and len(gff.fasta_embedded) == 0: logger.error('There is no embedded fasta in the GFF3 file.') logger.info('Extract sequences for {0:s}...'.format(stype)) seq = dict() if stype == 'all': if output_prefix: logger.info('Specifying prefix of output file name: (%s)...', output_prefix) pass else: print('[Error] Please specify the prefix of output file name...') sys.exit(1) tmp_stype = 'pre_trans' logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype)) seq = extract_start_end(gff, tmp_stype, dline, embedded_fasta) if len(seq): fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype) report_fh = open(fname, 'w') logger.info( '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format( output_prefix, tmp_stype)) for k, v in seq.items(): if len(k) != 0 and len(v) != 0: report_fh.write('{0:s}\n{1:s}\n'.format(k, v)) seq = dict() tmp_stype = 'gene' logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype)) seq = extract_start_end(gff, tmp_stype, dline, embedded_fasta) if len(seq): fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype) report_fh = open(fname, 'w') logger.info( '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format( output_prefix, tmp_stype)) for k, v in seq.items(): if len(k) != 0 and len(v) != 0: report_fh.write('{0:s}\n{1:s}\n'.format(k, v)) seq = dict() tmp_stype = 'exon' logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype)) seq = extract_start_end(gff, tmp_stype, dline, embedded_fasta) if len(seq): fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype) report_fh = open(fname, 'w') logger.info( '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format( output_prefix, tmp_stype)) for k, v in seq.items(): if len(k) != 0 and len(v) != 0: report_fh.write('{0:s}\n{1:s}\n'.format(k, v)) seq = dict() tmp_stype = 'trans' feature_type = ['exon', 'pseudogenic_exon'] logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype)) seq = splicer(gff, feature_type, dline, stype, embedded_fasta) if len(seq): fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype) report_fh = open(fname, 'w') logger.info( '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format( output_prefix, tmp_stype)) for k, v in seq.items(): if len(k) != 0 and len(v) != 0: report_fh.write('{0:s}\n{1:s}\n'.format(k, v)) seq = dict() tmp_stype = 'cds' feature_type = ['CDS'] logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype)) seq = splicer(gff, feature_type, dline, stype, embedded_fasta) if len(seq): fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype) report_fh = open(fname, 'w') logger.info( '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format( output_prefix, tmp_stype)) for k, v in seq.items(): if len(k) != 0 and len(v) != 0: report_fh.write('{0:s}\n{1:s}\n'.format(k, v)) seq = dict() tmp_stype = 'pep' feature_type = ['CDS'] logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype)) tmpseq = splicer(gff, feature_type, dline, tmp_stype, embedded_fasta) for k, v in tmpseq.items(): k = k.replace("|mRNA(CDS)|", "|peptide|") v = translator(v) seq[k] = v if len(seq): fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype) report_fh = open(fname, 'w') logger.info( '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format( output_prefix, tmp_stype)) for k, v in seq.items(): if len(k) != 0 and len(v) != 0: report_fh.write('{0:s}\n{1:s}\n'.format(k, v)) elif stype == 'user_defined': feature_type = [user_defined[0], user_defined[1]] seq = splicer(gff, feature_type, dline, stype, embedded_fasta) if len(seq): logger.info( 'Print out extracted sequences: {0:s}_{1:s}.fa...'.format( output_prefix, stype)) for k, v in seq.items(): if len(k) != 0 and len(v) != 0: report_fh.write('{0:s}\n{1:s}\n'.format(k, v)) else: if stype == 'pre_trans' or stype == 'gene' or stype == 'exon': seq = extract_start_end(gff, stype, dline, embedded_fasta) elif stype == 'trans': feature_type = ['exon', 'pseudogenic_exon'] seq = splicer(gff, feature_type, dline, stype, embedded_fasta) elif stype == 'cds': feature_type = ['CDS'] seq = splicer(gff, feature_type, dline, stype, embedded_fasta) elif stype == 'pep': feature_type = ['CDS'] tmpseq = splicer(gff, feature_type, dline, stype, embedded_fasta) for k, v in tmpseq.items(): k = k.replace("|mRNA(CDS)|", "|peptide|") #k = re.sub(r'(.*-)(R)(.)',r'\1P\3',k) v = translator(v) seq[k] = v if len(seq): logger.info( 'Print out extracted sequences: {0:s}_{1:s}.fa...'.format( output_prefix, stype)) for k, v in seq.items(): if len(k) != 0 and len(v) != 0: report_fh.write('{0:s}\n{1:s}\n'.format(k, v))
def script_main(): logger_stderr = logging.getLogger(__name__ + 'stderr') logger_stderr.setLevel(logging.INFO) stderr_handler = logging.StreamHandler() stderr_handler.setFormatter( logging.Formatter('%(levelname)-8s %(message)s')) logger_stderr.addHandler(stderr_handler) logger_null = logging.getLogger(__name__ + 'null') null_handler = logging.NullHandler() logger_null.addHandler(null_handler) import argparse from textwrap import dedent parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=dedent("""\ Testing environment: 1. Python 2.7 Inputs: 1. GFF3: Specify the file name with the -g or --gff argument; Please note that this program requires gene/pseudogene and mRNA/pseudogenic_transcript to have an ID attribute in column 9. 2. fasta file: Specify the file name with the -f or --fasta argument Outputs: 1. Error report for the input GFF3 file * Line_num: Line numbers of the found problematic models in the input GFF3 file. * Error_code: Error codes for the found problematic models. Please refer to lib/ERROR/ERROR.py to see the full list of Error_code and the corresponding Error_tag. * Error_tag: Detail of the found errors for the problematic models. Please refer to lib/ERROR/ERROR.py to see the full list of Error_code and the corresponding Error_tag. Quick start: gff3_QC -g example_file/example.gff3 -f example_file/reference.fa -o test or gff3_QC --gff example_file/example.gff3 --fasta example_file/reference.fa --output test """)) parser.add_argument('-g', '--gff', type=str, help='Genome annotation file, gff3 format') parser.add_argument('-f', '--fasta', type=str, help='Genome sequences, fasta format') parser.add_argument( '-noncg', '--noncanonical_gene', action="store_true", help='gff3 file is not formatted in the canonical gene model format.') parser.add_argument( '-i', '--initial_phase', action="store_true", help='Check whether initial CDS phase is 0 (default: no check)') parser.add_argument( '-n', '--allowed_num_of_n', type=int, default=0, help= 'Max number of Ns allowed in a feature, anything more will be reported as an error (default: 0)' ) parser.add_argument( '-t', '--check_n_feature_types', nargs='*', default=['CDS'], help= 'Count the number of Ns in each feature with the type specified, multiple types may be specified, ex: -t CDS exon (default: "CDS")' ) parser.add_argument('-o', '--output', type=str, help='output file name (default: report.txt)') parser.add_argument('-s', '--statistic', type=str, help='statistic file name (default: statistic.txt)') parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__) args = parser.parse_args() if args.gff: logger_stderr.info('Checking gff file (%s)...', args.gff) elif not sys.stdin.isatty(): # if STDIN connected to pipe or file args.gff = sys.stdin logger_stderr.info('Reading from STDIN...') else: # no input parser.print_help() sys.exit(1) if args.fasta: logger_stderr.info('Checking genome fasta (%s)...', args.fasta) elif not sys.stdin.isatty(): # if STDIN connected to pipe or file args.fasta = sys.stdin logger_stderr.info('Reading from STDIN...') else: # no input parser.print_help() sys.exit(1) if args.allowed_num_of_n or args.check_n_feature_types: check_n = True else: check_n = False logger_stderr.info('Reading gff files: (%s)...\n', args.gff) gff3 = Gff3(gff_file=args.gff, fasta_external=args.fasta, logger=logger_null) logger_stderr.info('Checking errors in the gff files: (%s)...\n', args.gff) if not gff3.check_parent_boundary(): sys.exit() gff3.check_unresolved_parents() if args.noncanonical_gene == False: gff3.check_phase(args.initial_phase) gff3.check_reference(fasta_external=args.fasta, check_n=check_n, allowed_num_of_n=args.allowed_num_of_n, feature_types=args.check_n_feature_types) logger_stderr.info('\t- Checking missing attributes: (%s)...\n', 'function4gff.FIX_MISSING_ATTR()') function4gff.FIX_MISSING_ATTR(gff3, logger=logger_stderr) error_set = list() cmd = None cmd = function4gff.extract_internal_detected_errors(gff3) if cmd: error_set.extend(cmd) cmd = None logger_stderr.info('\t- Checking intra-model errors: (%s)...\n', args.gff) cmd = intra_model.main(gff3, logger=logger_stderr, noncanonical_gene=args.noncanonical_gene) if cmd: error_set.extend(cmd) cmd = None logger_stderr.info('\t- Checking inter-model errors: (%s)...\n', args.gff) cmd = inter_model.main(gff3, args.gff, args.fasta, logger=logger_stderr, noncanonical_gene=args.noncanonical_gene) if cmd: error_set.extend(cmd) cmd = None logger_stderr.info('\t- Checking single-feature errors: (%s)...\n', args.gff) cmd = single_feature.main(gff3, logger=logger_stderr) if cmd: error_set.extend(cmd) if args.output: logger_stderr.info('Print QC report at {0:s}'.format(args.output)) report_fh = open(args.output, 'w') else: logger_stderr.info('Print QC report at {0:s}'.format('report.txt')) report_fh = open('report.txt', 'w') if args.statistic: logger_stderr.info('Print QC statistic report at {0:s}'.format( args.statistic)) statistic_fh = open(args.statistic, 'w') else: logger_stderr.info( 'Print QC statistic report at {0:s}'.format('statistic.txt')) statistic_fh = open('statistic.txt', 'w') report_fh.write('Line_num\tError_code\tError_tag\n') for e in sorted(error_set, key=lambda x: sorted(x.keys())): tag = '[{0:s}]'.format(e['eTag']) report_fh.write('{0:s}\t{1:s}\t{2:s}\n'.format(str(e['line_num']), str(e['eCode']), str(tag))) #statistic_file error_counts = dict() ERROR_INFO = ERROR.INFO statistic_fh.write('Error_code\tNumber_of_problematic_models\tError_tag\n') for s in sorted(error_set, key=lambda x: sorted(x.keys())): if s['eCode'] not in error_counts: error_counts[s['eCode']] = { 'count': 0, 'etag': ERROR_INFO[s['eCode']] } error_counts[s['eCode']]['count'] += 1 for a in error_counts: statistic_fh.write('{0:s}\t{1:s}\t{2:s}\n'.format( str(a), str(error_counts[a]['count']), str(error_counts[a]['etag'])))
def main(gff_file1, gff_file2, fasta, report, output_gff, all_assign=False, auto=True, user_defined1=None, user_defined2=None, logger=None): logger_null = logging.getLogger(__name__ + 'null') null_handler = logging.NullHandler() logger_null.addHandler(null_handler) if not logger: logger = logger_null if re.search(r'(\S+)/(\S+)$', gff_file1): _, gff_file1_name = re.search(r'(\S+)/(\S+)$', gff_file1).groups() else: gff_file1_name = gff_file1 # print(path, gff_file1_name) if auto: autoDIR = 'auto_replace_tag' autoFILE = '{0:s}/check1.txt'.format(autoDIR) autoReviseGff = '{0:s}/Revised_{1:s}'.format(autoDIR, gff_file1_name) autoReviseReport = '{0:s}/replace_tag_report.txt'.format(autoDIR) logger.info( '========== Auto-assignment of replace tags for each transcript model ==========' ) gff3_merge.auto_replace_tag.main(gff1=gff_file1, gff2=gff_file2, fasta=fasta, outdir=autoDIR, scode='TEMP', all_assign=all_assign, user_defined1=user_defined1, user_defined2=user_defined2, logger=logger) gff3_merge.revision.main(gff_file=gff_file1, revision_file=autoFILE, output_gff=autoReviseGff, report_file=autoReviseReport, user_defined1=user_defined1, auto=auto, logger=logger) logger.info( '========== Check whether there are missing replace tags ==========' ) gff3 = Gff3(gff_file=autoReviseGff, logger=logger_null) error_models = check_replace(gff3, user_defined1) if error_models: logger.error('There are models missing replace tags...') logger.error( 'Please check the below models in {0:s}. Please specify the proper replaced models at colulumn 9. For example, \'replace=[Transcript ID]\'. If this is a newly added model, please put it as \'replace=NA\'. Then, re-excute the program.\n' .format(autoReviseGff)) for line in error_models: print(line['line_raw']) return else: logger.info('- All models have replace tags.') logger.info('========== Merge the two gff files ==========') gff3_merge.merge.main(autoReviseGff, gff_file2, output_gff, report, user_defined1, user_defined2, logger) else: logger.info( '========== Check whether there are missing replace tags ==========' ) gff3 = Gff3(gff_file=gff_file1, logger=logger_null) error_models = check_replace(gff3) if error_models: logger.error('There are models missing replace tags...') logger.error( 'Please check the below models in {0:s}. Please specify the proper replaced models at colulumn 9. For example, \'replace=[Transcript ID]\'. If this is a newly added model, please put it as \'replace=NA\'. Then, re-excute the program.' .format(gff_file1)) for line in error_models: print(line['line_raw'].strip()) return else: logger.info('- All models have replace tags.') logger.info('========== Merge the two gff files ==========') gff3_merge.merge.main(gff_file1, gff_file2, output_gff, report, user_defined1, user_defined2, logger)
def script_main(): logger_stderr = logging.getLogger(__name__ + 'stderr') logger_stderr.setLevel(logging.INFO) stderr_handler = logging.StreamHandler() stderr_handler.setFormatter( logging.Formatter('%(levelname)-8s %(message)s')) logger_stderr.addHandler(stderr_handler) logger_null = logging.getLogger(__name__ + 'null') null_handler = logging.NullHandler() logger_null.addHandler(null_handler) import argparse from textwrap import dedent parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=dedent("""\ Testing environment: 1. Python 2.7 Input: 1. Error report: Error report from gff3_QC.py. Specify the file name with the -qc_r or --qc_report argument; 2. GFF3: Specify the file name with the -g or --gff argument; Output: 1. Corrected GFF3 Quick start: gff3_fix -qc_r error.txt -g example_file/example.gff3 -og corrected.gff3 """)) parser.add_argument('-qc_r', '--qc_report', type=str, help='Error report from gff3_QC.py') parser.add_argument('-g', '--gff', type=str, help='Genome annotation file, gff3 format') #parser.add_argument('-r', '--report', type=str, help='output report file name') parser.add_argument('-og', '--output_gff', type=str, help='output gff3 file name', default='corrected.gff3') parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__) args = parser.parse_args() if args.qc_report: logger_stderr.info('Checking QC report file (%s)...', args.qc_report) else: # no input parser.print_help() sys.exit() if args.gff: logger_stderr.info('Checking GFF3 file (%s)...', args.gff) else: # no input parser.print_help() sys.exit() logger_stderr.info('Reading QC report file: (%s)...\n', args.qc_report) #error_dict example: {'Emr0001': [[15,16],[13]],'Esf0005': [[17]]} error_dict = {} #line_num_dict example: {3: ['Emr0001','Esf0003'], 15: ['Emr0026']} line_num_dict = {} try: with open(args.qc_report, "r") as qcr: #ignore the first line (header) next(qcr) for line in qcr: line = line.strip() if line: try: lines = line.split("\t") line_num_list = map(int, re.findall(r'\d+', lines[0])) if lines[1] not in error_dict: error_dict[lines[1]] = [line_num_list] else: error_dict[lines[1]].append(line_num_list) for line_num in line_num_list: if line_num not in line_num_dict: line_num_dict[line_num] = {lines[1]: lines[2]} else: line_num_dict[line_num][lines[1]] = lines[2] except IndexError: logger_stderr.warning('Failed to recognize - %s', line) except: logger_stderr.error('Failed to read QC report file!') logger_stderr.info('Reading GFF3 file: (%s)...\n', args.gff) try: gff3 = Gff3(gff_file=args.gff, logger=logger_null) except: logger_stderr.error('Failed to read GFF3 file!') sys.exit(1) gff3_fix.fix.main(gff3=gff3, output_gff=args.output_gff, error_dict=error_dict, line_num_dict=line_num_dict, logger=logger_null)