Exemplo n.º 1
0
def main(old_gff, new_gff, output_gff, re_construct_features, tmp_identifier):
    logger.info('Reading original GFF3 file: (%s)...\n', old_gff)
    old_gff3 = Gff3(gff_file=old_gff)

    logger.info('Reading updated GFF3 file: (%s)...\n', new_gff)
    new_gff3 = Gff3(gff_file=new_gff)

    if re_construct_features:
        out_f = open(re_construct_features, 'w')
    else:
        out_f = None
    polypeptide_re_construct(old_gff3=old_gff3, new_gff3=new_gff3, tmp_identifier=tmp_identifier, report=out_f)
    re_construct(old_gff3=old_gff3, new_gff3=new_gff3, tmp_identifier=tmp_identifier, report=out_f)
    logger.info('Generating the re-constructed gff3 file: (%s)...\n', output_gff)
    write_gff3(new_gff3, output_gff)

    if re_construct_features:
        out_f.close()
Exemplo n.º 2
0
def open_gff_file(gff):
    gff_hd = Gff3(gff)

    cds_recs = []
    for line in gff_hd.lines[4:]:
        if 'seqid' in line and 'type' in line and 'strand' in line:
            if line['type'] == 'CDS' and line['strand'] == '+':
                cds_recs.append(line)

    print "number of cds: {}".format(len(cds_recs))
    return cds_recs
Exemplo n.º 3
0
def run_modifier(args):

    modifier = Modifier(args.annotation)
    print("Reading gff file")
    gff: Gff3 = Gff3(gff_file=args.gff_path)

    # Modify the gff file using the Modifier class
    modifier.modify_gff(gff)

    print("Writing modified gff file")
    # Write the modified gff to the output path
    if args.output_path is None:
        gff.write(sys.stdout)

    else:
        with open(args.output_path, "w") as file_out:
            gff.write(file_out)
Exemplo n.º 4
0
def parse_genes(gff3_file, go_terms_file):
    """parses the genes from gff3 and enriches it with additional information"""
    go_map = _get_go_map(go_terms_file)
    gff = Gff3(gff3_file)
    genes = [
        line for line in gff.lines
        if line['line_type'] == 'feature' and line['type'] == 'gene'
    ]

    genes_map = {}
    for gene in genes:
        gene_id = gene['attributes']['ID']
        symbol = gene['attributes'].get('symbol', None)
        full_name = gene['attributes'].get('full_name', None)
        aliases = gene['attributes'].get('Alias', [])
        aliases = [{
            'symbol':
            aliases[i],
            'full_name': (None if i >= len(aliases) - 1 else aliases[i + 1])
        } for i in range(0, len(aliases), 2)]

        if symbol and full_name:
            aliases.insert(0, {'symbol': symbol, 'full_name': full_name})

        gene_dict = {
            'positions': {
                'gte': gene['start'],
                'lte': gene['end']
            },
            'chr': gene['seqid'].lower(),
            'type': gene['type'],
            'strand': gene['strand'],
            'name': gene_id,
            'aliases': aliases,
            'isoforms': [],
            'GO': go_map.get(gene_id, [])
        }
        gene_dict['isoforms'] = _parse_isoforms(gff, gene)
        genes_map[gene_id] = gene_dict
        gene_dict['suggest'] = [gene_id]
        gene_dict['suggest'].extend(
            set([alias['symbol'] for alias in gene_dict['aliases']]))
    return genes_map
Exemplo n.º 5
0
def main():
    arguments = myTools.checkArgs([("gffFile", file)], [], __doc__)

    gff = Gff3(arguments["gffFile"])
    genes = [
        line for line in gff.lines
        if line['line_type'] == 'feature' and line['type'] == 'mRNA'
    ]

    for gene in genes:
        #print >> sys.stdout,  gene['seqid'], gene['start'], gene['end'], gene['strand'], gene['attributes']['ID']
        if gene['strand'] == "+":
            gene['strand'] = '1'
        else:
            gene['strand'] = '-1'
        print(myFile.myTSV.printLine([
            gene['seqid'], gene['start'], gene['end'], gene['strand'],
            gene['attributes']['ID']
        ]),
              file=sys.stdout)
Exemplo n.º 6
0
def main(in_gff, merge_report, out_merge_report, out_gff, uuid_on, prefix,
         digitlen, report, alias):
    logger.info('Reading input gff3 file: (%s)', in_gff)
    gff3 = Gff3(gff_file=in_gff, logger=None)
    if merge_report:
        if not out_merge_report:
            logger.error(
                '-m is given. Please specify the filename of the updated merge report with -om'
            )
            sys.exit(1)
        else:
            logger.info(
                'Reading the update report file generated by gff3_merge program: (%s)',
                merge_report)
            header_lines, log_lines, merge_report_dict = read_merge_report(
                gff3, merge_report)
    # generate a table of comparison between old and new IDs.
    if report:
        out_report = open(report, 'w')

    # old and new IDs pair dict
    # ID_dict = {old_ID:newID, missingID: [newID1, newID2]}
    ID_dict = {'missing': []}
    ID_order = []
    roots = list()
    logger.info('Generate new ID for features in (%s)', in_gff)
    for line in gff3.lines:
        try:
            if line['line_type'] == 'feature':
                if uuid_on:
                    newID = str(uuid.uuid1())
                    if 'ID' in line['attributes']:
                        if line['attributes']['ID'] in ID_dict:
                            ID_dict[line['attributes']['ID']].append(newID)
                            if alias:
                                line['attributes']['Alias'] = line[
                                    'attributes']['ID']
                            line['attributes']['ID'] = newID
                        else:
                            ID_dict[line['attributes']['ID']] = [newID]
                            ID_order.append(line['attributes']['ID'])
                            if alias:
                                line['attributes']['Alias'] = line[
                                    'attributes']['ID']
                            line['attributes']['ID'] = newID
                    else:
                        ID_dict['missing'].append(newID)
                        line['attributes']['ID'] = newID
                    if 'Parent' in line['attributes']:
                        for index, parent in enumerate(
                                line['attributes']['Parent']):
                            if parent in ID_dict:
                                line['attributes']['Parent'][index] = ID_dict[
                                    parent][0]
                            else:
                                newID = str(uuid.uuid1())
                                ID_dict[parent] = [newID]
                                ID_order.append(parent)
                                line['attributes']['Parent'][index] = newID
                else:
                    if 'Parent' not in line['attributes']:
                        roots.append(line)
        except KeyError:
            logger.warning('[Missing Attributes] Line (%s)',
                           str(line['line_index'] + 1))
    IDnumber = 0
    for root in roots:
        newID = idgenerator(prefix, IDnumber, digitlen)
        IDnumber = newID['maxnum']
        ID_dict[root['attributes']['ID']] = [newID['ID']]
        ID_order.append(root['attributes']['ID'])
        if alias:
            root['attributes']['Alias'] = root['attributes']['ID']
        root['attributes']['ID'] = newID['ID']
        children = root['children']
        alphabets = alphabets_suffix(len(children))
        for child in children:
            for index, parent in enumerate(child['attributes']['Parent']):
                if parent in ID_dict:
                    child['attributes']['Parent'][index] = newID['ID']

            newcID = '%s-R%s' % (newID['ID'], alphabets.pop(0))
            ID_dict[child['attributes']['ID']] = [newcID]
            ID_order.append(child['attributes']['ID'])
            if alias:
                child['attributes']['Alias'] = child['attributes']['ID']
            child['attributes']['ID'] = newcID
            collected_list = descendants_list(line_data=child, level=0)
            levellist = level_list(collected_list)
            IDnumber_dict = dict()
            for item_list in levellist:
                reverse = False
                if len(item_list) > 1:
                    if item_list[0]['strand'] == '-':
                        reverse = True
                descendant_sort = TypeSort(item_list, dict(), reverse)
                for descend in descendant_sort:
                    flag = False
                    if descend['type'] not in IDnumber_dict:
                        IDnumber_dict[descend['type']] = 0
                    for index, parent in enumerate(
                            descend['attributes']['Parent']):
                        if parent in ID_dict:
                            if flag == True:
                                break
                            if descend['attributes']['ID'] not in ID_dict:
                                deprefix = '%s-%s' % (ID_dict[parent][0],
                                                      descend['type'])
                                newdID = idgenerator(
                                    deprefix, IDnumber_dict[descend['type']],
                                    3)
                                IDnumber_dict[
                                    descend['type']] = newdID['maxnum']
                                ID_dict[descend['attributes']['ID']] = [
                                    newdID['ID']
                                ]
                                ID_order.append(descend['attributes']['ID'])
                                descend['attributes']['ID'] = newdID['ID']
                                flag = True
                            if flag == False:
                                deprefix = '%s-%s' % (ID_dict[parent][0],
                                                      descend['type'])
                                newdID = idgenerator(
                                    deprefix, IDnumber_dict[descend['type']],
                                    3)
                                IDnumber_dict[
                                    descend['type']] = newdID['maxnum']
                                ID_dict[descend['attributes']['ID']].append(
                                    newdID['ID'])
                                descend['attributes']['ID'] = newdID['ID']
                                flag = True
                            descend['attributes']['Parent'][index] = ID_dict[
                                parent][0]
    if merge_report and out_merge_report:
        logger.info(
            'Update report file generated by gff3_merge program with new IDs.')
        with open(out_merge_report, 'w') as out_f:
            for header_line in header_lines:
                out_f.write(header_line + '\n')
            for key in merge_report_dict:
                if key not in ID_order:
                    logger.error(
                        'The report file has to correspond to the gff3 file specified with -g'
                    )
                    sys.exit(1)
                else:
                    for line_num in merge_report_dict[key]:
                        # update Tmp_OGSv0_ID
                        log_lines[line_num][4] = ID_dict[key][0]
            for log_line in log_lines:
                out_f.write('\t'.join(log_line) + '\n')
    logger.info('Write out gff3 file: (%s)', out_gff)
    write_gff3(gff3, out_gff)
    if report:
        ID_order.append('missing')
        logger.info(
            'Generate a report of comparison between old and new IDs: (%s)',
            report)
        out_line = 'Old_ID\tNewID'
        out_report.write(out_line + '\n')
        for key in ID_order:
            for value in ID_dict[key]:
                out_line = '%s\t%s' % (key, value)
                out_report.write(out_line + '\n')

        out_report.close()
#python extract_intron_stat_from_gff.py links18.scaffolds.fa.rnd3_rerun.GAAS.gff test.csv

from gff3 import Gff3
import sys
#usage: python [input gff file] [output csv file]
gff = Gff3(sys.argv[1])
output = sys.argv[2]

intron_num = {}
exon_length = {}
mrna_len = {}
intron_len = {}
single_intron_len = []
CDS_length = {}

for l in gff.lines:
    if l['type'] == 'mRNA':
        #if l['type']=='mRNA' and float(l['attributes']['_AED']) <0.5:
        rna_nam = l['attributes']['Name']
        mrna_len[rna_nam] = l['end'] - l['start']
        exons = []
        CDS_len = 0
        for rec in l['children']:
            exons.append((rec['start'], rec['end']))
            if rec['type'] == 'CDS':
                CDS_len = CDS_len + rec['end'] - rec['start'] + 1
        CDS_length[rna_nam] = CDS_len
        exons_merged = []
        for begin, end in sorted(exons):
            if exons_merged and exons_merged[-1][1] >= begin - 1:
                exons_merged[-1] = (exons_merged[-1][0], end)
# fix_source.py
# =================

# script uses Han Lin's gff3 class to import a gff3, take a list of IDs that currently have source ManualCuration, and switch the source of these models (all parents and children) to I5K
# Requires: https://github.com/hotdogee/gff3-py
from gff3 import Gff3
gff = Gff3('agla_v1_2-NALmod3.gff3')
id_list = [
    'AGLA014663', 'AGLA003801', 'AGLA017751', 'AGLA003809', 'AGLA000919',
    'AGLA000103'
]
source_map = {'ManualCuration': 'I5K'}

for feature_id in id_list:
    for feature in gff.features[feature_id]:
        for line in gff.descendants(feature):
            if line['source'] in source_map:
                line['source'] = source_map[line['source']]
            if feature['source'] in source_map:
                feature['source'] = source_map[feature['source']]

gff.write('agla_v1_2-NALmod4.gff3')
Exemplo n.º 9
0
def clustersToGFF(clusterspath, gffpath, goldpath, annotpath, source_type):
    gffcontent = Gff3(gffpath)
    clustercontent, goldContent, annotationContent = "", "", ""

    clustercontent = Utils.readFileLines(clusterspath)
    clusters = Utils.foldClusterData(
        clustercontent, "",
        0.5) if 'score' in clusterspath else Utils.foldClusterData(
            clustercontent, "gold", "")

    goldContent = '\t'.join(Utils.readFileLines(goldpath)) if goldpath else ""
    annotationList = Utils.readFileLines(annotpath) if annotpath else ""
    annotationContent = ('\n').join(annotationList) if annotpath else ""

    # sort dict by key
    clusters = OrderedDict(sorted(clusters.items(), key=lambda x: x[0]))
    gffclusterfile = clusterspath.rsplit('.', 1)[0] + '.percluster.gff3'
    gffgenefile = clusterspath.rsplit('.', 1)[0] + '.pergene.gff3'

    outputcluster, outputgene = "##gff-version 3\n", "##gff-version 3\n"
    # filter only "mRNA" features, return dict {gene name, gff line}
    mRNAdict = {
        line['attributes']['Name'].replace('.1', ''): line
        for line in gffcontent.lines if line['type'] == 'mRNA'
    }

    for key, value in clusters.items():
        for gene in value:
            gene = gene.replace('.1', '')
            thisgene = mRNAdict.get(gene)

            if (thisgene is not None):
                chr = thisgene['seqid']
                position = str(thisgene['start']) + '\t' + str(thisgene['end'])
                score = '?'
                strand = thisgene['strand']
                phase = thisgene['phase']
                info = 'Name=' + gene + ';Note=' + key + '\n'

                if (goldContent):
                    if (gene in annotationContent):
                        annot = [
                            item for item in annotationList if gene in item
                        ]
                        annot = annot[0].split('\t')[1] if annot else ''
                        if ('backbone' in annot):
                            info = info.replace("\n",
                                                ";color=#EE0000\n")  # red
                        elif ('tailor' in annot):
                            info = info.replace("\n",
                                                ";color=#EE9300\n")  # orange
                        elif ('transcript') in annot:
                            info = info.replace(
                                "\n", ";color=#048014\n")  # forest green
                        elif ('transport' in annot):
                            info = info.replace(
                                "\n", ";color=#1888f0\n")  # light blue
                    elif (gene in goldContent):
                        info = info.replace(
                            "\n", ";color=#9931f2\n")  # bright purple
                outputgene += chr + '\t' + source_type + '\t' + position + '\t' + score + '\t' + strand + '\t' + phase + '\t' + info

            else:
                print('gene not found:', gene)

        startID = value[0].replace('.1', '')
        endID = value[-1].replace('.1', '')
        startGene = mRNAdict.get(startID)
        endGene = mRNAdict.get(endID)
        chr = startGene['seqid']
        position = str(startGene['start']) + '\t' + str(endGene['end'])

        strand = startGene['strand']
        phase = startGene['phase']
        score = '?'
        info = 'Name=' + key + ';Note=' + ('|').join(value) + '\n'
        outputcluster += chr + '\t' + source_type + '\t' + position + '\t' + score + '\t' + strand + '\t' + phase + '\t' + info

    Utils.writeFile(gffclusterfile, outputcluster)
    Utils.writeFile(gffgenefile, outputgene)

    return gffcontent
Exemplo n.º 10
0
        args = parser.parse_args(['-g', 'annotations.gff'])
    else:
        args = parser.parse_args()

    if args.gff_file:
        logger_stderr.info('Checking GFF3 file (%s)...', args.gff_file)
    elif not sys.stdin.isatty():  # if STDIN connected to pipe or file
        args.gff_file = sys.stdin
        logger_stderr.info('Reading from STDIN...')
    else:  # no input
        parser.print_help()
        sys.exit(1)

    logger_stderr.info('Checking syntax and formatting...')
    gff3 = Gff3(gff_file=args.gff_file,
                fasta_external=args.fasta_file,
                logger=logger_null)
    logger_stderr.info('Checking reference seqid, bounds and N count...')
    gff3.check_reference(allowed_num_of_n=args.allowed_num_of_n,
                         feature_types=args.check_n_feature_types)
    logger_stderr.info('Checking parent boundaries...')
    gff3.check_parent_boundary()

    gff3.check_phase()

    if args.report_file:
        logger_stderr.info('Writing validation report (%s)...',
                           args.report_file)
        report_fh = open(args.report_file, 'wb')
    else:
        report_fh = sys.stdout
Exemplo n.º 11
0
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'genes_gff',
        help="Input gff3 annotation file including only genes features")
    parser.add_argument('--busco_result',
                        default=None,
                        help="BUSCO full report output")
    parser.add_argument('--ips_result', help="InterProScan tsv output")
    parser.add_argument('--blast_result',
                        help="Blast search result of proteins vs. DB")
    parser.add_argument('--repeats_gff',
                        help="gff3 file with repeats features only")
    parser.add_argument('out_report', help="Output QA report")
    args = parser.parse_args()

    gff_obj = Gff3(args.genes_gff)
    qa_methods = [("Chromosome", mrna_chrom, (gff_obj, )),
                  ("AED", mrna_aed, (gff_obj, )),
                  ("Exons", mrna_exons_count, (gff_obj, )),
                  ("UTR", mrna_utr, (gff_obj, ))]
    if args.repeats_gff:
        rep_gff_obj = Gff3(args.repeats_gff)
        qa_methods.append(('Repeats', repeats_overlap, (gff_obj, rep_gff_obj)))
    if args.busco_result:
        qa_methods.append(('BUSCO', prot_busco, (args.busco_result, )))
    if args.blast_result:
        qa_methods.append(('BLAST', prot_similarity, (args.blast_result, )))
    if args.ips_result:
        qa_methods.append(('IPS', prot_domains, (args.ips_result, )))

    qa_data = [pd.Series(m[1].__call__(*m[2]), name=m[0]) for m in qa_methods]
Exemplo n.º 12
0
import sys
# try to import from project first
from os.path import dirname

sys.path.insert(1, dirname(dirname(__file__)))
from gff3 import Gff3

# initialize a Gff3 object
gff = Gff3()
# parse GFF3 file and do syntax checking, this populates gff.lines and gff.features
# if an embedded ##FASTA directive is found, parse the sequences into gff.fasta_embedded
gff.parse('annotations.gff')
# parse the external FASTA file into gff.fasta_external
#gff.parse_fasta_external('annotations.fa')
# Check seqid, bounds and the number of Ns in each feature using one or more reference sources
gff.check_reference(allowed_num_of_n=0, feature_types=['CDS'])
# Checks whether child features are within the coordinate boundaries of parent features
gff.check_parent_boundary()
# Calculates the correct phase and checks if it matches the given phase for CDS features
gff.check_phase()
Exemplo n.º 13
0
    filemode='w',
    level=logging.INFO)
logging.info('start program')
#gff_data = pd.read_csv('clec_OGS_v1_2_with_pep_CDS.gff3', sep="\t", header = None,comment='#')
'''
remove NOTES attribute
name attributes: gene, mRNA, pseudogenic_transcript

'''

f_out = open(file_name1 + '_out.txt', 'w')
#1. remove %09
#preprocess= subprocess.Popen("sed s/%09//g "+ file_in +">temp.gff", stdout=subprocess.PIPE,shell=True)
#gff_data = pd.read_csv('temp.gff', sep="\t", header = None,comment='#')
gff_data = pd.read_csv(file_in, sep="\t", header=None, comment='#')
gff = Gff3(gff_file=file_in)


#2. remodel pseudogene
def Remodel_pseudogenes():
    pseudo_list = []

    df0 = gff_data.where(gff_data[2].str.contains("pseudogene"))
    idx0 = df0.dropna().index.tolist()
    for index in idx0:
        gff_data.iloc[index, 2] = "gene"
        # gff_data.iloc[index, 8] = gff_data.loc[index, 8] + ";pseudogene=unknown"
        # get gene_ID
        gene_ID = "".join(
            re.match("^.*ID=([^;]+);.+$", gff_data.iloc[index, 8]).groups())
        # print gene_ID
Exemplo n.º 14
0
#python intron_length_comparison_with_intersp_protein2genome.py Sapria_longintron.rnd1.protein2genome.gff rnd1_rerun_protein2genome.intron_pos.tsv

from gff3 import Gff3
import sys, re

#gff = Gff3(sys.argv[1])
#output=open(sys.argv[2],'a')
gff = Gff3('Sapria_longintron.rnd1.protein2genome.gff')
output = open('rnd1_protein2genome.sum_by_intron.tsv', 'a')

output.write('\t'.join([
    'prot_aln_ID', 'scaffold', 'start', 'end', 'type', '5_end_intron_ID',
    'protein_hit', 'protein_corrd', 'alignment_note'
]) + '\n')
ID = 0
for l in gff.lines:
    #for protein2genome
    if l['type'] == 'protein_match':
        protein_target_id = l['attributes']['Name']
        if protein_target_id.startswith(
                'Potri') or protein_target_id.startswith('Mane'):
            ID = ID + 1
            #if no intron
            if len(l['children']) == 1:
                scaf = l['seqid']
                strand = l['strand']
                rec = l['children'][0]
                output.write('\t'.join([
                    'ALN_' + ` ID `, scaf,
                    str(rec['start']),
                    str(rec['end']), 'prot_match', '-', protein_target_id,
Exemplo n.º 15
0
import sys
# try to import from project first
from os.path import dirname
sys.path.insert(1, dirname(dirname(__file__)))
from gff3 import Gff3

gff = Gff3('annotations.gff')
type_map = {'exon': 'pseudogenic_exon', 'transcript': 'pseudogenic_transcript'}
pseudogenes = [line for line in gff.lines if line['type'] == 'pseudogene']
for pseudogene in pseudogenes:
    # convert types
    for line in gff.descendants(pseudogene):
        if line['type'] in type_map:
            line['type'] = type_map[line['type']]
    # find overlapping gene
    overlapping_genes = [
        line for line in gff.lines
        if line['type'] == 'gene' and gff.overlap(line, pseudogene)
    ]
    if overlapping_genes:
        # move pseudogene children to overlapping gene
        gff.adopt(pseudogene, overlapping_genes[0])
        # remove pseudogene
        gff.remove(pseudogene)
gff.write('annotations_fixed.gff')