def main(old_gff, new_gff, output_gff, re_construct_features, tmp_identifier):
    logger.info('Reading original GFF3 file: (%s)...\n', old_gff)
    old_gff3 = Gff3(gff_file=old_gff)

    logger.info('Reading updated GFF3 file: (%s)...\n', new_gff)
    new_gff3 = Gff3(gff_file=new_gff)

    if re_construct_features:
        out_f = open(re_construct_features, 'w')
    else:
        out_f = None
    polypeptide_re_construct(old_gff3=old_gff3,
                             new_gff3=new_gff3,
                             tmp_identifier=tmp_identifier,
                             report=out_f)
    re_construct(old_gff3=old_gff3,
                 new_gff3=new_gff3,
                 tmp_identifier=tmp_identifier,
                 report=out_f)
    logger.info('Generating the re-constructed gff3 file: (%s)...\n',
                output_gff)
    write_gff3(new_gff3, output_gff)

    if re_construct_features:
        out_f.close()
예제 #2
0
def main(gff,
         output=None,
         sorting_order=None,
         isoform_sort=False,
         logger=None,
         reference=False):
    logger_null = logging.getLogger(__name__ + 'null')
    null_handler = logging.NullHandler()
    logger_null.addHandler(null_handler)

    gff3 = Gff3(gff_file=gff, logger=logger_null)

    if output:
        report = open(output, 'w')
    else:
        report = sys.stdout

    logger.info('Sorting and printing out...')

    # Visit the GFF3 object through root-level features (eg. gene, pseudogene, and etc.)
    roots = []
    gff3_linenum_Set = set()

    for line in gff3.lines:
        if line['line_type'] == 'feature':
            gff3_linenum_Set.add(line['line_index'])
        try:
            if line['line_type'] == 'feature' and not 'Parent' in line[
                    'attributes'] and len(line['attributes']) != 0:
                roots.append(line)
        except:
            logger.warning(
                '[Missing Attributes] Program failed.\n\t\t- Line {0:s}: {1:s}'
                .format(str(line['line_index'] + 1), line['line_raw']))
    #roots = [line for line in gff3.lines if line['line_type'] == 'feature' and not line['attributes'].has_key('Parent')]

    # Sort the root-level features based on the order of the genomic sequences
    roots_sorted = PositionSort(roots, reference)

    # Write the gff version
    # report.write('##gff-version 3\n')

    wrote_sequence_region = set()
    # build sequence region data
    sequence_regions = {}
    if gff3.fasta_embedded:
        for seqid in gff3.fasta_embedded:
            sequence_regions[seqid] = (1,
                                       len(gff3.fasta_embedded[seqid]['seq']))
    else:
        directives_lines = [
            line_data for line_data in gff3.lines
            if line_data['line_type'] == 'directive'
            and line_data['directive'] == '##sequence-region'
        ]
        for sequence_region in directives_lines:
            sequence_regions[sequence_region['seqid']] = (
                sequence_region['start'], sequence_region['end'])
    ignore_directives = ['##sequence-region', '###', '##FASTA']
    # write directive
    directives_lines = [
        line_data for line_data in gff3.lines
        if line_data['line_type'] == 'directive'
        and line_data['directive'] not in ignore_directives
    ]
    for directives_line in directives_lines:
        report.write(directives_line['line_raw'])

    # Visit every root-level feature
    for root in roots_sorted:
        # write ##sequence-region
        if root['seqid'] not in wrote_sequence_region:
            if root['seqid'] in sequence_regions:
                report.write(
                    '##sequence-region %s %d %d\n' %
                    (root['seqid'], sequence_regions[root['seqid']][0],
                     sequence_regions[root['seqid']][1]))
            wrote_sequence_region.add(root['seqid'])
        if sorting_order == None:
            report.write(root['line_raw'])
            gff3_linenum_Set.discard(root['line_index'])
            children = root[
                'children']  # Collect the second-level features (eg. mRNA, ncRNA, and etc.)
            children_sorted = PositionSort(children, reference)
            otherlines = []
            for child in children_sorted:
                ## ID information is stored in child['attributes']['ID']
                #print('----------------')
                gff3_linenum_Set.discard(child['line_index'])
                report.write(child['line_raw'])
                grandchildren = child[
                    'children']  # Collect third-level features (eg. exon, CDS, and etc.)
                gchildgroup = {}
                # Visit every third-level feature, and collect a dictionary of 'type' to 'features'
                for grandchild in grandchildren:  # Visit each third-level feature
                    if str(grandchild['type']) in gchildgroup:
                        gchildgroup[str(grandchild['type'])].append(grandchild)
                    else:
                        gchildgroup[str(grandchild['type'])] = []
                        gchildgroup[str(grandchild['type'])].append(grandchild)
                    otherlines.extend(gff3.collect_descendants(grandchild))
                # Seperate the third-level features into three groups: exon, cds, and others
                exons = []
                cdss = []
                others = []
                for k, v in gchildgroup.items():
                    if k == 'exon' or k == 'pseudogenic_exon':
                        exons.extend(v)
                    elif k == 'CDS':
                        cdss.extend(v)
                    else:
                        others.extend(v)

                # Sort exons by considering strand information (StrandSort)
                if len(exons):
                    exons_sorted = []
                    if StrandSort(exons):
                        exons_sorted = StrandSort(exons)
                        for exon in exons_sorted:
                            if 'Parent' in exon['attributes']:
                                if isinstance(
                                        exon['attributes']['Parent'],
                                        list) and len(
                                            exon['attributes']['Parent']) > 1:
                                    gff3_linenum_Set.discard(
                                        exon['line_index'])
                                    report.write(
                                        TwoParent(child['attributes']['ID'],
                                                  exon))
                                else:
                                    gff3_linenum_Set.discard(
                                        exon['line_index'])
                                    report.write(exon['line_raw'])
                            else:
                                gff3_linenum_Set.discard(exon['line_index'])
                                report.write(exon['line_raw'])
                # Sort cds features by considering strand information (StrandSort)
                if len(cdss):
                    cdss_sorted = []
                    if StrandSort(cdss):
                        cdss_sorted = StrandSort(cdss)
                        for cds in cdss_sorted:
                            if 'Parent' in cds['attributes']:
                                if isinstance(
                                        cds['attributes']['Parent'],
                                        list) and len(
                                            cds['attributes']['Parent']) > 1:
                                    gff3_linenum_Set.discard(cds['line_index'])
                                    report.write(
                                        TwoParent(child['attributes']['ID'],
                                                  cds))
                                else:
                                    gff3_linenum_Set.discard(cds['line_index'])
                                    report.write(cds['line_raw'])
                            else:
                                gff3_linenum_Set.discard(cds['line_index'])
                                report.write(cds['line_raw'])
                # Sort other features by PositionSort
                if len(others):
                    if PositionSort(others, reference):
                        for other in others:
                            if 'Parent' in other['attributes']:
                                if isinstance(
                                        other['attributes']['Parent'],
                                        list) and len(
                                            other['attributes']['Parent']) > 1:
                                    gff3_linenum_Set.discard(
                                        other['line_index'])
                                    report.write(
                                        TwoParent(child['attributes']['ID'],
                                                  other))
                                else:
                                    gff3_linenum_Set.discard(
                                        other['line_index'])
                                    report.write(other['line_raw'])
                            else:
                                gff3_linenum_Set.discard(other['line_index'])
                                report.write(other['line_raw'])

            # Sort the features beyond the third-level by PositionSort
            unique = {}
            otherlines_sorted = []
            if PositionSort(otherlines, reference):
                otherlines_sorted = PositionSort(otherlines, reference)
            for k in otherlines_sorted:
                gff3_linenum_Set.discard(k['line_index'])
                unique[k['line_raw']] = 1
            for k, v in unique.items():
                report.write(k)
        else:
            if not isoform_sort:
                gff3_linenum_Set = write_out_by_level(
                    level=0,
                    report=report,
                    line_data=root,
                    sorting_order=sorting_order,
                    gff3_linenum_Set=gff3_linenum_Set)
            else:
                model = gff3.collect_descendants(root)
                model.insert(0, root)
                strand_set = list(set([line['strand'] for line in model]))
                reverse = False
                for line in model:
                    if len(strand_set) == 1:
                        if strand_set == '-':
                            reverse = True
                line_list = TypeSort(model, sorting_order, reverse=reverse)
                for line in line_list:
                    gff3_linenum_Set.discard(line['line_index'])
                    report.write(line['line_raw'])
        report.write('###\n')

    #Missing 'root' feature
    if len(gff3_linenum_Set) != 0:
        logger.warning(
            'The following lines are omitted from the output file, because there is a problem with the input file. Please review the input file or run gff-QC.py to identify the error.\n'
        )
        for line_num in gff3_linenum_Set:
            print('\t\t- Line {0:s}: {1:s}'.format(
                str(line_num + 1), gff3.lines[line_num]['line_raw']))

    # write fasta
    fasta = gff3.fasta_embedded
    if fasta:
        report.write('##FASTA\n')
        for key in fasta:
            seq = fasta[key]['seq']
            report.write(u'{0:s}\n{1:s}\n'.format(fasta[key]['header'], seq))
예제 #3
0
def main(gff1,
         gff2,
         fasta,
         outdir,
         scode,
         logger,
         all_assign=False,
         user_defined1=None,
         user_defined2=None):
    logger_null = logging.getLogger(__name__ + 'null')
    null_handler = logging.NullHandler()
    logger_null.addHandler(null_handler)
    if not os.path.isdir(outdir):
        os.makedirs(outdir)

    tmpdir = '{0:s}/{1:s}'.format(outdir, 'tmp')
    if not os.path.isdir(tmpdir):
        os.makedirs(tmpdir)

    #Check if there is a non-coding transcript
    transcripts = set()
    transcripts_type = set()
    gff3_1 = Gff3(gff_file=gff1, fasta_external=fasta, logger=logger)
    gff3_2 = Gff3(gff_file=gff2, fasta_external=fasta, logger=logger)

    makeblastdb_path = os.path.join(lib_path, 'ncbi-blast+', 'bin',
                                    'makeblastdb')
    blastn_path = os.path.join(lib_path, 'ncbi-blast+', 'bin', 'blastn')

    if user_defined1 is None:
        roots = []
        for line in gff3_1.lines:
            try:
                if line['line_type'] == 'feature':
                    # remove all the replace attributes
                    if all_assign and 'replace' in line['attributes']:
                        del line['attributes']['replace']
                    if 'Parent' not in line['attributes'] and len(
                            line['attributes']) != 0:
                        roots.append(line)
            except:
                pass
        for root in roots:
            children = root['children']
            for child in children:
                cid = 'NA'
                if child['attributes'].has_key('ID'):
                    cid = child['attributes']['ID']
                defline = cid
                gchildren = child['children']
                CDSflag = 0
                for gchild in gchildren:
                    if gchild['type'] == 'CDS':
                        CDSflag += 1
                if CDSflag == 0:
                    transcripts.add(defline)
                if child.has_key('type'):
                    transcripts_type.add(child['type'])
    else:
        for lines in user_defined1:
            transcripts_type.add(lines[0])
        for line in gff3_1.lines:
            if line['line_type'] == 'feature':
                if all_assign and 'replace' in line['attributes']:
                    del line['attributes']['replace']
            if line['type'] in transcripts_type:
                id = str()
                if line['attributes'].has_key('ID'):
                    id = line['attributes']['ID']
                    transcripts.add(id)
    gff2_transcripts_type = set()
    if user_defined2 is None:
        roots = []
        for line in gff3_2.lines:
            try:
                if line['line_type'] == 'feature':
                    if 'Parent' not in line['attributes'] and len(
                            line['attributes']) != 0:
                        roots.append(line)
            except KeyError:
                pass
        for root in roots:
            for child in root['children']:
                if 'type' in child:
                    gff2_transcripts_type.add(child['type'])
    else:
        for lines in user_defined2:
            gff2_transcripts_type.add(lines[0])

    if all_assign:
        # modified gff1 without any relace attributes
        gff3_1_mod = os.path.join(tmpdir, 'gff1_mod.gff3')
        gff3_1.write(gff3_1_mod)
        gff1 = gff3_1_mod

    out1_type = os.path.join(tmpdir, 'gff1_transcript_type.txt')
    with open(out1_type, "w") as trans_type:
        for line in transcripts_type:
            trans_type.write(line + "\n")

    cmd = os.path.join(lib_path, 'auto_assignment',
                       'create_annotation_summaries_nov21-7.pl')
    logger.info('Generate info table for {0:s} by using {1:s}'.format(
        gff1, cmd))
    summary = os.path.join(tmpdir, 'summary_report.txt')
    subprocess.Popen(['perl', cmd, gff1, fasta, summary, scode, out1_type],
                     stdout=DEVNULL).wait()

    logger.info('Extract sequences from {0:s}...'.format(gff1))
    out1 = os.path.join(tmpdir, 'gff1')
    if user_defined1 is None:
        logger.info('\tExtract CDS sequences...')
        gff3_to_fasta.main(gff_file=gff1,
                           fasta_file=fasta,
                           stype='cds',
                           dline='complete',
                           qc=False,
                           output_prefix=out1,
                           logger=logger_null)
        logger.info('\tExtract premature transcript sequences...')
        gff3_to_fasta.main(gff_file=gff1,
                           fasta_file=fasta,
                           stype='pre_trans',
                           dline='complete',
                           qc=False,
                           output_prefix=out1,
                           logger=logger_null)
        if len(transcripts) > 0:
            logger.info('\tExtract transcript sequences...')
            gff3_to_fasta.main(gff_file=gff1,
                               fasta_file=fasta,
                               stype='trans',
                               dline='complete',
                               qc=False,
                               output_prefix=out1,
                               logger=logger_null)
    else:
        logger.info('\tExtract user_defined_file1 sequences...')
        user_defined_out1 = '{0:s}_{1:s}'.format(out1, 'cds.fa')
        user_defined_pretrans1 = '{0:s}_{1:s}'.format(out1, 'pre_trans.fa')
        user_defined_tmp = '{0:s}_{1:s}'.format(out1, 'user_defined.fa')
        parent_type = set()
        with open(user_defined_out1, "w") as outfile:
            for lines in user_defined1:
                gff3_to_fasta.main(gff_file=gff1,
                                   fasta_file=fasta,
                                   stype='user_defined',
                                   user_defined=lines,
                                   dline='complete',
                                   qc=False,
                                   output_prefix=out1,
                                   logger=logger_null)
                with open(user_defined_tmp, 'rb') as fd:
                    shutil.copyfileobj(fd, outfile)
                parent_type.add(lines[0])

        with open(user_defined_pretrans1, "w") as outfile:
            for line in parent_type:
                seq = gff3_to_fasta.extract_start_end(gff3_1, line, 'complete')
                for k, v in seq.items():
                    if len(k) != 0 and len(v) != 0:
                        outfile.write('{0:s}\n{1:s}\n'.format(k, v))

    logger.info('Extract sequences from {0:s}...'.format(gff2))
    out2 = os.path.join(tmpdir, 'gff2')
    if user_defined2 is None:
        logger.info('\tExtract CDS sequences...')
        gff3_to_fasta.main(gff_file=gff2,
                           fasta_file=fasta,
                           stype='cds',
                           dline='complete',
                           qc=False,
                           output_prefix=out2,
                           logger=logger_null)
        logger.info('\tExtract premature transcript sequences...')
        gff3_to_fasta.main(gff_file=gff2,
                           fasta_file=fasta,
                           stype='pre_trans',
                           dline='complete',
                           qc=False,
                           output_prefix=out2,
                           logger=logger_null)
        if len(transcripts) > 0:
            logger.info('\tExtract transcript sequences...')
            gff3_to_fasta.main(gff_file=gff2,
                               fasta_file=fasta,
                               stype='trans',
                               dline='complete',
                               qc=False,
                               output_prefix=out2,
                               logger=logger_null)
    else:
        logger.info('\tExtract user_defined_file2 sequences...')
        user_defined_out2 = '{0:s}_{1:s}'.format(out2, 'cds.fa')
        user_defined_pretrans2 = '{0:s}_{1:s}'.format(out2, 'pre_trans.fa')
        user_defined_tmp = '{0:s}_{1:s}'.format(out2, 'user_defined.fa')
        parent_type = set()
        with open(user_defined_out2, "w") as outfile:
            for lines in user_defined2:
                gff3_to_fasta.main(gff_file=gff2,
                                   fasta_file=fasta,
                                   stype='user_defined',
                                   user_defined=lines,
                                   dline='complete',
                                   qc=False,
                                   output_prefix=out2,
                                   logger=logger_null)
                with open(user_defined_tmp, 'rb') as fd:
                    shutil.copyfileobj(fd, outfile)
                parent_type.add(lines[0])

        with open(user_defined_pretrans2, "w") as outfile:
            for line in parent_type:
                seq = gff3_to_fasta.extract_start_end(gff3_2, line, 'complete')
                for k, v in seq.items():
                    if len(k) != 0 and len(v) != 0:
                        outfile.write('{0:s}\n{1:s}\n'.format(k, v))

    logger.info('Catenate {0:s} and {1:s}...'.format(gff1, gff2))
    cgff = os.path.join(tmpdir, 'cat.gff')
    with open(cgff, "w") as outfile:
        for catfile in [gff1, gff2]:
            with open(catfile, 'rb') as fd:
                shutil.copyfileobj(fd, outfile)
    bdb = '{0:s}_{1:s}'.format(out2, 'cds.fa')
    logger.info('Make blastDB for CDS sequences from {0:s}...'.format(bdb))
    subprocess.Popen([makeblastdb_path, '-in', bdb, '-dbtype', 'nucl']).wait()
    print('\n')
    logger.info(
        'Sequence alignment for cds fasta files between {0:s} and {1:s}...'.
        format(gff1, gff2))
    binput = '{0:s}_{1:s}'.format(out1, 'cds.fa')
    bout = os.path.join(tmpdir, 'blastn.out')
    subprocess.Popen([
        blastn_path, '-db', bdb, '-query', binput, '-out', bout, '-evalue',
        '1e-10', '-penalty', '-15', '-ungapped', '-outfmt', '6'
    ]).wait()
    # update out1_type
    transcripts_type.update(gff2_transcripts_type)
    with open(out1_type, "w") as trans_type:
        for line in transcripts_type:
            trans_type.write(line + "\n")
    logger.info('Find CDS matched pairs between {0:s} and {1:s}...'.format(
        gff1, gff2))
    cmd = os.path.join(lib_path, 'auto_assignment', 'find_match.pl')
    report1 = os.path.join(tmpdir, 'report1.txt')
    subprocess.Popen(['perl', cmd, cgff, bout, scode, report1,
                      out1_type]).wait()

    with open(bout, "r") as bcds:
        for line in bcds:
            try:
                QueryID = re.match("^.*ID=([^|]+).+$",
                                   line.split("\t")[0]).group(1)
                transcripts.discard(QueryID)
            except:
                pass
    if len(transcripts) > 0:
        if user_defined2 is None:
            bdb = '{0:s}_{1:s}'.format(out2, 'trans.fa')
        else:
            bdb = '{0:s}_{1:s}'.format(out2, 'cds.fa')
            logger.info(
                'Make blastDB for transcript sequences from {0:s}...'.format(
                    bdb))
        subprocess.Popen([makeblastdb_path, '-in', bdb, '-dbtype',
                          'nucl']).wait()
        print('\n')
        logger.info(
            'Sequence alignment for transcript fasta files between {0:s} and {1:s}...'
            .format(gff1, gff2))
        if user_defined1 is None:
            binput = '{0:s}_{1:s}'.format(out1, 'trans.fa')
        else:
            binput = '{0:s}_{1:s}'.format(out1, 'cds.fa')
            bout = '{0:s}/{1:s}'.format(tmpdir, 'blastn.out')
        subprocess.Popen([
            blastn_path, '-db', bdb, '-query', binput, '-out', bout, '-evalue',
            '1e-10', '-penalty', '-15', '-ungapped', '-outfmt', '6'
        ]).wait()

        logger.info(
            'Find transcript matched pairs between {0:s} and {1:s}...'.format(
                gff1, gff2))
        cmd = os.path.join(lib_path, 'auto_assignment', 'find_match.pl')
        report1_trans = os.path.join(tmpdir, 'report1_trans.txt')
        subprocess.Popen(
            ['perl', cmd, cgff, bout, scode, report1_trans, out1_type]).wait()

        with open(report1, "a") as rep1:
            with open(report1_trans, "r") as rep1_trans:
                for line in rep1_trans:
                    try:
                        transID = line.split("\t")[2]
                        if transID in transcripts:
                            rep1.write(line)
                    except:
                        pass
    bdb = '{0:s}_{1:s}'.format(out2, 'pre_trans.fa')
    logger.info(
        'Make blastDB for premature transcript sequences from {0:s}...'.format(
            bdb))
    subprocess.Popen([makeblastdb_path, '-in', bdb, '-dbtype', 'nucl']).wait()
    print('\n')
    logger.info(
        'Sequence alignment for premature transcript fasta files between {0:s} and {1:s}...'
        .format(gff1, gff2))
    binput = '{0:s}_{1:s}'.format(out1, 'pre_trans.fa')
    bout = os.path.join(tmpdir, 'blastn.out')
    subprocess.Popen([
        blastn_path, '-db', bdb, '-query', binput, '-out', bout, '-evalue',
        '1e-10', '-penalty', '-15', '-ungapped', '-outfmt', '6'
    ]).wait()

    cmd = os.path.join(lib_path, 'auto_assignment', 'find_match.pl')
    logger.info(
        'Find premature transcript matched pairs between {0:s} and {1:s}...'.
        format(gff1, gff2))
    report2 = os.path.join(tmpdir, 'report2.txt')
    subprocess.Popen(['perl', cmd, cgff, bout, scode, report2,
                      out1_type]).wait()

    print('\n')
    cmd = os.path.join(lib_path, 'auto_assignment', 'gen_spreadsheet.pl')
    check1 = os.path.join(outdir, 'check1.txt')
    logger.info(
        'Generate {0:s} for Check Point 1 internal reviewing...'.format(
            check1))
    subprocess.Popen(['perl', cmd, summary, report1, report2, check1]).wait()
예제 #4
0
def main(gff_file1,
         gff_file2,
         output_gff,
         report_fh,
         user_defined1=None,
         user_defined2=None,
         logger=None):
    logger_null = logging.getLogger(__name__ + 'null')
    null_handler = logging.NullHandler()
    logger_null.addHandler(null_handler)

    if not logger:
        logger = logger_null
    logger.info(
        'Sorting the WA gff by following the order of Scaffold number and coordinates...'
    )
    gff3_sort.main(gff_file1, output='WA_sorted.gff', logger=logger)

    logger.info(
        'Sorting the other gff by following the order of Scaffold number and coordinates...'
    )
    gff3_sort.main(gff_file2, output='other_sorted.gff', logger=logger)

    logger.info('Reading WA gff3 file...')
    gff3 = Gff3(gff_file='WA_sorted.gff', logger=logger_null)

    logger.info('Reading the other gff3 file...')
    gff3M = Gff3(gff_file='other_sorted.gff', logger=logger_null)  #Maker

    logger.info('Identifying types of replacement based on replace tag...')
    ReplaceGroups = replace_OGS.Groups(WAgff=gff3,
                                       Pgff=gff3M,
                                       outsideNum=1,
                                       user_defined1=user_defined1,
                                       user_defined2=user_defined2,
                                       logger=logger_null)

    logger.info('Replacing...')
    u_types = set()
    u1_types = set()
    if user_defined1 is not None:
        for line in user_defined1:
            u1_types.add(line[0])
        u_types |= u1_types
    else:
        u1_types = None
    u2_types = set()
    if user_defined2 is not None:
        for line in user_defined2:
            u2_types.add(line[0])
        u_types |= u2_types
    else:
        u2_types = None
    roots = []
    transcripts = []
    unique = set()
    for line in gff3.lines:

        if user_defined1 is None:
            try:
                if line['line_type'] == 'feature' and not line[
                        'attributes'].has_key('Parent'):
                    roots.append(line)
            except:
                pass
        else:
            if line['type'] in u1_types:
                transcripts.append(line)
                for root in gff3.collect_roots(line):
                    if root['line_raw'] not in unique:
                        roots.append(root)
                        unique.add(root['line_raw'])

    #roots = [line for line in gff3.lines if line['line_type'] == 'feature' and not line['attributes'].has_key('Parent')]
    rnum, cnum, changed = 0, 0, 0
    cal_type_children = {}
    changed_rootid = set()
    not_orphan = set()
    for root in roots:
        rnum += 1
        if user_defined1 is None:
            children = root['children']
        else:
            children = []
            unique = set()
            if root['type'] in u1_types:
                children.append(root)
            else:
                for child in gff3.collect_descendants(root):
                    if child['type'] in u1_types:
                        if child['line_raw'] not in unique:
                            children.append(child)
                            unique.add(child['line_raw'])
            children = sorted(children, key=lambda k: k['line_index'])

        tags = {}
        cnum += len(children)
        maxisoforms = 0
        for child in children:
            tags[str(child['attributes']['replace'])] = 0
            for tag in child['attributes']['replace']:
                if not tag == 'NA':
                    not_orphan.add(tag)
                    t = gff3M.features[ReplaceGroups.mapName2ID[tag]][0]
                    if user_defined2 is None:
                        tmp = len(t['parents'][0][0]['children'])
                    else:
                        if len(t['parents']) == 0 and t['type'] in u2_types:
                            #this transcript don't have parent feature(e.g. gene), set the number of isoform as 1.
                            tmp = 1
                        else:
                            tmp = len(t['parents'][0][0]['children'])

                    if tmp > maxisoforms:
                        maxisoforms = tmp
        if len(tags) <= 1:
            if maxisoforms >= 2:
                root['attributes']['replace_type'] = 'multi-ref'
                for child in children:
                    child['attributes']['replace_type'] = 'multi-ref'
                if user_defined1 is None:
                    ans = ReplaceGroups.replacer_multi(root, ReplaceGroups,
                                                       gff3M, u1_types,
                                                       u2_types)
                else:
                    ans = ReplaceGroups.replacer_multi(root, ReplaceGroups,
                                                       gff3M, u1_types,
                                                       u2_types, gff3)
                report_fh.write('{0:s}\n'.format(ans))
                changed_rootid.add(root['attributes']['ID'])
                changed += 1
            else:
                ReplaceGroups.replacer(root, ReplaceGroups, gff3M, u1_types,
                                       gff3)
                changed_rootid.add(root['attributes']['ID'])
                changed += 1
        else:
            logger.info(
                '[Warning] multiple replace tags in multiple isoforms! {0:s}. This model is not processed\n'
                .format(root['attributes']['ID']))
            report_fh.write(
                '[Warning] multiple replace tags in multiple isoforms! {0:s}. This model is not processed\n'
                .format(root['attributes']['ID']))
        for child in children:
            if child['attributes'].has_key('status') and (
                    child['attributes']['status'] == 'Delete'
                    or child['attributes']['status'] == 'delete'):
                child['attributes']['replace_type'] = 'Delete'
            if cal_type_children.has_key(child['attributes']['replace_type']):
                cal_type_children[child['attributes']['replace_type']] += 1
            else:
                cal_type_children[child['attributes']['replace_type']] = 1

    cal_type = {}
    for i in ReplaceGroups.info:
        tokens = i.split('\t')
        tmp = re.search('(.+?):(.*)', tokens[3])
        if cal_type.has_key(tmp.groups()[0]):
            cal_type[tmp.groups()[0]] += 1
        else:
            cal_type[tmp.groups()[0]] = 1
        #print('{0:s}'.format(i))

    report_fh.write('# Number of WA loci: {0:d}\n'.format(rnum))
    report_fh.write('# Number of WA transcripts: {0:d}\n'.format(cnum))
    report_fh.write(
        '# Number of WA loci that were used to replace the models in reference gff: {0:d}\n'
        .format(changed))

    for k, v in cal_type.items():
        if k == 'simple':
            report_fh.write(
                '# Number of loci with {0:s}/Delete replacement: {1:d}\n'.
                format(k, v))
        else:
            report_fh.write(
                '# Number of loci with {0:s} replacement: {1:d}\n'.format(
                    k, v))
    for k, v in cal_type_children.items():
        report_fh.write(
            '# Number of transcripts with {0:s} replacement: {1:d}\n'.format(
                k, v))

    report_fh.write(
        'Change_log\tOriginal_gene_name\tOriginal_transcript_ID\tOriginal_transcript_name\tTmp_OGSv0_ID\n'
    )

    roots = []
    transcripts = []
    unique = set()
    for line in gff3M.lines:
        if user_defined2 is None:
            try:
                if line['line_type'] == 'feature' and not line[
                        'attributes'].has_key('Parent'):
                    roots.append(line)
            except:
                pass
        else:
            if line['type'] in u_types:
                transcripts.append(line)
                for root in gff3M.collect_roots(line):
                    if root['line_raw'] not in unique:
                        roots.append(root)
                        unique.add(root['line_raw'])

    #roots = [line for line in gff3M.lines if line['line_type'] == 'feature' and not line['attributes'].has_key('Parent')]
    for root in roots:
        if root['attributes']['ID'] not in changed_rootid:
            if user_defined2 is None:
                children = root['children']

            else:
                children = []
                unique = set()
                if root['type'] in u_types:
                    children.append(root)
                else:
                    for child in gff3M.collect_descendants(root):
                        if child['type'] in u_types:
                            if child['line_raw'] not in unique:
                                children.append(child)
                                unique.add(child['line_raw'])
                children = sorted(children, key=lambda k: k['line_index'])
        elif root['attributes'][
                'ID'] in changed_rootid and user_defined1 is not None:
            children = []
            unique = set()
            if root['type'] in u1_types:
                children.append(root)
            else:
                for child in gff3.collect_descendants(root):
                    if child['type'] in u1_types:
                        if child['line_raw'] not in unique:
                            children.append(child)
                            unique.add(child['line_raw'])
            children = sorted(children, key=lambda k: k['line_index'])
        else:
            children = root['children']

        for child in children:
            cflag = 0
            if not child['line_status'] == 'removed':
                #print(child['attributes'])
                if child['attributes'].has_key('replace_type'):
                    for i in root['attributes']['replace']:
                        tname, tid, gid, tmpid = 'NA', 'NA', 'NA', 'NA'
                        tmpid = child['attributes']['ID']
                        if not i == 'NA':
                            t = gff3M.features[ReplaceGroups.mapName2ID[i]][0]
                            try:
                                tname = t['attributes']['Name']
                            except:
                                tname = t['attributes']['ID']
                            tid = t['attributes']['ID']
                            gid_list = list()
                            if user_defined2 is None:
                                for tp_line in t['parents']:
                                    for tp in tp_line:
                                        gid_list.append(tp['attributes']['ID'])
                                gid = ','.join(gid_list)
                            else:
                                for tp in gff3M.collect_roots(t):
                                    gid_list.append(tp['attributes']['ID'])
                                gid = ','.join(gid_list)
                            if tname not in not_orphan:
                                tmpid = 'NA'
                        report_fh.write(
                            '{0:s}\t{1:s}\t{2:s}\t{3:s}\t{4:s}\n'.format(
                                ReplaceGroups.mapType2Log[
                                    child['attributes']['replace_type']], gid,
                                tid, tname, tmpid))
                    del child['attributes']['replace_type']
                    cflag += 1
                if child['attributes'].has_key('replace'):
                    del child['attributes']['replace']
                if cflag == 0:
                    gid = None
                    gid_list = list()
                    if user_defined2 is None:
                        for p_line in child['parents']:
                            for p in p_line:
                                gid_list.append(p['attributes']['ID'])
                    else:
                        for p in gff3M.collect_roots(child):
                            gid_list.append(p['attributes']['ID'])

                    gid = ','.join(gid_list)
                    report_fh.write(
                        '{0:s}\t{1:s}\t{2:s}\t{3:s}\t{4:s}\n'.format(
                            ReplaceGroups.mapType2Log['other'], gid,
                            child['attributes']['ID'],
                            ReplaceGroups.id2name[child['attributes']['ID']],
                            child['attributes']['ID']))
            else:
                if child['attributes'].has_key('status') and child[
                        'attributes']['status'] == 'Delete':
                    for i in child['attributes']['replace']:
                        if i == 'NA':
                            sys.exit(
                                'The replace tag for Delete replacement cannot be NA: {0:s}'
                                .format(child['line_raw']))
                        t = gff3M.features[ReplaceGroups.mapName2ID[i]][0]
                        tname = t['attributes']['Name']
                        tid = t['attributes']['ID']
                        gid_list = list()
                        if user_defined2 is None:
                            for tp_line in t['parents']:
                                for tp in tp_line:
                                    gid_list.append(tp['attributes']['ID'])
                        else:
                            for tp_line in gff3M.collect_roots(t):
                                gid_list.append(tp_line['attributes']['ID'])

                        gid = ','.join(gid_list)

                        report_fh.write(
                            '{0:s}\t{1:s}\t{2:s}\t{3:s}\t{4:s}\n'.format(
                                ReplaceGroups.mapType2Log['Delete'], gid, tid,
                                tname, "NA"))
                    if child['attributes'].has_key('replace'):
                        del child['attributes']['replace']

        if root['attributes'].has_key('replace'):
            del root['attributes']['replace']
        if root['attributes'].has_key('replace_type'):
            del root['attributes']['replace_type']
        if root['attributes'].has_key('modified_track'):
            del root['attributes']['modified_track']

    ReplaceGroups.name2id(gff3M)
    gff3M.write(output_gff)
    rm_list = ['WA_sorted.gff', 'other_sorted.gff']
    remove_files_from_list(rm_list)
예제 #5
0
def main(gff_file,
         revision_file,
         output_gff,
         report_file=None,
         user_defined1=None,
         auto=True,
         logger=None):
    logger_null = logging.getLogger(__name__ + 'null')
    null_handler = logging.NullHandler()
    logger_null.addHandler(null_handler)

    if not logger:
        logger = logger_null

    NCRNA = ['rRNA', 'miRNA', 'ncRNA', 'snRNA', 'snoRNA', 'tRNA']

    logger.info('Reading revision file... ({0:s})'.format(revision_file))
    flines = open(revision_file, 'r')
    fflag = 0
    revision = {}
    revision_id = {}
    rtype = {}
    for line_raw in flines:
        fflag += 1
        if fflag == 1:
            continue
        else:
            if not re.search('\t\n', line_raw):
                line_strip = line_raw.rstrip('\n')
                tokens = line_strip.split('\t')
                key = '{0:s}:{1:s}-{2:s}:{3:s}:{4:s}'.format(
                    tokens[6], tokens[7], tokens[8], tokens[9], tokens[10])
                revision[key] = [tokens[24], line_strip]
                revision_id[tokens[12]] = [tokens[24], line_strip]
                rtype[tokens[10]] = 1

    logger.info('Reading gff3 file... ({0:s})'.format(gff_file))
    gff3 = Gff3(gff_file=gff_file, logger=logger_null)

    if report_file:
        logger.info('Writing summary report ({0:s})...'.format(report_file))
        report_fh = open(report_file, 'w')
    else:
        logger.info('Writing summary report: replace_tag_report.txt...')
        report_fh = open('replace_tag_report.txt', 'w')

    # Validation Summary
    report_fh.write('# GFF3 Revision Report ({0:s})'.format(report_file))
    if gff_file and sys.stdin.isatty():
        report_fh.write(': {0:s} and {1:s}'.format(gff_file, revision_file))
    report_fh.write('\n\n')

    report_fh.write('# Summary\n')

    if len(revision_id) == 0:
        report_fh.write('* Found 0 lines to be revised\n')
    else:
        report_fh.write('* Found {0:d} lines of the revision file\n'.format(
            len(revision_id)))

    match = 0
    for line in gff3.lines:
        if line['type'] in rtype:
            key = '{0:s}:{1:s}-{2:s}:{3:s}:{4:s}'.format(
                line['seqid'], str(line['start']), str(line['end']),
                line['strand'], line['type'])
            if line['attributes']['ID'] in revision_id:
                match += 1
                # if 'replace' not in line['attributes']:
                #     line['attributes']['replace'] = revision_id[line['attributes']['ID']][0]
                line['attributes']['replace'] = [
                    revision_id[line['attributes']['ID']][0]
                ]
                revision_id[line['attributes']['ID']][1] = 'hit'
            elif key in revision:
                tokens = revision[key][1].split('\t')
                if not revision[key][1] == 'hit':
                    report_fh.write(
                        '\t- Same genomic region, but different IDs:\t(Annotator){0:s}\t(Gff){1:s}\n'
                        .format(tokens[12], line['attributes']['ID']))
                    match += 1
                    if 'replace' not in line['attributes']:
                        line['attributes']['replace'] = [revision[key][0]]
                    revision[key][1] = 'hit'
                else:
                    report_fh.write(
                        '\t- Same genomic region, but different IDs and duplicate seuqences at the same location:\t(Location){0:s}\t(Gff){1:s}\n'
                        .format(key, line['attributes']['ID']))
    if match == 0:
        #print '\n[Warning!] No matched lines in the input gff!\n'
        print('\n')
        #sys.exit()
    else:
        report_fh.write(
            '* Found {0:d} matched IDs of the revision file\n'.format(match))
        report_fh.write(
            '* Are there IDs that should be revised, but cannot be found in the gff?\n'
        )
        count = 0
        for v in list(revision_id.values()):
            if not v[1] == 'hit':
                tokens = v[1].split('\t')
                key = '{0:s}:{1:s}-{2:s}:{3:s}:{4:s}'.format(
                    tokens[6], tokens[7], tokens[8], tokens[9], tokens[10])
                if not revision[key][1] == 'hit':
                    report_fh.write('\t- {0:s}\n'.format(v[1]))
                    count += 1
        if count == 0:
            report_fh.write('\t- All IDs are properly found in the gff.\n')

    u_types = set()
    if user_defined1 != None:
        for line in user_defined1:
            u_types.add(line[0])

    roots = []
    transcripts = []
    unique = set()
    for line in gff3.lines:
        if user_defined1 is None:
            try:
                if line['line_type'] == 'feature' and 'Parent' not in line[
                        'attributes']:
                    roots.append(line)
            except KeyError:
                print(
                    'WARNING  [Missing Attributes] Program failed.\n\t\t- Line {0:s}: {1:s}'
                    .format(str(line['line_index'] + 1), line['line_raw']))
        else:
            if line['type'] in u_types:
                transcripts.append(line)
                for root in gff3.collect_roots(line):
                    if root['line_raw'] not in unique:
                        roots.append(root)
                        unique.add(root['line_raw'])

    #roots = [line for line in gff3.lines if line['line_type'] == 'feature' and 'Parent' not in line['attributes']]
    for line in roots:
        if 'replace' in line['attributes'] and 'children' in line:
            for index in range(len(line['attributes']['replace'])):
                line['attributes']['replace'][index] = re.sub(
                    '\s+', '', line['attributes']['replace'][index])
            if user_defined1 is None:
                children = line['children']
            else:
                children = []
                unique = set()
                if line['type'] in u_types:
                    children.append(line)
                else:
                    for child in gff3.collect_descendants(line):
                        if child['type'] in u_types:
                            if child['line_raw'] not in unique:
                                children.append(child)
                                unique.add(child['line_raw'])
                children = sorted(children, key=lambda k: k['line_index'])
            flag = 0
            for child in children:
                f = 0
                if 'replace' not in child['attributes']:
                    child['attributes']['replace'] = line['attributes'][
                        'replace']
                    flag += 1
                    f += 1

                for index in range(len(child['attributes']['replace'])):
                    child['attributes']['replace'][index] = re.sub(
                        '\s+', '', child['attributes']['replace'][index])

                if f == 0:
                    #print('\nReplace tags found at both gene and mRNA level:{0:s}; {1:s}'.format(line['attributes']['replace'], child['attributes']['replace']))
                    i = str(sorted(line['attributes']['replace']))
                    j = str(sorted(child['attributes']['replace']))
                    if not i == j:
                        print(
                            '[Warning!] replace tag at gene level ({0:s}) is not consistent with that at mRNA level ({1:s})'
                            .format(i, j))
            if user_defined1 is None:
                del line['attributes']['replace']
            else:
                if line['type'] not in u_types:
                    del line['attributes']['replace']

            # add an exon features with the same coordiantes to the ncRNA feature if the ncRNA does not contain at least one exon.
            if user_defined1 is None:
                children = line['children']
            else:
                children = []
                unique = set()
                if line['type'] in u_types:
                    children.append(line)
                else:
                    for child in gff3.collect_descendants(line):
                        if child['type'] in u_types:
                            if child['line_raw'] not in unique:
                                children.append(child)
                                unique.add(child['line_raw'])
                children = sorted(children, key=lambda k: k['line_index'])
            for child in children:
                exonflag = 0
                if child['type'] in NCRNA:
                    gchildren = child['children']
                    for gchild in gchildren:
                        if gchild['type'] == 'exon':
                            exonflag += 1
                    if exonflag == 0:
                        newid = '{0:s}-EXON1'.format(child['attributes']['ID'])
                        newExon = copy.deepcopy(child)
                        eofindex = len(gff3.lines)
                        newExon['line_index'] = eofindex
                        newExon['parents'] = []
                        newExon['attributes']['Parent'] = []
                        newExon['attributes']['ID'] = newid
                        newExon['attributes']['Name'] = newid
                        newExon['type'] = 'exon'
                        if 'replace' in newExon['attributes']:
                            del newExon['attributes']['replace']
                        newExon['parents'].append(child)
                        newExon['attributes']['Parent'].append(
                            child['attributes']['ID'])
                        child['children'].append(newExon)
                        gff3.features[newExon['attributes']['ID']].append(
                            newExon)
                        gff3.lines.append(newExon)

            if line['type'] == 'gene' or line['type'] == 'pseudogene':
                if 'children' not in line:
                    gff3.remove(line)
        if auto:
            if 'children' in line:
                if user_defined1 is None:
                    children = line['children']
                else:
                    children = []
                    unique = set()
                    if line['type'] in u_types:
                        children.append(line)
                    else:
                        for child in gff3.collect_descendants(line):
                            if child['type'] in u_types:
                                if child['line_raw'] not in unique:
                                    children.append(child)
                                    unique.add(child['line_raw'])
                tags = {}
                for child in children:
                    tag = ','.join(child['attributes']['replace']).replace(
                        ' ', '')
                    tag = tag.split(',')
                    tags[tuple(tag)] = 0
                # multi-isoforms have different replace tags
                if len(tags) > 1:
                    flag = 0
                    merged_tag = set()
                    for tag in tags.keys():
                        if 'NA' in tag:
                            flag = 1
                        merged_tag.update(list(tag))
                    if flag == 0:
                        for child in children:
                            child['attributes']['replace'] = list(merged_tag)

    if report_file:
        report_fh.close()

    logger.info('Writing revised gff: ({0:s})...'.format(output_gff))
    gff3.write(output_gff)
예제 #6
0
def main(in_gff, merge_report, out_merge_report, out_gff, uuid_on, prefix,
         digitlen, report, alias):
    logger.info('Reading input gff3 file: (%s)', in_gff)
    gff3 = Gff3(gff_file=in_gff, logger=None)
    if merge_report:
        if not out_merge_report:
            logger.error(
                '-m is given. Please specify the filename of the updated merge report with -om'
            )
            sys.exit(1)
        else:
            logger.info(
                'Reading the update report file generated by gff3_merge program: (%s)',
                merge_report)
            header_lines, log_lines, merge_report_dict = read_merge_report(
                gff3, merge_report)
    # generate a table of comparison between old and new IDs.
    if report:
        out_report = open(report, 'w')

    # old and new IDs pair dict
    # ID_dict = {old_ID:newID, missingID: [newID1, newID2]}
    ID_dict = {'missing': []}
    ID_order = []
    roots = list()
    logger.info('Generate new ID for features in (%s)', in_gff)
    for line in gff3.lines:
        try:
            if line['line_type'] == 'feature':
                if uuid_on:
                    newID = str(uuid.uuid1())
                    if 'ID' in line['attributes']:
                        if line['attributes']['ID'] in ID_dict:
                            ID_dict[line['attributes']['ID']].append(newID)
                            if alias:
                                line['attributes']['Alias'] = line[
                                    'attributes']['ID']
                            line['attributes']['ID'] = newID
                        else:
                            ID_dict[line['attributes']['ID']] = [newID]
                            ID_order.append(line['attributes']['ID'])
                            if alias:
                                line['attributes']['Alias'] = line[
                                    'attributes']['ID']
                            line['attributes']['ID'] = newID
                    else:
                        ID_dict['missing'].append(newID)
                        line['attributes']['ID'] = newID
                    if 'Parent' in line['attributes']:
                        for index, parent in enumerate(
                                line['attributes']['Parent']):
                            if parent in ID_dict:
                                line['attributes']['Parent'][index] = ID_dict[
                                    parent][0]
                            else:
                                newID = str(uuid.uuid1())
                                ID_dict[parent] = [newID]
                                ID_order.append(parent)
                                line['attributes']['Parent'][index] = newID
                else:
                    if 'Parent' not in line['attributes']:
                        roots.append(line)
        except KeyError:
            logger.warning('[Missing Attributes] Line (%s)',
                           str(line['line_index'] + 1))
    IDnumber = 0
    for root in roots:
        newID = idgenerator(prefix, IDnumber, digitlen)
        IDnumber = newID['maxnum']
        ID_dict[root['attributes']['ID']] = [newID['ID']]
        ID_order.append(root['attributes']['ID'])
        if alias:
            root['attributes']['Alias'] = root['attributes']['ID']
        root['attributes']['ID'] = newID['ID']
        children = root['children']
        alphabets = alphabets_suffix(len(children))
        for child in children:
            for index, parent in enumerate(child['attributes']['Parent']):
                if parent in ID_dict:
                    child['attributes']['Parent'][index] = newID['ID']

            newcID = '%s-R%s' % (newID['ID'], alphabets.pop(0))
            ID_dict[child['attributes']['ID']] = [newcID]
            ID_order.append(child['attributes']['ID'])
            if alias:
                child['attributes']['Alias'] = child['attributes']['ID']
            child['attributes']['ID'] = newcID
            collected_list = descendants_list(line_data=child, level=0)
            levellist = level_list(collected_list)
            IDnumber_dict = dict()
            for item_list in levellist:
                reverse = False
                if len(item_list) > 1:
                    if item_list[0]['strand'] == '-':
                        reverse = True
                descendant_sort = TypeSort(item_list, dict(), reverse)
                for descend in descendant_sort:
                    flag = False
                    if descend['type'] not in IDnumber_dict:
                        IDnumber_dict[descend['type']] = 0
                    for index, parent in enumerate(
                            descend['attributes']['Parent']):
                        if parent in ID_dict:
                            if flag == True:
                                break
                            if descend['attributes']['ID'] not in ID_dict:
                                deprefix = '%s-%s' % (ID_dict[parent][0],
                                                      descend['type'])
                                newdID = idgenerator(
                                    deprefix, IDnumber_dict[descend['type']],
                                    3)
                                IDnumber_dict[
                                    descend['type']] = newdID['maxnum']
                                ID_dict[descend['attributes']['ID']] = [
                                    newdID['ID']
                                ]
                                ID_order.append(descend['attributes']['ID'])
                                descend['attributes']['ID'] = newdID['ID']
                                flag = True
                            if flag == False:
                                deprefix = '%s-%s' % (ID_dict[parent][0],
                                                      descend['type'])
                                newdID = idgenerator(
                                    deprefix, IDnumber_dict[descend['type']],
                                    3)
                                IDnumber_dict[
                                    descend['type']] = newdID['maxnum']
                                ID_dict[descend['attributes']['ID']].append(
                                    newdID['ID'])
                                descend['attributes']['ID'] = newdID['ID']
                                flag = True
                            descend['attributes']['Parent'][index] = ID_dict[
                                parent][0]
    if merge_report and out_merge_report:
        logger.info(
            'Update report file generated by gff3_merge program with new IDs.')
        with open(out_merge_report, 'w') as out_f:
            for header_line in header_lines:
                out_f.write(header_line + '\n')
            for key in merge_report_dict:
                if key not in ID_order:
                    logger.error(
                        'The report file has to correspond to the gff3 file specified with -g'
                    )
                    sys.exit(1)
                else:
                    for line_num in merge_report_dict[key]:
                        # update Tmp_OGSv0_ID
                        log_lines[line_num][4] = ID_dict[key][0]
            for log_line in log_lines:
                out_f.write('\t'.join(log_line) + '\n')
    logger.info('Write out gff3 file: (%s)', out_gff)
    write_gff3(gff3, out_gff)
    if report:
        ID_order.append('missing')
        logger.info(
            'Generate a report of comparison between old and new IDs: (%s)',
            report)
        out_line = 'Old_ID\tNewID'
        out_report.write(out_line + '\n')
        for key in ID_order:
            for value in ID_dict[key]:
                out_line = '%s\t%s' % (key, value)
                out_report.write(out_line + '\n')

        out_report.close()
예제 #7
0
def main(gff_file=None,
         fasta_file=None,
         embedded_fasta=False,
         stype=None,
         user_defined=None,
         dline=None,
         qc=True,
         output_prefix=None,
         logger=None):
    stderr_handler = logging.StreamHandler()
    stderr_handler.setFormatter(
        logging.Formatter('%(levelname)-8s %(message)s'))
    logger_null = logging.getLogger(__name__ + 'null')
    null_handler = logging.NullHandler()
    logger_null.addHandler(null_handler)

    if not gff_file or (not fasta_file and not embedded_fasta) or not stype:
        print(
            'Gff file, fasta file, and type of extracted sequences need to be specified'
        )
        sys.exit(1)
    type_set = [
        'gene', 'exon', 'pre_trans', 'trans', 'cds', 'pep', 'all',
        'user_defined'
    ]
    if not stype in type_set:
        logger.error(
            'Your sequence type is "{0:s}". Sequence type must be one of {1:s}!'
            .format(stype, str(type_set)))
        sys.exit(1)

    if stype == 'all' and output_prefix:
        pass
    elif stype != 'all' and output_prefix:
        logger.info('Specifying prefix of output file name: (%s)...',
                    output_prefix)
        fname = '{0:s}_{1:s}.fa'.format(output_prefix, stype)
        report_fh = open(fname, 'w')
    else:
        print('[Error] Please specify the prefix of output file name...')
        sys.exit(1)
    if stype == 'user_defined' and user_defined != None:
        if len(user_defined) != 2:
            logger.error(
                'Please specify parent and child feature via the -u argument. Format: [parent feature type],[child feature type]'
            )
            sys.exit(1)
    elif stype != 'user_defined' and user_defined != None:
        logger.warning(
            'Your sequence type is "{0:s}", -u argument will be ignored.'.
            format(stype))
    elif stype == 'user_defined' and user_defined == None:
        logger.error('-u is needed in combination with -st user_defined.')
        sys.exit(1)

    logger.info('Reading files: {0:s}, {1:s}...'.format(gff_file, fasta_file))
    gff = None

    if qc:
        initial_phase = False
        gff = Gff3(gff_file=gff_file, fasta_external=fasta_file, logger=logger)
        if embedded_fasta and len(gff.fasta_embedded) == 0:
            logger.error('There is no embedded fasta in the GFF3 file.')
            sys.exit(1)
        logger.info('Checking errors...')
        gff.check_parent_boundary()
        gff.check_phase(initial_phase)
        gff.check_reference()
        error_set = function4gff.extract_internal_detected_errors(gff)
        t = intra_model.main(gff, logger=logger)
        if t:
            error_set.extend(t)
        t = single_feature.main(gff, logger=logger)
        if t:
            error_set.extend(t)

        if error_set and len(error_set):
            escaped_error = ['Esf0012', 'Esf0033']
            eSet = list()
            for e in error_set:
                if not e['eCode'] in escaped_error:
                    eSet.append(e)
            if len(eSet):
                logger.warning(
                    'The extracted sequences might be wrong for the following features which have formatting errors...'
                )
                print('ID\tError_Code\tError_Tag')
                for e in eSet:
                    tag = '[{0:s}]'.format(e['eTag'])
                    print(e['ID'], e['eCode'], tag)
    else:
        gff = Gff3(gff_file=gff_file,
                   fasta_external=fasta_file,
                   logger=logger_null)
        if embedded_fasta and len(gff.fasta_embedded) == 0:
            logger.error('There is no embedded fasta in the GFF3 file.')

    logger.info('Extract sequences for {0:s}...'.format(stype))
    seq = dict()
    if stype == 'all':
        if output_prefix:
            logger.info('Specifying prefix of output file name: (%s)...',
                        output_prefix)
            pass
        else:
            print('[Error] Please specify the prefix of output file name...')
            sys.exit(1)

        tmp_stype = 'pre_trans'
        logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype))
        seq = extract_start_end(gff, tmp_stype, dline, embedded_fasta)
        if len(seq):
            fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype)
            report_fh = open(fname, 'w')
            logger.info(
                '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format(
                    output_prefix, tmp_stype))
            for k, v in seq.items():
                if len(k) != 0 and len(v) != 0:
                    report_fh.write('{0:s}\n{1:s}\n'.format(k, v))

        seq = dict()
        tmp_stype = 'gene'
        logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype))
        seq = extract_start_end(gff, tmp_stype, dline, embedded_fasta)
        if len(seq):
            fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype)
            report_fh = open(fname, 'w')
            logger.info(
                '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format(
                    output_prefix, tmp_stype))
            for k, v in seq.items():
                if len(k) != 0 and len(v) != 0:
                    report_fh.write('{0:s}\n{1:s}\n'.format(k, v))

        seq = dict()
        tmp_stype = 'exon'
        logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype))
        seq = extract_start_end(gff, tmp_stype, dline, embedded_fasta)
        if len(seq):
            fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype)
            report_fh = open(fname, 'w')
            logger.info(
                '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format(
                    output_prefix, tmp_stype))
            for k, v in seq.items():
                if len(k) != 0 and len(v) != 0:
                    report_fh.write('{0:s}\n{1:s}\n'.format(k, v))

        seq = dict()
        tmp_stype = 'trans'
        feature_type = ['exon', 'pseudogenic_exon']
        logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype))
        seq = splicer(gff, feature_type, dline, stype, embedded_fasta)
        if len(seq):
            fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype)
            report_fh = open(fname, 'w')
            logger.info(
                '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format(
                    output_prefix, tmp_stype))
            for k, v in seq.items():
                if len(k) != 0 and len(v) != 0:
                    report_fh.write('{0:s}\n{1:s}\n'.format(k, v))

        seq = dict()
        tmp_stype = 'cds'
        feature_type = ['CDS']
        logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype))
        seq = splicer(gff, feature_type, dline, stype, embedded_fasta)
        if len(seq):
            fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype)
            report_fh = open(fname, 'w')
            logger.info(
                '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format(
                    output_prefix, tmp_stype))
            for k, v in seq.items():
                if len(k) != 0 and len(v) != 0:
                    report_fh.write('{0:s}\n{1:s}\n'.format(k, v))

        seq = dict()
        tmp_stype = 'pep'
        feature_type = ['CDS']
        logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype))
        tmpseq = splicer(gff, feature_type, dline, tmp_stype, embedded_fasta)
        for k, v in tmpseq.items():
            k = k.replace("|mRNA(CDS)|", "|peptide|")
            v = translator(v)
            seq[k] = v
        if len(seq):
            fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype)
            report_fh = open(fname, 'w')
            logger.info(
                '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format(
                    output_prefix, tmp_stype))
            for k, v in seq.items():
                if len(k) != 0 and len(v) != 0:
                    report_fh.write('{0:s}\n{1:s}\n'.format(k, v))
    elif stype == 'user_defined':
        feature_type = [user_defined[0], user_defined[1]]
        seq = splicer(gff, feature_type, dline, stype, embedded_fasta)
        if len(seq):
            logger.info(
                'Print out extracted sequences: {0:s}_{1:s}.fa...'.format(
                    output_prefix, stype))
            for k, v in seq.items():
                if len(k) != 0 and len(v) != 0:
                    report_fh.write('{0:s}\n{1:s}\n'.format(k, v))

    else:
        if stype == 'pre_trans' or stype == 'gene' or stype == 'exon':
            seq = extract_start_end(gff, stype, dline, embedded_fasta)
        elif stype == 'trans':
            feature_type = ['exon', 'pseudogenic_exon']
            seq = splicer(gff, feature_type, dline, stype, embedded_fasta)
        elif stype == 'cds':
            feature_type = ['CDS']
            seq = splicer(gff, feature_type, dline, stype, embedded_fasta)
        elif stype == 'pep':
            feature_type = ['CDS']
            tmpseq = splicer(gff, feature_type, dline, stype, embedded_fasta)
            for k, v in tmpseq.items():
                k = k.replace("|mRNA(CDS)|", "|peptide|")
                #k = re.sub(r'(.*-)(R)(.)',r'\1P\3',k)
                v = translator(v)
                seq[k] = v
        if len(seq):
            logger.info(
                'Print out extracted sequences: {0:s}_{1:s}.fa...'.format(
                    output_prefix, stype))
            for k, v in seq.items():
                if len(k) != 0 and len(v) != 0:
                    report_fh.write('{0:s}\n{1:s}\n'.format(k, v))
예제 #8
0
def script_main():
    logger_stderr = logging.getLogger(__name__ + 'stderr')
    logger_stderr.setLevel(logging.INFO)
    stderr_handler = logging.StreamHandler()
    stderr_handler.setFormatter(
        logging.Formatter('%(levelname)-8s %(message)s'))
    logger_stderr.addHandler(stderr_handler)
    logger_null = logging.getLogger(__name__ + 'null')
    null_handler = logging.NullHandler()
    logger_null.addHandler(null_handler)
    import argparse
    from textwrap import dedent
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=dedent("""\

    Testing environment:
    1. Python 2.7

    Inputs:
    1. GFF3: Specify the file name with the -g or --gff argument; Please note that this program requires gene/pseudogene and mRNA/pseudogenic_transcript to have an ID attribute in column 9.
    2. fasta file: Specify the file name with the -f or --fasta argument

    Outputs:
    1. Error report for the input GFF3 file
	* Line_num: Line numbers of the found problematic models in the input GFF3 file.
	* Error_code: Error codes for the found problematic models. Please refer to lib/ERROR/ERROR.py to see the full list of Error_code and the corresponding Error_tag.
        * Error_tag: Detail of the found errors for the problematic models. Please refer to lib/ERROR/ERROR.py to see the full list of Error_code and the corresponding Error_tag.

    Quick start:
    gff3_QC -g example_file/example.gff3 -f example_file/reference.fa -o test
    or
    gff3_QC --gff example_file/example.gff3 --fasta example_file/reference.fa --output test

    """))
    parser.add_argument('-g',
                        '--gff',
                        type=str,
                        help='Genome annotation file, gff3 format')
    parser.add_argument('-f',
                        '--fasta',
                        type=str,
                        help='Genome sequences, fasta format')
    parser.add_argument(
        '-noncg',
        '--noncanonical_gene',
        action="store_true",
        help='gff3 file is not formatted in the canonical gene model format.')
    parser.add_argument(
        '-i',
        '--initial_phase',
        action="store_true",
        help='Check whether initial CDS phase is 0 (default: no check)')
    parser.add_argument(
        '-n',
        '--allowed_num_of_n',
        type=int,
        default=0,
        help=
        'Max number of Ns allowed in a feature, anything more will be reported as an error (default: 0)'
    )
    parser.add_argument(
        '-t',
        '--check_n_feature_types',
        nargs='*',
        default=['CDS'],
        help=
        'Count the number of Ns in each feature with the type specified, multiple types may be specified, ex: -t CDS exon (default: "CDS")'
    )
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        help='output file name (default: report.txt)')
    parser.add_argument('-s',
                        '--statistic',
                        type=str,
                        help='statistic file name (default: statistic.txt)')
    parser.add_argument('-v',
                        '--version',
                        action='version',
                        version='%(prog)s ' + __version__)

    args = parser.parse_args()
    if args.gff:
        logger_stderr.info('Checking gff file (%s)...', args.gff)
    elif not sys.stdin.isatty():  # if STDIN connected to pipe or file
        args.gff = sys.stdin
        logger_stderr.info('Reading from STDIN...')
    else:  # no input
        parser.print_help()
        sys.exit(1)
    if args.fasta:
        logger_stderr.info('Checking genome fasta (%s)...', args.fasta)
    elif not sys.stdin.isatty():  # if STDIN connected to pipe or file
        args.fasta = sys.stdin
        logger_stderr.info('Reading from STDIN...')
    else:  # no input
        parser.print_help()
        sys.exit(1)
    if args.allowed_num_of_n or args.check_n_feature_types:
        check_n = True
    else:
        check_n = False

    logger_stderr.info('Reading gff files: (%s)...\n', args.gff)
    gff3 = Gff3(gff_file=args.gff,
                fasta_external=args.fasta,
                logger=logger_null)
    logger_stderr.info('Checking errors in the gff files: (%s)...\n', args.gff)
    if not gff3.check_parent_boundary():
        sys.exit()
    gff3.check_unresolved_parents()
    if args.noncanonical_gene == False:
        gff3.check_phase(args.initial_phase)
    gff3.check_reference(fasta_external=args.fasta,
                         check_n=check_n,
                         allowed_num_of_n=args.allowed_num_of_n,
                         feature_types=args.check_n_feature_types)
    logger_stderr.info('\t- Checking missing attributes: (%s)...\n',
                       'function4gff.FIX_MISSING_ATTR()')
    function4gff.FIX_MISSING_ATTR(gff3, logger=logger_stderr)

    error_set = list()
    cmd = None
    cmd = function4gff.extract_internal_detected_errors(gff3)
    if cmd:
        error_set.extend(cmd)
    cmd = None
    logger_stderr.info('\t- Checking intra-model errors: (%s)...\n', args.gff)
    cmd = intra_model.main(gff3,
                           logger=logger_stderr,
                           noncanonical_gene=args.noncanonical_gene)
    if cmd:
        error_set.extend(cmd)
    cmd = None
    logger_stderr.info('\t- Checking inter-model errors: (%s)...\n', args.gff)
    cmd = inter_model.main(gff3,
                           args.gff,
                           args.fasta,
                           logger=logger_stderr,
                           noncanonical_gene=args.noncanonical_gene)
    if cmd:
        error_set.extend(cmd)
    cmd = None
    logger_stderr.info('\t- Checking single-feature errors: (%s)...\n',
                       args.gff)
    cmd = single_feature.main(gff3, logger=logger_stderr)
    if cmd:
        error_set.extend(cmd)
    if args.output:
        logger_stderr.info('Print QC report at {0:s}'.format(args.output))
        report_fh = open(args.output, 'w')
    else:
        logger_stderr.info('Print QC report at {0:s}'.format('report.txt'))
        report_fh = open('report.txt', 'w')

    if args.statistic:
        logger_stderr.info('Print QC statistic report at {0:s}'.format(
            args.statistic))
        statistic_fh = open(args.statistic, 'w')
    else:
        logger_stderr.info(
            'Print QC statistic report at {0:s}'.format('statistic.txt'))
        statistic_fh = open('statistic.txt', 'w')
    report_fh.write('Line_num\tError_code\tError_tag\n')
    for e in sorted(error_set, key=lambda x: sorted(x.keys())):
        tag = '[{0:s}]'.format(e['eTag'])
        report_fh.write('{0:s}\t{1:s}\t{2:s}\n'.format(str(e['line_num']),
                                                       str(e['eCode']),
                                                       str(tag)))
    #statistic_file
    error_counts = dict()
    ERROR_INFO = ERROR.INFO
    statistic_fh.write('Error_code\tNumber_of_problematic_models\tError_tag\n')
    for s in sorted(error_set, key=lambda x: sorted(x.keys())):
        if s['eCode'] not in error_counts:
            error_counts[s['eCode']] = {
                'count': 0,
                'etag': ERROR_INFO[s['eCode']]
            }
        error_counts[s['eCode']]['count'] += 1
    for a in error_counts:
        statistic_fh.write('{0:s}\t{1:s}\t{2:s}\n'.format(
            str(a), str(error_counts[a]['count']),
            str(error_counts[a]['etag'])))
예제 #9
0
def main(gff_file1,
         gff_file2,
         fasta,
         report,
         output_gff,
         all_assign=False,
         auto=True,
         user_defined1=None,
         user_defined2=None,
         logger=None):
    logger_null = logging.getLogger(__name__ + 'null')
    null_handler = logging.NullHandler()
    logger_null.addHandler(null_handler)

    if not logger:
        logger = logger_null

    if re.search(r'(\S+)/(\S+)$', gff_file1):
        _, gff_file1_name = re.search(r'(\S+)/(\S+)$', gff_file1).groups()
    else:
        gff_file1_name = gff_file1


#    print(path, gff_file1_name)

    if auto:
        autoDIR = 'auto_replace_tag'
        autoFILE = '{0:s}/check1.txt'.format(autoDIR)
        autoReviseGff = '{0:s}/Revised_{1:s}'.format(autoDIR, gff_file1_name)
        autoReviseReport = '{0:s}/replace_tag_report.txt'.format(autoDIR)

        logger.info(
            '========== Auto-assignment of replace tags for each transcript model =========='
        )
        gff3_merge.auto_replace_tag.main(gff1=gff_file1,
                                         gff2=gff_file2,
                                         fasta=fasta,
                                         outdir=autoDIR,
                                         scode='TEMP',
                                         all_assign=all_assign,
                                         user_defined1=user_defined1,
                                         user_defined2=user_defined2,
                                         logger=logger)
        gff3_merge.revision.main(gff_file=gff_file1,
                                 revision_file=autoFILE,
                                 output_gff=autoReviseGff,
                                 report_file=autoReviseReport,
                                 user_defined1=user_defined1,
                                 auto=auto,
                                 logger=logger)

        logger.info(
            '========== Check whether there are missing replace tags =========='
        )
        gff3 = Gff3(gff_file=autoReviseGff, logger=logger_null)
        error_models = check_replace(gff3, user_defined1)
        if error_models:
            logger.error('There are models missing replace tags...')
            logger.error(
                'Please check the below models in {0:s}. Please specify the proper replaced models at colulumn 9. For example, \'replace=[Transcript ID]\'. If this is a newly added model, please put it as \'replace=NA\'. Then, re-excute the program.\n'
                .format(autoReviseGff))
            for line in error_models:
                print(line['line_raw'])
            return
        else:
            logger.info('- All models have replace tags.')

        logger.info('========== Merge the two gff files ==========')
        gff3_merge.merge.main(autoReviseGff, gff_file2, output_gff, report,
                              user_defined1, user_defined2, logger)
    else:
        logger.info(
            '========== Check whether there are missing replace tags =========='
        )
        gff3 = Gff3(gff_file=gff_file1, logger=logger_null)
        error_models = check_replace(gff3)
        if error_models:
            logger.error('There are models missing replace tags...')
            logger.error(
                'Please check the below models in {0:s}. Please specify the proper replaced models at colulumn 9. For example, \'replace=[Transcript ID]\'. If this is a newly added model, please put it as \'replace=NA\'. Then, re-excute the program.'
                .format(gff_file1))
            for line in error_models:
                print(line['line_raw'].strip())
            return
        else:
            logger.info('- All models have replace tags.')

        logger.info('========== Merge the two gff files ==========')
        gff3_merge.merge.main(gff_file1, gff_file2, output_gff, report,
                              user_defined1, user_defined2, logger)
예제 #10
0
def script_main():
    logger_stderr = logging.getLogger(__name__ + 'stderr')
    logger_stderr.setLevel(logging.INFO)
    stderr_handler = logging.StreamHandler()
    stderr_handler.setFormatter(
        logging.Formatter('%(levelname)-8s %(message)s'))
    logger_stderr.addHandler(stderr_handler)
    logger_null = logging.getLogger(__name__ + 'null')
    null_handler = logging.NullHandler()
    logger_null.addHandler(null_handler)
    import argparse
    from textwrap import dedent
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=dedent("""\

    Testing environment:
    1. Python 2.7

    Input:
    1. Error report: Error report from gff3_QC.py. Specify the file name with the -qc_r or --qc_report argument;
    2. GFF3: Specify the file name with the -g or --gff argument;

    Output:
    1. Corrected GFF3


    Quick start:
    gff3_fix -qc_r error.txt -g example_file/example.gff3 -og corrected.gff3
    """))

    parser.add_argument('-qc_r',
                        '--qc_report',
                        type=str,
                        help='Error report from gff3_QC.py')
    parser.add_argument('-g',
                        '--gff',
                        type=str,
                        help='Genome annotation file, gff3 format')
    #parser.add_argument('-r', '--report', type=str, help='output report file name')
    parser.add_argument('-og',
                        '--output_gff',
                        type=str,
                        help='output gff3 file name',
                        default='corrected.gff3')
    parser.add_argument('-v',
                        '--version',
                        action='version',
                        version='%(prog)s ' + __version__)

    args = parser.parse_args()
    if args.qc_report:
        logger_stderr.info('Checking QC report file (%s)...', args.qc_report)
    else:  # no input
        parser.print_help()
        sys.exit()

    if args.gff:
        logger_stderr.info('Checking GFF3 file (%s)...', args.gff)
    else:  # no input
        parser.print_help()
        sys.exit()

    logger_stderr.info('Reading QC report file: (%s)...\n', args.qc_report)
    #error_dict example: {'Emr0001': [[15,16],[13]],'Esf0005': [[17]]}
    error_dict = {}
    #line_num_dict example: {3: ['Emr0001','Esf0003'], 15: ['Emr0026']}
    line_num_dict = {}
    try:
        with open(args.qc_report, "r") as qcr:
            #ignore the first line (header)
            next(qcr)
            for line in qcr:
                line = line.strip()
                if line:
                    try:
                        lines = line.split("\t")
                        line_num_list = map(int, re.findall(r'\d+', lines[0]))
                        if lines[1] not in error_dict:
                            error_dict[lines[1]] = [line_num_list]
                        else:
                            error_dict[lines[1]].append(line_num_list)
                        for line_num in line_num_list:
                            if line_num not in line_num_dict:
                                line_num_dict[line_num] = {lines[1]: lines[2]}
                            else:
                                line_num_dict[line_num][lines[1]] = lines[2]
                    except IndexError:
                        logger_stderr.warning('Failed to recognize - %s', line)

    except:
        logger_stderr.error('Failed to read QC report file!')
    logger_stderr.info('Reading GFF3 file: (%s)...\n', args.gff)
    try:
        gff3 = Gff3(gff_file=args.gff, logger=logger_null)
    except:
        logger_stderr.error('Failed to read GFF3 file!')
        sys.exit(1)

    gff3_fix.fix.main(gff3=gff3,
                      output_gff=args.output_gff,
                      error_dict=error_dict,
                      line_num_dict=line_num_dict,
                      logger=logger_null)