Python set_column_9_value примеры, biocode.gff.set_column_9_value Python примеры использования

Пример #1

0

Показать файл

Файл: correct_RNAs_missing_genes.py Проект: zhiyongli1995/biocode

def main():
    parser = argparse.ArgumentParser(
        description='Adds gene features for RNAs which lack them')

    ## output file to be written
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to the input GFF3 file')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help='Output GFF3 file to write')
    args = parser.parse_args()

    infile = open(args.input)
    ofh = open(args.output, 'wt')

    for line in infile:

        if line.startswith('#'):
            ofh.write(line)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write("{0}\n".format(line))
            continue

        id = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')

        if cols[2].endswith('RNA') and parent is None:
            gene_cols = list(cols)
            gene_cols[2] = 'gene'
            gene_cols[8] = gff.set_column_9_value(gene_cols[8], 'ID',
                                                  "{0}.gene".format(id))
            ofh.write("{0}\n".format("\t".join(gene_cols)))

            cols[8] = gff.set_column_9_value(cols[8], 'Parent',
                                             "{0}.gene".format(id))
            ofh.write("{0}\n".format("\t".join(cols)))
        else:
            ofh.write("{0}\n".format(line))

Пример #2

0

Показать файл

Файл: correct_gff3_exon_parentage.py Проект: zhiyongli1995/biocode

def main():
    parser = argparse.ArgumentParser(
        description=
        'Updates exon Parent attributes to point at the correct RNA feature')

    ## output file to be written
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to the input GFF3 file')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help='Output GFF3 file to write')
    args = parser.parse_args()

    infile = open(args.input)
    ofh = open(args.output, 'wt')

    last_rna_id = None

    for line in infile:

        if line.startswith('#'):
            ofh.write(line)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write("{0}\n".format(line))
            continue

        id = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')

        if cols[2].endswith('RNA'):
            last_rna_id = id
            ofh.write("{0}\n".format(line))

        elif cols[2] == 'exon':
            if parent != last_rna_id:
                print(
                    "INFO: correcting unexpected parentage for feature ({0}) type {2}.  Expected ({1})"
                    .format(id, last_rna_id, cols[2]))
                cols[8] = gff.set_column_9_value(cols[8], 'Parent',
                                                 last_rna_id)
                ofh.write("{0}\n".format("\t".join(cols)))
            else:
                ofh.write("{0}\n".format(line))
        else:
            ofh.write("{0}\n".format(line))

Пример #3

0

Показать файл

Файл: correct_RNAs_missing_genes.py Проект: jorvis/biocode

def main():
    parser = argparse.ArgumentParser( description='Adds gene features for RNAs which lack them')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file' )
    parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write' )
    args = parser.parse_args()

    infile = open(args.input)
    ofh = open(args.output, 'wt')

    for line in infile:
        
        if line.startswith('#'):
            ofh.write(line)
            continue
        
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write("{0}\n".format(line) )
            continue

        id     = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')

        if cols[2].endswith('RNA') and parent is None:
            gene_cols = list(cols)
            gene_cols[2] = 'gene'
            gene_cols[8] = gff.set_column_9_value(gene_cols[8], 'ID', "{0}.gene".format(id))
            ofh.write("{0}\n".format("\t".join(gene_cols)) )

            cols[8] = gff.set_column_9_value(cols[8], 'Parent', "{0}.gene".format(id))
            ofh.write("{0}\n".format("\t".join(cols)) )
        else:
            ofh.write("{0}\n".format(line) )

Пример #4

0

Показать файл

Файл: correct_gff3_exon_parentage.py Проект: jorvis/biocode

def main():
    parser = argparse.ArgumentParser( description='Updates exon Parent attributes to point at the correct RNA feature')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file' )
    parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write' )
    args = parser.parse_args()

    infile = open(args.input)
    ofh = open(args.output, 'wt')

    last_rna_id = None
    
    for line in infile:
        
        if line.startswith('#'):
            ofh.write(line)
            continue
        
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write("{0}\n".format(line) )
            continue

        id     = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')

        if cols[2].endswith('RNA'):
            last_rna_id = id
            ofh.write("{0}\n".format(line) )

        elif cols[2] == 'exon':
            if parent != last_rna_id:
                print("INFO: correcting unexpected parentage for feature ({0}) type {2}.  Expected ({1})".format(id, last_rna_id, cols[2]) )
                cols[8] = gff.set_column_9_value(cols[8], 'Parent', last_rna_id)
                ofh.write("{0}\n".format("\t".join(cols)) )
            else:
                ofh.write("{0}\n".format(line) )
        else:
            ofh.write("{0}\n".format(line) )

Пример #5

0

Показать файл

Файл: add_gff3_locus_tags.py Проект: yangjie4546/biocode

def main():
    parser = argparse.ArgumentParser(
        description='Adds locus tag identifiers to GFF3 features')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='TA file of source molecules')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Optional output file path (else STDOUT)')
    parser.add_argument('-p',
                        '--prefix',
                        type=str,
                        required=True,
                        help='The prefix portion of IDs to be generated')
    parser.add_argument(
        '-a',
        '--padding',
        type=int,
        required=True,
        help=
        'Specify the minimum with to reserve for the numeric portion of the IDs.  Smaller numbers will be zero-padded.'
    )
    parser.add_argument('-n',
                        '--interval',
                        type=int,
                        required=False,
                        default=1,
                        help='Interval between generated identifiers')
    parser.add_argument(
        '-s',
        '--starting_id',
        type=int,
        required=False,
        default=0,
        help='Initial numeric portion of IDs to be generated (do not zero-pad)'
    )
    parser.add_argument(
        '-d',
        '--id_file',
        type=str,
        required=False,
        help=
        'Pass a 2-column file of IDs to retain (in case you have mapped genes, for example)'
    )
    parser.add_argument(
        '-m',
        '--molecule_map',
        type=str,
        required=False,
        help=
        'Pass a 2-column file of molecule->token identifiers (see documentation)'
    )
    parser.add_argument(
        '-c',
        '--custom',
        type=str,
        required=False,
        help='For custom parsing steps.  Most should ignore this.')

    args = parser.parse_args()
    check_arguments(args)

    # used to store locus_tags associated with each gene (so children can inherit)
    gene_loci = dict()
    next_id = args.starting_id
    last_molecule = None

    id_mapping = parse_mapping_file(args.id_file)
    mol_mapping = parse_mapping_file(args.molecule_map)
    loci_assigned = list()

    ## if using Joana's custom options, check assumptions
    if args.custom == 'joana':
        if args.molecule_map is None or args.id_file is None:
            raise Exception(
                "ERROR: Expected --molecule_map and --id_file options when using --custom=joana"
            )
        else:
            ## need to process the ID map to reformat IDs
            for id in id_mapping:
                # TP05_0002 -> TpMuguga_05g00002
                m = re.match('TP(\d\d)_(\d+)', id_mapping[id])
                if m:
                    id_mapping[id] = "{0}_{1}g0{2}".format(
                        args.prefix, m.group(1), m.group(2))

    elif args.custom == 'bmicroti':
        microti_map = {'I': '01', 'II': '02', 'III': '03', 'IV': '04'}

        if args.molecule_map is None or args.id_file is None:
            raise Exception(
                "ERROR: Expected --molecule_map and --id_file options when using --custom=bmicroti"
            )
        else:
            for id in id_mapping:
                m = re.match('BBM_(\D+)(\d+)', id_mapping[id])
                if m:
                    print("Changing id from {0} to ".format(id))
                    id_mapping[id] = "{0}_{1}g{2}".format(
                        args.prefix, microti_map[m.group(1)], m.group(2))
                    print(id_mapping[id])
                else:
                    raise Exception(
                        "ERROR: id ({0}) didn't match expected convention.".
                        format(id_mapping[id]))

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    last_number_portion_assigned = 0

    for line in open(args.input_file):
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            fout.write(line + "\n")
            continue

        if last_molecule is None or (
                args.molecule_map is not None
                and mol_mapping[cols[0]] != mol_mapping[last_molecule]):
            print("Found molecule {0}, resetting id counter from {1}".format(
                cols[0], next_id))
            next_id = args.starting_id
            last_molecule = cols[0]

        # grab the ID column if any
        id = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')
        type = cols[2]

        # issue

        # 66F4EEF2E3C863C251F831817FF71233
        # 7F1917E4D81A959078C9A38E15488BC0
        # E22888670919A4A888572155F40F2654
        # B9D9CF1F7A8E5A2E1124F0A6C68840DC -> BBM_I00232
        # gene before is: 6DE6BCCE69CCDC39994A0940B2ED524A - novel

        # errors on: BmicrotiR1_01g00233 -> BBM_I00233
        #5800A4110A62E4EAE57AFAD1F8D65CB3        BBM_I00233

        if type == 'gene':
            while True:
                if id in id_mapping:
                    locus_id = id_mapping[id]
                else:
                    if args.molecule_map is None:
                        locus_id = "{0}_{1}".format(
                            args.prefix,
                            str(next_id).zfill(args.padding))
                    else:
                        if cols[0] in mol_mapping:
                            if args.custom == 'bmicroti':
                                locus_id = "{0}_{2}g{1}".format(
                                    args.prefix,
                                    str(int(last_number_portion_assigned) +
                                        1).zfill(args.padding),
                                    mol_mapping[cols[0]])
                            else:
                                locus_id = "{0}_{2}g{1}".format(
                                    args.prefix,
                                    str(next_id).zfill(args.padding),
                                    mol_mapping[cols[0]])
                        else:
                            raise Exception(
                                "ERROR: --molecule_map passed but {0} wasn't found in it."
                                .format(cols[0]))

                    next_id += args.interval

                cols[8] = gff.set_column_9_value(cols[8], 'locus_tag',
                                                 locus_id)

                ## make sure this wasn't generated already (possibly conflict between --id_file and an
                #   auto-generated ID?
                if locus_id not in loci_assigned:
                    break
                else:
                    print("DEBUG: Duplicate ID assigned ({0}), trying again.".
                          format(locus_id))

            loci_assigned.append(locus_id)
            gene_loci[id] = locus_id

            m = re.search(r"(\d+)$", locus_id)
            if m:
                last_number_portion_assigned = m.group(1)

        elif type.endswith('RNA'):
            if parent in gene_loci:
                cols[8] = gff.set_column_9_value(cols[8], 'locus_tag',
                                                 gene_loci[parent])
            else:
                raise Exception(
                    "ERROR: found RNA {0} whose parent {1} wasn't found yet".
                    format(id, parent))

        fout.write("\t".join(cols) + "\n")

Пример #6

0

Показать файл

Файл: convert_glimmerHMM_gff_to_gff3.py Проект: zhiyongli1995/biocode

def main():
    parser = argparse.ArgumentParser(
        description='Converts glimmerHMM GFF output to GFF3')

    # output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input file to parse')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')

    args = parser.parse_args()

    fout = open(args.output_file, 'w')

    current_gene = None
    current_mRNA = None

    next_exon_num = defaultdict(int)

    for line in open(args.input_file, 'r'):
        if line.startswith('#'):
            fout.write(line)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            continue

        mol_id = cols[0]
        feat_type = cols[2]
        feat_fmin = int(cols[3]) - 1
        feat_fmax = int(cols[4])

        id = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')

        if feat_type == 'mRNA':
            gene_cols = list(cols)
            gene_cols[2] = 'gene'

            cols[8] = gff.set_column_9_value(cols[8], 'ID',
                                             "{0}.mRNA".format(id))
            cols[8] = gff.set_column_9_value(cols[8], 'Name',
                                             "{0}.mRNA".format(id))
            cols[8] = gff.order_column_9(cols[8])

            # print the gene and mRNA
            fout.write("{0}\n".format("\t".join(gene_cols)))
            fout.write("{0}\n".format("\t".join(cols)))

        elif feat_type == 'CDS':
            exon_cols = list(cols)

            cols[8] = gff.set_column_9_value(cols[8], 'ID',
                                             "{0}.cds".format(parent))
            cols[8] = gff.set_column_9_value(cols[8], 'Name',
                                             "{0}.cds".format(parent))
            cols[8] = gff.set_column_9_value(cols[8], 'Parent',
                                             "{0}.mRNA".format(parent))
            cols[8] = gff.order_column_9(cols[8])

            exon_id = "{0}.exon.{1}".format(parent, next_exon_num[parent])
            next_exon_num[parent] += 1

            exon_cols[2] = 'exon'
            exon_cols[7] = '.'
            exon_cols[8] = gff.set_column_9_value(exon_cols[8], 'ID', exon_id)
            exon_cols[8] = gff.set_column_9_value(exon_cols[8], 'Name',
                                                  exon_id)
            exon_cols[8] = gff.set_column_9_value(exon_cols[8], 'Parent',
                                                  "{0}.mRNA".format(parent))
            exon_cols[8] = gff.order_column_9(exon_cols[8])

            fout.write("{0}\n".format("\t".join(exon_cols)))
            fout.write("{0}\n".format("\t".join(cols)))

Пример #7

0

Показать файл

Файл: add_gff3_locus_tags.py Проект: jorvis/biocode

def main():
    parser = argparse.ArgumentParser( description='Adds locus tag identifiers to GFF3 features')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='TA file of source molecules' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)' )
    parser.add_argument('-p', '--prefix', type=str, required=True, help='The prefix portion of IDs to be generated')
    parser.add_argument('-a', '--padding', type=int, required=True, help='Specify the minimum with to reserve for the numeric portion of the IDs.  Smaller numbers will be zero-padded.' )
    parser.add_argument('-n', '--interval', type=int, required=False, default=1, help='Interval between generated identifiers' )
    parser.add_argument('-s', '--starting_id', type=int, required=False, default=0, help='Initial numeric portion of IDs to be generated (do not zero-pad)' )
    parser.add_argument('-d', '--id_file', type=str, required=False, help='Pass a 2-column file of IDs to retain (in case you have mapped genes, for example)')
    parser.add_argument('-m', '--molecule_map', type=str, required=False, help='Pass a 2-column file of molecule->token identifiers (see documentation)')
    parser.add_argument('-c', '--custom', type=str, required=False, help='For custom parsing steps.  Most should ignore this.')

    args = parser.parse_args()
    check_arguments(args)

    # used to store locus_tags associated with each gene (so children can inherit)
    gene_loci = dict()
    next_id = args.starting_id
    last_molecule = None

    id_mapping  = parse_mapping_file( args.id_file )
    mol_mapping = parse_mapping_file( args.molecule_map )
    loci_assigned = list()

    ## if using Joana's custom options, check assumptions
    if args.custom == 'joana':
        if args.molecule_map is None or args.id_file is None:
            raise Exception("ERROR: Expected --molecule_map and --id_file options when using --custom=joana")
        else:
            ## need to process the ID map to reformat IDs
            for id in id_mapping:
                # TP05_0002 -> TpMuguga_05g00002
                m = re.match('TP(\d\d)_(\d+)', id_mapping[id])
                if m:
                    id_mapping[id] = "{0}_{1}g0{2}".format(args.prefix, m.group(1), m.group(2) )
                    
    elif args.custom == 'bmicroti':
        microti_map = { 'I':'01', 'II':'02', 'III':'03', 'IV':'04' }
        
        if  args.molecule_map is None or args.id_file is None:
            raise Exception("ERROR: Expected --molecule_map and --id_file options when using --custom=bmicroti")
        else:
            for id in id_mapping:
                m = re.match('BBM_(\D+)(\d+)', id_mapping[id])
                if m:
                    print("Changing id from {0} to ".format(id))
                    id_mapping[id] = "{0}_{1}g{2}".format(args.prefix, microti_map[m.group(1)], m.group(2) )
                    print(id_mapping[id])
                else:
                    raise Exception("ERROR: id ({0}) didn't match expected convention.".format(id_mapping[id]))
                    
        
    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    last_number_portion_assigned = 0

    for line in open(args.input_file):
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            fout.write(line + "\n")
            continue

        if last_molecule is None or (args.molecule_map is not None and mol_mapping[cols[0]] != mol_mapping[last_molecule]):
            print("Found molecule {0}, resetting id counter from {1}".format(cols[0], next_id) )
            next_id = args.starting_id
            last_molecule = cols[0]

        # grab the ID column if any
        id = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')
        type = cols[2]

        if type == 'gene':
            while True:
                if id in id_mapping:
                    locus_id = id_mapping[id]
                else:
                    if args.molecule_map is None:
                        locus_id = "{0}_{1}".format(args.prefix, str(next_id).zfill(args.padding))
                    else:
                        if cols[0] in mol_mapping:
                            if args.custom == 'bmicroti':
                                locus_id = "{0}_{2}g{1}".format(args.prefix, str(int(last_number_portion_assigned) + 1).zfill(args.padding), mol_mapping[cols[0]])
                            else:
                                locus_id = "{0}_{2}g{1}".format(args.prefix, str(next_id).zfill(args.padding), mol_mapping[cols[0]])
                        else:
                            raise Exception("ERROR: --molecule_map passed but {0} wasn't found in it.".format(cols[0]) )

                    next_id += args.interval

                cols[8] = gff.set_column_9_value(cols[8], 'locus_tag', locus_id)

                ## make sure this wasn't generated already (possibly conflict between --id_file and an
                #   auto-generated ID?
                if locus_id not in loci_assigned:
                    break
                else:
                    print("DEBUG: Duplicate ID assigned ({0}), trying again.".format(locus_id) )

            loci_assigned.append(locus_id)
            gene_loci[id] = locus_id

            m = re.search(r"(\d+)$", locus_id)
            if m:
                last_number_portion_assigned = m.group(1)
            
        elif type.endswith('RNA'):
            if parent in gene_loci:
                cols[8] = gff.set_column_9_value(cols[8], 'locus_tag', gene_loci[parent])
            else:
                raise Exception("ERROR: found RNA {0} whose parent {1} wasn't found yet".format(id, parent))
        
        fout.write("\t".join(cols) + "\n")

Пример #8

0

Показать файл

Файл: convert_glimmerHMM_gff_to_gff3.py Проект: jorvis/biocode

def main():
    parser = argparse.ArgumentParser( description='Converts glimmerHMM GFF output to GFF3')

    # output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to parse' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    
    args = parser.parse_args()

    fout = open(args.output_file, 'w')

    current_gene = None
    current_mRNA = None

    next_exon_num = defaultdict(int)

    for line in open(args.input_file, 'r'):
        if line.startswith('#'):
            fout.write(line)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            continue

        mol_id = cols[0]
        feat_type = cols[2]
        feat_fmin = int(cols[3]) - 1
        feat_fmax = int(cols[4])

        id = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')

        if feat_type == 'mRNA':
            gene_cols = list(cols)
            gene_cols[2] = 'gene'

            cols[8] = gff.set_column_9_value(cols[8], 'ID', "{0}.mRNA".format(id))
            cols[8] = gff.set_column_9_value(cols[8], 'Name', "{0}.mRNA".format(id))
            cols[8] = gff.order_column_9(cols[8])
            
            # print the gene and mRNA
            fout.write( "{0}\n".format("\t".join(gene_cols)) )
            fout.write( "{0}\n".format("\t".join(cols)) )
            
        elif feat_type == 'CDS':
            exon_cols = list(cols)

            cols[8] = gff.set_column_9_value(cols[8], 'ID', "{0}.cds".format(parent))
            cols[8] = gff.set_column_9_value(cols[8], 'Name', "{0}.cds".format(parent))
            cols[8] = gff.set_column_9_value(cols[8], 'Parent', "{0}.mRNA".format(parent))
            cols[8] = gff.order_column_9(cols[8])

            exon_id = "{0}.exon.{1}".format(parent, next_exon_num[parent] )
            next_exon_num[parent] += 1
            
            exon_cols[2] = 'exon'
            exon_cols[7] = '.'
            exon_cols[8] = gff.set_column_9_value(exon_cols[8], 'ID', exon_id)
            exon_cols[8] = gff.set_column_9_value(exon_cols[8], 'Name', exon_id)
            exon_cols[8] = gff.set_column_9_value(exon_cols[8], 'Parent', "{0}.mRNA".format(parent))
            exon_cols[8] = gff.order_column_9(exon_cols[8])

            fout.write( "{0}\n".format("\t".join(exon_cols)) )
            fout.write( "{0}\n".format("\t".join(cols)) )

Пример #9

0

Показать файл

Файл: add_gff3_locus_tags.py Проект: jorvis/biocode

def main():
    parser = argparse.ArgumentParser(description="Adds locus tag identifiers to GFF3 features")

    ## output file to be written
    parser.add_argument("-i", "--input_file", type=str, required=True, help="TA file of source molecules")
    parser.add_argument("-o", "--output_file", type=str, required=False, help="Optional output file path (else STDOUT)")
    parser.add_argument("-p", "--prefix", type=str, required=True, help="The prefix portion of IDs to be generated")
    parser.add_argument(
        "-a",
        "--padding",
        type=int,
        required=True,
        help="Specify the minimum with to reserve for the numeric portion of the IDs.  Smaller numbers will be zero-padded.",
    )
    parser.add_argument(
        "-n", "--interval", type=int, required=False, default=1, help="Interval between generated identifiers"
    )
    parser.add_argument(
        "-s",
        "--starting_id",
        type=int,
        required=False,
        default=0,
        help="Initial numeric portion of IDs to be generated (do not zero-pad)",
    )
    parser.add_argument(
        "-d",
        "--id_file",
        type=str,
        required=False,
        help="Pass a 2-column file of IDs to retain (in case you have mapped genes, for example)",
    )
    parser.add_argument(
        "-m",
        "--molecule_map",
        type=str,
        required=False,
        help="Pass a 2-column file of molecule->token identifiers (see documentation)",
    )
    parser.add_argument(
        "-c", "--custom", type=str, required=False, help="For custom parsing steps.  Most should ignore this."
    )

    args = parser.parse_args()
    check_arguments(args)

    # used to store locus_tags associated with each gene (so children can inherit)
    gene_loci = dict()
    next_id = args.starting_id
    last_molecule = None

    id_mapping = parse_mapping_file(args.id_file)
    mol_mapping = parse_mapping_file(args.molecule_map)
    loci_assigned = list()

    ## if using Joana's custom options, check assumptions
    if args.custom == "joana":
        if args.molecule_map is None or args.id_file is None:
            raise Exception("ERROR: Expected --molecule_map and --id_file options when using --custom=joana")
        else:
            ## need to process the ID map to reformat IDs
            for id in id_mapping:
                # TP05_0002 -> TpMuguga_05g00002
                m = re.match("TP(\d\d)_(\d+)", id_mapping[id])
                if m:
                    id_mapping[id] = "{0}_{1}g0{2}".format(args.prefix, m.group(1), m.group(2))

    elif args.custom == "bmicroti":
        microti_map = {"I": "01", "II": "02", "III": "03", "IV": "04"}

        if args.molecule_map is None or args.id_file is None:
            raise Exception("ERROR: Expected --molecule_map and --id_file options when using --custom=bmicroti")
        else:
            for id in id_mapping:
                m = re.match("BBM_(\D+)(\d+)", id_mapping[id])
                if m:
                    print("Changing id from {0} to ".format(id))
                    id_mapping[id] = "{0}_{1}g{2}".format(args.prefix, microti_map[m.group(1)], m.group(2))
                    print(id_mapping[id])
                else:
                    raise Exception("ERROR: id ({0}) didn't match expected convention.".format(id_mapping[id]))

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, "wt")

    last_number_portion_assigned = 0

    for line in open(args.input_file):
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            fout.write(line + "\n")
            continue

        if last_molecule is None or (
            args.molecule_map is not None and mol_mapping[cols[0]] != mol_mapping[last_molecule]
        ):
            print("Found molecule {0}, resetting id counter from {1}".format(cols[0], next_id))
            next_id = args.starting_id
            last_molecule = cols[0]

        # grab the ID column if any
        id = gff.column_9_value(cols[8], "ID")
        parent = gff.column_9_value(cols[8], "Parent")
        type = cols[2]

        # issue

        # 66F4EEF2E3C863C251F831817FF71233
        # 7F1917E4D81A959078C9A38E15488BC0
        # E22888670919A4A888572155F40F2654
        # B9D9CF1F7A8E5A2E1124F0A6C68840DC -> BBM_I00232
        # gene before is: 6DE6BCCE69CCDC39994A0940B2ED524A - novel

        # errors on: BmicrotiR1_01g00233 -> BBM_I00233
        # 5800A4110A62E4EAE57AFAD1F8D65CB3        BBM_I00233

        if type == "gene":
            while True:
                if id in id_mapping:
                    locus_id = id_mapping[id]
                else:
                    if args.molecule_map is None:
                        locus_id = "{0}_{1}".format(args.prefix, str(next_id).zfill(args.padding))
                    else:
                        if cols[0] in mol_mapping:
                            if args.custom == "bmicroti":
                                locus_id = "{0}_{2}g{1}".format(
                                    args.prefix,
                                    str(int(last_number_portion_assigned) + 1).zfill(args.padding),
                                    mol_mapping[cols[0]],
                                )
                            else:
                                locus_id = "{0}_{2}g{1}".format(
                                    args.prefix, str(next_id).zfill(args.padding), mol_mapping[cols[0]]
                                )
                        else:
                            raise Exception("ERROR: --molecule_map passed but {0} wasn't found in it.".format(cols[0]))

                    next_id += args.interval

                cols[8] = gff.set_column_9_value(cols[8], "locus_tag", locus_id)

                ## make sure this wasn't generated already (possibly conflict between --id_file and an
                #   auto-generated ID?
                if locus_id not in loci_assigned:
                    break
                else:
                    print("DEBUG: Duplicate ID assigned ({0}), trying again.".format(locus_id))

            loci_assigned.append(locus_id)
            gene_loci[id] = locus_id

            m = re.search(r"(\d+)$", locus_id)
            if m:
                last_number_portion_assigned = m.group(1)

        elif type.endswith("RNA"):
            if parent in gene_loci:
                cols[8] = gff.set_column_9_value(cols[8], "locus_tag", gene_loci[parent])
            else:
                raise Exception("ERROR: found RNA {0} whose parent {1} wasn't found yet".format(id, parent))

        fout.write("\t".join(cols) + "\n")

Python set_column_9_value примеры использования