def main():
    parser = argparse.ArgumentParser( description='Filters trinity output for longest subcomponents based on naming convention')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file' )
    parser.add_argument('-o', '--output', type=str, required=False, help='Output file to be created.  Default = STDOUT' )
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output is not None:
        fout = open(args.output, 'wt')

    seqs = utils.fasta_dict_from_file(args.input)

    components = dict()

    for seq_id in seqs:
        m = re.search("(comp\d+)_", seq_id)
        if m:
            component_id = m.group(1)

            if component_id not in components or len(seqs[seq_id]['s']) > len(components[component_id]['s']):
                components[component_id] = seqs[seq_id]
                components[component_id]['longest_id'] = seq_id
        else:
            raise Exception("ERROR: This ID wasn't in the expected format of compN_cN_seqN: {0}".format(seq_id))

    for c_id in components:
        seq_wrapped = utils.wrapped_fasta(components[c_id]['s'], every=60)
        fout.write(">{0} {1}\n{2}\n".format(components[c_id]['longest_id'], components[c_id]['h'], seq_wrapped))
Пример #2
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Filters trinity output for longest subcomponents based on naming convention'
    )

    ## output file to be written
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to an input FASTA file')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=False,
                        help='Output file to be created.  Default = STDOUT')
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output is not None:
        fout = open(args.output, 'wt')

    seqs = utils.fasta_dict_from_file(args.input)

    components = dict()

    for seq_id in seqs:
        m = re.search("(comp\d+)_", seq_id)
        if m:
            component_id = m.group(1)

            if component_id not in components or len(seqs[seq_id]['s']) > len(
                    components[component_id]['s']):
                components[component_id] = seqs[seq_id]
                components[component_id]['longest_id'] = seq_id
        else:
            raise Exception(
                "ERROR: This ID wasn't in the expected format of compN_cN_seqN: {0}"
                .format(seq_id))

    for c_id in components:
        seq_wrapped = utils.wrapped_fasta(components[c_id]['s'], every=60)
        fout.write(">{0} {1}\n{2}\n".format(components[c_id]['longest_id'],
                                            components[c_id]['h'],
                                            seq_wrapped))
def write_fasta_results(f, polypeptides):
    """
    Produces headers like:
    >ID PRODUCT_NAME gene::GENE_SYMBOL ec::EC_NUMBERS go::GO_TERMS

    Example:
    
    """
    for polypeptide_id in polypeptides:
        polypeptide = polypeptides[polypeptide_id]
        go_string = ""
        ec_string = ""

        for go_annot in polypeptide.annotation.go_annotations:
            go_string += "GO:{0},".format(go_annot.go_id)

        go_string = go_string.rstrip(',')

        for ec_annot in polypeptide.annotation.ec_numbers:
            ec_string += "{0},".format(ec_annot.number)

        ec_string = ec_string.rstrip(',')

        header = "{0} {1}".format(polypeptide_id,
                                  polypeptide.annotation.product_name)

        if polypeptide.annotation.gene_symbol is not None:
            header = "{0} gene::{1}".format(header,
                                            polypeptide.annotation.gene_symbol)

        if ec_string != "":
            header = "{0} ec::{1}".format(header, ec_string)

        if go_string != "":
            header = "{0} go::{1}".format(header, go_string)

        f.write(">{0}\n".format(header))
        f.write("{0}\n".format(utils.wrapped_fasta(polypeptide.residues)))
def write_fasta_results( f, polypeptides ):
    """
    Produces headers like:
    >ID PRODUCT_NAME gene::GENE_SYMBOL ec::EC_NUMBERS go::GO_TERMS

    Example:
    
    """
    for polypeptide_id in polypeptides:
        polypeptide = polypeptides[polypeptide_id]
        go_string = ""
        ec_string = ""

        for go_annot in polypeptide.annotation.go_annotations:
            go_string += "GO:{0},".format(go_annot.go_id)
        
        go_string = go_string.rstrip(',')

        for ec_annot in polypeptide.annotation.ec_numbers:
            ec_string += "{0},".format(ec_annot.number)
        
        ec_string = ec_string.rstrip(',')

        header = "{0} {1}".format(polypeptide_id, polypeptide.annotation.product_name)

        if polypeptide.annotation.gene_symbol is not None:
            header = "{0} gene::{1}".format(header, polypeptide.annotation.gene_symbol)

        if ec_string != "":
            header = "{0} ec::{1}".format(header, ec_string)
            
        if go_string != "":
            header = "{0} go::{1}".format(header, go_string)
            
        f.write( ">{0}\n".format( header ) )
        f.write( "{0}\n".format(utils.wrapped_fasta(polypeptide.residues)))