def classify_species(filepath, results, domains, protein_families):
    """Classifies a species using the results from hmmsearch"""
    recarray = hmmer.parse_csv(filepath)
    
    # output base filename
    target = os.path.splitext(os.path.basename(filepath))[0]
           
    # Load contigs with matching domains
    contigs = load_contigs(target, recarray, domains)

    # open csv file for output
    filename =  "classification_%s.csv" % target
    fp = open(os.path.join("../csv/classifications", filename), 'wt')
    csv_writer = csv.writer(fp)
    csv_writer.writerow(['contig', 'family', 'type'])

    # classify contigs
    classifications = classify_contigs(contigs, protein_families)

    # collapse related contigs and write classification to csv
    collapsed = collapse_contig_classifications(target, classifications, 
                                                csv_writer)

    # convert to recarray and add to master dict
    datatypes = [('contig', '|S32'), ('family', '|S64'), ('type', '|S2')]
    results[target] = np.array(collapsed, dtype=datatypes)
def classify_species(filepath, results, domains, protein_families):
    """Classifies a species using the results from hmmsearch"""
    recarray = hmmer.parse_csv(filepath)

    # output base filename
    target = os.path.splitext(os.path.basename(filepath))[0]

    # Load contigs with matching domains
    contigs = load_contigs(target, recarray, domains)

    # open csv file for output
    filename = "classification_%s.csv" % target
    fp = open(os.path.join("../csv/classifications", filename), 'wt')
    csv_writer = csv.writer(fp)
    csv_writer.writerow(['contig', 'family', 'type'])

    # classify contigs
    classifications = classify_contigs(contigs, protein_families)

    # collapse related contigs and write classification to csv
    collapsed = collapse_contig_classifications(target, classifications,
                                                csv_writer)

    # convert to recarray and add to master dict
    datatypes = [('contig', '|S32'), ('family', '|S64'), ('type', '|S2')]
    results[target] = np.array(collapsed, dtype=datatypes)
示例#3
0
def analyze_hmmer_table(go_terms, pfam2go, hmmer_table, go_level=1):
    """Analyzes a given HMMER3 search table for Pfam/GO term matches.
    
    Arguments
    ---------
    go_terms : goatools.obo_parser.GODag
        A dictionary of GO terms
    pfam2go : dict
        Pfam/GO mapping
    hmmer_table : str
        filepath to a HMMER3 search output table
    go_level : int
        (Optional) the GO level to summarize
    """
    summary = {
        "unknown": 0
    }

    # parse HMMER output and store in data dict
    contigs = hmmer.parse_csv(hmmer_table)
    
    # iterate through HMMER output
    for contig in contigs:
        pfam_domain = contig[2]
        
        # check to see if domain is in Pfam2go
        if pfam_domain not in pfam2go:
            summary['unknown'] += 1
            continue
        
        # otherwise get a list of the associated GO terms
        terms = pfam2go[pfam_domain]
        
        # iterate through GO terms for the contig
        for t in terms:
            go_term = go_terms[t[0]]
    
            # find GO category at the desired level
            node = go_term
            
            for i in range(go_term.level - go_level):
                node = node.parents[0]
                
            # add to summary table
            category = node.name
            
            if not category in summary:
                summary[category] = 0
                
            summary[category] += 1
    
    return summary