def write_annotations_to_output(output_dic, output_file, summary_file):
    '''
    Given output dic of annotated sequences, write to outputfile.
    We want to include all the information that is already in summary file
    so
    we use summary file to read each row and add annotations.
    
    Each row in summary file may write 0 or more lines depending on 
    how many uniprot annotations match to the amino acid sequence.
    '''
    # initialize writefile as write obj
    outfile = open(output_file, 'wb')
    mywriter = csv.writer(outfile, delimiter='\t')
    
    # define column names
    # summary file colnames
    gene_name_colname, miso_event_colname, reading_frame_colname, \
    nucleotide_seq_colname, amino_acid_seq_colname, gene_id_colname, \
    transcript_id_colname, exon_number_colname = \
        get_summary_file_colnames()
    summary_colnames = [gene_name_colname,
                        miso_event_colname, 
                        reading_frame_colname, 
                        nucleotide_seq_colname, 
                        amino_acid_seq_colname, 
                        gene_id_colname, 
                        transcript_id_colname, 
                        exon_number_colname]
    
    # get annotation colnames
    start_colname, end_colname, descript_colname = get_uniprot_subkeys()
    exon_start_colname = 'exon_start'
    exon_end_colname = 'exon_end'
    feature_colname = 'feature'
    annotation_colnames = [feature_colname,
                           start_colname,
                           end_colname,
                           descript_colname,
                           exon_start_colname,
                           exon_end_colname]
    
    # Write header to output file. Order matters.
    outheader = summary_colnames + annotation_colnames
    mywriter.writerow(outheader)
    
    # init writecount.
    writecount = 0
    
    # create read file obj for summary file
    with open(summary_file, 'rb') as readfile:
        myreader = csv.reader(readfile, delimiter='\t')
        readheader = myreader.next()
        for row in myreader:
            # get amino acid seq, our key used to access output dic annotes
            aa_seq = row[readheader.index(amino_acid_seq_colname)]
            
            # if no associatd annotations, skip to next.
            if aa_seq not in output_dic:
                continue
            
            '''
            # iterate over list in list of features, write corresponding annotes
            # along with summary information. Expect multiple annotations (or none)
            # for each miso event.
            '''
            for i in range(0, len(output_dic[aa_seq][feature_colname])):
                row_to_write = []    # initialize
                for summary_colname in summary_colnames:
                    row_to_write.append(row[readheader.index(summary_colname)])
                for annote_colname in annotation_colnames:
                    row_to_write.append(output_dic[aa_seq][annote_colname][i])
                mywriter.writerow(row_to_write)
                writecount += 1
    outfile.close()
    return writecount
Пример #2
0
def write_annotations_to_output(output_dic, output_file, summary_file):
    '''
    Given output dic of annotated sequences, write to outputfile.
    We want to include all the information that is already in summary file
    so
    we use summary file to read each row and add annotations.
    
    Each row in summary file may write 0 or more lines depending on 
    how many uniprot annotations match to the amino acid sequence.
    '''
    # initialize writefile as write obj
    outfile = open(output_file, 'wb')
    mywriter = csv.writer(outfile, delimiter='\t')

    # define column names
    # summary file colnames
    gene_name_colname, miso_event_colname, reading_frame_colname, \
    nucleotide_seq_colname, amino_acid_seq_colname, gene_id_colname, \
    transcript_id_colname, exon_number_colname = \
        get_summary_file_colnames()
    summary_colnames = [
        gene_name_colname, miso_event_colname, reading_frame_colname,
        nucleotide_seq_colname, amino_acid_seq_colname, gene_id_colname,
        transcript_id_colname, exon_number_colname
    ]

    # get annotation colnames
    start_colname, end_colname, descript_colname = get_uniprot_subkeys()
    exon_start_colname = 'exon_start'
    exon_end_colname = 'exon_end'
    feature_colname = 'feature'
    annotation_colnames = [
        feature_colname, start_colname, end_colname, descript_colname,
        exon_start_colname, exon_end_colname
    ]

    # Write header to output file. Order matters.
    outheader = summary_colnames + annotation_colnames
    mywriter.writerow(outheader)

    # init writecount.
    writecount = 0

    # create read file obj for summary file
    with open(summary_file, 'rb') as readfile:
        myreader = csv.reader(readfile, delimiter='\t')
        readheader = myreader.next()
        for row in myreader:
            # get amino acid seq, our key used to access output dic annotes
            aa_seq = row[readheader.index(amino_acid_seq_colname)]

            # if no associatd annotations, skip to next.
            if aa_seq not in output_dic:
                continue
            '''
            # iterate over list in list of features, write corresponding annotes
            # along with summary information. Expect multiple annotations (or none)
            # for each miso event.
            '''
            for i in range(0, len(output_dic[aa_seq][feature_colname])):
                row_to_write = []  # initialize
                for summary_colname in summary_colnames:
                    row_to_write.append(row[readheader.index(summary_colname)])
                for annote_colname in annotation_colnames:
                    row_to_write.append(output_dic[aa_seq][annote_colname][i])
                mywriter.writerow(row_to_write)
                writecount += 1
    outfile.close()
    return writecount
def append_dic_if_feature_within_start_end(exon_start, exon_end, 
                                           amino_acid_seq,
                                           uniprot_dic, gene_key, feature,
                                           output_dic):
    '''
    Given start and end, check if a particular feature within
    a gene inside uniprot_dic matches start and ends in the
    feature annotation.
    
    Return all instances where it matches in a dictionary object.
    
    amino acid sequence comes from a particular exon.
    
    Dictionary format:
    {amino_acid_sequence: {feature: {[start], [end], [description]}}}
    '''
    # get uniprot subkeys for accessing feature starts, stops and descriptions
    start_subkey, end_subkey, descript_subkey = get_uniprot_subkeys()
    # define additional subkeys: exon_start and exon_end and feature
    exon_start_subkey = 'exon_start'
    exon_end_subkey = 'exon_end'
    feature_subkey = 'feature'
    
    # initialize match_count
    match_count = 0
    
    # get start, end, description from uniprot dic
    feature_start_list = uniprot_dic[gene_key][feature][start_subkey]
    feature_end_list = uniprot_dic[gene_key][feature][end_subkey]
    descript_list = uniprot_dic[gene_key][feature][descript_subkey]

    '''
    # iterate feature start/end in parallel, ask if it is within 
    # the exon start/end range.
    Criteria for if it is NOT within range is:
    exon_start > feature_end
    exon_end < feature_start
    '''
    for feature_start, feature_end, descript in zip(feature_start_list, 
                                                    feature_end_list,
                                                    descript_list):
        if exon_start > feature_end or exon_end < feature_start:
            # feature outside of relevant range, go to next start/end
            continue
        else:
            # feature within relevant range, store to output dic
            # intialize relevant keynames if not yet initialized already.
            output_keyname = amino_acid_seq
            if output_keyname not in output_dic:
                output_dic[output_keyname] = {}
                for subkey in [start_subkey, end_subkey, descript_subkey, 
                               exon_start_subkey, exon_end_subkey, 
                               feature_subkey]:
                    output_dic[output_keyname][subkey] = []
            else:
                # already initialized, so simply append subvals to list.
                pass
            # store values into subkey
            for subkey, subval in \
                zip([start_subkey, end_subkey, descript_subkey, 
                     exon_start_subkey, exon_end_subkey, feature_subkey],
                    [feature_start, feature_end, descript, 
                     exon_start, exon_end, feature]):
                output_dic[output_keyname][subkey].append(subval)
            match_count += 1
    return output_dic, match_count
Пример #4
0
def append_dic_if_feature_within_start_end(exon_start, exon_end,
                                           amino_acid_seq, uniprot_dic,
                                           gene_key, feature, output_dic):
    '''
    Given start and end, check if a particular feature within
    a gene inside uniprot_dic matches start and ends in the
    feature annotation.
    
    Return all instances where it matches in a dictionary object.
    
    amino acid sequence comes from a particular exon.
    
    Dictionary format:
    {amino_acid_sequence: {feature: {[start], [end], [description]}}}
    '''
    # get uniprot subkeys for accessing feature starts, stops and descriptions
    start_subkey, end_subkey, descript_subkey = get_uniprot_subkeys()
    # define additional subkeys: exon_start and exon_end and feature
    exon_start_subkey = 'exon_start'
    exon_end_subkey = 'exon_end'
    feature_subkey = 'feature'

    # initialize match_count
    match_count = 0

    # get start, end, description from uniprot dic
    feature_start_list = uniprot_dic[gene_key][feature][start_subkey]
    feature_end_list = uniprot_dic[gene_key][feature][end_subkey]
    descript_list = uniprot_dic[gene_key][feature][descript_subkey]
    '''
    # iterate feature start/end in parallel, ask if it is within 
    # the exon start/end range.
    Criteria for if it is NOT within range is:
    exon_start > feature_end
    exon_end < feature_start
    '''
    for feature_start, feature_end, descript in zip(feature_start_list,
                                                    feature_end_list,
                                                    descript_list):
        if exon_start > feature_end or exon_end < feature_start:
            # feature outside of relevant range, go to next start/end
            continue
        else:
            # feature within relevant range, store to output dic
            # intialize relevant keynames if not yet initialized already.
            output_keyname = amino_acid_seq
            if output_keyname not in output_dic:
                output_dic[output_keyname] = {}
                for subkey in [
                        start_subkey, end_subkey, descript_subkey,
                        exon_start_subkey, exon_end_subkey, feature_subkey
                ]:
                    output_dic[output_keyname][subkey] = []
            else:
                # already initialized, so simply append subvals to list.
                pass
            # store values into subkey
            for subkey, subval in \
                zip([start_subkey, end_subkey, descript_subkey,
                     exon_start_subkey, exon_end_subkey, feature_subkey],
                    [feature_start, feature_end, descript,
                     exon_start, exon_end, feature]):
                output_dic[output_keyname][subkey].append(subval)
            match_count += 1
    return output_dic, match_count