예제 #1
0
def get_mutation_features(map_file, f_mutations_db_location, output_location):
    map_file_obj = open(map_file, 'r')
    mutations_db = open(f_mutations_db_location, 'r')
    write_to = open(output_location, 'w')
    uniprot_to_index_to_disease = build_uniprot_to_index_to_disease(mutations_db)
    for line in map_file_obj:
        tokens = line.split()
        asid = tokens[0].split("_")[0]
        prot = tokens[1]

        sstart = int(tokens[2])
        start = int(tokens[3])
        end = int(tokens[4])
        eend = int(tokens[5])

        if prot not in uniprot_to_index_to_disease:
            c1_count = 0
            a_count = 0
            c2_count = 0
            canonical_absolute = 0
        else:
            c1_count = score_differences(uniprot_to_index_to_disease, prot, sstart, start)
            a_count = score_differences(uniprot_to_index_to_disease, prot, start, end)
            c2_count = score_differences(uniprot_to_index_to_disease, prot, end, eend)
            prot_len = int(line.split("\t")[7].strip())
            canonical_absolute = score_differences(uniprot_to_index_to_disease, prot, 1, prot_len)

        print >> write_to, tokens[0] + "\t" + prot + "\t" + repr(c1_count) + "\t" + repr(a_count) + "\t" + repr(
            c2_count) + "\t" + repr(canonical_absolute)
    write_to.close()
예제 #2
0
def get_uniprot_elm_features(uniprot_exon_indices_location, uniprot_elm_db_location, output_location):
    """
    Reads uniprot ELM file and generates ELM features.

    :param uniprot_exon_indices_location:
    :param uniprot_elm_db_location:
    :param output_location:
    :return:
    """

    read_from = open(uniprot_elm_db_location, 'r')
    uniprot_exon_indices = open(uniprot_exon_indices_location, 'r')
    write_to = open(output_location, 'w')
    uniprot_to_index_to_elm = {}

    for line in read_from:
        tokens = line.split("\t")
        try:
            uniprot = tokens[0].strip()
            start = int(tokens[2].strip())
            end = int(tokens[3].strip())
            for i in range(start, end + 1):
                if uniprot in uniprot_to_index_to_elm:
                    uniprot_to_index_to_elm[uniprot][i] = "*"
                else:
                    uniprot_to_index_to_elm[uniprot] = {i: "*"}
        except ValueError:
            print "Cannot parse: " + line[0:len(line) - 1]

    for line in uniprot_exon_indices:
        tokens = line.split()

        asid = tokens[0].split("_")[0]
        prot = tokens[1]

        sstart = int(tokens[2])
        start = int(tokens[3])
        end = int(tokens[4])
        eend = int(tokens[5])

        c1_count = 0
        a_count = 0
        c2_count = 0
        canonical_absolute = 0

        if not uniprot_to_index_to_elm.has_key(prot):
            c1_count = 0
            a_count = 0
            c2_count = 0
            canonical_absolute = 0
            other_absolute = 0
        else:
            c1_count = score_differences(uniprot_to_index_to_elm, prot, sstart, start)
            a_count = score_differences(uniprot_to_index_to_elm, prot, start, end)
            c2_count = score_differences(uniprot_to_index_to_elm, prot, end, eend)
            protLen = int(line.split("\t")[7].strip())
            canonical_absolute = score_differences(uniprot_to_index_to_elm, prot, 1, protLen)
        print >> write_to, tokens[0] + "\t" + prot + "\t" + repr(c1_count) + "\t" + repr(a_count) + "\t" + repr(
            c2_count) + "\t" + repr(canonical_absolute)
예제 #3
0
def get_postranscriptional_modification_features(uniprot_exon_indices_location, uniprot_ptm_db_location,
                                                 output_file_location):
    """
    Reads uniprot PTM file and generates post translational modification site features.

    :param uniprot_exon_indices_location:
    :param uniprot_ptm_db_location:
    :param output_file_location:
    :return:
    """

    read_from = open(uniprot_ptm_db_location, 'r')
    uniprot_exon_indices = open(uniprot_exon_indices_location, 'r')
    write_to = open(output_file_location, 'w')
    uniprot_to_index_to_ptm = {}

    for line in read_from:
        tokens = line.split()
        try:
            uniprot = tokens[0]
            index = int(tokens[1])
            ptm = tokens[3]
            if uniprot_to_index_to_ptm.has_key(uniprot):
                uniprot_to_index_to_ptm[uniprot][index] = "*"
            else:
                uniprot_to_index_to_ptm[uniprot] = {index: "*"}
        except ValueError:
            print "Cannot parse: " + line[0:len(line) - 1]

    for line in uniprot_exon_indices:
        tokens = line.split()

        asid = tokens[0].split("_")[0]
        prot = tokens[1]

        sstart = int(tokens[2])
        start = int(tokens[3])
        end = int(tokens[4])
        eend = int(tokens[5])

        c1_count = 0
        a_count = 0
        c2_count = 0
        canonical_absolute = 0

        if not uniprot_to_index_to_ptm.has_key(prot):
            c1_count = 0
            a_count = 0
            c2_count = 0
            canonical_absolute = 0
            other_absolute = 0
        else:
            c1_count = score_differences(uniprot_to_index_to_ptm, prot, sstart, start)
            a_count = score_differences(uniprot_to_index_to_ptm, prot, start, end)
            c2_count = score_differences(uniprot_to_index_to_ptm, prot, end, eend)
            prot_len = int(line.split("\t")[7].strip())
            canonical_absolute = score_differences(uniprot_to_index_to_ptm, prot, 1, prot_len)
        print >> write_to, tokens[0] + "\t" + prot + "\t" + repr(c1_count) + "\t" + repr(a_count) + "\t" + repr(
            c2_count) + "\t" + repr(canonical_absolute)
예제 #4
0
def get_mutation_features(map_file, f_mutations_db_location, output_location):
    map_file_obj = open(map_file, 'r')
    mutations_db = open(f_mutations_db_location, 'r')
    write_to = open(output_location, 'w')
    uniprot_to_index_to_disease = build_uniprot_to_index_to_disease(
        mutations_db)
    for line in map_file_obj:
        tokens = line.split()
        asid = tokens[0].split("_")[0]
        prot = tokens[1]

        sstart = int(tokens[2])
        start = int(tokens[3])
        end = int(tokens[4])
        eend = int(tokens[5])

        if prot not in uniprot_to_index_to_disease:
            c1_count = 0
            a_count = 0
            c2_count = 0
            canonical_absolute = 0
        else:
            c1_count = score_differences(uniprot_to_index_to_disease, prot,
                                         sstart, start)
            a_count = score_differences(uniprot_to_index_to_disease, prot,
                                        start, end)
            c2_count = score_differences(uniprot_to_index_to_disease, prot,
                                         end, eend)
            prot_len = int(line.split("\t")[7].strip())
            canonical_absolute = score_differences(uniprot_to_index_to_disease,
                                                   prot, 1, prot_len)

        print >> write_to, tokens[0] + "\t" + prot + "\t" + repr(
            c1_count) + "\t" + repr(a_count) + "\t" + repr(
                c2_count) + "\t" + repr(canonical_absolute)
    write_to.close()
예제 #5
0
def get_sable_scores(map_file, f_sable_db_location, uniprot_core_output_location):
    map_file_obj = open(map_file, 'r')
    sable_db_obj = open(f_sable_db_location, 'r')
    write_to = open(uniprot_core_output_location, 'w')

    uniprot_to_index_to_core = build_uniprot_to_index_to_core(sable_db_obj)

    for line in map_file_obj:
        tokens = line.split()

        asid = tokens[0].split("_")[0]
        prot = tokens[1]
        sstart = int(tokens[2])
        start = int(tokens[3])
        end = int(tokens[4])
        eend = int(tokens[5])

        rough_a_length = int(int(tokens[0].split("_")[-1].split("=")[1]) / 3)
        if asid[0] == "I":
            rough_a_length = 0

        c1_count = 0
        a_count = 0
        c2_count = 0
        canonical_absolute = 0

        if prot in uniprot_to_index_to_core:
            c1_count = score_differences(uniprot_to_index_to_core, prot, sstart, start)
            a_count = score_differences(uniprot_to_index_to_core, prot, start, end)
            c2_count = score_differences(uniprot_to_index_to_core, prot, end, eend)
            prot_len = int(line.split("\t")[7].strip())
            canonical_absolute = score_differences(uniprot_to_index_to_core, prot, 1, prot_len)

        print >> write_to, tokens[0] + "\t" + prot + "\t" + repr(c1_count) + "\t" + repr(a_count) + "\t" + repr(
            c2_count) + "\t" + repr(canonical_absolute)
    write_to.close()
예제 #6
0
def get_uniprot_disorder_features(pulse_path, map_file, iupred_output,
                                  canonical_db_location, disorder_read_out_location):

    anchors_map = build_anchors_map(map_file)
    uniprot_to_index_to_disorder = fetch_feature('uniprot_iupred', anchors_map, canonical_db_location)
    uniprot_to_index_to_disorder = build_uniprot_to_index_to_disorder(iupred_output, uniprot_to_index_to_disorder)

    write_to = open(disorder_read_out_location, 'w')
    uniprot_exon_indices = open(map_file, 'r')
    for line in uniprot_exon_indices:
        tokens = line.split('\t')
        asid = tokens[0]  # .split("_")[0]
        prot = tokens[1]
        sstart = int(tokens[2])
        start = int(tokens[3])
        end = int(tokens[4])
        eend = int(tokens[5])
        rough_a_length = int(int(tokens[0].split("_")[-1].split("=")[1]) / 3)
        if asid[0] == "I":
            rough_a_length = 0
        c1_count = 0
        a_count = 0
        c2_count = 0
        canonical_absolute = 0
        other_absolute = 0
        other_a = 0
        if prot not in uniprot_to_index_to_disorder:
            print "Protein not in uniprot_to_index_to_disorder: ", prot
            c1_count = 0
            a_count = 0
            c2_count = 0
            canonical_absolute = 0
            other_absolute = 0
            other_a = 0
        else:
            c1_count = score_differences(uniprot_to_index_to_disorder, prot, sstart, start)
            a_count = score_differences(uniprot_to_index_to_disorder, prot, start, end)
            c2_count = score_differences(uniprot_to_index_to_disorder, prot, end, eend)
            prot_len = int(line.split("\t")[7].strip())
            other_prot = asid  # or token[0] Customize this line if is need it for test [asid+'//'+prot+'-ST']
            other_prot_len = int(line.split("\t")[8].strip())
            canonical_absolute = score_differences(uniprot_to_index_to_disorder, prot, 1, prot_len)
            other_absolute = score_differences(uniprot_to_index_to_disorder, other_prot, 1, other_prot_len)
            other_a_end = start + rough_a_length

            if other_a_end > other_prot_len:
                other_a_end = other_prot_len
            other_a = score_differences(uniprot_to_index_to_disorder, other_prot, start, other_a_end)
        print >> write_to, tokens[0] + "\t" + prot + "\t" + repr(c1_count) + "\t" + repr(a_count) + "\t" + repr(
            c2_count) + "\t" + repr(canonical_absolute) + "\t" + repr(other_absolute) + "\t" + repr(other_a)
    write_to.close()
예제 #7
0
def get_postranscriptional_modification_features(uniprot_exon_indices_location,
                                                 uniprot_ptm_db_location,
                                                 output_file_location):
    """
    Reads uniprot PTM file and generates post translational modification site features.

    :param uniprot_exon_indices_location:
    :param uniprot_ptm_db_location:
    :param output_file_location:
    :return:
    """

    read_from = open(uniprot_ptm_db_location, 'r')
    uniprot_exon_indices = open(uniprot_exon_indices_location, 'r')
    write_to = open(output_file_location, 'w')
    uniprot_to_index_to_ptm = {}

    for line in read_from:
        tokens = line.split()
        try:
            uniprot = tokens[0]
            index = int(tokens[1])
            ptm = tokens[3]
            if uniprot_to_index_to_ptm.has_key(uniprot):
                uniprot_to_index_to_ptm[uniprot][index] = "*"
            else:
                uniprot_to_index_to_ptm[uniprot] = {index: "*"}
        except ValueError:
            print "Cannot parse: " + line[0:len(line) - 1]

    for line in uniprot_exon_indices:
        tokens = line.split()

        asid = tokens[0].split("_")[0]
        prot = tokens[1]

        sstart = int(tokens[2])
        start = int(tokens[3])
        end = int(tokens[4])
        eend = int(tokens[5])

        c1_count = 0
        a_count = 0
        c2_count = 0
        canonical_absolute = 0

        if not uniprot_to_index_to_ptm.has_key(prot):
            c1_count = 0
            a_count = 0
            c2_count = 0
            canonical_absolute = 0
            other_absolute = 0
        else:
            c1_count = score_differences(uniprot_to_index_to_ptm, prot, sstart,
                                         start)
            a_count = score_differences(uniprot_to_index_to_ptm, prot, start,
                                        end)
            c2_count = score_differences(uniprot_to_index_to_ptm, prot, end,
                                         eend)
            prot_len = int(line.split("\t")[7].strip())
            canonical_absolute = score_differences(uniprot_to_index_to_ptm,
                                                   prot, 1, prot_len)
        print >> write_to, tokens[0] + "\t" + prot + "\t" + repr(
            c1_count) + "\t" + repr(a_count) + "\t" + repr(
                c2_count) + "\t" + repr(canonical_absolute)
예제 #8
0
def get_uniprot_disorder_features(pulse_path, map_file, iupred_output,
                                  canonical_db_location,
                                  disorder_read_out_location):

    anchors_map = build_anchors_map(map_file)
    uniprot_to_index_to_disorder = fetch_feature('uniprot_iupred', anchors_map,
                                                 canonical_db_location)
    uniprot_to_index_to_disorder = build_uniprot_to_index_to_disorder(
        iupred_output, uniprot_to_index_to_disorder)

    write_to = open(disorder_read_out_location, 'w')
    uniprot_exon_indices = open(map_file, 'r')
    for line in uniprot_exon_indices:
        tokens = line.split('\t')
        asid = tokens[0]  # .split("_")[0]
        prot = tokens[1]
        sstart = int(tokens[2])
        start = int(tokens[3])
        end = int(tokens[4])
        eend = int(tokens[5])
        rough_a_length = int(int(tokens[0].split("_")[-1].split("=")[1]) / 3)
        if asid[0] == "I":
            rough_a_length = 0
        c1_count = 0
        a_count = 0
        c2_count = 0
        canonical_absolute = 0
        other_absolute = 0
        other_a = 0
        if prot not in uniprot_to_index_to_disorder:
            print "Protein not in uniprot_to_index_to_disorder: ", prot
            c1_count = 0
            a_count = 0
            c2_count = 0
            canonical_absolute = 0
            other_absolute = 0
            other_a = 0
        else:
            c1_count = score_differences(uniprot_to_index_to_disorder, prot,
                                         sstart, start)
            a_count = score_differences(uniprot_to_index_to_disorder, prot,
                                        start, end)
            c2_count = score_differences(uniprot_to_index_to_disorder, prot,
                                         end, eend)
            prot_len = int(line.split("\t")[7].strip())
            other_prot = asid  # or token[0] Customize this line if is need it for test [asid+'//'+prot+'-ST']
            other_prot_len = int(line.split("\t")[8].strip())
            canonical_absolute = score_differences(
                uniprot_to_index_to_disorder, prot, 1, prot_len)
            other_absolute = score_differences(uniprot_to_index_to_disorder,
                                               other_prot, 1, other_prot_len)
            other_a_end = start + rough_a_length

            if other_a_end > other_prot_len:
                other_a_end = other_prot_len
            other_a = score_differences(uniprot_to_index_to_disorder,
                                        other_prot, start, other_a_end)
        print >> write_to, tokens[0] + "\t" + prot + "\t" + repr(
            c1_count) + "\t" + repr(a_count) + "\t" + repr(
                c2_count) + "\t" + repr(canonical_absolute) + "\t" + repr(
                    other_absolute) + "\t" + repr(other_a)
    write_to.close()