def get_mutation_features(map_file, f_mutations_db_location, output_location): map_file_obj = open(map_file, 'r') mutations_db = open(f_mutations_db_location, 'r') write_to = open(output_location, 'w') uniprot_to_index_to_disease = build_uniprot_to_index_to_disease(mutations_db) for line in map_file_obj: tokens = line.split() asid = tokens[0].split("_")[0] prot = tokens[1] sstart = int(tokens[2]) start = int(tokens[3]) end = int(tokens[4]) eend = int(tokens[5]) if prot not in uniprot_to_index_to_disease: c1_count = 0 a_count = 0 c2_count = 0 canonical_absolute = 0 else: c1_count = score_differences(uniprot_to_index_to_disease, prot, sstart, start) a_count = score_differences(uniprot_to_index_to_disease, prot, start, end) c2_count = score_differences(uniprot_to_index_to_disease, prot, end, eend) prot_len = int(line.split("\t")[7].strip()) canonical_absolute = score_differences(uniprot_to_index_to_disease, prot, 1, prot_len) print >> write_to, tokens[0] + "\t" + prot + "\t" + repr(c1_count) + "\t" + repr(a_count) + "\t" + repr( c2_count) + "\t" + repr(canonical_absolute) write_to.close()
def get_uniprot_elm_features(uniprot_exon_indices_location, uniprot_elm_db_location, output_location): """ Reads uniprot ELM file and generates ELM features. :param uniprot_exon_indices_location: :param uniprot_elm_db_location: :param output_location: :return: """ read_from = open(uniprot_elm_db_location, 'r') uniprot_exon_indices = open(uniprot_exon_indices_location, 'r') write_to = open(output_location, 'w') uniprot_to_index_to_elm = {} for line in read_from: tokens = line.split("\t") try: uniprot = tokens[0].strip() start = int(tokens[2].strip()) end = int(tokens[3].strip()) for i in range(start, end + 1): if uniprot in uniprot_to_index_to_elm: uniprot_to_index_to_elm[uniprot][i] = "*" else: uniprot_to_index_to_elm[uniprot] = {i: "*"} except ValueError: print "Cannot parse: " + line[0:len(line) - 1] for line in uniprot_exon_indices: tokens = line.split() asid = tokens[0].split("_")[0] prot = tokens[1] sstart = int(tokens[2]) start = int(tokens[3]) end = int(tokens[4]) eend = int(tokens[5]) c1_count = 0 a_count = 0 c2_count = 0 canonical_absolute = 0 if not uniprot_to_index_to_elm.has_key(prot): c1_count = 0 a_count = 0 c2_count = 0 canonical_absolute = 0 other_absolute = 0 else: c1_count = score_differences(uniprot_to_index_to_elm, prot, sstart, start) a_count = score_differences(uniprot_to_index_to_elm, prot, start, end) c2_count = score_differences(uniprot_to_index_to_elm, prot, end, eend) protLen = int(line.split("\t")[7].strip()) canonical_absolute = score_differences(uniprot_to_index_to_elm, prot, 1, protLen) print >> write_to, tokens[0] + "\t" + prot + "\t" + repr(c1_count) + "\t" + repr(a_count) + "\t" + repr( c2_count) + "\t" + repr(canonical_absolute)
def get_postranscriptional_modification_features(uniprot_exon_indices_location, uniprot_ptm_db_location, output_file_location): """ Reads uniprot PTM file and generates post translational modification site features. :param uniprot_exon_indices_location: :param uniprot_ptm_db_location: :param output_file_location: :return: """ read_from = open(uniprot_ptm_db_location, 'r') uniprot_exon_indices = open(uniprot_exon_indices_location, 'r') write_to = open(output_file_location, 'w') uniprot_to_index_to_ptm = {} for line in read_from: tokens = line.split() try: uniprot = tokens[0] index = int(tokens[1]) ptm = tokens[3] if uniprot_to_index_to_ptm.has_key(uniprot): uniprot_to_index_to_ptm[uniprot][index] = "*" else: uniprot_to_index_to_ptm[uniprot] = {index: "*"} except ValueError: print "Cannot parse: " + line[0:len(line) - 1] for line in uniprot_exon_indices: tokens = line.split() asid = tokens[0].split("_")[0] prot = tokens[1] sstart = int(tokens[2]) start = int(tokens[3]) end = int(tokens[4]) eend = int(tokens[5]) c1_count = 0 a_count = 0 c2_count = 0 canonical_absolute = 0 if not uniprot_to_index_to_ptm.has_key(prot): c1_count = 0 a_count = 0 c2_count = 0 canonical_absolute = 0 other_absolute = 0 else: c1_count = score_differences(uniprot_to_index_to_ptm, prot, sstart, start) a_count = score_differences(uniprot_to_index_to_ptm, prot, start, end) c2_count = score_differences(uniprot_to_index_to_ptm, prot, end, eend) prot_len = int(line.split("\t")[7].strip()) canonical_absolute = score_differences(uniprot_to_index_to_ptm, prot, 1, prot_len) print >> write_to, tokens[0] + "\t" + prot + "\t" + repr(c1_count) + "\t" + repr(a_count) + "\t" + repr( c2_count) + "\t" + repr(canonical_absolute)
def get_mutation_features(map_file, f_mutations_db_location, output_location): map_file_obj = open(map_file, 'r') mutations_db = open(f_mutations_db_location, 'r') write_to = open(output_location, 'w') uniprot_to_index_to_disease = build_uniprot_to_index_to_disease( mutations_db) for line in map_file_obj: tokens = line.split() asid = tokens[0].split("_")[0] prot = tokens[1] sstart = int(tokens[2]) start = int(tokens[3]) end = int(tokens[4]) eend = int(tokens[5]) if prot not in uniprot_to_index_to_disease: c1_count = 0 a_count = 0 c2_count = 0 canonical_absolute = 0 else: c1_count = score_differences(uniprot_to_index_to_disease, prot, sstart, start) a_count = score_differences(uniprot_to_index_to_disease, prot, start, end) c2_count = score_differences(uniprot_to_index_to_disease, prot, end, eend) prot_len = int(line.split("\t")[7].strip()) canonical_absolute = score_differences(uniprot_to_index_to_disease, prot, 1, prot_len) print >> write_to, tokens[0] + "\t" + prot + "\t" + repr( c1_count) + "\t" + repr(a_count) + "\t" + repr( c2_count) + "\t" + repr(canonical_absolute) write_to.close()
def get_sable_scores(map_file, f_sable_db_location, uniprot_core_output_location): map_file_obj = open(map_file, 'r') sable_db_obj = open(f_sable_db_location, 'r') write_to = open(uniprot_core_output_location, 'w') uniprot_to_index_to_core = build_uniprot_to_index_to_core(sable_db_obj) for line in map_file_obj: tokens = line.split() asid = tokens[0].split("_")[0] prot = tokens[1] sstart = int(tokens[2]) start = int(tokens[3]) end = int(tokens[4]) eend = int(tokens[5]) rough_a_length = int(int(tokens[0].split("_")[-1].split("=")[1]) / 3) if asid[0] == "I": rough_a_length = 0 c1_count = 0 a_count = 0 c2_count = 0 canonical_absolute = 0 if prot in uniprot_to_index_to_core: c1_count = score_differences(uniprot_to_index_to_core, prot, sstart, start) a_count = score_differences(uniprot_to_index_to_core, prot, start, end) c2_count = score_differences(uniprot_to_index_to_core, prot, end, eend) prot_len = int(line.split("\t")[7].strip()) canonical_absolute = score_differences(uniprot_to_index_to_core, prot, 1, prot_len) print >> write_to, tokens[0] + "\t" + prot + "\t" + repr(c1_count) + "\t" + repr(a_count) + "\t" + repr( c2_count) + "\t" + repr(canonical_absolute) write_to.close()
def get_uniprot_disorder_features(pulse_path, map_file, iupred_output, canonical_db_location, disorder_read_out_location): anchors_map = build_anchors_map(map_file) uniprot_to_index_to_disorder = fetch_feature('uniprot_iupred', anchors_map, canonical_db_location) uniprot_to_index_to_disorder = build_uniprot_to_index_to_disorder(iupred_output, uniprot_to_index_to_disorder) write_to = open(disorder_read_out_location, 'w') uniprot_exon_indices = open(map_file, 'r') for line in uniprot_exon_indices: tokens = line.split('\t') asid = tokens[0] # .split("_")[0] prot = tokens[1] sstart = int(tokens[2]) start = int(tokens[3]) end = int(tokens[4]) eend = int(tokens[5]) rough_a_length = int(int(tokens[0].split("_")[-1].split("=")[1]) / 3) if asid[0] == "I": rough_a_length = 0 c1_count = 0 a_count = 0 c2_count = 0 canonical_absolute = 0 other_absolute = 0 other_a = 0 if prot not in uniprot_to_index_to_disorder: print "Protein not in uniprot_to_index_to_disorder: ", prot c1_count = 0 a_count = 0 c2_count = 0 canonical_absolute = 0 other_absolute = 0 other_a = 0 else: c1_count = score_differences(uniprot_to_index_to_disorder, prot, sstart, start) a_count = score_differences(uniprot_to_index_to_disorder, prot, start, end) c2_count = score_differences(uniprot_to_index_to_disorder, prot, end, eend) prot_len = int(line.split("\t")[7].strip()) other_prot = asid # or token[0] Customize this line if is need it for test [asid+'//'+prot+'-ST'] other_prot_len = int(line.split("\t")[8].strip()) canonical_absolute = score_differences(uniprot_to_index_to_disorder, prot, 1, prot_len) other_absolute = score_differences(uniprot_to_index_to_disorder, other_prot, 1, other_prot_len) other_a_end = start + rough_a_length if other_a_end > other_prot_len: other_a_end = other_prot_len other_a = score_differences(uniprot_to_index_to_disorder, other_prot, start, other_a_end) print >> write_to, tokens[0] + "\t" + prot + "\t" + repr(c1_count) + "\t" + repr(a_count) + "\t" + repr( c2_count) + "\t" + repr(canonical_absolute) + "\t" + repr(other_absolute) + "\t" + repr(other_a) write_to.close()
def get_postranscriptional_modification_features(uniprot_exon_indices_location, uniprot_ptm_db_location, output_file_location): """ Reads uniprot PTM file and generates post translational modification site features. :param uniprot_exon_indices_location: :param uniprot_ptm_db_location: :param output_file_location: :return: """ read_from = open(uniprot_ptm_db_location, 'r') uniprot_exon_indices = open(uniprot_exon_indices_location, 'r') write_to = open(output_file_location, 'w') uniprot_to_index_to_ptm = {} for line in read_from: tokens = line.split() try: uniprot = tokens[0] index = int(tokens[1]) ptm = tokens[3] if uniprot_to_index_to_ptm.has_key(uniprot): uniprot_to_index_to_ptm[uniprot][index] = "*" else: uniprot_to_index_to_ptm[uniprot] = {index: "*"} except ValueError: print "Cannot parse: " + line[0:len(line) - 1] for line in uniprot_exon_indices: tokens = line.split() asid = tokens[0].split("_")[0] prot = tokens[1] sstart = int(tokens[2]) start = int(tokens[3]) end = int(tokens[4]) eend = int(tokens[5]) c1_count = 0 a_count = 0 c2_count = 0 canonical_absolute = 0 if not uniprot_to_index_to_ptm.has_key(prot): c1_count = 0 a_count = 0 c2_count = 0 canonical_absolute = 0 other_absolute = 0 else: c1_count = score_differences(uniprot_to_index_to_ptm, prot, sstart, start) a_count = score_differences(uniprot_to_index_to_ptm, prot, start, end) c2_count = score_differences(uniprot_to_index_to_ptm, prot, end, eend) prot_len = int(line.split("\t")[7].strip()) canonical_absolute = score_differences(uniprot_to_index_to_ptm, prot, 1, prot_len) print >> write_to, tokens[0] + "\t" + prot + "\t" + repr( c1_count) + "\t" + repr(a_count) + "\t" + repr( c2_count) + "\t" + repr(canonical_absolute)
def get_uniprot_disorder_features(pulse_path, map_file, iupred_output, canonical_db_location, disorder_read_out_location): anchors_map = build_anchors_map(map_file) uniprot_to_index_to_disorder = fetch_feature('uniprot_iupred', anchors_map, canonical_db_location) uniprot_to_index_to_disorder = build_uniprot_to_index_to_disorder( iupred_output, uniprot_to_index_to_disorder) write_to = open(disorder_read_out_location, 'w') uniprot_exon_indices = open(map_file, 'r') for line in uniprot_exon_indices: tokens = line.split('\t') asid = tokens[0] # .split("_")[0] prot = tokens[1] sstart = int(tokens[2]) start = int(tokens[3]) end = int(tokens[4]) eend = int(tokens[5]) rough_a_length = int(int(tokens[0].split("_")[-1].split("=")[1]) / 3) if asid[0] == "I": rough_a_length = 0 c1_count = 0 a_count = 0 c2_count = 0 canonical_absolute = 0 other_absolute = 0 other_a = 0 if prot not in uniprot_to_index_to_disorder: print "Protein not in uniprot_to_index_to_disorder: ", prot c1_count = 0 a_count = 0 c2_count = 0 canonical_absolute = 0 other_absolute = 0 other_a = 0 else: c1_count = score_differences(uniprot_to_index_to_disorder, prot, sstart, start) a_count = score_differences(uniprot_to_index_to_disorder, prot, start, end) c2_count = score_differences(uniprot_to_index_to_disorder, prot, end, eend) prot_len = int(line.split("\t")[7].strip()) other_prot = asid # or token[0] Customize this line if is need it for test [asid+'//'+prot+'-ST'] other_prot_len = int(line.split("\t")[8].strip()) canonical_absolute = score_differences( uniprot_to_index_to_disorder, prot, 1, prot_len) other_absolute = score_differences(uniprot_to_index_to_disorder, other_prot, 1, other_prot_len) other_a_end = start + rough_a_length if other_a_end > other_prot_len: other_a_end = other_prot_len other_a = score_differences(uniprot_to_index_to_disorder, other_prot, start, other_a_end) print >> write_to, tokens[0] + "\t" + prot + "\t" + repr( c1_count) + "\t" + repr(a_count) + "\t" + repr( c2_count) + "\t" + repr(canonical_absolute) + "\t" + repr( other_absolute) + "\t" + repr(other_a) write_to.close()