def main(): ############################################################################ # create string mapping dictionaries ############################################################################ ensp8_to_ensg = stringrnautils.get_string_to_alias_mapper('9606', 'ENSP', 'ENSG', 8)['9606'] ensg_to_ensp10 = stringrnautils.get_alias_to_string_mapper('9606', 'ENSP', 'ENSG', 10)['9606'] ensp_to_ensp10 = stringrnautils.get_alias_to_string_mapper('9606', 'ENSP', 'ENSP', 10)['9606'] mir_hash = stringrnautils.get_unique_mir_mapper() # cheating :) because one gene died between versions # ensp8_to_ensg['ENSP00000308970'] = 'ENSG00000231924' # ensg_to_ensp9['ENSG00000231924'] = 'ENSP00000244296' black_list = set() # croft contains duplicates!! unmappable_mir_triples = set() unmappable_ensp_triples = set() for line in sys.stdin: mir, ensp, _ = line.rstrip().split('\t') new_mir, new_ensp = None, None # Map miRNA if mir in mir_hash: new_mir = mir_hash[mir] else: unmappable_mir_triples.add('({}, {})'.format(ensp, mir)) # if mir not in mir_hash and mir in mir_alias_hash: # sys.stderr.write(' - mir is one of these %s' % str(mir_alias_hash[mir])) # Map ENSP if ensp in ensp8_to_ensg and ensp8_to_ensg[ensp] in ensg_to_ensp10: ensg = ensp8_to_ensg[ensp] new_ensp = ensg_to_ensp10[ensg] elif ensp in ensp_to_ensp10: new_ensp = ensp_to_ensp10[ensp] else: unmappable_ensp_triples.add('({}, {})'.format(ensp, mir)) if new_ensp and new_mir and (ensp, mir) not in black_list: out_line = '\t'.join(("9606", new_mir, new_ensp, "0", "database", "0.900", "Croft", "", "")) sys.stdout.write('%s\n' % out_line) black_list.add((ensp, mir)) if len(unmappable_mir_triples) > 0: logger.warning("Could not map " + str(len(unmappable_mir_triples)) + " miRNAs to IDs used in miRBase. Respective interactions were: " + ', '.join(unmappable_mir_triples)) if len(unmappable_ensp_triples) > 0: logger.warning("Could not map " + str(len(unmappable_ensp_triples)) + " proteins to ENSPs used in STRING 10. Respective interactions were: " + ', '.join(unmappable_ensp_triples))
def run(): # TODO: UNCOMMENT BEFORE YOU COMMIT!!! # download() master_file = open('master_files/database_spliceosome.tsv', 'w') # TODO: map RNAs as well!!! for organism in organisms: print(' - generating interactions for {}'.format(organism)) interactions = get_u_snrna_rna_rna_interactions() for ent1, ent2, url in interactions: _str = "\t".join( (organism, ent1, ent2, "0", "DATABASE", "0.9", url, '')) master_file.write("{}\n".format(_str)) uniprot_to_ensp = stringrnautils.get_alias_to_string_mapper( organism, '', '', 10)[organism] with open('data/{}_celluar_component.tsv'.format( organism)) as go_terms_file: go_terms_file.readline() # skip header for line in go_terms_file: try: uniprot_acc, goterms = line.rstrip().split('\t') except ValueError: # there are no go terms for this protein continue url = 'http://www.uniprot.org/uniprot/{}'.format(uniprot_acc) for go_term in goterms.split('; '): ensp = uniprot_to_ensp.get(uniprot_acc, None) if ensp: for u_rna in goterm_to_urna.get(go_term, []): _line = "\t".join((organism, u_rna, ensp, "0", "DATABASE", "0.9", url, '')) master_file.write("{}\n".format(_line))
def run(): protein_mapper = stringrnautils.get_alias_to_string_mapper( "9606", 'ENSP', "")["9606"] # parse S3 and S4, use S4 as simply yet another PID supporting them (22365833) ############################################################ # parse S3 and S4 into pmids and benchmark ############################################################ pmid_evicence = collections.defaultdict(list) # s3 and s4 experiment_evidence = collections.defaultdict(list) # s6 counts as 0.9 hegel_pmid = 22365833 if not os.path.exists(TABEL_S3_LOCAL): urllib.urlretrieve(TABEL_S3_URL, TABEL_S3_LOCAL) s3_data = pd.read_excel(TABEL_S3_LOCAL, TABEL_S3_SHEET) for i in range(s3_data.shape[0]): gene_a = s3_data["SymbolA"][i] gene_b = s3_data["SymbolB"][i] try: prot_a = protein_mapper[gene_a] prot_b = protein_mapper[gene_b] key = tuple(sorted((prot_a, prot_b))) pmid_evicence[key].append(hegel_pmid) except KeyError: print "the interaction between {0} and {1} could not be mapped to string ids".format( gene_a, gene_b) if not os.path.exists(TABEL_S4_LOCAL): urllib.urlretrieve(TABEL_S4_LOCAL, TABEL_S4_SHEET) s4_data = pd.read_excel(TABEL_S4_LOCAL, TABEL_S4_SHEET) for i in range(s4_data.shape[0]): gene_a = s4_data["SymbolA"][i] gene_b = s4_data["SymbolB"][i] try: prot_a = protein_mapper[gene_a] prot_b = protein_mapper[gene_b] key = tuple(sorted((prot_a, prot_b))) for j in range(1, int(s4_data["#PMID"][i])): pmid_evicence[key].append(int(s4_data[str(j)][i])) except KeyError: print "the interaction between {0} and {1} could not be mapped to string ids".format( gene_a, gene_b) # TODO alex or garde: hook this into the combine-miRTarBase-NPinter script, as thise interactions are proteins and # therefore have to be scored using the bins from there (as we have no protein positive set for (prot_1, prot_b), pmids in pmid_evicence.items(): print '\t'.join(("9606", prot_a, prot_b, "0", "Experiment", str(len(pmids)), "Litterature", "", "")) ############################################################ # parse S6 into experiments ############################################################ if not os.path.exists(TABEL_S6_LOCAL): urllib.urlretrieve(TABEL_S6_URL, TABEL_S6_LOCAL) s6_data = pd.read_excel(TABEL_S6_LOCAL, TABEL_S6_SHEET) for i in range(s6_data.shape[0]): gene_a = s6_data["FireSymbol"][i] gene_b = s6_data["PASymbol"][i] try: prot_a = protein_mapper[gene_a] prot_b = protein_mapper[gene_b] key = tuple(sorted((prot_a, prot_b))) for j in range(1, int(s4_data["#PMID"][i])): pmid_evicence[key].append(int(s4_data[str(j)][i])) except KeyError: print "the interaction between {0} and {1} could not be mapped to string ids".format( gene_a, gene_b) # TODO: append this to the experiments file in "combine_experiments" # these are proteins and therefore cannot be benchmarked against our gold-standard print '\t'.join( ("9606", prot_a, prot_b, "0", "Experiment", "0.9", "Luciferase", "https://www.ncbi.nlm.nih.gov/pubmed/22365833", ""))
os.mkdir(MASTER_PATH) cat_master_file_name = 'database.tsv' knowledge_master_file_path = os.path.join(MASTER_PATH, cat_master_file_name) ################################################################################ # NOTE/TODO for future versions of RAIN: # currently, this script assumes that the curated knowledge channel contains # only human interactions. If this changes in the future, the ID mapping # routines must be adjusted. ################################################################################ # Make up for the fact that it was curated for version 9 ENSP2ENSG_v9 = stringrnautils.get_string_to_alias_mapper( '9606', 'ENSP', 'ENSG', 9, 'all', True)['9606'] ENSG2ENSP_v10 = stringrnautils.get_alias_to_string_mapper( 9606, 'ENSP', 'ENSG', 10)['9606'] ENSP9_to_ENSP10 = dict([(ensp, ENSG2ENSP_v10[ensg]) for ensp, ensg in ENSP2ENSG_v9.iteritems() if ensg in ENSG2ENSP_v10]) ENSP9_to_ENSP10.update({ 'ENSP00000403359': 'ENSP00000441000', 'ENSP00000400867': 'ENSP00000441000', 'ENSP00000403175': 'ENSP00000393241' }) ENSP2ENSP_v10 = stringrnautils.get_alias_to_string_mapper( 9606, 'ENSP', 'ENSP', 10)['9606'] ncrna_mapper = stringrnautils.get_non_coding_rna_alias_mapper()['9606'] def correct_rna_names(ID, ncrna_mapper):
def getSTRINGdic(specie): #ENSG ENSP conversion for RefSeq NM_ mRNAs STRING_dic = stringrnautils.get_alias_to_string_mapper( organisms=organismIdMap[specie], filter_string_alias='', filter_string_id='') return STRING_dic
import gzip, os, argparse, stringrnautils parser = argparse.ArgumentParser() parser.add_argument('-data_path', default='data') parser.add_argument('-rawscore_path', default='rawscore_files') parser.add_argument('-master_path', default='master_files') parser.add_argument('-gold_std', default='data/extended_gold_standard.tsv') args = parser.parse_args() string_mapper = stringrnautils.get_alias_to_string_mapper( ['9606', '10090', '7955', '10116', '7227', '6239', '3702'], '', '', 10, 'all') mir_mapper = stringrnautils.get_unique_mir_mapper() # Retrieve tarbase data #---------------------- tarbase_file = os.path.join(args.data_path, "Tarbase.6.7.FINAL.mirbase21.download.tsv.gz") tarbase_file = os.path.join("data", "Tarbase.6.7.FINAL.mirbase21.download.tsv.gz") if not os.path.exists(tarbase_file): os.system( "wget -nv http://rth.dk/~ajunge/Tarbase.6.7.FINAL.mirbase21.download.tsv.gz -O %s" % tarbase_file) orgn2keep = dict([("Arabidopsis thaliana", '3702'), ("Caenorhabditis elegans", '6239'), ("Danio rerio", '7955'), ("Drosophila melanogaster", '7227'), ("H**o sapiens", '9606'), ("Mus musculus", '10090'), ("Rattus norvegicus", '10116')])
def integrate_all_prediction_tools(): # Define dictionaries #-------------------- gene2ensembl = stringrnautils.map_gene_2_enemble(os.path.join(LOCAL_DATA_PATH, 'gene2ensembl.gz')) stringrnautils.integrate_NM_dictionary(gene2ensembl) mir_mapper = stringrnautils.get_unique_mir_mapper() string_mapper = stringrnautils.get_alias_to_string_mapper(['9606', '10090','7955', '10116', '7227', '6239','3702'], '', '', 10, 'all') # Read data and benchmark #-------------------------- # starmirdb - may decide to exclude this one read_starmirdb( mir_mapper, string_mapper) # miRanda read_predictions( "miRanda_v3.3a.tsv.gz", {}, mir_mapper, string_mapper, tax_idx=0, mir_idx=1,target_idx=2,score_idx=4, increasing=False, window_size=1000, name="miRanda", ignore_fraction=0.7, has_header=True ) # miRDB read_predictions( "miRDB_v5.0.tsv.gz", gene2ensembl, mir_mapper, string_mapper, tax_idx=0, mir_idx=1,target_idx=2,score_idx=3, increasing=True, window_size=75, name="miRDB", ignore_fraction=0.0, has_header=True ) # PITA read_predictions( "PITA.tsv.gz", {}, mir_mapper, string_mapper, tax_idx=0, mir_idx=2,target_idx=1,score_idx=4, increasing=False, window_size=500, name="PITA", ignore_fraction=0.0, has_header=True ) # RNA22 - excluded due to poor performance if args.run_all: read_predictions( "RNA22.tsv.gz", {}, mir_mapper, string_mapper, tax_idx=0, mir_idx=1,target_idx=2,score_idx=3, increasing=True, window_size=50, name="RNA22", ignore_fraction=0.2, has_header=True,do_benchmark=False) # RNAhybrid - excluded due to poor performance if args.run_all: read_predictions( "RNAhybrid_seed.tsv.gz", {}, mir_mapper, string_mapper, tax_idx=0, mir_idx=1,target_idx=2,score_idx=3, increasing=False, window_size=50, name="RNAhybrid_seed", ignore_fraction=0.2, has_header=False, do_benchmark=False) # Targetscan read_predictions( "targetscan.mammals.tsv.gz", {}, mir_mapper, string_mapper, tax_idx=0, mir_idx=1,target_idx=2,score_idx=4, increasing=False, window_size=50, name="targetscan", ignore_fraction=0.50, has_header=True ) # integrate prediction tools #-------------------- prediction_tools = ('starmirdb', 'miRanda', 'targetscan', 'miRDB', 'PITA') organism_to_tool = {} for tool in prediction_tools: organisms = species_covered(os.path.join(MASTER_FILE_DIR,'{0}.tsv'.format(tool))) for organism in organisms: organism_to_tool.setdefault(organism, []).append(tool) tool_combinations = set() tool_combinations_to_species = {} for organism, tools in list(organism_to_tool.items()): tools = '_and_'.join(sorted(tools)) tool_combinations.add(tools) organism_to_tool[organism] = tools tool_combinations_to_species.setdefault(tools, set()).add(organism) tool_parameters = { "PITA_and_miRDB_and_miRanda_and_targetscan":{ 'negative_evidence' : False, 'rebenchmark_everything' : True, 'ignore_fraction' : 0.0, 'window_size' : 110, 'unlink_master_files' : False }, "PITA_and_miRanda": { 'negative_evidence' : False, 'rebenchmark_everything' : True, 'ignore_fraction' : 0.0, 'window_size' : 200, 'unlink_master_files' : False } } default_tool_parameters = { 'negative_evidence' : False, 'rebenchmark_everything' : True, 'ignore_fraction' : 0.60, 'window_size' : 75, 'unlink_master_files' : False } # generate organism specific callibration curves predictions_master_file = open(os.path.join(MASTER_FILE_DIR, 'predictions.tsv'), 'w') new_master_files = ['{0}.tsv'.format(p) for p in prediction_tools] for tool_combination in tool_combinations: source_master_files = ('{0}.tsv'.format(t) for t in tool_combination.split('_and_')) destination_name = 'predictions_subset_{0}'.format(tool_combination) destination_master_file = 'predictions_subset_{0}.tsv'.format(tool_combination) parameters = default_tool_parameters.copy() if tool_combination in tool_parameters: parameters.update(tool_parameters[tool_combination]) new_master_files.append(destination_master_file) stringrnautils.combine_masterfiles(source_master_files, destination_master_file, gold_standard_file_path, destination_name, **parameters) # generate/append relevant species to predictions.tsv species = tool_combinations_to_species[tool_combination] for line in open(os.path.join(MASTER_FILE_DIR, destination_master_file)): if int(line.split('\t', 1)[0]) in species: predictions_master_file.write(line) # delete all the tmp master files for master_file in new_master_files: os.unlink(os.path.join(MASTER_FILE_DIR, master_file))
} def get_assay_mapping(assay_mapping_path): assay_dict = {} with open(assay_mapping_path, 'r') as fin: for curr_line in fin: split_cols = curr_line.rstrip().split('\t') assay_dict[split_cols[0]] = split_cols[1] return assay_dict miRNA2Clean = stringrnautils.get_unique_mir_mapper() miRNA2taxonomyID = stringrnautils.get_mir_id_to_tax_id_mapper() targetName2targetID = stringrnautils.get_alias_to_string_mapper( organisms=uniqueSpeciesMap.values(), filter_string_alias='', filter_string_id='') restricted_pmids = stringrnautils.starbase_exp_pmids() assayMappingFile = os.path.join(DATA_PATH, 'miRTarBase_assay_mapping.tsv' ) # Maps assay names to 'cleaned' assay names assay2Clean = get_assay_mapping(assayMappingFile) not_mapped = 0 totalCount = 0 # Maps a certain interaction to a set of experiments supporting that interaction, the PubMedIDs and the evidence levels # according to miRTarBase interaction_to_experiments = {} interaction_to_pubmed_ids = {} interaction_to_evidence_types = {}