########## STRING dic for ENSPlike conversions def getSTRINGdic(specie): #ENSG ENSP conversion for RefSeq NM_ mRNAs STRING_dic = stringrnautils.get_alias_to_string_mapper( organisms=organismIdMap[specie], filter_string_alias='', filter_string_id='') return STRING_dic ######### Parse with Specie Name ################################################ UP_dic = stringrnautils.getUniProtDic(BIOMART_DIC_PATH) NM_dic = stringrnautils.getRefSeqNMdic(BIOMART_DIC_PATH) NR_dic = stringrnautils.getRefSeqNRdic(BIOMART_DIC_PATH, GENENAME_DIC_PATH) NONCODE_dic = stringrnautils.getNONCODEdic(NONCODE_DIC_PATH, BIOMART_DIC_PATH, GENENAME_DIC_PATH) MB_dic = stringrnautils.get_unique_mir_mapper() def ParseNPINTER(specie='H**o sapiens'): #for H**o Sapiens if specie is not chosen #RNAs or Proteins in NPInter has identifiers from NonCode(NR_, ENST), RefSeq(NM_), MirBase(miR) and UniProt. unmapped = {} mapped = {} unc = 0 mac = 0 UniProtdic = {} if UP_dic.has_key(organismIdMap[specie]): UniProtdic = UP_dic[organismIdMap[specie]] RefSeqdic = {} if NM_dic.has_key(organismIdMap[specie]): RefSeqdic = NM_dic[organismIdMap[specie]]
def integrate_all_prediction_tools(): # Define dictionaries #-------------------- gene2ensembl = stringrnautils.map_gene_2_enemble(os.path.join(LOCAL_DATA_PATH, 'gene2ensembl.gz')) stringrnautils.integrate_NM_dictionary(gene2ensembl) mir_mapper = stringrnautils.get_unique_mir_mapper() string_mapper = stringrnautils.get_alias_to_string_mapper(['9606', '10090','7955', '10116', '7227', '6239','3702'], '', '', 10, 'all') # Read data and benchmark #-------------------------- # starmirdb - may decide to exclude this one read_starmirdb( mir_mapper, string_mapper) # miRanda read_predictions( "miRanda_v3.3a.tsv.gz", {}, mir_mapper, string_mapper, tax_idx=0, mir_idx=1,target_idx=2,score_idx=4, increasing=False, window_size=1000, name="miRanda", ignore_fraction=0.7, has_header=True ) # miRDB read_predictions( "miRDB_v5.0.tsv.gz", gene2ensembl, mir_mapper, string_mapper, tax_idx=0, mir_idx=1,target_idx=2,score_idx=3, increasing=True, window_size=75, name="miRDB", ignore_fraction=0.0, has_header=True ) # PITA read_predictions( "PITA.tsv.gz", {}, mir_mapper, string_mapper, tax_idx=0, mir_idx=2,target_idx=1,score_idx=4, increasing=False, window_size=500, name="PITA", ignore_fraction=0.0, has_header=True ) # RNA22 - excluded due to poor performance if args.run_all: read_predictions( "RNA22.tsv.gz", {}, mir_mapper, string_mapper, tax_idx=0, mir_idx=1,target_idx=2,score_idx=3, increasing=True, window_size=50, name="RNA22", ignore_fraction=0.2, has_header=True,do_benchmark=False) # RNAhybrid - excluded due to poor performance if args.run_all: read_predictions( "RNAhybrid_seed.tsv.gz", {}, mir_mapper, string_mapper, tax_idx=0, mir_idx=1,target_idx=2,score_idx=3, increasing=False, window_size=50, name="RNAhybrid_seed", ignore_fraction=0.2, has_header=False, do_benchmark=False) # Targetscan read_predictions( "targetscan.mammals.tsv.gz", {}, mir_mapper, string_mapper, tax_idx=0, mir_idx=1,target_idx=2,score_idx=4, increasing=False, window_size=50, name="targetscan", ignore_fraction=0.50, has_header=True ) # integrate prediction tools #-------------------- prediction_tools = ('starmirdb', 'miRanda', 'targetscan', 'miRDB', 'PITA') organism_to_tool = {} for tool in prediction_tools: organisms = species_covered(os.path.join(MASTER_FILE_DIR,'{0}.tsv'.format(tool))) for organism in organisms: organism_to_tool.setdefault(organism, []).append(tool) tool_combinations = set() tool_combinations_to_species = {} for organism, tools in list(organism_to_tool.items()): tools = '_and_'.join(sorted(tools)) tool_combinations.add(tools) organism_to_tool[organism] = tools tool_combinations_to_species.setdefault(tools, set()).add(organism) tool_parameters = { "PITA_and_miRDB_and_miRanda_and_targetscan":{ 'negative_evidence' : False, 'rebenchmark_everything' : True, 'ignore_fraction' : 0.0, 'window_size' : 110, 'unlink_master_files' : False }, "PITA_and_miRanda": { 'negative_evidence' : False, 'rebenchmark_everything' : True, 'ignore_fraction' : 0.0, 'window_size' : 200, 'unlink_master_files' : False } } default_tool_parameters = { 'negative_evidence' : False, 'rebenchmark_everything' : True, 'ignore_fraction' : 0.60, 'window_size' : 75, 'unlink_master_files' : False } # generate organism specific callibration curves predictions_master_file = open(os.path.join(MASTER_FILE_DIR, 'predictions.tsv'), 'w') new_master_files = ['{0}.tsv'.format(p) for p in prediction_tools] for tool_combination in tool_combinations: source_master_files = ('{0}.tsv'.format(t) for t in tool_combination.split('_and_')) destination_name = 'predictions_subset_{0}'.format(tool_combination) destination_master_file = 'predictions_subset_{0}.tsv'.format(tool_combination) parameters = default_tool_parameters.copy() if tool_combination in tool_parameters: parameters.update(tool_parameters[tool_combination]) new_master_files.append(destination_master_file) stringrnautils.combine_masterfiles(source_master_files, destination_master_file, gold_standard_file_path, destination_name, **parameters) # generate/append relevant species to predictions.tsv species = tool_combinations_to_species[tool_combination] for line in open(os.path.join(MASTER_FILE_DIR, destination_master_file)): if int(line.split('\t', 1)[0]) in species: predictions_master_file.write(line) # delete all the tmp master files for master_file in new_master_files: os.unlink(os.path.join(MASTER_FILE_DIR, master_file))
# print taxonomy distribution taxonomy_id_to_occurrences = Counter( [intact._org for intact in gold_standard_interactions]) sorted_taxonomy_ids_occurrences = sorted(taxonomy_id_to_occurrences.items(), key=operator.itemgetter(1), reverse=True) logger.info('Taxonomy identifier\tOccurrences') logger.info('--------------------------------') for tax_id, occ in sorted_taxonomy_ids_occurrences: logger.info('%s\t%d' % (tax_id, occ)) # alias mappers used to determine protein_alias_mapper = get_string_to_alias_mapper( taxonomy_id_to_occurrences.keys(), '', '', 10, 'all', True) mir_alias_mapper = get_unique_mir_mapper() ncrna_alias_mapper = get_non_coding_rna_alias_mapper() def get_entity_type(entity_name, taxonomy_id): tax_protein_alias_mapper = protein_alias_mapper[taxonomy_id] entity_is_ncrna = entity_name in ncrna_alias_mapper[taxonomy_id] if entity_is_ncrna: return EntityType.ncRNA entity_is_protein = entity_name in tax_protein_alias_mapper if entity_is_protein: return EntityType.Protein entity_is_mirna = entity_name in mir_alias_mapper if entity_is_mirna: return EntityType.miRNA
source = cols[4] ent1, ent2 = sorted((ent1_unsort, ent2_unsort)) tax_to_interactions[tax].add((ent1, ent2, source)) evidence_channel_to_table_name = { 'database': 'Curated', 'experimental': 'Experiments', 'prediction': 'Predictions', 'textmining': 'Text mining' } all_taxonomy_ids = list(tax_to_interactions.keys()) protein_alias_mapper = stringrnautils.get_string_to_alias_mapper( all_taxonomy_ids, '', '', 10, 'all', True) mir_alias_mapper = stringrnautils.get_unique_mir_mapper() # load ncRNA dictionary ncrna_mapper = stringrnautils.get_non_coding_rna_alias_mapper( ) # taxonomy ID -> ncRNA alias -> ncRNA identifier tax_interaction_count = [] tax_to_miRNA_mRNA_interactions = collections.defaultdict(set) tax_to_ncRNA_protein_interactions = collections.defaultdict(set) tax_to_ncRNA_ncRNA_interactions = collections.defaultdict(set) tax_to_miRNAs = collections.defaultdict(set) tax_to_ncRNAs = collections.defaultdict(set) tax_to_proteins = collections.defaultdict(set) tax_to_source_to_interactions = {} for tax, interaction_set in tax_to_interactions.iteritems(): tax_interaction_count.append((tax, len(interaction_set)))
'Solanum lycopersicum': '4081', 'Sus scrofa': '9832', 'Taeniopygia guttata': '59729' } def get_assay_mapping(assay_mapping_path): assay_dict = {} with open(assay_mapping_path, 'r') as fin: for curr_line in fin: split_cols = curr_line.rstrip().split('\t') assay_dict[split_cols[0]] = split_cols[1] return assay_dict miRNA2Clean = stringrnautils.get_unique_mir_mapper() miRNA2taxonomyID = stringrnautils.get_mir_id_to_tax_id_mapper() targetName2targetID = stringrnautils.get_alias_to_string_mapper( organisms=uniqueSpeciesMap.values(), filter_string_alias='', filter_string_id='') restricted_pmids = stringrnautils.starbase_exp_pmids() assayMappingFile = os.path.join(DATA_PATH, 'miRTarBase_assay_mapping.tsv' ) # Maps assay names to 'cleaned' assay names assay2Clean = get_assay_mapping(assayMappingFile) not_mapped = 0 totalCount = 0 # Maps a certain interaction to a set of experiments supporting that interaction, the PubMedIDs and the evidence levels
def create_mir_ortholog_lists(): """ Determines groups of miRNA ortholog by following the following steps: 1) miRBase family annotation (miFam.dat.gz) determines which miRNA precursors belong to the one orthologous group 2) miRNA precursors in each group are replaced by the mature miRNAs processed from them 3) only those mature miRNAs that are contained in set of RAIN identifiers are retained 4) 5prime and 3prime mature miRNAs in the same orthologous group are sorted into two classes :return: a dict mapping a string to a list of sets - keys in the returned dictionary are names of miRNA orthologous groups and value are a list of sets were set contains the miRBase identifiers of mature miRNAs that are orthologous """ # used to check condition 2) unique_mir_mapper = stringrnautils.get_unique_mir_mapper() # step 1) # a list of miRNA families where each family is represented by a set of miRNA precursor names, e.g., # [{'hsa-mir-17', 'hsa-mir-18a', ...} , {'cel-let-7', 'hsa-let-7a-1', 'hsa-let-7a-2' , ...}, .., ] precursor_mir_families_list = [] # a list of miRNA family accessions mir_family_accessions = [] mirs_already_added_to_family = set( ) # for sanity checking if a mir precursor is assigned to more than one family with gzip.open(MIR_FAMILY_FILE, 'rb') as mir_fam_file: current_family_members = set() fam_accession = None for line in mir_fam_file: cols = line.rstrip('\r\n').split() tag = cols[0] if tag == 'AC': # family accession assert len(cols) == 2 fam_accession = cols[1] elif tag == 'ID': # family identifier assert len(cols) == 2 # fam_identifier = cols[1] elif tag == 'MI': # accession and identifier of family member assert len(cols) == 3 # fam_member_accession = cols[1] fam_member_identifier = cols[2] current_family_members.add(fam_member_identifier) elif line.startswith('//'): # end of family # sanity check for mir in current_family_members: if mir in mirs_already_added_to_family: raise ValueError( 'miRNA precursor %s is assigned to more than one miRBase family.' % mir) mirs_already_added_to_family.update(current_family_members) precursor_mir_families_list.append(current_family_members) if fam_accession: mir_family_accessions.append(fam_accession) else: raise ValueError('Missing family accession.') current_family_members = set() fam_accession = None else: raise ValueError( 'Could not identify tag %s in line %s of file %s.' % (tag, line, MIR_FAMILY_FILE)) logger.info( 'Found %i miRNA families in miRBase %s containing a total of %i miRNAs.' % (len(precursor_mir_families_list), MIRBASE_VERSION, len(mirs_already_added_to_family))) # maps each taxonomy ID and precursor identifier to the identifiers of the mature miRNAs being made from it, e.g.,: # mir_precursor_to_mature_mapper[('9606', 'hsa-mir-17')] -> ['hsa-miR-17-5p', 'hsa-miR-17-3p'] taxonomy_mir_precursor_to_mature_mapper = __get_mir_precursor_to_mature_mapper__( ) # maps each mature miRNA to its taxonomy ID and is needed to produce the final species aware ortholog list. Example: # taxonomy_mir_precursor_to_mature_mapper['hsa-miR-17-5p'] -> '9606' mature_to_taxonomy_mapper = {} # precursors in miR families are not linked to taxonomy ID, so create mapping dictionary of following form: # mir_precursor_to_mature_mapper['hsa-mir-17'] -> ['hsa-miR-17-5p', 'hsa-miR-17-3p'] mir_precursor_to_mature_mapper = {} for taxonomy_precursor, mature_list in taxonomy_mir_precursor_to_mature_mapper.items( ): taxonomy_id, precursor_id = taxonomy_precursor for mature_id in mature_list: mature_to_taxonomy_mapper[mature_id] = taxonomy_id mir_precursor_to_mature_mapper[precursor_id] = mature_list # step 2) mir_precursors_not_mapped = set() mature_mir_families_list = [] for mir_family in precursor_mir_families_list: mature_mirs_in_family = set() for mir_precursor in mir_family: # step 3) if mir_precursor in mir_precursor_to_mature_mapper: mature_mirs = mir_precursor_to_mature_mapper[mir_precursor] for mature_mir in mature_mirs: mature_mirs_in_family.add(mature_mir) else: mir_precursors_not_mapped.add(mir_precursor) mature_mir_families_list.append(mature_mirs_in_family) logger.debug('Following miRNAs could not be mapped to mature miRNA(s): ' + ', '.join(sorted(list(mir_precursors_not_mapped)))) # step 4) ends_in_5p = set() ends_in_3p = set() ends_not_in_3p_or_5p = set() final_mir_identifiers = unique_mir_mapper.values() # replace each mature miRNA ID by a tuple containing its taxonomy ID and the mature miRNA ID itself taxonomy_five_prime_mature_mir_families_list = [] taxonomy_three_prime_mature_mir_families_list = [] fam_members_not_mapped = set() for family in mature_mir_families_list: five_prime_fam_members = set() three_prime_fam_members = set() for fam_member in family: if fam_member not in mature_to_taxonomy_mapper: raise IOError( 'Mature miRNA could not be mapped to taxonomy ID: ' + fam_member) taxonomy_id = mature_to_taxonomy_mapper[fam_member] if fam_member not in final_mir_identifiers: fam_members_not_mapped.add(fam_member) elif fam_member.endswith('-5p'): five_prime_fam_members.add((taxonomy_id, fam_member)) ends_in_5p.add(fam_member) elif fam_member.endswith('-3p'): three_prime_fam_members.add((taxonomy_id, fam_member)) ends_in_3p.add(fam_member) else: ends_not_in_3p_or_5p.add(fam_member) taxonomy_five_prime_mature_mir_families_list.append( five_prime_fam_members) taxonomy_three_prime_mature_mir_families_list.append( three_prime_fam_members) logger.info('Following miRNAs were not found in RAIN identifiers: ' + ', '.join(sorted(fam_members_not_mapped))) logger.info('Mature miRNAs ending in -5p: %i' % len(ends_in_5p)) logger.info('Mature miRNAs ending in -3p: %i' % len(ends_in_3p)) logger.debug('Following miRNAs did neither end in -5p nor on -3p: ' + ', '.join(ends_not_in_3p_or_5p)) assert (2 * len(mir_family_accessions)) == \ (len(taxonomy_five_prime_mature_mir_families_list) + len(taxonomy_three_prime_mature_mir_families_list)) assert len(taxonomy_five_prime_mature_mir_families_list) == len( taxonomy_three_prime_mature_mir_families_list) # finally map all family accessions (split into 5p and 3p) families to the mature miRs belonging to this family. # Example: fam_map['MIPF0000001-5p'] = set(('9606', 'hsa-miR-17-5p'), ... ) fam_map = {} for i, mir_fam_acc in enumerate(mir_family_accessions): fam_map[mir_fam_acc + '-5p'] = taxonomy_five_prime_mature_mir_families_list[i] fam_map[mir_fam_acc + '-3p'] = taxonomy_three_prime_mature_mir_families_list[i] return fam_map