Exemplo n.º 1
0
########## STRING dic for ENSPlike conversions
def getSTRINGdic(specie):  #ENSG ENSP conversion for RefSeq NM_ mRNAs
    STRING_dic = stringrnautils.get_alias_to_string_mapper(
        organisms=organismIdMap[specie],
        filter_string_alias='',
        filter_string_id='')
    return STRING_dic


######### Parse with Specie Name ################################################
UP_dic = stringrnautils.getUniProtDic(BIOMART_DIC_PATH)
NM_dic = stringrnautils.getRefSeqNMdic(BIOMART_DIC_PATH)
NR_dic = stringrnautils.getRefSeqNRdic(BIOMART_DIC_PATH, GENENAME_DIC_PATH)
NONCODE_dic = stringrnautils.getNONCODEdic(NONCODE_DIC_PATH, BIOMART_DIC_PATH,
                                           GENENAME_DIC_PATH)
MB_dic = stringrnautils.get_unique_mir_mapper()


def ParseNPINTER(specie='H**o sapiens'):
    #for H**o Sapiens if specie is not chosen
    #RNAs or Proteins in NPInter has identifiers from NonCode(NR_, ENST), RefSeq(NM_), MirBase(miR) and UniProt.
    unmapped = {}
    mapped = {}
    unc = 0
    mac = 0
    UniProtdic = {}
    if UP_dic.has_key(organismIdMap[specie]):
        UniProtdic = UP_dic[organismIdMap[specie]]
    RefSeqdic = {}
    if NM_dic.has_key(organismIdMap[specie]):
        RefSeqdic = NM_dic[organismIdMap[specie]]
def integrate_all_prediction_tools():
    # Define dictionaries
    #--------------------
    gene2ensembl = stringrnautils.map_gene_2_enemble(os.path.join(LOCAL_DATA_PATH, 'gene2ensembl.gz'))
    stringrnautils.integrate_NM_dictionary(gene2ensembl)

    mir_mapper = stringrnautils.get_unique_mir_mapper()
    string_mapper = stringrnautils.get_alias_to_string_mapper(['9606', '10090','7955', '10116', '7227', '6239','3702'], '', '', 10, 'all')

    # Read data and benchmark
    #--------------------------

    # starmirdb - may decide to exclude this one
    read_starmirdb( mir_mapper, string_mapper)

    # miRanda
    read_predictions( "miRanda_v3.3a.tsv.gz", {}, mir_mapper, string_mapper,
                      tax_idx=0, mir_idx=1,target_idx=2,score_idx=4,
                      increasing=False, window_size=1000, name="miRanda",
                      ignore_fraction=0.7, has_header=True )

    # miRDB
    read_predictions( "miRDB_v5.0.tsv.gz", gene2ensembl, mir_mapper, string_mapper,
                      tax_idx=0, mir_idx=1,target_idx=2,score_idx=3,
                      increasing=True, window_size=75, name="miRDB",
                      ignore_fraction=0.0, has_header=True )

    # PITA
    read_predictions( "PITA.tsv.gz", {}, mir_mapper, string_mapper,
                      tax_idx=0, mir_idx=2,target_idx=1,score_idx=4,
                      increasing=False, window_size=500, name="PITA",
                      ignore_fraction=0.0, has_header=True )

    # RNA22 - excluded due to poor performance
    if args.run_all:
        read_predictions( "RNA22.tsv.gz", {}, mir_mapper, string_mapper,
                          tax_idx=0, mir_idx=1,target_idx=2,score_idx=3,
                          increasing=True, window_size=50, name="RNA22",
                          ignore_fraction=0.2, has_header=True,do_benchmark=False)

    # RNAhybrid - excluded due to poor performance
    if args.run_all:
        read_predictions( "RNAhybrid_seed.tsv.gz", {}, mir_mapper, string_mapper,
                          tax_idx=0, mir_idx=1,target_idx=2,score_idx=3,
                          increasing=False, window_size=50, name="RNAhybrid_seed",
                          ignore_fraction=0.2, has_header=False, do_benchmark=False)

    # Targetscan
    read_predictions( "targetscan.mammals.tsv.gz", {}, mir_mapper, string_mapper,
                      tax_idx=0, mir_idx=1,target_idx=2,score_idx=4,
                      increasing=False, window_size=50, name="targetscan",
                      ignore_fraction=0.50, has_header=True )

    # integrate prediction tools
    #--------------------
    prediction_tools = ('starmirdb', 'miRanda', 'targetscan', 'miRDB', 'PITA')
    organism_to_tool = {}
    for tool in prediction_tools:
        organisms = species_covered(os.path.join(MASTER_FILE_DIR,'{0}.tsv'.format(tool)))
        for organism in organisms:
            organism_to_tool.setdefault(organism, []).append(tool)

    tool_combinations = set()
    tool_combinations_to_species = {}
    for organism, tools in list(organism_to_tool.items()):
        tools = '_and_'.join(sorted(tools))
        tool_combinations.add(tools)
        organism_to_tool[organism] = tools
        tool_combinations_to_species.setdefault(tools, set()).add(organism)

    tool_parameters = {
    "PITA_and_miRDB_and_miRanda_and_targetscan":{
        'negative_evidence' : False,
        'rebenchmark_everything' : True,
        'ignore_fraction' : 0.0,
        'window_size' : 110,
        'unlink_master_files' : False
    },
    "PITA_and_miRanda": {
        'negative_evidence' : False,
        'rebenchmark_everything' : True,
        'ignore_fraction' : 0.0,
        'window_size' : 200,
        'unlink_master_files' : False
    }
    }

    default_tool_parameters = {
        'negative_evidence' : False,
        'rebenchmark_everything' : True,
        'ignore_fraction' : 0.60,
        'window_size' : 75,
        'unlink_master_files' : False
    }

    # generate organism specific callibration curves
    predictions_master_file = open(os.path.join(MASTER_FILE_DIR, 'predictions.tsv'), 'w')
    new_master_files = ['{0}.tsv'.format(p) for p in prediction_tools]

    for tool_combination in tool_combinations:
        source_master_files = ('{0}.tsv'.format(t) for t in tool_combination.split('_and_'))
        destination_name = 'predictions_subset_{0}'.format(tool_combination)
        destination_master_file = 'predictions_subset_{0}.tsv'.format(tool_combination)

        parameters = default_tool_parameters.copy()
        if tool_combination in tool_parameters:
            parameters.update(tool_parameters[tool_combination])

        new_master_files.append(destination_master_file)
        stringrnautils.combine_masterfiles(source_master_files, destination_master_file,
                                           gold_standard_file_path, destination_name,
                                           **parameters)

        # generate/append relevant species to predictions.tsv
        species = tool_combinations_to_species[tool_combination]
        for line in open(os.path.join(MASTER_FILE_DIR, destination_master_file)):
            if int(line.split('\t', 1)[0]) in species:
                predictions_master_file.write(line)

    # delete all the tmp master files
    for master_file in new_master_files:
        os.unlink(os.path.join(MASTER_FILE_DIR, master_file))
# print taxonomy distribution
taxonomy_id_to_occurrences = Counter(
    [intact._org for intact in gold_standard_interactions])
sorted_taxonomy_ids_occurrences = sorted(taxonomy_id_to_occurrences.items(),
                                         key=operator.itemgetter(1),
                                         reverse=True)
logger.info('Taxonomy identifier\tOccurrences')
logger.info('--------------------------------')
for tax_id, occ in sorted_taxonomy_ids_occurrences:
    logger.info('%s\t%d' % (tax_id, occ))

# alias mappers used to determine
protein_alias_mapper = get_string_to_alias_mapper(
    taxonomy_id_to_occurrences.keys(), '', '', 10, 'all', True)
mir_alias_mapper = get_unique_mir_mapper()
ncrna_alias_mapper = get_non_coding_rna_alias_mapper()


def get_entity_type(entity_name, taxonomy_id):
    tax_protein_alias_mapper = protein_alias_mapper[taxonomy_id]
    entity_is_ncrna = entity_name in ncrna_alias_mapper[taxonomy_id]
    if entity_is_ncrna:
        return EntityType.ncRNA
    entity_is_protein = entity_name in tax_protein_alias_mapper
    if entity_is_protein:
        return EntityType.Protein
    entity_is_mirna = entity_name in mir_alias_mapper
    if entity_is_mirna:
        return EntityType.miRNA
Exemplo n.º 4
0
        source = cols[4]
        ent1, ent2 = sorted((ent1_unsort, ent2_unsort))
        tax_to_interactions[tax].add((ent1, ent2, source))

evidence_channel_to_table_name = {
    'database': 'Curated',
    'experimental': 'Experiments',
    'prediction': 'Predictions',
    'textmining': 'Text mining'
}

all_taxonomy_ids = list(tax_to_interactions.keys())
protein_alias_mapper = stringrnautils.get_string_to_alias_mapper(
    all_taxonomy_ids, '', '', 10, 'all', True)

mir_alias_mapper = stringrnautils.get_unique_mir_mapper()

# load ncRNA dictionary
ncrna_mapper = stringrnautils.get_non_coding_rna_alias_mapper(
)  # taxonomy ID -> ncRNA alias -> ncRNA identifier

tax_interaction_count = []
tax_to_miRNA_mRNA_interactions = collections.defaultdict(set)
tax_to_ncRNA_protein_interactions = collections.defaultdict(set)
tax_to_ncRNA_ncRNA_interactions = collections.defaultdict(set)
tax_to_miRNAs = collections.defaultdict(set)
tax_to_ncRNAs = collections.defaultdict(set)
tax_to_proteins = collections.defaultdict(set)
tax_to_source_to_interactions = {}
for tax, interaction_set in tax_to_interactions.iteritems():
    tax_interaction_count.append((tax, len(interaction_set)))
Exemplo n.º 5
0
    'Solanum lycopersicum': '4081',
    'Sus scrofa': '9832',
    'Taeniopygia guttata': '59729'
}


def get_assay_mapping(assay_mapping_path):
    assay_dict = {}
    with open(assay_mapping_path, 'r') as fin:
        for curr_line in fin:
            split_cols = curr_line.rstrip().split('\t')
            assay_dict[split_cols[0]] = split_cols[1]
    return assay_dict


miRNA2Clean = stringrnautils.get_unique_mir_mapper()
miRNA2taxonomyID = stringrnautils.get_mir_id_to_tax_id_mapper()
targetName2targetID = stringrnautils.get_alias_to_string_mapper(
    organisms=uniqueSpeciesMap.values(),
    filter_string_alias='',
    filter_string_id='')
restricted_pmids = stringrnautils.starbase_exp_pmids()

assayMappingFile = os.path.join(DATA_PATH, 'miRTarBase_assay_mapping.tsv'
                                )  # Maps assay names to 'cleaned' assay names
assay2Clean = get_assay_mapping(assayMappingFile)

not_mapped = 0
totalCount = 0

# Maps a certain interaction to a set of experiments supporting that interaction, the PubMedIDs and the evidence levels
def create_mir_ortholog_lists():
    """
    Determines groups of miRNA ortholog by following the following steps:
    1) miRBase family annotation (miFam.dat.gz) determines which miRNA precursors belong to the one orthologous group
    2) miRNA precursors in each group are replaced by the mature miRNAs processed from them
    3) only those mature miRNAs that are contained in set of RAIN identifiers are retained
    4) 5prime and 3prime mature miRNAs in the same orthologous group are sorted into two classes

    :return: a dict mapping a string to a list of sets - keys in the returned dictionary are names of miRNA orthologous
             groups and value are a list of sets were set contains the miRBase identifiers of mature miRNAs that are
             orthologous
    """
    # used to check condition 2)
    unique_mir_mapper = stringrnautils.get_unique_mir_mapper()

    # step 1)
    # a list of miRNA families where each family is represented by a set of miRNA precursor names, e.g.,
    # [{'hsa-mir-17', 'hsa-mir-18a', ...} , {'cel-let-7', 'hsa-let-7a-1', 'hsa-let-7a-2' , ...}, .., ]
    precursor_mir_families_list = []
    # a list of miRNA family accessions
    mir_family_accessions = []
    mirs_already_added_to_family = set(
    )  # for sanity checking if a mir precursor is assigned to more than one family
    with gzip.open(MIR_FAMILY_FILE, 'rb') as mir_fam_file:
        current_family_members = set()
        fam_accession = None
        for line in mir_fam_file:
            cols = line.rstrip('\r\n').split()
            tag = cols[0]
            if tag == 'AC':  # family accession
                assert len(cols) == 2
                fam_accession = cols[1]
            elif tag == 'ID':  # family identifier
                assert len(cols) == 2
                # fam_identifier = cols[1]
            elif tag == 'MI':  # accession and identifier of family member
                assert len(cols) == 3
                # fam_member_accession = cols[1]
                fam_member_identifier = cols[2]
                current_family_members.add(fam_member_identifier)
            elif line.startswith('//'):  # end of family
                # sanity check
                for mir in current_family_members:
                    if mir in mirs_already_added_to_family:
                        raise ValueError(
                            'miRNA precursor %s is assigned to more than one miRBase family.'
                            % mir)
                mirs_already_added_to_family.update(current_family_members)
                precursor_mir_families_list.append(current_family_members)
                if fam_accession:
                    mir_family_accessions.append(fam_accession)
                else:
                    raise ValueError('Missing family accession.')
                current_family_members = set()
                fam_accession = None
            else:
                raise ValueError(
                    'Could not identify tag %s in line %s of file %s.' %
                    (tag, line, MIR_FAMILY_FILE))
    logger.info(
        'Found %i miRNA families in miRBase %s containing a total of %i miRNAs.'
        % (len(precursor_mir_families_list), MIRBASE_VERSION,
           len(mirs_already_added_to_family)))

    # maps each taxonomy ID and precursor identifier to the identifiers of the mature miRNAs being made from it, e.g.,:
    # mir_precursor_to_mature_mapper[('9606', 'hsa-mir-17')] -> ['hsa-miR-17-5p', 'hsa-miR-17-3p']
    taxonomy_mir_precursor_to_mature_mapper = __get_mir_precursor_to_mature_mapper__(
    )

    # maps each mature miRNA to its taxonomy ID and is needed to produce the final species aware ortholog list. Example:
    # taxonomy_mir_precursor_to_mature_mapper['hsa-miR-17-5p'] -> '9606'
    mature_to_taxonomy_mapper = {}
    # precursors in miR families are not linked to taxonomy ID, so create mapping dictionary of following form:
    # mir_precursor_to_mature_mapper['hsa-mir-17'] -> ['hsa-miR-17-5p', 'hsa-miR-17-3p']
    mir_precursor_to_mature_mapper = {}
    for taxonomy_precursor, mature_list in taxonomy_mir_precursor_to_mature_mapper.items(
    ):
        taxonomy_id, precursor_id = taxonomy_precursor
        for mature_id in mature_list:
            mature_to_taxonomy_mapper[mature_id] = taxonomy_id
        mir_precursor_to_mature_mapper[precursor_id] = mature_list

    # step 2)
    mir_precursors_not_mapped = set()
    mature_mir_families_list = []
    for mir_family in precursor_mir_families_list:
        mature_mirs_in_family = set()
        for mir_precursor in mir_family:
            # step 3)
            if mir_precursor in mir_precursor_to_mature_mapper:
                mature_mirs = mir_precursor_to_mature_mapper[mir_precursor]
                for mature_mir in mature_mirs:
                    mature_mirs_in_family.add(mature_mir)
            else:
                mir_precursors_not_mapped.add(mir_precursor)
        mature_mir_families_list.append(mature_mirs_in_family)
    logger.debug('Following miRNAs could not be mapped to mature miRNA(s): ' +
                 ', '.join(sorted(list(mir_precursors_not_mapped))))

    # step 4)
    ends_in_5p = set()
    ends_in_3p = set()
    ends_not_in_3p_or_5p = set()
    final_mir_identifiers = unique_mir_mapper.values()

    # replace each mature miRNA ID by a tuple containing its taxonomy ID and the mature miRNA ID itself
    taxonomy_five_prime_mature_mir_families_list = []
    taxonomy_three_prime_mature_mir_families_list = []
    fam_members_not_mapped = set()
    for family in mature_mir_families_list:
        five_prime_fam_members = set()
        three_prime_fam_members = set()
        for fam_member in family:
            if fam_member not in mature_to_taxonomy_mapper:
                raise IOError(
                    'Mature miRNA could not be mapped to taxonomy ID: ' +
                    fam_member)
            taxonomy_id = mature_to_taxonomy_mapper[fam_member]

            if fam_member not in final_mir_identifiers:
                fam_members_not_mapped.add(fam_member)
            elif fam_member.endswith('-5p'):
                five_prime_fam_members.add((taxonomy_id, fam_member))
                ends_in_5p.add(fam_member)
            elif fam_member.endswith('-3p'):
                three_prime_fam_members.add((taxonomy_id, fam_member))
                ends_in_3p.add(fam_member)
            else:
                ends_not_in_3p_or_5p.add(fam_member)
        taxonomy_five_prime_mature_mir_families_list.append(
            five_prime_fam_members)
        taxonomy_three_prime_mature_mir_families_list.append(
            three_prime_fam_members)
    logger.info('Following miRNAs were not found in RAIN identifiers: ' +
                ', '.join(sorted(fam_members_not_mapped)))
    logger.info('Mature miRNAs ending in -5p: %i' % len(ends_in_5p))
    logger.info('Mature miRNAs ending in -3p: %i' % len(ends_in_3p))
    logger.debug('Following miRNAs did neither end in -5p nor on -3p: ' +
                 ', '.join(ends_not_in_3p_or_5p))

    assert (2 * len(mir_family_accessions)) == \
           (len(taxonomy_five_prime_mature_mir_families_list) + len(taxonomy_three_prime_mature_mir_families_list))
    assert len(taxonomy_five_prime_mature_mir_families_list) == len(
        taxonomy_three_prime_mature_mir_families_list)

    # finally map all family accessions (split into 5p and 3p) families to the mature miRs belonging to this family.
    # Example: fam_map['MIPF0000001-5p'] = set(('9606', 'hsa-miR-17-5p'), ... )
    fam_map = {}
    for i, mir_fam_acc in enumerate(mir_family_accessions):
        fam_map[mir_fam_acc +
                '-5p'] = taxonomy_five_prime_mature_mir_families_list[i]
        fam_map[mir_fam_acc +
                '-3p'] = taxonomy_three_prime_mature_mir_families_list[i]
    return fam_map