Exemplo n.º 1
0
def main():
    ############################################################################
    # create string mapping dictionaries
    ############################################################################
    ensp8_to_ensg = stringrnautils.get_string_to_alias_mapper('9606', 'ENSP', 'ENSG', 8)['9606']
    ensg_to_ensp10 = stringrnautils.get_alias_to_string_mapper('9606', 'ENSP', 'ENSG', 10)['9606']
    ensp_to_ensp10 = stringrnautils.get_alias_to_string_mapper('9606', 'ENSP', 'ENSP', 10)['9606']
    mir_hash = stringrnautils.get_unique_mir_mapper()

    # cheating :) because one gene died between versions
    # ensp8_to_ensg['ENSP00000308970'] = 'ENSG00000231924'
    # ensg_to_ensp9['ENSG00000231924'] = 'ENSP00000244296'
    black_list = set()  # croft contains duplicates!!
    unmappable_mir_triples = set()
    unmappable_ensp_triples = set()

    for line in sys.stdin:
        mir, ensp, _ = line.rstrip().split('\t')

        new_mir, new_ensp = None, None

        # Map miRNA
        if mir in mir_hash:
            new_mir = mir_hash[mir]
        else:
            unmappable_mir_triples.add('({}, {})'.format(ensp, mir))
            # if mir not in mir_hash and mir in mir_alias_hash:
            #     sys.stderr.write(' - mir is one of these %s' % str(mir_alias_hash[mir]))

        # Map ENSP
        if ensp in ensp8_to_ensg and ensp8_to_ensg[ensp] in ensg_to_ensp10:
            ensg = ensp8_to_ensg[ensp]
            new_ensp = ensg_to_ensp10[ensg]
        elif ensp in ensp_to_ensp10:
            new_ensp = ensp_to_ensp10[ensp]
        else:
            unmappable_ensp_triples.add('({}, {})'.format(ensp, mir))

        if new_ensp and new_mir and (ensp, mir) not in black_list:
            out_line = '\t'.join(("9606", new_mir, new_ensp, "0", "database", "0.900", "Croft", "", ""))
            sys.stdout.write('%s\n' % out_line)
            black_list.add((ensp, mir))

    if len(unmappable_mir_triples) > 0:
        logger.warning("Could not map " + str(len(unmappable_mir_triples)) +
                       " miRNAs to IDs used in miRBase. Respective interactions were: " +
                       ', '.join(unmappable_mir_triples))

    if len(unmappable_ensp_triples) > 0:
        logger.warning("Could not map " + str(len(unmappable_ensp_triples)) +
                       " proteins to ENSPs used in STRING 10. Respective interactions were: " +
                       ', '.join(unmappable_ensp_triples))
Exemplo n.º 2
0
def run():
    # TODO: UNCOMMENT BEFORE YOU COMMIT!!!
    # download()

    master_file = open('master_files/database_spliceosome.tsv', 'w')

    # TODO: map RNAs as well!!!
    for organism in organisms:
        print(' - generating interactions for {}'.format(organism))
        interactions = get_u_snrna_rna_rna_interactions()
        for ent1, ent2, url in interactions:
            _str = "\t".join(
                (organism, ent1, ent2, "0", "DATABASE", "0.9", url, ''))
            master_file.write("{}\n".format(_str))

        uniprot_to_ensp = stringrnautils.get_alias_to_string_mapper(
            organism, '', '', 10)[organism]
        with open('data/{}_celluar_component.tsv'.format(
                organism)) as go_terms_file:
            go_terms_file.readline()  # skip header
            for line in go_terms_file:
                try:
                    uniprot_acc, goterms = line.rstrip().split('\t')
                except ValueError:
                    # there are no go terms for this protein
                    continue
                url = 'http://www.uniprot.org/uniprot/{}'.format(uniprot_acc)
                for go_term in goterms.split('; '):
                    ensp = uniprot_to_ensp.get(uniprot_acc, None)
                    if ensp:
                        for u_rna in goterm_to_urna.get(go_term, []):
                            _line = "\t".join((organism, u_rna, ensp, "0",
                                               "DATABASE", "0.9", url, ''))
                            master_file.write("{}\n".format(_line))
Exemplo n.º 3
0
def run():
    protein_mapper = stringrnautils.get_alias_to_string_mapper(
        "9606", 'ENSP', "")["9606"]

    # parse S3 and S4, use S4 as simply yet another PID supporting them (22365833)

    ############################################################
    # parse S3 and S4 into pmids and benchmark
    ############################################################

    pmid_evicence = collections.defaultdict(list)  # s3 and s4
    experiment_evidence = collections.defaultdict(list)  # s6 counts as 0.9
    hegel_pmid = 22365833

    if not os.path.exists(TABEL_S3_LOCAL):
        urllib.urlretrieve(TABEL_S3_URL, TABEL_S3_LOCAL)
    s3_data = pd.read_excel(TABEL_S3_LOCAL, TABEL_S3_SHEET)

    for i in range(s3_data.shape[0]):
        gene_a = s3_data["SymbolA"][i]
        gene_b = s3_data["SymbolB"][i]

        try:
            prot_a = protein_mapper[gene_a]
            prot_b = protein_mapper[gene_b]

            key = tuple(sorted((prot_a, prot_b)))
            pmid_evicence[key].append(hegel_pmid)
        except KeyError:
            print "the interaction between {0} and {1} could not be mapped to string ids".format(
                gene_a, gene_b)

    if not os.path.exists(TABEL_S4_LOCAL):
        urllib.urlretrieve(TABEL_S4_LOCAL, TABEL_S4_SHEET)
    s4_data = pd.read_excel(TABEL_S4_LOCAL, TABEL_S4_SHEET)
    for i in range(s4_data.shape[0]):
        gene_a = s4_data["SymbolA"][i]
        gene_b = s4_data["SymbolB"][i]

        try:
            prot_a = protein_mapper[gene_a]
            prot_b = protein_mapper[gene_b]

            key = tuple(sorted((prot_a, prot_b)))
            for j in range(1, int(s4_data["#PMID"][i])):
                pmid_evicence[key].append(int(s4_data[str(j)][i]))

        except KeyError:
            print "the interaction between {0} and {1} could not be mapped to string ids".format(
                gene_a, gene_b)

    # TODO alex or garde: hook this into the combine-miRTarBase-NPinter script, as thise interactions are proteins and
    # therefore have to be scored using the bins from there (as we have no protein positive set
    for (prot_1, prot_b), pmids in pmid_evicence.items():
        print '\t'.join(("9606", prot_a, prot_b, "0", "Experiment",
                         str(len(pmids)), "Litterature", "", ""))

    ############################################################
    # parse S6 into experiments
    ############################################################
    if not os.path.exists(TABEL_S6_LOCAL):
        urllib.urlretrieve(TABEL_S6_URL, TABEL_S6_LOCAL)
    s6_data = pd.read_excel(TABEL_S6_LOCAL, TABEL_S6_SHEET)

    for i in range(s6_data.shape[0]):
        gene_a = s6_data["FireSymbol"][i]
        gene_b = s6_data["PASymbol"][i]

        try:
            prot_a = protein_mapper[gene_a]
            prot_b = protein_mapper[gene_b]

            key = tuple(sorted((prot_a, prot_b)))
            for j in range(1, int(s4_data["#PMID"][i])):
                pmid_evicence[key].append(int(s4_data[str(j)][i]))

        except KeyError:
            print "the interaction between {0} and {1} could not be mapped to string ids".format(
                gene_a, gene_b)

        # TODO: append this to the experiments file in "combine_experiments"
        # these are proteins and therefore cannot be benchmarked against our gold-standard
        print '\t'.join(
            ("9606", prot_a, prot_b, "0", "Experiment", "0.9", "Luciferase",
             "https://www.ncbi.nlm.nih.gov/pubmed/22365833", ""))
    os.mkdir(MASTER_PATH)

cat_master_file_name = 'database.tsv'
knowledge_master_file_path = os.path.join(MASTER_PATH, cat_master_file_name)

################################################################################
# NOTE/TODO for future versions of RAIN:
# currently, this script assumes that the curated knowledge channel contains
# only human interactions. If this changes in the future, the ID mapping
# routines must be adjusted.
################################################################################

# Make up for the fact that it was curated for version 9
ENSP2ENSG_v9 = stringrnautils.get_string_to_alias_mapper(
    '9606', 'ENSP', 'ENSG', 9, 'all', True)['9606']
ENSG2ENSP_v10 = stringrnautils.get_alias_to_string_mapper(
    9606, 'ENSP', 'ENSG', 10)['9606']
ENSP9_to_ENSP10 = dict([(ensp, ENSG2ENSP_v10[ensg])
                        for ensp, ensg in ENSP2ENSG_v9.iteritems()
                        if ensg in ENSG2ENSP_v10])
ENSP9_to_ENSP10.update({
    'ENSP00000403359': 'ENSP00000441000',
    'ENSP00000400867': 'ENSP00000441000',
    'ENSP00000403175': 'ENSP00000393241'
})

ENSP2ENSP_v10 = stringrnautils.get_alias_to_string_mapper(
    9606, 'ENSP', 'ENSP', 10)['9606']
ncrna_mapper = stringrnautils.get_non_coding_rna_alias_mapper()['9606']


def correct_rna_names(ID, ncrna_mapper):
Exemplo n.º 5
0
def getSTRINGdic(specie):  #ENSG ENSP conversion for RefSeq NM_ mRNAs
    STRING_dic = stringrnautils.get_alias_to_string_mapper(
        organisms=organismIdMap[specie],
        filter_string_alias='',
        filter_string_id='')
    return STRING_dic
Exemplo n.º 6
0
import gzip, os, argparse, stringrnautils

parser = argparse.ArgumentParser()
parser.add_argument('-data_path', default='data')
parser.add_argument('-rawscore_path', default='rawscore_files')
parser.add_argument('-master_path', default='master_files')
parser.add_argument('-gold_std', default='data/extended_gold_standard.tsv')
args = parser.parse_args()

string_mapper = stringrnautils.get_alias_to_string_mapper(
    ['9606', '10090', '7955', '10116', '7227', '6239', '3702'], '', '', 10,
    'all')
mir_mapper = stringrnautils.get_unique_mir_mapper()

# Retrieve tarbase data
#----------------------
tarbase_file = os.path.join(args.data_path,
                            "Tarbase.6.7.FINAL.mirbase21.download.tsv.gz")
tarbase_file = os.path.join("data",
                            "Tarbase.6.7.FINAL.mirbase21.download.tsv.gz")

if not os.path.exists(tarbase_file):
    os.system(
        "wget -nv http://rth.dk/~ajunge/Tarbase.6.7.FINAL.mirbase21.download.tsv.gz -O %s"
        % tarbase_file)

orgn2keep = dict([("Arabidopsis thaliana", '3702'),
                  ("Caenorhabditis elegans", '6239'), ("Danio rerio", '7955'),
                  ("Drosophila melanogaster", '7227'),
                  ("H**o sapiens", '9606'), ("Mus musculus", '10090'),
                  ("Rattus norvegicus", '10116')])
def integrate_all_prediction_tools():
    # Define dictionaries
    #--------------------
    gene2ensembl = stringrnautils.map_gene_2_enemble(os.path.join(LOCAL_DATA_PATH, 'gene2ensembl.gz'))
    stringrnautils.integrate_NM_dictionary(gene2ensembl)

    mir_mapper = stringrnautils.get_unique_mir_mapper()
    string_mapper = stringrnautils.get_alias_to_string_mapper(['9606', '10090','7955', '10116', '7227', '6239','3702'], '', '', 10, 'all')

    # Read data and benchmark
    #--------------------------

    # starmirdb - may decide to exclude this one
    read_starmirdb( mir_mapper, string_mapper)

    # miRanda
    read_predictions( "miRanda_v3.3a.tsv.gz", {}, mir_mapper, string_mapper,
                      tax_idx=0, mir_idx=1,target_idx=2,score_idx=4,
                      increasing=False, window_size=1000, name="miRanda",
                      ignore_fraction=0.7, has_header=True )

    # miRDB
    read_predictions( "miRDB_v5.0.tsv.gz", gene2ensembl, mir_mapper, string_mapper,
                      tax_idx=0, mir_idx=1,target_idx=2,score_idx=3,
                      increasing=True, window_size=75, name="miRDB",
                      ignore_fraction=0.0, has_header=True )

    # PITA
    read_predictions( "PITA.tsv.gz", {}, mir_mapper, string_mapper,
                      tax_idx=0, mir_idx=2,target_idx=1,score_idx=4,
                      increasing=False, window_size=500, name="PITA",
                      ignore_fraction=0.0, has_header=True )

    # RNA22 - excluded due to poor performance
    if args.run_all:
        read_predictions( "RNA22.tsv.gz", {}, mir_mapper, string_mapper,
                          tax_idx=0, mir_idx=1,target_idx=2,score_idx=3,
                          increasing=True, window_size=50, name="RNA22",
                          ignore_fraction=0.2, has_header=True,do_benchmark=False)

    # RNAhybrid - excluded due to poor performance
    if args.run_all:
        read_predictions( "RNAhybrid_seed.tsv.gz", {}, mir_mapper, string_mapper,
                          tax_idx=0, mir_idx=1,target_idx=2,score_idx=3,
                          increasing=False, window_size=50, name="RNAhybrid_seed",
                          ignore_fraction=0.2, has_header=False, do_benchmark=False)

    # Targetscan
    read_predictions( "targetscan.mammals.tsv.gz", {}, mir_mapper, string_mapper,
                      tax_idx=0, mir_idx=1,target_idx=2,score_idx=4,
                      increasing=False, window_size=50, name="targetscan",
                      ignore_fraction=0.50, has_header=True )

    # integrate prediction tools
    #--------------------
    prediction_tools = ('starmirdb', 'miRanda', 'targetscan', 'miRDB', 'PITA')
    organism_to_tool = {}
    for tool in prediction_tools:
        organisms = species_covered(os.path.join(MASTER_FILE_DIR,'{0}.tsv'.format(tool)))
        for organism in organisms:
            organism_to_tool.setdefault(organism, []).append(tool)

    tool_combinations = set()
    tool_combinations_to_species = {}
    for organism, tools in list(organism_to_tool.items()):
        tools = '_and_'.join(sorted(tools))
        tool_combinations.add(tools)
        organism_to_tool[organism] = tools
        tool_combinations_to_species.setdefault(tools, set()).add(organism)

    tool_parameters = {
    "PITA_and_miRDB_and_miRanda_and_targetscan":{
        'negative_evidence' : False,
        'rebenchmark_everything' : True,
        'ignore_fraction' : 0.0,
        'window_size' : 110,
        'unlink_master_files' : False
    },
    "PITA_and_miRanda": {
        'negative_evidence' : False,
        'rebenchmark_everything' : True,
        'ignore_fraction' : 0.0,
        'window_size' : 200,
        'unlink_master_files' : False
    }
    }

    default_tool_parameters = {
        'negative_evidence' : False,
        'rebenchmark_everything' : True,
        'ignore_fraction' : 0.60,
        'window_size' : 75,
        'unlink_master_files' : False
    }

    # generate organism specific callibration curves
    predictions_master_file = open(os.path.join(MASTER_FILE_DIR, 'predictions.tsv'), 'w')
    new_master_files = ['{0}.tsv'.format(p) for p in prediction_tools]

    for tool_combination in tool_combinations:
        source_master_files = ('{0}.tsv'.format(t) for t in tool_combination.split('_and_'))
        destination_name = 'predictions_subset_{0}'.format(tool_combination)
        destination_master_file = 'predictions_subset_{0}.tsv'.format(tool_combination)

        parameters = default_tool_parameters.copy()
        if tool_combination in tool_parameters:
            parameters.update(tool_parameters[tool_combination])

        new_master_files.append(destination_master_file)
        stringrnautils.combine_masterfiles(source_master_files, destination_master_file,
                                           gold_standard_file_path, destination_name,
                                           **parameters)

        # generate/append relevant species to predictions.tsv
        species = tool_combinations_to_species[tool_combination]
        for line in open(os.path.join(MASTER_FILE_DIR, destination_master_file)):
            if int(line.split('\t', 1)[0]) in species:
                predictions_master_file.write(line)

    # delete all the tmp master files
    for master_file in new_master_files:
        os.unlink(os.path.join(MASTER_FILE_DIR, master_file))
Exemplo n.º 8
0
}


def get_assay_mapping(assay_mapping_path):
    assay_dict = {}
    with open(assay_mapping_path, 'r') as fin:
        for curr_line in fin:
            split_cols = curr_line.rstrip().split('\t')
            assay_dict[split_cols[0]] = split_cols[1]
    return assay_dict


miRNA2Clean = stringrnautils.get_unique_mir_mapper()
miRNA2taxonomyID = stringrnautils.get_mir_id_to_tax_id_mapper()
targetName2targetID = stringrnautils.get_alias_to_string_mapper(
    organisms=uniqueSpeciesMap.values(),
    filter_string_alias='',
    filter_string_id='')
restricted_pmids = stringrnautils.starbase_exp_pmids()

assayMappingFile = os.path.join(DATA_PATH, 'miRTarBase_assay_mapping.tsv'
                                )  # Maps assay names to 'cleaned' assay names
assay2Clean = get_assay_mapping(assayMappingFile)

not_mapped = 0
totalCount = 0

# Maps a certain interaction to a set of experiments supporting that interaction, the PubMedIDs and the evidence levels
# according to miRTarBase
interaction_to_experiments = {}
interaction_to_pubmed_ids = {}
interaction_to_evidence_types = {}