예제 #1
0
파일: wesParser.py 프로젝트: scarltee/CKG
def parser(projectId):
    data = {}
    config = builder_utils.get_config(config_name="wes.yml", data_type='experiments')
    directory = '../../../data/experiments/PROJECTID/wes/'
    if 'directory' in config:
        directory = config['directory']
    directory = directory.replace('PROJECTID', projectId)
    wes_data = parseWESDataset(projectId, config, directory)
    if wes_data is not None:
        somatic_mutations = pd.DataFrame()
        for sample in wes_data:
            entities, variantRows, sampleRows, geneRows, chrRows = extractWESRelationships(wes_data[sample], config)
            data[('somatic_mutation_known_variant', 'w')] = variantRows
            data[('somatic_mutation_sample', 'w')] = sampleRows
            data[('somatic_mutation_gene', 'w')] = geneRows
            data[('somatic_mutation_chromosome', 'w')] = chrRows
            if somatic_mutations.empty:
                somatic_mutations = entities
            else:
                new = set(entities.index).difference(set(somatic_mutations.index))
                somatic_mutations = somatic_mutations.append(entities.loc[new,:], ignore_index=False)
        somatic_mutations = somatic_mutations.reset_index()
        data[('somatic_mutation', 'w')] = somatic_mutations
    
    return data
예제 #2
0
파일: smpdbParser.py 프로젝트: scarltee/CKG
def parser(databases_directory, download=True):
    config = builder_utils.get_config(config_name="smpdbConfig.yml", data_type='databases')
    urls = config['smpdb_urls']
    entities = set()
    relationships = defaultdict(set)
    entities_header = config['pathway_header']
    relationships_headers = config['relationships_header']
    directory = os.path.join(databases_directory, "SMPDB")
    builder_utils.checkDirectory(directory)

    for dataset in urls:
        url = urls[dataset]
        file_name = url.split('/')[-1]
        if download:
            builder_utils.downloadDB(url, directory)
        zipped_file = os.path.join(directory, file_name)
        with zipfile.ZipFile(zipped_file) as rf:
            if dataset == "pathway":
                entities = parsePathways(config, rf)
            elif dataset == "protein":
                relationships.update(parsePathwayProteinRelationships(rf))
            elif dataset == "metabolite":
                relationships.update(parsePathwayMetaboliteDrugRelationships(rf))

    builder_utils.remove_directory(directory)

    return entities, relationships, entities_header, relationships_headers
예제 #3
0
def parser(databases_directory, download=True):
    config = builder_utils.get_config(config_name="signorConfig.yml",
                                      data_type='databases')

    directory = os.path.join(databases_directory, "SIGNOR")
    builder_utils.checkDirectory(directory)

    url = config['url']
    modifications = config['modifications']
    amino_acids = config['amino_acids']
    accronyms = config['accronyms']
    entities_header = config['entities_header']
    relationships_headers = config['rel_headers']

    entities = set()
    relationships = defaultdict(set)

    filename = os.path.join(directory, url.split('/')[-1])
    if download:
        builder_utils.downloadDB(url, directory)

    entities, relationships = parse_substrates(filename, modifications,
                                               accronyms, amino_acids)

    return entities, relationships, entities_header, relationships_headers
예제 #4
0
def parser(projectId, type='clinical'):
    data = {}
    cwd = os.path.abspath(os.path.dirname(__file__))
    config = builder_utils.get_config(config_name="clinical.yml",
                                      data_type='experiments')
    project_directory = os.path.join(
        cwd, '../../../../data/experiments/PROJECTID/project/')
    clinical_directory = os.path.join(
        cwd, '../../../../data/experiments/PROJECTID/clinical/')
    design_directory = os.path.join(
        cwd, '../../../../data/experiments/PROJECTID/experimental_design/')
    separator = config["separator"]
    if 'project_directory' in config:
        project_directory = os.path.join(cwd, config['project_directory'])
    project_directory = project_directory.replace('PROJECTID', projectId)
    if 'clinical_directory' in config:
        clinical_directory = os.path.join(cwd, config['clinical_directory'])
    clinical_directory = clinical_directory.replace('PROJECTID', projectId)
    if 'design_directory' in config:
        design_directory = os.path.join(cwd, config['design_directory'])
    design_directory = design_directory.replace('PROJECTID', projectId)
    if type == 'project':
        project_dfs = project_parser(projectId, config, project_directory,
                                     separator)
        data.update(project_dfs)
    elif type == 'experimental_design':
        design_dfs = experimental_design_parser(projectId, config,
                                                design_directory)
        data.update(design_dfs)
    elif type == 'clinical':
        clinical_dfs = clinical_parser(projectId, config, clinical_directory,
                                       separator)
        data.update(clinical_dfs)

    return data
예제 #5
0
def parser(databases_directory, download=True):
    relationships = defaultdict(set)
    config = builder_utils.get_config(config_name="disgenetConfig.yml",
                                      data_type='databases')

    files = config['disgenet_files']
    mapping_files = config['disgenet_mapping_files']
    url = config['disgenet_url']
    directory = os.path.join(databases_directory, "disgenet")
    builder_utils.checkDirectory(directory)
    header = config['disgenet_header']
    output_file = 'disgenet_associated_with.tsv'

    if download:
        for f in files:
            builder_utils.downloadDB(url + files[f], directory)
        for f in mapping_files:
            builder_utils.downloadDB(url + mapping_files[f], directory)

    proteinMapping = readDisGeNetProteinMapping(config, directory)
    diseaseMapping = readDisGeNetDiseaseMapping(config, directory)
    for f in files:
        first = True
        associations = gzip.open(os.path.join(directory, files[f]), 'r')
        dtype, atype = f.split('_')
        if dtype == 'gene':
            idType = "Protein"
            scorePos = 9
        if dtype == 'variant':
            idType = "Transcript"
            scorePos = 5
        for line in associations:
            if first:
                first = False
                continue
            try:
                data = line.decode('utf-8').rstrip("\r\n").split("\t")
                geneId = str(int(data[0]))
                #disease_specificity_index =  data[2]
                #disease_pleiotropy_index = data[3]
                diseaseId = data[4]
                score = float(data[scorePos])
                pmids = data[13]
                source = data[-1]
                if geneId in proteinMapping:
                    for identifier in proteinMapping[geneId]:
                        if diseaseId in diseaseMapping:
                            for code in diseaseMapping[diseaseId]:
                                code = "DOID:" + code
                                relationships[idType].add(
                                    (identifier, code, "ASSOCIATED_WITH",
                                     score, atype, "DisGeNet: " + source,
                                     pmids))
            except UnicodeDecodeError:
                continue
        associations.close()

    builder_utils.remove_directory(directory)

    return (relationships, header, output_file)
예제 #6
0
def parser(databases_directory):
    directory = os.path.join(databases_directory, "PhosphoSitePlus")
    builder_utils.checkDirectory(directory)
    config = builder_utils.get_config(config_name="pspConfig.yml", data_type='databases')
    modifications = config['modifications']
    annotation_files = config['annotation_files']
    entities_header = config['entities_header']
    relationships_headers = config['rel_headers']
    entities = set()
    relationships = defaultdict(set)
    for site_file in config['site_files']:
        file_name = os.path.join(directory, site_file)
        with gzip.open(file_name, 'r') as f:
            sites, site_relationships = parseSites(f, modifications)
            entities.update(sites)
            for r in site_relationships:
                relationships[r].update(site_relationships[r])
    for er in annotation_files:
        entity, relationship_type = er.split('-')
        file_name = os.path.join(directory, annotation_files[er])
        with gzip.open(file_name, 'r') as f:
            if entity == "disease":
                mapping = mp.getMappingFromOntology(ontology="Disease", source=None)
                relationships[(entity, relationship_type)].update(parseDiseaseAnnotations(f, modifications, mapping))
            elif entity == "biological_process":
                mapping = mp.getMappingFromOntology(ontology="Gene_ontology", source=None)
                relationships[(entity, relationship_type)].update(parseRegulationAnnotations(f, modifications, mapping))
            elif entity == "substrate":
                relationships[(entity, relationship_type)] = parseKinaseSubstrates(f, modifications)

    return entities, relationships, entities_header, relationships_headers
예제 #7
0
def parser(databases_directory, importDirectory, download=True):
    config = builder_utils.get_config(config_name="jensenlabConfig.yml", data_type='databases')
    outputfileName = "Publications.tsv"
    url = config['db_url']
    ifile = config['organisms_file']
    organisms = str(config['organisms'])
    directory = os.path.join(databases_directory, "Jensenlab")
    builder_utils.checkDirectory(os.path.join(directory, "textmining"))

    if download:
        builder_utils.downloadDB(url.replace("FILE", ifile), os.path.join(directory, "textmining"))

    ifile = os.path.join(directory, os.path.join("textmining", ifile))
    valid_pubs = read_valid_pubs(organisms, ifile)
    entities, header = parse_PMC_list(config, os.path.join(directory, "textmining"), download=download, valid_pubs=valid_pubs)
    num_entities = len(entities)
    outputfile = os.path.join(importDirectory, outputfileName)
    builder_utils.write_entities(entities, header, outputfile)
    entities = None

    for qtype in config['db_mentions_types']:
        parse_mentions(config, directory, qtype, importDirectory, download)

    builder_utils.remove_directory(os.path.join(directory, "textmining"))

    return (num_entities, outputfile)
예제 #8
0
def get_configuration(processing_tool, data_type):
    configuration = None
    if processing_tool is not None:
        config = builder_utils.get_config(config_name="proteomics.yml",
                                          data_type='experiments')
        if processing_tool in config:
            tool_configuration = config[processing_tool]
            if data_type in tool_configuration:
                configuration = tool_configuration[data_type]

    return configuration
예제 #9
0
def get_dataset_configuration(processing_format, data_type):
    config = builder_utils.get_config(config_name="proteomics.yml",
                                      data_type='experiments')
    dataset_config = {}
    if processing_format in config:
        if data_type is not None:
            if data_type in config[processing_format]:
                dataset_config = config[processing_format][data_type]
        else:
            dataset_config = config[processing_format]

    return dataset_config
예제 #10
0
def parser(databases_directory):
    config = builder_utils.get_config(config_name="drugBankConfig.yml",
                                      data_type='databases')
    directory = os.path.join(databases_directory, "DrugBank")
    builder_utils.checkDirectory(directory)
    drugs = extract_drugs(config, directory)
    build_DrugBank_dictionary(config, directory, drugs)
    relationships = build_relationships_from_DrugBank(config, drugs)
    entities, attributes = build_drug_entity(config, drugs)
    entities_header = ['ID'] + attributes
    relationships_headers = config['relationships_headers']

    return (entities, relationships, entities_header, relationships_headers)
예제 #11
0
def parser(databases_directory, download=True):
    result = {}
    config = builder_utils.get_config(config_name="jensenlabConfig.yml", data_type='databases')
    string_mapping = mp.getSTRINGMapping(download=download)

    for qtype in config['db_types']:
        relationships = parsePairs(config, databases_directory, qtype, string_mapping)
        entity1, entity2 = config['db_types'][qtype]
        outputfileName = entity1+"_"+entity2+"_associated_with_integrated.tsv"
        header = config['header']
        result[qtype] = (relationships, header, outputfileName)

    return result
예제 #12
0
def parser(databases_directory, download=True):
    config = builder_utils.get_config(config_name="gwasCatalogConfig.yml",
                                      data_type='databases')
    url = config['GWASCat_url']
    entities_header = config['entities_header']
    relationships_header = config['relationships_header']
    entities = set()
    relationships = defaultdict(set)
    directory = os.path.join(databases_directory, "GWAScatalog")
    builder_utils.checkDirectory(directory)
    fileName = os.path.join(directory, url.split('/')[-1])
    if download:
        builder_utils.downloadDB(url, directory)
    with open(fileName, 'r', encoding="utf-8") as catalog:
        for line in catalog:
            data = line.rstrip("\r\n").split("\t")
            if len(data) > 36:
                pubmedid = data[1]
                date = data[3]
                title = data[6]
                sample_size = data[8]
                replication_size = data[9]
                #chromosome = data[11]
                #position = data[12]
                #genes_mapped = data[14].split(" - ")
                snp_id = data[20].split('-')[0]
                freq = data[26]
                pval = data[27]
                odds_ratio = data[30]
                trait = data[34]
                exp_factor = data[35]
                study = data[36]

                entities.add((study, "GWAS_study", title, date, sample_size,
                              replication_size, trait))
                if pubmedid != "":
                    relationships["published_in_publication"].add(
                        (study, pubmedid, "PUBLISHED_IN", "GWAS Catalog"))
                if snp_id != "":
                    relationships["variant_found_in_gwas"].add(
                        (re.sub(r"^\W+|\W+$", "",
                                snp_id), study, "VARIANT_FOUND_IN_GWAS", freq,
                         pval, odds_ratio, trait, "GWAS Catalog"))
                if exp_factor != "":
                    exp_factor = exp_factor.split('/')[-1].replace('_', ':')
                    relationships["studies_trait"].add(
                        (study, exp_factor, "STUDIES_TRAIT", "GWAS Catalog"))

    builder_utils.remove_directory(directory)

    return (entities, relationships, entities_header, relationships_header)
예제 #13
0
def parser(projectId, type='proteomics', directory=None):
    #directory = None
    data = {}
    cwd = os.path.abspath(os.path.dirname(__file__))
    config = builder_utils.get_config(config_name="proteomics.yml",
                                      data_type='experiments')
    if directory is None:
        directory = os.path.join(
            cwd, '../../../../data/experiments/PROJECTID/' + type)
        if 'directory' in config:
            directory = os.path.join(cwd, config['directory'] + type)
    directory = directory.replace('PROJECTID', projectId)
    data = parse_from_directory(projectId, directory, config)

    return data
예제 #14
0
def get_STRING_mapping_url(db="STRING"):
    """
    Get the url for downloading the mapping file from either STRING or STITCH

    :param str db: Which database to get the url from: STRING or STITCH
    :return: url where to download the mapping file
    """
    url = None
    config = builder_utils.get_config(config_name="stringConfig.yml", data_type='databases')
    if db.upper() == "STRING":
        url = config['STRING_mapping_url']
    elif db.upper() == "STITCH":
        url = config['STITCH_mapping_url']

    return url
예제 #15
0
파일: foodbParser.py 프로젝트: scarltee/CKG
def parser(databases_directory, download=True):
    relationships = defaultdict(set)
    directory = os.path.join(databases_directory, "FooDB")
    builder_utils.checkDirectory(directory)
    config = builder_utils.get_config(config_name="foodbConfig.yml", data_type='databases')

    database_url = config['database_url']
    entities_header = config['entities_header']
    relationships_headers = config['relationships_headers']
    tar_fileName = os.path.join(directory, database_url.split('/')[-1])
    if download:
        builder_utils.downloadDB(database_url, directory)

    contents = {}
    food = set()
    compounds = {}
    try:
        tf = tarfile.open(tar_fileName, 'r')
        file_content = tf.getnames()
        tar_dir = file_content[1]
        tf.extractall(path=directory)
        tf.close()
        for file_name in config['files']:
            path = os.path.join(directory, os.path.join(tar_dir, file_name))
            with open(path, 'r', encoding="utf-8", errors='replace') as f:
                if file_name == "Content.csv":
                    contents = parseContents(f)
                elif file_name == "Food.csv":
                    food, mapping = parseFood(f)
                elif file_name == "Compound.csv":
                    compounds = parseCompounds(f)
        for food_id, compound_id in contents:
            if compound_id in compounds:
                compound_code = compounds[compound_id].replace("HMDB", "HMDB00")
                relationships[("food", "has_content")].add((food_id, compound_code, "HAS_CONTENT") + contents[(food_id, compound_id)])
        mp.reset_mapping(entity="Food")
        with open(os.path.join(directory, "mapping.tsv"), 'w', encoding='utf-8') as out:
            for food_id in mapping:
                for alias in mapping[food_id]:
                    out.write(str(food_id)+"\t"+str(alias)+"\n")

        mp.mark_complete_mapping(entity="Food")
    except tarfile.ReadError as err:
        raise Exception("Error importing database FooDB.\n {}".format(err))

    builder_utils.remove_directory(directory)

    return food, relationships, entities_header, relationships_headers
예제 #16
0
def update_configuration(data_type,
                         processing_tool,
                         value_col='LFQ intensity',
                         columns=[]):
    configuration = {}
    if processing_tool is not None:
        config = builder_utils.get_config(config_name="proteomics.yml",
                                          data_type='experiments')
        if processing_tool in config:
            tool_configuration = config[processing_tool]
            if data_type in tool_configuration:
                configuration = tool_configuration[data_type]
                configuration['columns'].extend(columns)
                configuration['valueCol'] = value_col

    return configuration
예제 #17
0
def parser(databases_directory, download=True):
    config = builder_utils.get_config(config_name="hmdbConfig.yml",
                                      data_type='databases')
    directory = os.path.join(databases_directory, "HMDB")
    builder_utils.checkDirectory(directory)
    metabolites = extract_metabolites(config, directory, download)
    mapping = mp.getMappingFromOntology(ontology="Disease",
                                        source=config['HMDB_DO_source'])
    mapping.update(mp.getMappingFromOntology(ontology="Tissue", source=None))
    entities, attributes = build_metabolite_entity(config, directory,
                                                   metabolites)
    relationships = build_relationships_from_HMDB(config, metabolites, mapping)
    entities_header = ['ID'] + attributes
    relationships_header = config['relationships_header']

    #builder_utils.remove_directory(directory)

    return (entities, relationships, entities_header, relationships_header)
예제 #18
0
def parser(databases_directory, download=True):
    relationships = set()
    config = builder_utils.get_config(config_name="mutationDsConfig.yml",
                                      data_type='databases')
    header = config['header']
    output_file_name = "mutation_curated_affects_interaction_with.tsv"
    regex = r":(\w+)\("
    url = config['mutations_url']
    directory = os.path.join(databases_directory, "MutationDs")
    builder_utils.checkDirectory(directory)
    file_name = os.path.join(directory, url.split('/')[-1])
    if download:
        builder_utils.downloadDB(url, directory)

    with open(file_name, 'r') as mf:
        first = True
        for line in mf:
            if first:
                first = False
                continue
            data = line.rstrip("\r\n").split("\t")
            if len(data) > 12:
                internal_id = data[0]
                pvariant = data[1]
                effect = data[5]
                protein = data[7].split(':')
                organism = data[10]
                interaction = data[11]
                evidence = data[12]

                if organism.startswith("9606") and len(protein) > 1:
                    protein = protein[1]
                    pvariant = protein + "_" + pvariant
                    matches = re.finditer(regex, interaction)
                    for matchNum, match in enumerate(matches, start=1):
                        interactor = match.group(1)
                        relationships.add((pvariant, interactor,
                                           "CURATED_AFFECTS_INTERACTION_WITH",
                                           effect, interaction, evidence,
                                           internal_id, "Intact-MutationDs"))

    builder_utils.remove_directory(directory)

    return (relationships, header, output_file_name)
예제 #19
0
def parser(databases_dir, download=True):
    config = builder_utils.get_config(config_name="goaConfig.yml",
                                      data_type='databases')
    url = config['url']
    rel_header = config['header']

    protein_mapping = mp.getMappingForEntity(entity="Protein")
    valid_proteins = list(set(protein_mapping.values))

    directory = os.path.join(databases_dir, "GOA")
    builder_utils.checkDirectory(directory)
    file_name = os.path.join(directory, url.split('/')[-1])
    if download:
        builder_utils.downloadDB(url, directory)

    annotations = parse_annotations_with_pandas(file_name, valid_proteins)

    builder_utils.remove_directory(directory)

    return annotations, rel_header
예제 #20
0
def parser(databases_directory, download=True):
    config = builder_utils.get_config(
        config_name="drugGeneInteractionDBConfig.yml", data_type='databases')
    url = config['DGIdb_url']
    header = config['header']
    output_file = "dgidb_targets.tsv"
    drugmapping = mp.getMappingForEntity("Drug")

    relationships = set()
    directory = os.path.join(databases_directory, "DGIdb")
    builder_utils.checkDirectory(directory)
    fileName = os.path.join(directory, url.split('/')[-1])
    if download:
        builder_utils.downloadDB(url, directory)
    with open(fileName, 'r', encoding='utf-8') as associations:
        first = True
        for line in associations:
            if first:
                first = False
                continue
            data = line.rstrip("\r\n").split("\t")
            gene = data[0]
            source = data[3]
            interactionType = data[4] if data[4] != '' else 'unknown'
            drug = data[8].lower()
            if drug == "":
                drug = data[7]
                if drug == "" and data[6] != "":
                    drug = data[6]
                else:
                    continue
            if gene != "":
                if drug in drugmapping:
                    drug = drugmapping[drug]
                    relationships.add((drug, gene, "TARGETS", "NA", "NA", "NA",
                                       interactionType, "DGIdb: " + source))

    builder_utils.remove_directory(directory)

    return (relationships, header, output_file)
예제 #21
0
def parser(databases_directory, download=True):
    config = builder_utils.get_config(config_name="pathwayCommonsConfig.yml",
                                      data_type='databases')
    url = config['pathwayCommons_pathways_url']
    entities = set()
    relationships = set()
    directory = os.path.join(databases_directory, "PathwayCommons")
    builder_utils.checkDirectory(directory)
    fileName = url.split('/')[-1]
    entities_header = config['pathways_header']
    relationships_header = config['relationships_header']

    if download:
        builder_utils.downloadDB(url, directory)
    f = os.path.join(directory, fileName)
    associations = gzip.open(f, 'r')
    for line in associations:
        data = line.decode('utf-8').rstrip("\r\n").split("\t")
        linkout = data[0]
        code = data[0].split("/")[-1]
        ptw_dict = dict([item.split(": ")[0], ":".join(item.split(": ")[1:])]
                        for item in data[1].split("; "))
        proteins = data[2:]
        if "organism" in ptw_dict and ptw_dict["organism"] == "9606":
            name = ptw_dict["name"]
            source = ptw_dict["datasource"]
        else:
            continue

        entities.add((code, "Pathway", name, name, ptw_dict["organism"],
                      source, linkout))
        for protein in proteins:
            relationships.add((protein, code, "ANNOTATED_IN_PATHWAY", linkout,
                               "PathwayCommons: " + source))

    associations.close()

    builder_utils.remove_directory(directory)

    return (entities, relationships, entities_header, relationships_header)
예제 #22
0
def parser(databases_directory, download=True):
    directory = os.path.join(databases_directory, "ExposomeExplorer")
    builder_utils.checkDirectory(directory)
    config = builder_utils.get_config(config_name="exposomeConfig.yml", data_type='databases')
    database_urls = config['database_urls']
    relationships_header = config['relationships_header']
    mapping = mp.getMappingForEntity("Food")
    correlations = {}
    for url in database_urls:
        zipped_fileName = os.path.join(directory, url.split('/')[-1])
        file_name = '.'.join(url.split('/')[-1].split('.')[0:2])
        if download:
            builder_utils.downloadDB(url, directory)

        with zipfile.ZipFile(zipped_fileName) as z:
            if file_name == "biomarkers.csv":
                biomarkers = parseBiomarkersFile(z, file_name)
            elif file_name == "correlations.csv":
                correlations = parseCorrelationsFile(z, file_name, biomarkers, mapping)

    builder_utils.remove_directory(directory)

    return correlations, relationships_header
예제 #23
0
def parser(databases_directory, download=True):
    config = builder_utils.get_config(config_name="hgncConfig.yml",
                                      data_type='databases')
    url = config['hgnc_url']
    entities = set()
    directory = os.path.join(databases_directory, "HGNC")
    builder_utils.checkDirectory(directory)
    fileName = os.path.join(directory, url.split('/')[-1])
    taxid = 9606
    entities_header = config['header']

    if download:
        builder_utils.downloadDB(url, directory)

    with open(fileName, 'r', encoding="utf-8") as df:
        first = True
        for line in df:
            if first:
                first = False
                continue
            data = line.rstrip("\r\n").split("\t")
            geneSymbol = data[1]
            geneName = data[2]
            status = data[5]
            geneFamily = data[12]
            synonyms = data[18:23]
            transcript = data[23]
            if status != "Approved":
                continue

            entities.add((geneSymbol, "Gene", geneName, geneFamily,
                          ",".join(synonyms), taxid))
            #relationships.add((geneSymbol, transcript, "TRANSCRIBED_INTO"))

    builder_utils.remove_directory(directory)

    return entities, entities_header
예제 #24
0
def parser(databases_directory, download=True):
    config = builder_utils.get_config(config_name="hpaConfig.yml",
                                      data_type='databases')
    url = config['hpa_pathology_url']
    disease_mapping = mp.getMappingFromOntology(ontology="Disease",
                                                source=None)
    protein_mapping = mp.getMultipleMappingForEntity("Protein")
    directory = os.path.join(databases_directory, "HPA")
    builder_utils.checkDirectory(directory)
    compressed_fileName = os.path.join(directory, url.split('/')[-1])
    file_name = '.'.join(url.split('/')[-1].split('.')[0:2])
    relationships_headers = config['relationships_headers']

    if download:
        builder_utils.downloadDB(url, directory)

    with zipfile.ZipFile(compressed_fileName) as z:
        if file_name == "pathology.tsv":
            pathology = parsePathologyFile(config, z, file_name,
                                           protein_mapping, disease_mapping)

    builder_utils.remove_directory(directory)

    return (pathology, relationships_headers)
예제 #25
0
def parseActions(databases_directory,
                 importDirectory,
                 proteinMapping,
                 drugMapping=None,
                 download=True,
                 db="STRING"):
    config = builder_utils.get_config(config_name="stringConfig.yml",
                                      data_type='databases')
    url = None
    bool_dict = {
        't': True,
        'T': True,
        'True': True,
        'TRUE': True,
        'f': False,
        'F': False,
        'False': False,
        'FALSE': False
    }
    header = config['header_actions']
    relationship = "COMPILED_ACTS_ON"
    stored = set()
    if db == "STRING":
        url = config['STRING_actions_url']
        outputfile = os.path.join(importDirectory,
                                  "string_protein_acts_on_protein.tsv")
    elif db == "STITCH":
        url = config['STITCH_actions_url']
        outputfile = os.path.join(importDirectory,
                                  "stitch_drug_acts_on_protein.tsv")

    directory = os.path.join(databases_directory, db)
    builder_utils.checkDirectory(directory)
    fileName = os.path.join(directory, url.split('/')[-1])
    if download:
        builder_utils.downloadDB(url, directory)

    f = os.path.join(directory, fileName)
    associations = gzip.open(f, 'r')
    first = True
    with open(outputfile, 'w') as csvfile:
        writer = csv.writer(csvfile,
                            delimiter='\t',
                            escapechar='\\',
                            quotechar='"',
                            quoting=csv.QUOTE_ALL)
        writer.writerow(header)
        for line in associations:
            if first:
                first = False
                continue
            data = line.decode('utf-8').rstrip("\r\n").split()
            intA = data[0]
            intB = data[1]
            action = data[2]
            score = float(data[-1]) / 1000
            directionality = bool_dict[data[-3]] if db == "STRING" else True

            if intB in proteinMapping:
                aliasesA = []
                if intA in drugMapping:
                    aliasesA = drugMapping[intA]
                elif intA in proteinMapping:
                    aliasesA = proteinMapping[intA]
                for aliasA in aliasesA:
                    for aliasB in proteinMapping[intB]:
                        if (aliasA, aliasB, action) not in stored:
                            row = (aliasA, aliasB, relationship, action,
                                   directionality, score, db)
                            writer.writerow(row)
                            stored.add((aliasA, aliasB, action))
                            stored.add((aliasB, aliasA, action))
    associations.close()
예제 #26
0
def parser(databases_directory,
           importDirectory,
           drug_source=None,
           download=True,
           db="STRING"):
    config = builder_utils.get_config(config_name="stringConfig.yml",
                                      data_type='databases')
    mapping = mp.getSTRINGMapping(download=False)
    stored = set()
    relationship = None
    cutoff = config['STRING_cutoff']
    header = config['header']
    drugmapping = {}
    if db == "STITCH":
        evidences = [
            "experimental", "prediction", "database", "textmining", "score"
        ]
        relationship = "COMPILED_INTERACTS_WITH"
        url = config['STITCH_url']
        outputfile = os.path.join(importDirectory,
                                  "stitch_associated_with.tsv")

        drugmapping = mp.getSTRINGMapping(source=drug_source,
                                          download=download,
                                          db=db)

    elif db == "STRING":
        evidences = [
            "Neighborhood in the Genome", "Gene fusions",
            "Co-ocurrence across genomes", "Co-expression",
            "Experimental/biochemical data",
            "Association in curated databases", "Text-mining"
        ]
        relationship = "COMPILED_TARGETS"
        outputfile = os.path.join(importDirectory, "string_interacts_with.tsv")
        url = config['STRING_url']
    directory = os.path.join(databases_directory, db)
    builder_utils.checkDirectory(directory)
    fileName = os.path.join(directory, url.split('/')[-1])

    if download:
        builder_utils.downloadDB(url, directory)

    f = os.path.join(directory, fileName)
    associations = gzip.open(f, 'r')
    first = True
    with open(outputfile, 'w') as csvfile:
        writer = csv.writer(csvfile,
                            delimiter='\t',
                            escapechar='\\',
                            quotechar='"',
                            quoting=csv.QUOTE_ALL)
        writer.writerow(header)
        for line in associations:
            if first:
                first = False
                continue
            data = line.decode('utf-8').rstrip("\r\n").split()
            intA = data[0]
            intB = data[1]
            scores = data[2:]
            fscores = [str(float(score) / 1000) for score in scores]
            if db == "STRING":
                if intA in mapping and intB in mapping and float(
                        fscores[-1]) >= cutoff:
                    for aliasA in mapping[intA]:
                        for aliasB in mapping[intB]:
                            if (aliasA, aliasB) not in stored:
                                row = (aliasA, aliasB, relationship,
                                       "association", db, ",".join(evidences),
                                       ",".join(fscores[0:-1]), fscores[-1])
                                stored.add((aliasA, aliasB))
                                stored.add((aliasB, aliasB))
                                writer.writerow(row)
            elif db == "STITCH":
                if intA in drugmapping and intB in mapping and float(
                        fscores[-1]) >= cutoff:
                    for aliasA in drugmapping[intA]:
                        for aliasB in mapping[intB]:
                            if (aliasA, aliasB) not in stored:
                                row = (aliasA, aliasB, relationship,
                                       "association", db, ",".join(evidences),
                                       ",".join(fscores[0:-1]), fscores[-1])
                                stored.add((aliasA, aliasB))
                                stored.add((aliasB, aliasB))
                                writer.writerow(row)
    associations.close()

    return mapping, drugmapping
예제 #27
0
파일: corumParser.py 프로젝트: scarltee/CKG
def parser(databases_directory, download=True):
    entities = set()
    relationships = defaultdict(set)
    directory = os.path.join(databases_directory, "CORUM")
    builder_utils.checkDirectory(directory)

    try:
        config = builder_utils.get_config(config_name="corumConfig.yml",
                                          data_type='databases')
    except Exception as err:
        raise Exception("Reading configuration > {}.".format(err))

    database_url = config['database_url']
    entities_header = config['entities_header']
    relationships_headers = config['relationships_headers']
    zipped_fileName = os.path.join(directory, database_url.split('/')[-1])
    fileName = '.'.join(database_url.split('/')[-1].split('.')[0:2])
    if download:
        builder_utils.downloadDB(database_url, directory)
    names = set()
    first = True
    with zipfile.ZipFile(zipped_fileName) as z:
        with z.open(fileName) as f:
            for line in f:
                if first:
                    first = False
                    continue
                data = line.decode("utf-8").rstrip("\r\n").split("\t")
                identifier = data[0]
                name = data[1]
                organism = data[2]
                synonyms = data[3].split(';') if data[3] != "None" else [""]
                cell_lines = data[4].join(';')
                subunits = data[5].split(';')
                evidences = data[7].split(';')
                processes = data[8].split(';')
                pubmedid = data[14]

                if organism == "Human":
                    #ID name organism synonyms source
                    if name not in names:
                        entities.add((identifier, name, "9606",
                                      ",".join(synonyms), "CORUM"))
                        names.add(name)
                    for subunit in subunits:
                        #START_ID END_ID type cell_lines evidences publication source
                        relationships[("Protein", "is_subunit_of")].add(
                            (subunit, identifier, "IS_SUBUNIT_OF",
                             ",".join(cell_lines), ",".join(evidences),
                             pubmedid, "CORUM"))
                    for process in processes:
                        #START_ID END_ID type evidence_type score source
                        relationships["Biological_process",
                                      "associated_with"].add(
                                          (identifier, process,
                                           "ASSOCIATED_WITH", "CURATED", 5,
                                           "CORUM"))

    builder_utils.remove_directory(directory)

    return entities, relationships, entities_header, relationships_headers
예제 #28
0
def parser(databases_directory, download=True):
    config = builder_utils.get_config(config_name="oncokbConfig.yml",
                                      data_type='databases')
    url_actionable = config['OncoKB_actionable_url']
    url_annotation = config['OncoKB_annotated_url']
    amino_acids = config['amino_acids']
    entities_header = config['entities_header']
    relationships_headers = config['relationships_headers']
    mapping = mp.getMappingFromOntology(ontology="Disease", source=None)

    drug_mapping = mp.getMappingForEntity("Drug")
    protein_mapping = mp.getMultipleMappingForEntity("Protein")

    levels = config['OncoKB_levels']
    entities = set()
    relationships = defaultdict(set)
    directory = os.path.join(databases_directory, "OncoKB")
    builder_utils.checkDirectory(directory)
    acfileName = os.path.join(directory, url_actionable.split('/')[-1])
    anfileName = os.path.join(directory, url_annotation.split('/')[-1])
    if download:
        builder_utils.downloadDB(url_actionable, directory)
        builder_utils.downloadDB(url_annotation, directory)

    variant_regex = r"(\D\d+\D)$"
    with open(anfileName, 'r', errors='replace') as variants:
        first = True
        for line in variants:
            if first:
                first = False
                continue
            data = line.rstrip("\r\n").split("\t")
            gene = data[3]
            variant = data[4]
            oncogenicity = data[5]
            effect = data[6]
            if gene in protein_mapping:
                for protein in protein_mapping[gene]:
                    match = re.search(variant_regex, variant)
                    if match:
                        if variant[0] in amino_acids and variant[
                                -1] in amino_acids:
                            valid_variant = protein + '_p.' + amino_acids[
                                variant[0]] + ''.join(
                                    variant[1:-1]) + amino_acids[variant[-1]]
                            entities.add(
                                (valid_variant, "Clinically_relevant_variant",
                                 "", "", "", "", "", effect, oncogenicity))

    with open(acfileName, 'r', errors='replace') as associations:
        first = True
        for line in associations:
            if first:
                first = False
                continue
            data = line.rstrip("\r\n").split("\t")
            isoform = data[1]
            gene = data[3]
            variant = data[5]
            disease = data[6]
            level = data[7]
            drugs = data[8].split(', ')
            pubmed_ids = data[9].split(',')
            if level in levels:
                level = levels[level]

            valid_variants = []
            if gene in protein_mapping:
                for protein in protein_mapping[gene]:
                    match = re.search(variant_regex, variant)
                    if match:
                        if variant[0] in amino_acids and variant[
                                -1] in amino_acids:
                            valid_variants.append(protein + '_p.' +
                                                  amino_acids[variant[0]] +
                                                  ''.join(variant[1:-1]) +
                                                  amino_acids[variant[-1]])
            for drug in drugs:
                for d in drug.split(' + '):
                    if d.lower() in drug_mapping:
                        drug = drug_mapping[d.lower()]
                        relationships["targets"].add(
                            (drug, gene, "CURATED_TARGETS", "curated", "NA",
                             "NA", "curated", "OncoKB"))
                        for valid_variant in valid_variants:
                            relationships[
                                "targets_clinically_relevant_variant"].add(
                                    (drug, valid_variant,
                                     "TARGETS_KNOWN_VARIANT", level[0],
                                     level[1], disease, "curated", "OncoKB"))
            for valid_variant in valid_variants:
                if disease.lower() in mapping:
                    disease = mapping[disease.lower()]
                    relationships["associated_with"].add(
                        (valid_variant, disease, "ASSOCIATED_WITH", "curated",
                         "curated", "OncoKB", len(pubmed_ids)))
                else:
                    pass
                relationships["known_variant_is_clinically_relevant"].add(
                    (valid_variant, valid_variant,
                     "KNOWN_VARIANT_IS_CLINICALLY_RELEVANT", "OncoKB"))

    builder_utils.remove_directory(directory)

    return (entities, relationships, entities_header, relationships_headers)
예제 #29
0
import config.ckg_config as ckg_config
import ckg_utils
from graphdb_connector import connector
from graphdb_builder import builder_utils
from graphdb_builder.builder import loader
from graphdb_builder.experiments import experiments_controller as eh

log_config = ckg_config.report_manager_log
logger = builder_utils.setup_logging(log_config, key="project_creation")

cwd = os.path.abspath(os.path.dirname(__file__))
experimentDir = os.path.join(cwd, '../../../data/experiments')
importDir = os.path.join(cwd, '../../../data/imports/experiments')

try:
    config = builder_utils.get_config(config_name="clinical.yml",
                                      data_type='experiments')
except Exception as err:
    logger.error("Reading configuration > {}.".format(err))


def get_project_creation_queries():
    """
    Reads the YAML file containing the queries relevant to user creation, parses the given stream and \
    returns a Python object (dict[dict]).

    :return: Nested dictionary.
    """
    try:
        cwd = os.path.abspath(os.path.dirname(__file__))
        queries_path = "../queries/project_creation_cypher.yml"
        project_creation_cypher = ckg_utils.get_queries(
예제 #30
0
def parser(databases_directory, download=True):
    config = builder_utils.get_config(config_name="refseqConfig.yml",
                                      data_type='databases')
    url = config['refseq_url']
    ftp_dir = config['refseq_ftp_dir']
    entities = defaultdict(set)
    relationships = defaultdict(set)
    directory = os.path.join(databases_directory, "RefSeq")
    builder_utils.checkDirectory(directory)
    fileName = os.path.join(directory, url.split('/')[-1])
    headers = config['headerEntities']
    taxid = 9606

    if download:
        file_dir = builder_utils.list_ftp_directory(ftp_dir)[0]
        new_file = file_dir.split('/')[-1] + "_feature_table.txt.gz"
        url = ftp_dir + file_dir.split('/')[-1] + "/" + new_file
        builder_utils.downloadDB(url, directory)
        fileName = os.path.join(directory, new_file)

    if os.path.isfile(fileName):
        df = builder_utils.read_gzipped_file(fileName)
        first = True
        for line in df:
            if first:
                first = False
                continue
            data = line.rstrip("\r\n").split("\t")
            tclass = data[1]
            assembly = data[2]
            chrom = data[5]
            geneAcc = data[6]
            start = data[7]
            end = data[8]
            strand = data[9]
            protAcc = data[10]
            name = data[13]
            symbol = data[14]

            if protAcc != "":
                entities["Transcript"].add(
                    (protAcc, "Transcript", name, tclass, assembly, taxid))
                if chrom != "":
                    entities["Chromosome"].add(
                        (chrom, "Chromosome", chrom, taxid))
                    relationships["LOCATED_IN"].add(
                        (protAcc, chrom, "LOCATED_IN", start, end, strand,
                         "RefSeq"))
                if symbol != "":
                    relationships["TRANSCRIBED_INTO"].add(
                        (symbol, protAcc, "TRANSCRIBED_INTO", "RefSeq"))
            elif geneAcc != "":
                entities["Transcript"].add(
                    (geneAcc, "Transcript", name, tclass, assembly, taxid))
                if chrom != "":
                    entities["Chromosome"].add(
                        (chrom, "Chromosome", chrom, taxid))
                    relationships["LOCATED_IN"].add(
                        (protAcc, chrom, "LOCATED_IN", start, end, strand,
                         "RefSeq"))
        df.close()

    builder_utils.remove_directory(directory)

    return (entities, relationships, headers)