def parser(projectId): data = {} config = builder_utils.get_config(config_name="wes.yml", data_type='experiments') directory = '../../../data/experiments/PROJECTID/wes/' if 'directory' in config: directory = config['directory'] directory = directory.replace('PROJECTID', projectId) wes_data = parseWESDataset(projectId, config, directory) if wes_data is not None: somatic_mutations = pd.DataFrame() for sample in wes_data: entities, variantRows, sampleRows, geneRows, chrRows = extractWESRelationships(wes_data[sample], config) data[('somatic_mutation_known_variant', 'w')] = variantRows data[('somatic_mutation_sample', 'w')] = sampleRows data[('somatic_mutation_gene', 'w')] = geneRows data[('somatic_mutation_chromosome', 'w')] = chrRows if somatic_mutations.empty: somatic_mutations = entities else: new = set(entities.index).difference(set(somatic_mutations.index)) somatic_mutations = somatic_mutations.append(entities.loc[new,:], ignore_index=False) somatic_mutations = somatic_mutations.reset_index() data[('somatic_mutation', 'w')] = somatic_mutations return data
def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="smpdbConfig.yml", data_type='databases') urls = config['smpdb_urls'] entities = set() relationships = defaultdict(set) entities_header = config['pathway_header'] relationships_headers = config['relationships_header'] directory = os.path.join(databases_directory, "SMPDB") builder_utils.checkDirectory(directory) for dataset in urls: url = urls[dataset] file_name = url.split('/')[-1] if download: builder_utils.downloadDB(url, directory) zipped_file = os.path.join(directory, file_name) with zipfile.ZipFile(zipped_file) as rf: if dataset == "pathway": entities = parsePathways(config, rf) elif dataset == "protein": relationships.update(parsePathwayProteinRelationships(rf)) elif dataset == "metabolite": relationships.update(parsePathwayMetaboliteDrugRelationships(rf)) builder_utils.remove_directory(directory) return entities, relationships, entities_header, relationships_headers
def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="signorConfig.yml", data_type='databases') directory = os.path.join(databases_directory, "SIGNOR") builder_utils.checkDirectory(directory) url = config['url'] modifications = config['modifications'] amino_acids = config['amino_acids'] accronyms = config['accronyms'] entities_header = config['entities_header'] relationships_headers = config['rel_headers'] entities = set() relationships = defaultdict(set) filename = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) entities, relationships = parse_substrates(filename, modifications, accronyms, amino_acids) return entities, relationships, entities_header, relationships_headers
def parser(projectId, type='clinical'): data = {} cwd = os.path.abspath(os.path.dirname(__file__)) config = builder_utils.get_config(config_name="clinical.yml", data_type='experiments') project_directory = os.path.join( cwd, '../../../../data/experiments/PROJECTID/project/') clinical_directory = os.path.join( cwd, '../../../../data/experiments/PROJECTID/clinical/') design_directory = os.path.join( cwd, '../../../../data/experiments/PROJECTID/experimental_design/') separator = config["separator"] if 'project_directory' in config: project_directory = os.path.join(cwd, config['project_directory']) project_directory = project_directory.replace('PROJECTID', projectId) if 'clinical_directory' in config: clinical_directory = os.path.join(cwd, config['clinical_directory']) clinical_directory = clinical_directory.replace('PROJECTID', projectId) if 'design_directory' in config: design_directory = os.path.join(cwd, config['design_directory']) design_directory = design_directory.replace('PROJECTID', projectId) if type == 'project': project_dfs = project_parser(projectId, config, project_directory, separator) data.update(project_dfs) elif type == 'experimental_design': design_dfs = experimental_design_parser(projectId, config, design_directory) data.update(design_dfs) elif type == 'clinical': clinical_dfs = clinical_parser(projectId, config, clinical_directory, separator) data.update(clinical_dfs) return data
def parser(databases_directory, download=True): relationships = defaultdict(set) config = builder_utils.get_config(config_name="disgenetConfig.yml", data_type='databases') files = config['disgenet_files'] mapping_files = config['disgenet_mapping_files'] url = config['disgenet_url'] directory = os.path.join(databases_directory, "disgenet") builder_utils.checkDirectory(directory) header = config['disgenet_header'] output_file = 'disgenet_associated_with.tsv' if download: for f in files: builder_utils.downloadDB(url + files[f], directory) for f in mapping_files: builder_utils.downloadDB(url + mapping_files[f], directory) proteinMapping = readDisGeNetProteinMapping(config, directory) diseaseMapping = readDisGeNetDiseaseMapping(config, directory) for f in files: first = True associations = gzip.open(os.path.join(directory, files[f]), 'r') dtype, atype = f.split('_') if dtype == 'gene': idType = "Protein" scorePos = 9 if dtype == 'variant': idType = "Transcript" scorePos = 5 for line in associations: if first: first = False continue try: data = line.decode('utf-8').rstrip("\r\n").split("\t") geneId = str(int(data[0])) #disease_specificity_index = data[2] #disease_pleiotropy_index = data[3] diseaseId = data[4] score = float(data[scorePos]) pmids = data[13] source = data[-1] if geneId in proteinMapping: for identifier in proteinMapping[geneId]: if diseaseId in diseaseMapping: for code in diseaseMapping[diseaseId]: code = "DOID:" + code relationships[idType].add( (identifier, code, "ASSOCIATED_WITH", score, atype, "DisGeNet: " + source, pmids)) except UnicodeDecodeError: continue associations.close() builder_utils.remove_directory(directory) return (relationships, header, output_file)
def parser(databases_directory): directory = os.path.join(databases_directory, "PhosphoSitePlus") builder_utils.checkDirectory(directory) config = builder_utils.get_config(config_name="pspConfig.yml", data_type='databases') modifications = config['modifications'] annotation_files = config['annotation_files'] entities_header = config['entities_header'] relationships_headers = config['rel_headers'] entities = set() relationships = defaultdict(set) for site_file in config['site_files']: file_name = os.path.join(directory, site_file) with gzip.open(file_name, 'r') as f: sites, site_relationships = parseSites(f, modifications) entities.update(sites) for r in site_relationships: relationships[r].update(site_relationships[r]) for er in annotation_files: entity, relationship_type = er.split('-') file_name = os.path.join(directory, annotation_files[er]) with gzip.open(file_name, 'r') as f: if entity == "disease": mapping = mp.getMappingFromOntology(ontology="Disease", source=None) relationships[(entity, relationship_type)].update(parseDiseaseAnnotations(f, modifications, mapping)) elif entity == "biological_process": mapping = mp.getMappingFromOntology(ontology="Gene_ontology", source=None) relationships[(entity, relationship_type)].update(parseRegulationAnnotations(f, modifications, mapping)) elif entity == "substrate": relationships[(entity, relationship_type)] = parseKinaseSubstrates(f, modifications) return entities, relationships, entities_header, relationships_headers
def parser(databases_directory, importDirectory, download=True): config = builder_utils.get_config(config_name="jensenlabConfig.yml", data_type='databases') outputfileName = "Publications.tsv" url = config['db_url'] ifile = config['organisms_file'] organisms = str(config['organisms']) directory = os.path.join(databases_directory, "Jensenlab") builder_utils.checkDirectory(os.path.join(directory, "textmining")) if download: builder_utils.downloadDB(url.replace("FILE", ifile), os.path.join(directory, "textmining")) ifile = os.path.join(directory, os.path.join("textmining", ifile)) valid_pubs = read_valid_pubs(organisms, ifile) entities, header = parse_PMC_list(config, os.path.join(directory, "textmining"), download=download, valid_pubs=valid_pubs) num_entities = len(entities) outputfile = os.path.join(importDirectory, outputfileName) builder_utils.write_entities(entities, header, outputfile) entities = None for qtype in config['db_mentions_types']: parse_mentions(config, directory, qtype, importDirectory, download) builder_utils.remove_directory(os.path.join(directory, "textmining")) return (num_entities, outputfile)
def get_configuration(processing_tool, data_type): configuration = None if processing_tool is not None: config = builder_utils.get_config(config_name="proteomics.yml", data_type='experiments') if processing_tool in config: tool_configuration = config[processing_tool] if data_type in tool_configuration: configuration = tool_configuration[data_type] return configuration
def get_dataset_configuration(processing_format, data_type): config = builder_utils.get_config(config_name="proteomics.yml", data_type='experiments') dataset_config = {} if processing_format in config: if data_type is not None: if data_type in config[processing_format]: dataset_config = config[processing_format][data_type] else: dataset_config = config[processing_format] return dataset_config
def parser(databases_directory): config = builder_utils.get_config(config_name="drugBankConfig.yml", data_type='databases') directory = os.path.join(databases_directory, "DrugBank") builder_utils.checkDirectory(directory) drugs = extract_drugs(config, directory) build_DrugBank_dictionary(config, directory, drugs) relationships = build_relationships_from_DrugBank(config, drugs) entities, attributes = build_drug_entity(config, drugs) entities_header = ['ID'] + attributes relationships_headers = config['relationships_headers'] return (entities, relationships, entities_header, relationships_headers)
def parser(databases_directory, download=True): result = {} config = builder_utils.get_config(config_name="jensenlabConfig.yml", data_type='databases') string_mapping = mp.getSTRINGMapping(download=download) for qtype in config['db_types']: relationships = parsePairs(config, databases_directory, qtype, string_mapping) entity1, entity2 = config['db_types'][qtype] outputfileName = entity1+"_"+entity2+"_associated_with_integrated.tsv" header = config['header'] result[qtype] = (relationships, header, outputfileName) return result
def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="gwasCatalogConfig.yml", data_type='databases') url = config['GWASCat_url'] entities_header = config['entities_header'] relationships_header = config['relationships_header'] entities = set() relationships = defaultdict(set) directory = os.path.join(databases_directory, "GWAScatalog") builder_utils.checkDirectory(directory) fileName = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) with open(fileName, 'r', encoding="utf-8") as catalog: for line in catalog: data = line.rstrip("\r\n").split("\t") if len(data) > 36: pubmedid = data[1] date = data[3] title = data[6] sample_size = data[8] replication_size = data[9] #chromosome = data[11] #position = data[12] #genes_mapped = data[14].split(" - ") snp_id = data[20].split('-')[0] freq = data[26] pval = data[27] odds_ratio = data[30] trait = data[34] exp_factor = data[35] study = data[36] entities.add((study, "GWAS_study", title, date, sample_size, replication_size, trait)) if pubmedid != "": relationships["published_in_publication"].add( (study, pubmedid, "PUBLISHED_IN", "GWAS Catalog")) if snp_id != "": relationships["variant_found_in_gwas"].add( (re.sub(r"^\W+|\W+$", "", snp_id), study, "VARIANT_FOUND_IN_GWAS", freq, pval, odds_ratio, trait, "GWAS Catalog")) if exp_factor != "": exp_factor = exp_factor.split('/')[-1].replace('_', ':') relationships["studies_trait"].add( (study, exp_factor, "STUDIES_TRAIT", "GWAS Catalog")) builder_utils.remove_directory(directory) return (entities, relationships, entities_header, relationships_header)
def parser(projectId, type='proteomics', directory=None): #directory = None data = {} cwd = os.path.abspath(os.path.dirname(__file__)) config = builder_utils.get_config(config_name="proteomics.yml", data_type='experiments') if directory is None: directory = os.path.join( cwd, '../../../../data/experiments/PROJECTID/' + type) if 'directory' in config: directory = os.path.join(cwd, config['directory'] + type) directory = directory.replace('PROJECTID', projectId) data = parse_from_directory(projectId, directory, config) return data
def get_STRING_mapping_url(db="STRING"): """ Get the url for downloading the mapping file from either STRING or STITCH :param str db: Which database to get the url from: STRING or STITCH :return: url where to download the mapping file """ url = None config = builder_utils.get_config(config_name="stringConfig.yml", data_type='databases') if db.upper() == "STRING": url = config['STRING_mapping_url'] elif db.upper() == "STITCH": url = config['STITCH_mapping_url'] return url
def parser(databases_directory, download=True): relationships = defaultdict(set) directory = os.path.join(databases_directory, "FooDB") builder_utils.checkDirectory(directory) config = builder_utils.get_config(config_name="foodbConfig.yml", data_type='databases') database_url = config['database_url'] entities_header = config['entities_header'] relationships_headers = config['relationships_headers'] tar_fileName = os.path.join(directory, database_url.split('/')[-1]) if download: builder_utils.downloadDB(database_url, directory) contents = {} food = set() compounds = {} try: tf = tarfile.open(tar_fileName, 'r') file_content = tf.getnames() tar_dir = file_content[1] tf.extractall(path=directory) tf.close() for file_name in config['files']: path = os.path.join(directory, os.path.join(tar_dir, file_name)) with open(path, 'r', encoding="utf-8", errors='replace') as f: if file_name == "Content.csv": contents = parseContents(f) elif file_name == "Food.csv": food, mapping = parseFood(f) elif file_name == "Compound.csv": compounds = parseCompounds(f) for food_id, compound_id in contents: if compound_id in compounds: compound_code = compounds[compound_id].replace("HMDB", "HMDB00") relationships[("food", "has_content")].add((food_id, compound_code, "HAS_CONTENT") + contents[(food_id, compound_id)]) mp.reset_mapping(entity="Food") with open(os.path.join(directory, "mapping.tsv"), 'w', encoding='utf-8') as out: for food_id in mapping: for alias in mapping[food_id]: out.write(str(food_id)+"\t"+str(alias)+"\n") mp.mark_complete_mapping(entity="Food") except tarfile.ReadError as err: raise Exception("Error importing database FooDB.\n {}".format(err)) builder_utils.remove_directory(directory) return food, relationships, entities_header, relationships_headers
def update_configuration(data_type, processing_tool, value_col='LFQ intensity', columns=[]): configuration = {} if processing_tool is not None: config = builder_utils.get_config(config_name="proteomics.yml", data_type='experiments') if processing_tool in config: tool_configuration = config[processing_tool] if data_type in tool_configuration: configuration = tool_configuration[data_type] configuration['columns'].extend(columns) configuration['valueCol'] = value_col return configuration
def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="hmdbConfig.yml", data_type='databases') directory = os.path.join(databases_directory, "HMDB") builder_utils.checkDirectory(directory) metabolites = extract_metabolites(config, directory, download) mapping = mp.getMappingFromOntology(ontology="Disease", source=config['HMDB_DO_source']) mapping.update(mp.getMappingFromOntology(ontology="Tissue", source=None)) entities, attributes = build_metabolite_entity(config, directory, metabolites) relationships = build_relationships_from_HMDB(config, metabolites, mapping) entities_header = ['ID'] + attributes relationships_header = config['relationships_header'] #builder_utils.remove_directory(directory) return (entities, relationships, entities_header, relationships_header)
def parser(databases_directory, download=True): relationships = set() config = builder_utils.get_config(config_name="mutationDsConfig.yml", data_type='databases') header = config['header'] output_file_name = "mutation_curated_affects_interaction_with.tsv" regex = r":(\w+)\(" url = config['mutations_url'] directory = os.path.join(databases_directory, "MutationDs") builder_utils.checkDirectory(directory) file_name = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) with open(file_name, 'r') as mf: first = True for line in mf: if first: first = False continue data = line.rstrip("\r\n").split("\t") if len(data) > 12: internal_id = data[0] pvariant = data[1] effect = data[5] protein = data[7].split(':') organism = data[10] interaction = data[11] evidence = data[12] if organism.startswith("9606") and len(protein) > 1: protein = protein[1] pvariant = protein + "_" + pvariant matches = re.finditer(regex, interaction) for matchNum, match in enumerate(matches, start=1): interactor = match.group(1) relationships.add((pvariant, interactor, "CURATED_AFFECTS_INTERACTION_WITH", effect, interaction, evidence, internal_id, "Intact-MutationDs")) builder_utils.remove_directory(directory) return (relationships, header, output_file_name)
def parser(databases_dir, download=True): config = builder_utils.get_config(config_name="goaConfig.yml", data_type='databases') url = config['url'] rel_header = config['header'] protein_mapping = mp.getMappingForEntity(entity="Protein") valid_proteins = list(set(protein_mapping.values)) directory = os.path.join(databases_dir, "GOA") builder_utils.checkDirectory(directory) file_name = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) annotations = parse_annotations_with_pandas(file_name, valid_proteins) builder_utils.remove_directory(directory) return annotations, rel_header
def parser(databases_directory, download=True): config = builder_utils.get_config( config_name="drugGeneInteractionDBConfig.yml", data_type='databases') url = config['DGIdb_url'] header = config['header'] output_file = "dgidb_targets.tsv" drugmapping = mp.getMappingForEntity("Drug") relationships = set() directory = os.path.join(databases_directory, "DGIdb") builder_utils.checkDirectory(directory) fileName = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) with open(fileName, 'r', encoding='utf-8') as associations: first = True for line in associations: if first: first = False continue data = line.rstrip("\r\n").split("\t") gene = data[0] source = data[3] interactionType = data[4] if data[4] != '' else 'unknown' drug = data[8].lower() if drug == "": drug = data[7] if drug == "" and data[6] != "": drug = data[6] else: continue if gene != "": if drug in drugmapping: drug = drugmapping[drug] relationships.add((drug, gene, "TARGETS", "NA", "NA", "NA", interactionType, "DGIdb: " + source)) builder_utils.remove_directory(directory) return (relationships, header, output_file)
def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="pathwayCommonsConfig.yml", data_type='databases') url = config['pathwayCommons_pathways_url'] entities = set() relationships = set() directory = os.path.join(databases_directory, "PathwayCommons") builder_utils.checkDirectory(directory) fileName = url.split('/')[-1] entities_header = config['pathways_header'] relationships_header = config['relationships_header'] if download: builder_utils.downloadDB(url, directory) f = os.path.join(directory, fileName) associations = gzip.open(f, 'r') for line in associations: data = line.decode('utf-8').rstrip("\r\n").split("\t") linkout = data[0] code = data[0].split("/")[-1] ptw_dict = dict([item.split(": ")[0], ":".join(item.split(": ")[1:])] for item in data[1].split("; ")) proteins = data[2:] if "organism" in ptw_dict and ptw_dict["organism"] == "9606": name = ptw_dict["name"] source = ptw_dict["datasource"] else: continue entities.add((code, "Pathway", name, name, ptw_dict["organism"], source, linkout)) for protein in proteins: relationships.add((protein, code, "ANNOTATED_IN_PATHWAY", linkout, "PathwayCommons: " + source)) associations.close() builder_utils.remove_directory(directory) return (entities, relationships, entities_header, relationships_header)
def parser(databases_directory, download=True): directory = os.path.join(databases_directory, "ExposomeExplorer") builder_utils.checkDirectory(directory) config = builder_utils.get_config(config_name="exposomeConfig.yml", data_type='databases') database_urls = config['database_urls'] relationships_header = config['relationships_header'] mapping = mp.getMappingForEntity("Food") correlations = {} for url in database_urls: zipped_fileName = os.path.join(directory, url.split('/')[-1]) file_name = '.'.join(url.split('/')[-1].split('.')[0:2]) if download: builder_utils.downloadDB(url, directory) with zipfile.ZipFile(zipped_fileName) as z: if file_name == "biomarkers.csv": biomarkers = parseBiomarkersFile(z, file_name) elif file_name == "correlations.csv": correlations = parseCorrelationsFile(z, file_name, biomarkers, mapping) builder_utils.remove_directory(directory) return correlations, relationships_header
def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="hgncConfig.yml", data_type='databases') url = config['hgnc_url'] entities = set() directory = os.path.join(databases_directory, "HGNC") builder_utils.checkDirectory(directory) fileName = os.path.join(directory, url.split('/')[-1]) taxid = 9606 entities_header = config['header'] if download: builder_utils.downloadDB(url, directory) with open(fileName, 'r', encoding="utf-8") as df: first = True for line in df: if first: first = False continue data = line.rstrip("\r\n").split("\t") geneSymbol = data[1] geneName = data[2] status = data[5] geneFamily = data[12] synonyms = data[18:23] transcript = data[23] if status != "Approved": continue entities.add((geneSymbol, "Gene", geneName, geneFamily, ",".join(synonyms), taxid)) #relationships.add((geneSymbol, transcript, "TRANSCRIBED_INTO")) builder_utils.remove_directory(directory) return entities, entities_header
def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="hpaConfig.yml", data_type='databases') url = config['hpa_pathology_url'] disease_mapping = mp.getMappingFromOntology(ontology="Disease", source=None) protein_mapping = mp.getMultipleMappingForEntity("Protein") directory = os.path.join(databases_directory, "HPA") builder_utils.checkDirectory(directory) compressed_fileName = os.path.join(directory, url.split('/')[-1]) file_name = '.'.join(url.split('/')[-1].split('.')[0:2]) relationships_headers = config['relationships_headers'] if download: builder_utils.downloadDB(url, directory) with zipfile.ZipFile(compressed_fileName) as z: if file_name == "pathology.tsv": pathology = parsePathologyFile(config, z, file_name, protein_mapping, disease_mapping) builder_utils.remove_directory(directory) return (pathology, relationships_headers)
def parseActions(databases_directory, importDirectory, proteinMapping, drugMapping=None, download=True, db="STRING"): config = builder_utils.get_config(config_name="stringConfig.yml", data_type='databases') url = None bool_dict = { 't': True, 'T': True, 'True': True, 'TRUE': True, 'f': False, 'F': False, 'False': False, 'FALSE': False } header = config['header_actions'] relationship = "COMPILED_ACTS_ON" stored = set() if db == "STRING": url = config['STRING_actions_url'] outputfile = os.path.join(importDirectory, "string_protein_acts_on_protein.tsv") elif db == "STITCH": url = config['STITCH_actions_url'] outputfile = os.path.join(importDirectory, "stitch_drug_acts_on_protein.tsv") directory = os.path.join(databases_directory, db) builder_utils.checkDirectory(directory) fileName = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) f = os.path.join(directory, fileName) associations = gzip.open(f, 'r') first = True with open(outputfile, 'w') as csvfile: writer = csv.writer(csvfile, delimiter='\t', escapechar='\\', quotechar='"', quoting=csv.QUOTE_ALL) writer.writerow(header) for line in associations: if first: first = False continue data = line.decode('utf-8').rstrip("\r\n").split() intA = data[0] intB = data[1] action = data[2] score = float(data[-1]) / 1000 directionality = bool_dict[data[-3]] if db == "STRING" else True if intB in proteinMapping: aliasesA = [] if intA in drugMapping: aliasesA = drugMapping[intA] elif intA in proteinMapping: aliasesA = proteinMapping[intA] for aliasA in aliasesA: for aliasB in proteinMapping[intB]: if (aliasA, aliasB, action) not in stored: row = (aliasA, aliasB, relationship, action, directionality, score, db) writer.writerow(row) stored.add((aliasA, aliasB, action)) stored.add((aliasB, aliasA, action)) associations.close()
def parser(databases_directory, importDirectory, drug_source=None, download=True, db="STRING"): config = builder_utils.get_config(config_name="stringConfig.yml", data_type='databases') mapping = mp.getSTRINGMapping(download=False) stored = set() relationship = None cutoff = config['STRING_cutoff'] header = config['header'] drugmapping = {} if db == "STITCH": evidences = [ "experimental", "prediction", "database", "textmining", "score" ] relationship = "COMPILED_INTERACTS_WITH" url = config['STITCH_url'] outputfile = os.path.join(importDirectory, "stitch_associated_with.tsv") drugmapping = mp.getSTRINGMapping(source=drug_source, download=download, db=db) elif db == "STRING": evidences = [ "Neighborhood in the Genome", "Gene fusions", "Co-ocurrence across genomes", "Co-expression", "Experimental/biochemical data", "Association in curated databases", "Text-mining" ] relationship = "COMPILED_TARGETS" outputfile = os.path.join(importDirectory, "string_interacts_with.tsv") url = config['STRING_url'] directory = os.path.join(databases_directory, db) builder_utils.checkDirectory(directory) fileName = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) f = os.path.join(directory, fileName) associations = gzip.open(f, 'r') first = True with open(outputfile, 'w') as csvfile: writer = csv.writer(csvfile, delimiter='\t', escapechar='\\', quotechar='"', quoting=csv.QUOTE_ALL) writer.writerow(header) for line in associations: if first: first = False continue data = line.decode('utf-8').rstrip("\r\n").split() intA = data[0] intB = data[1] scores = data[2:] fscores = [str(float(score) / 1000) for score in scores] if db == "STRING": if intA in mapping and intB in mapping and float( fscores[-1]) >= cutoff: for aliasA in mapping[intA]: for aliasB in mapping[intB]: if (aliasA, aliasB) not in stored: row = (aliasA, aliasB, relationship, "association", db, ",".join(evidences), ",".join(fscores[0:-1]), fscores[-1]) stored.add((aliasA, aliasB)) stored.add((aliasB, aliasB)) writer.writerow(row) elif db == "STITCH": if intA in drugmapping and intB in mapping and float( fscores[-1]) >= cutoff: for aliasA in drugmapping[intA]: for aliasB in mapping[intB]: if (aliasA, aliasB) not in stored: row = (aliasA, aliasB, relationship, "association", db, ",".join(evidences), ",".join(fscores[0:-1]), fscores[-1]) stored.add((aliasA, aliasB)) stored.add((aliasB, aliasB)) writer.writerow(row) associations.close() return mapping, drugmapping
def parser(databases_directory, download=True): entities = set() relationships = defaultdict(set) directory = os.path.join(databases_directory, "CORUM") builder_utils.checkDirectory(directory) try: config = builder_utils.get_config(config_name="corumConfig.yml", data_type='databases') except Exception as err: raise Exception("Reading configuration > {}.".format(err)) database_url = config['database_url'] entities_header = config['entities_header'] relationships_headers = config['relationships_headers'] zipped_fileName = os.path.join(directory, database_url.split('/')[-1]) fileName = '.'.join(database_url.split('/')[-1].split('.')[0:2]) if download: builder_utils.downloadDB(database_url, directory) names = set() first = True with zipfile.ZipFile(zipped_fileName) as z: with z.open(fileName) as f: for line in f: if first: first = False continue data = line.decode("utf-8").rstrip("\r\n").split("\t") identifier = data[0] name = data[1] organism = data[2] synonyms = data[3].split(';') if data[3] != "None" else [""] cell_lines = data[4].join(';') subunits = data[5].split(';') evidences = data[7].split(';') processes = data[8].split(';') pubmedid = data[14] if organism == "Human": #ID name organism synonyms source if name not in names: entities.add((identifier, name, "9606", ",".join(synonyms), "CORUM")) names.add(name) for subunit in subunits: #START_ID END_ID type cell_lines evidences publication source relationships[("Protein", "is_subunit_of")].add( (subunit, identifier, "IS_SUBUNIT_OF", ",".join(cell_lines), ",".join(evidences), pubmedid, "CORUM")) for process in processes: #START_ID END_ID type evidence_type score source relationships["Biological_process", "associated_with"].add( (identifier, process, "ASSOCIATED_WITH", "CURATED", 5, "CORUM")) builder_utils.remove_directory(directory) return entities, relationships, entities_header, relationships_headers
def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="oncokbConfig.yml", data_type='databases') url_actionable = config['OncoKB_actionable_url'] url_annotation = config['OncoKB_annotated_url'] amino_acids = config['amino_acids'] entities_header = config['entities_header'] relationships_headers = config['relationships_headers'] mapping = mp.getMappingFromOntology(ontology="Disease", source=None) drug_mapping = mp.getMappingForEntity("Drug") protein_mapping = mp.getMultipleMappingForEntity("Protein") levels = config['OncoKB_levels'] entities = set() relationships = defaultdict(set) directory = os.path.join(databases_directory, "OncoKB") builder_utils.checkDirectory(directory) acfileName = os.path.join(directory, url_actionable.split('/')[-1]) anfileName = os.path.join(directory, url_annotation.split('/')[-1]) if download: builder_utils.downloadDB(url_actionable, directory) builder_utils.downloadDB(url_annotation, directory) variant_regex = r"(\D\d+\D)$" with open(anfileName, 'r', errors='replace') as variants: first = True for line in variants: if first: first = False continue data = line.rstrip("\r\n").split("\t") gene = data[3] variant = data[4] oncogenicity = data[5] effect = data[6] if gene in protein_mapping: for protein in protein_mapping[gene]: match = re.search(variant_regex, variant) if match: if variant[0] in amino_acids and variant[ -1] in amino_acids: valid_variant = protein + '_p.' + amino_acids[ variant[0]] + ''.join( variant[1:-1]) + amino_acids[variant[-1]] entities.add( (valid_variant, "Clinically_relevant_variant", "", "", "", "", "", effect, oncogenicity)) with open(acfileName, 'r', errors='replace') as associations: first = True for line in associations: if first: first = False continue data = line.rstrip("\r\n").split("\t") isoform = data[1] gene = data[3] variant = data[5] disease = data[6] level = data[7] drugs = data[8].split(', ') pubmed_ids = data[9].split(',') if level in levels: level = levels[level] valid_variants = [] if gene in protein_mapping: for protein in protein_mapping[gene]: match = re.search(variant_regex, variant) if match: if variant[0] in amino_acids and variant[ -1] in amino_acids: valid_variants.append(protein + '_p.' + amino_acids[variant[0]] + ''.join(variant[1:-1]) + amino_acids[variant[-1]]) for drug in drugs: for d in drug.split(' + '): if d.lower() in drug_mapping: drug = drug_mapping[d.lower()] relationships["targets"].add( (drug, gene, "CURATED_TARGETS", "curated", "NA", "NA", "curated", "OncoKB")) for valid_variant in valid_variants: relationships[ "targets_clinically_relevant_variant"].add( (drug, valid_variant, "TARGETS_KNOWN_VARIANT", level[0], level[1], disease, "curated", "OncoKB")) for valid_variant in valid_variants: if disease.lower() in mapping: disease = mapping[disease.lower()] relationships["associated_with"].add( (valid_variant, disease, "ASSOCIATED_WITH", "curated", "curated", "OncoKB", len(pubmed_ids))) else: pass relationships["known_variant_is_clinically_relevant"].add( (valid_variant, valid_variant, "KNOWN_VARIANT_IS_CLINICALLY_RELEVANT", "OncoKB")) builder_utils.remove_directory(directory) return (entities, relationships, entities_header, relationships_headers)
import config.ckg_config as ckg_config import ckg_utils from graphdb_connector import connector from graphdb_builder import builder_utils from graphdb_builder.builder import loader from graphdb_builder.experiments import experiments_controller as eh log_config = ckg_config.report_manager_log logger = builder_utils.setup_logging(log_config, key="project_creation") cwd = os.path.abspath(os.path.dirname(__file__)) experimentDir = os.path.join(cwd, '../../../data/experiments') importDir = os.path.join(cwd, '../../../data/imports/experiments') try: config = builder_utils.get_config(config_name="clinical.yml", data_type='experiments') except Exception as err: logger.error("Reading configuration > {}.".format(err)) def get_project_creation_queries(): """ Reads the YAML file containing the queries relevant to user creation, parses the given stream and \ returns a Python object (dict[dict]). :return: Nested dictionary. """ try: cwd = os.path.abspath(os.path.dirname(__file__)) queries_path = "../queries/project_creation_cypher.yml" project_creation_cypher = ckg_utils.get_queries(
def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="refseqConfig.yml", data_type='databases') url = config['refseq_url'] ftp_dir = config['refseq_ftp_dir'] entities = defaultdict(set) relationships = defaultdict(set) directory = os.path.join(databases_directory, "RefSeq") builder_utils.checkDirectory(directory) fileName = os.path.join(directory, url.split('/')[-1]) headers = config['headerEntities'] taxid = 9606 if download: file_dir = builder_utils.list_ftp_directory(ftp_dir)[0] new_file = file_dir.split('/')[-1] + "_feature_table.txt.gz" url = ftp_dir + file_dir.split('/')[-1] + "/" + new_file builder_utils.downloadDB(url, directory) fileName = os.path.join(directory, new_file) if os.path.isfile(fileName): df = builder_utils.read_gzipped_file(fileName) first = True for line in df: if first: first = False continue data = line.rstrip("\r\n").split("\t") tclass = data[1] assembly = data[2] chrom = data[5] geneAcc = data[6] start = data[7] end = data[8] strand = data[9] protAcc = data[10] name = data[13] symbol = data[14] if protAcc != "": entities["Transcript"].add( (protAcc, "Transcript", name, tclass, assembly, taxid)) if chrom != "": entities["Chromosome"].add( (chrom, "Chromosome", chrom, taxid)) relationships["LOCATED_IN"].add( (protAcc, chrom, "LOCATED_IN", start, end, strand, "RefSeq")) if symbol != "": relationships["TRANSCRIBED_INTO"].add( (symbol, protAcc, "TRANSCRIBED_INTO", "RefSeq")) elif geneAcc != "": entities["Transcript"].add( (geneAcc, "Transcript", name, tclass, assembly, taxid)) if chrom != "": entities["Chromosome"].add( (chrom, "Chromosome", chrom, taxid)) relationships["LOCATED_IN"].add( (protAcc, chrom, "LOCATED_IN", start, end, strand, "RefSeq")) df.close() builder_utils.remove_directory(directory) return (entities, relationships, headers)