def create_user_from_command_line(args, expiration): """ Creates new user in the graph database and corresponding node, from a terminal window (command line), \ and adds the new user information to the users excel and import files. Arguments as in set_arguments(). :param args: object. Contains all the parameters neccessary to create a user ('username', 'name', 'email', \ 'secondary_email', 'phone_number' and 'affiliation'). :type args: any object with __dict__ attribute :param int expiration: number of days users is given access. .. note:: This function can be used directly with *python create_user_from_command_line.py -u username \ -n user_name -e email -s secondary_email -p phone_number -a affiliation* . """ usersImportDirectory = ckg_config['imports_users_directory'] usersFile = os.path.join(usersImportDirectory, uconfig['usersFile']) builder_utils.checkDirectory(usersImportDirectory) import_file = os.path.join(usersImportDirectory, uconfig['import_file']) data = vars(args) df = pd.DataFrame.from_dict(data, orient='index').T.drop('file', axis=1) create_user(df, import_file, expiration) if os.path.exists(usersFile): excel = pd.read_excel(usersFile, index=0) excel = excel.append(data, ignore_index=True) excel.to_excel(usersFile, index=False) else: df.to_excel(usersFile, index=False)
def parser(databases_directory, drug_source, download=True): config = builder_utils.get_config(config_name="siderConfig.yml", data_type='databases') url = config['SIDER_url'] header = config['header'] output_file = 'sider_has_side_effect.tsv' drugmapping = mp.getSTRINGMapping(source=drug_source, download=download, db="STITCH") phenotypemapping = mp.getMappingFromOntology(ontology="Phenotype", source=config['SIDER_source']) relationships = set() directory = os.path.join(databases_directory, "SIDER") builder_utils.checkDirectory(directory) fileName = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) associations = gzip.open(fileName, 'r') for line in associations: data = line.decode('utf-8').rstrip("\r\n").split("\t") drug = re.sub(r'CID\d', 'CIDm', data[0]) se = data[2] evidence_from = str(data[3]) #freq = data[4] #lower_bound = data[5] #upper_bound = data[6] if se.lower() in phenotypemapping and drug in drugmapping: for d in drugmapping[drug]: p = phenotypemapping[se.lower()] relationships.add((d, p, "HAS_SIDE_EFFECT", "SIDER", se, evidence_from)) associations.close() return (relationships, header, output_file, drugmapping, phenotypemapping)
def parser(databases_directory, importDirectory, download=True): config = builder_utils.get_config(config_name="jensenlabConfig.yml", data_type='databases') outputfileName = "Publications.tsv" url = config['db_url'] ifile = config['organisms_file'] organisms = str(config['organisms']) directory = os.path.join(databases_directory, "Jensenlab") builder_utils.checkDirectory(os.path.join(directory, "textmining")) if download: builder_utils.downloadDB(url.replace("FILE", ifile), os.path.join(directory, "textmining")) ifile = os.path.join(directory, os.path.join("textmining", ifile)) valid_pubs = read_valid_pubs(organisms, ifile) entities, header = parse_PMC_list(config, os.path.join(directory, "textmining"), download=download, valid_pubs=valid_pubs) num_entities = len(entities) outputfile = os.path.join(importDirectory, outputfileName) builder_utils.write_entities(entities, header, outputfile) entities = None for qtype in config['db_mentions_types']: parse_mentions(config, directory, qtype, importDirectory, download) builder_utils.remove_directory(os.path.join(directory, "textmining")) return (num_entities, outputfile)
def create_user_from_file(filepath, expiration): """ Creates new user in the graph database and corresponding node, from an excel file. \ Rows in the file must be users, and columns must follow set_arguments() fields. :param str filepath: filepath and filename containing users information. :param str output_file: path to output csv file. :param int expiration: number of days users is given access. .. note:: This function can be used directly with *python create_user_from_file.py -f path_to_file* . """ usersImportDirectory = ckg_config['imports_users_directory'] usersFile = os.path.join(usersImportDirectory, uconfig['usersFile']) builder_utils.checkDirectory(usersImportDirectory) import_file = os.path.join(usersImportDirectory, uconfig['import_file']) data = vars(args) data = pd.read_excel(data['file']).applymap(str) create_user(data, import_file, expiration) if os.path.exists(usersFile): excel = pd.read_excel(usersFile, index=0) excel = excel.append(data.drop('file', axis=1), ignore_index=True) excel.to_excel(usersFile, index=False) else: data.to_excel(usersFile, index=False)
def parseUniProtAnnotations(config, databases_directory, download=True): roots = { 'F': 'Molecular_function', 'C': 'Cellular_component', 'P': 'Biological_process' } url = config['uniprot_go_annotations'] relationships = defaultdict(set) directory = os.path.join(databases_directory, "UniProt") builder_utils.checkDirectory(directory) fileName = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) af = builder_utils.read_gzipped_file(fileName) for line in af: line = line if line.startswith('!'): continue data = line.rstrip("\r\n").split("\t") identifier = data[1] go = data[4] evidence = data[6] root = data[8] if root in roots: root = roots[root] relationships[(root, 'associated_with')].add( (identifier, go, "ASSOCIATED_WITH", evidence, 5, "UniProt")) return relationships
def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="smpdbConfig.yml", data_type='databases') urls = config['smpdb_urls'] entities = set() relationships = defaultdict(set) entities_header = config['pathway_header'] relationships_headers = config['relationships_header'] directory = os.path.join(databases_directory, "SMPDB") builder_utils.checkDirectory(directory) for dataset in urls: url = urls[dataset] file_name = url.split('/')[-1] if download: builder_utils.downloadDB(url, directory) zipped_file = os.path.join(directory, file_name) with zipfile.ZipFile(zipped_file) as rf: if dataset == "pathway": entities = parsePathways(config, rf) elif dataset == "protein": relationships.update(parsePathwayProteinRelationships(rf)) elif dataset == "metabolite": relationships.update( parsePathwayMetaboliteDrugRelationships(rf)) builder_utils.remove_directory(directory) return entities, relationships, entities_header, relationships_headers
def parsePairs(config, databases_directory, qtype, mapping, download=True): url = config['db_url'] ifile = config['db_files'][qtype] source = config['db_sources'][qtype] relationships = set() directory = os.path.join(databases_directory, "Jensenlab") builder_utils.checkDirectory(os.path.join(directory, "integration")) if download: builder_utils.downloadDB(url.replace("FILE", ifile), os.path.join(directory, "integration")) ifile = os.path.join(directory,os.path.join("integration", ifile)) with open(ifile, 'r') as idbf: for line in idbf: data = line.rstrip("\r\n").split('\t') id1 = "9606."+data[0] id2 = data[2] score = float(data[4]) if id1 in mapping: for ident in mapping[id1]: relationships.add((ident, id2, "ASSOCIATED_WITH_INTEGRATED", source, score, "compiled")) else: continue return relationships
def parser(databases_directory, download=True): relationships = defaultdict(set) config = builder_utils.get_config(config_name="disgenetConfig.yml", data_type='databases') files = config['disgenet_files'] mapping_files = config['disgenet_mapping_files'] url = config['disgenet_url'] directory = os.path.join(databases_directory, "disgenet") builder_utils.checkDirectory(directory) header = config['disgenet_header'] output_file = 'disgenet_associated_with.tsv' if download: for f in files: builder_utils.downloadDB(url + files[f], directory) for f in mapping_files: builder_utils.downloadDB(url + mapping_files[f], directory) proteinMapping = readDisGeNetProteinMapping(config, directory) diseaseMapping = readDisGeNetDiseaseMapping(config, directory) for f in files: first = True associations = gzip.open(os.path.join(directory, files[f]), 'r') dtype, atype = f.split('_') if dtype == 'gene': idType = "Protein" scorePos = 9 if dtype == 'variant': idType = "Transcript" scorePos = 5 for line in associations: if first: first = False continue try: data = line.decode('utf-8').rstrip("\r\n").split("\t") geneId = str(int(data[0])) #disease_specificity_index = data[2] #disease_pleiotropy_index = data[3] diseaseId = data[4] score = float(data[scorePos]) pmids = data[13] source = data[-1] if geneId in proteinMapping: for identifier in proteinMapping[geneId]: if diseaseId in diseaseMapping: for code in diseaseMapping[diseaseId]: code = "DOID:" + code relationships[idType].add( (identifier, code, "ASSOCIATED_WITH", score, atype, "DisGeNet: " + source, pmids)) except UnicodeDecodeError: continue associations.close() builder_utils.remove_directory(directory) return (relationships, header, output_file)
def parserIndications(databases_directory, drugMapping, phenotypeMapping, download=True): config = builder_utils.get_config(config_name="siderConfig.yml", data_type='databases') url = config['SIDER_indications'] header = config['indications_header'] output_file = 'sider_is_indicated_for.tsv' relationships = set() directory = os.path.join(databases_directory, "SIDER") builder_utils.checkDirectory(directory) fileName = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) associations = gzip.open(fileName, 'r') for line in associations: data = line.decode('utf-8').rstrip("\r\n").split("\t") drug = re.sub(r'CID\d', 'CIDm', data[0]) se = data[1] evidence = data[2] if se.lower() in phenotypeMapping and drug in drugMapping: for d in drugMapping[drug]: p = phenotypeMapping[se.lower()] relationships.add((d, p, "IS_INDICATED_FOR", evidence, "SIDER", se)) associations.close() builder_utils.remove_directory(directory) return (relationships, header, output_file)
def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="signorConfig.yml", data_type='databases') directory = os.path.join(databases_directory, "SIGNOR") builder_utils.checkDirectory(directory) url = config['url'] modifications = config['modifications'] amino_acids = config['amino_acids'] accronyms = config['accronyms'] entities_header = config['entities_header'] relationships_headers = config['rel_headers'] entities = set() relationships = defaultdict(set) filename = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) entities, relationships = parse_substrates(filename, modifications, accronyms, amino_acids) return entities, relationships, entities_header, relationships_headers
def parseUniProtPeptides(config, databases_directory, download=True): file_urls = config['uniprot_peptides_files'] entities = set() relationships = defaultdict(set) directory = os.path.join(databases_directory, "UniProt") builder_utils.checkDirectory(directory) for url in file_urls: fileName = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) first = True with open(fileName, 'r', encoding='utf-8') as f: for line in f: if first: first = False continue data = line.rstrip("\r\n").split("\t") peptide = data[0] accs = data[6].split(",") is_unique = True if len(accs) > 1: is_unique = False entities.add( (peptide, "Peptide", "tryptic peptide", is_unique)) for protein in accs: relationships[("Peptide", 'belongs_to_protein')].add( (peptide, protein, "BELONGS_TO_PROTEIN", "UniProt")) return entities, relationships
def save_files_in_tmp(content, dataset, prot_tool, prot_file, projectid, uploaded_file): if dataset is not None: session_cookie = flask.request.cookies.get('custom-auth-session') temporaryDirectory = os.path.join(ckg_config['tmp_directory'], session_cookie + "upload") if not os.path.exists(ckg_config['tmp_directory']): os.makedirs(ckg_config['tmp_directory']) elif not os.path.exists(temporaryDirectory): os.makedirs(temporaryDirectory) directory = os.path.join(temporaryDirectory, dataset) if os.path.exists(directory) and uploaded_file is not None: if os.path.exists(os.path.join(directory, uploaded_file)): shutil.rmtree(directory) builder_utils.checkDirectory(directory) if dataset in [ 'proteomics', 'interactomics', 'phosphoproteomics' ] and prot_tool != '' and (prot_file != '' or prot_tool == 'mzTab'): selected_file = prot_tool.lower() + "-" + prot_file.lower() if selected_file in config['file_proteomics']: filename = config['file_proteomics'][selected_file] else: if prot_tool == 'mzTab': filename = dataset + '_' + prot_tool.lower() + '.mztab' else: filename = dataset + '_' + prot_tool.lower( ) + '_' + prot_file.replace( ' ', '').lower() + '.' + uploaded_file.split('.')[-1] directory = os.path.join(directory, prot_tool.lower()) if os.path.exists(directory): if os.path.exists(os.path.join(directory, filename)): os.remove(os.path.join(directory, filename)) builder_utils.checkDirectory(directory) elif dataset == 'experimental_design': filename = config['file_design'].split( '_')[0] + '_' + projectid + '.' + uploaded_file.split('.')[-1] elif dataset == 'clinical': filename = config['file_clinical'].split( '_')[0] + '_' + projectid + '.' + uploaded_file.split('.')[-1] if uploaded_file is None: content = None if content is not None: data = builder_utils.parse_contents(content, filename) builder_utils.export_contents(data, directory, filename) uploaded = uploaded_file uploaded_file = None return uploaded, uploaded_file, '', '' else: raise PreventUpdate return '', None, '', ''
def parser(databases_directory, download=True): relationships = defaultdict(set) directory = os.path.join(databases_directory, "FooDB") builder_utils.checkDirectory(directory) config = builder_utils.get_config(config_name="foodbConfig.yml", data_type='databases') database_url = config['database_url'] entities_header = config['entities_header'] relationships_headers = config['relationships_headers'] tar_fileName = os.path.join(directory, database_url.split('/')[-1]) if download: builder_utils.downloadDB(database_url, directory) contents = {} food = set() compounds = {} try: tf = tarfile.open(tar_fileName, 'r') file_content = tf.getnames() tar_dir = file_content[1] tf.extractall(path=directory) tf.close() for file_name in config['files']: path = os.path.join(directory, os.path.join(tar_dir, file_name)) with open(path, 'r', encoding="utf-8", errors='replace') as f: if file_name == "Content.csv": contents = parseContents(f) elif file_name == "Food.csv": food, mapping = parseFood(f) elif file_name == "Compound.csv": compounds = parseCompounds(f) for food_id, compound_id in contents: if compound_id in compounds: compound_code = compounds[compound_id].replace( "HMDB", "HMDB00") relationships[("food", "has_content" )].add((food_id, compound_code, "HAS_CONTENT") + contents[(food_id, compound_id)]) mp.reset_mapping(entity="Food") with open(os.path.join(directory, "mapping.tsv"), 'w', encoding='utf-8') as out: for food_id in mapping: for alias in mapping[food_id]: out.write(str(food_id) + "\t" + str(alias) + "\n") mp.mark_complete_mapping(entity="Food") except tarfile.ReadError as err: raise Exception("Error importing database FooDB.\n {}".format(err)) builder_utils.remove_directory(directory) return food, relationships, entities_header, relationships_headers
def parser(databases_directory): config = builder_utils.get_config(config_name="drugBankConfig.yml", data_type='databases') directory = os.path.join(databases_directory, "DrugBank") builder_utils.checkDirectory(directory) drugs = extract_drugs(config, directory) build_DrugBank_dictionary(config, directory, drugs) relationships = build_relationships_from_DrugBank(config, drugs) entities, attributes = build_drug_entity(config, drugs) entities_header = ['ID'] + attributes relationships_headers = config['relationships_headers'] return (entities, relationships, entities_header, relationships_headers)
def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="gwasCatalogConfig.yml", data_type='databases') url = config['GWASCat_url'] entities_header = config['entities_header'] relationships_header = config['relationships_header'] entities = set() relationships = defaultdict(set) directory = os.path.join(databases_directory, "GWAScatalog") builder_utils.checkDirectory(directory) fileName = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) with open(fileName, 'r', encoding="utf-8") as catalog: for line in catalog: data = line.rstrip("\r\n").split("\t") if len(data) > 36: pubmedid = data[1] date = data[3] title = data[6] sample_size = data[8] replication_size = data[9] #chromosome = data[11] #position = data[12] #genes_mapped = data[14].split(" - ") snp_id = data[20].split('-')[0] freq = data[26] pval = data[27] odds_ratio = data[30] trait = data[34] exp_factor = data[35] study = data[36] entities.add((study, "GWAS_study", title, date, sample_size, replication_size, trait)) if pubmedid != "": relationships["published_in_publication"].add( (study, pubmedid, "PUBLISHED_IN", "GWAS Catalog")) if snp_id != "": relationships["variant_found_in_gwas"].add( (re.sub(r"^\W+|\W+$", "", snp_id), study, "VARIANT_FOUND_IN_GWAS", freq, pval, odds_ratio, trait, "GWAS Catalog")) if exp_factor != "": exp_factor = exp_factor.split('/')[-1].replace('_', ':') relationships["studies_trait"].add( (study, exp_factor, "STUDIES_TRAIT", "GWAS Catalog")) builder_utils.remove_directory(directory) return (entities, relationships, entities_header, relationships_header)
def experimentImport(importDirectory, experimentsDirectory, project): """ Generates all the entities and relationships from the specified Project. Called from function experimentsImport. :param str importDirectory: path to the directory where all the import files are generated. :param str experimentDirectory: path to the directory where all the experiments are located. :param str project: identifier of the project to be imported. """ projectPath = os.path.join(importDirectory, project) builder_utils.checkDirectory(projectPath) projectDirectory = os.path.join(experimentsDirectory, project) datasets = builder_utils.listDirectoryFolders(projectDirectory) if 'project' in datasets: dataset = 'project' datasetPath = os.path.join(projectPath, dataset) builder_utils.checkDirectory(datasetPath) eh.generate_dataset_imports(project, dataset, datasetPath) datasets.remove(dataset) if 'experimental_design' in datasets: dataset = 'experimental_design' datasetPath = os.path.join(projectPath, dataset) builder_utils.checkDirectory(datasetPath) eh.generate_dataset_imports(project, dataset, datasetPath) datasets.remove(dataset) for dataset in datasets: datasetPath = os.path.join(projectPath, dataset) builder_utils.checkDirectory(datasetPath) eh.generate_dataset_imports(project, dataset, datasetPath)
def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="hmdbConfig.yml", data_type='databases') directory = os.path.join(databases_directory, "HMDB") builder_utils.checkDirectory(directory) metabolites = extract_metabolites(config, directory, download) mapping = mp.getMappingFromOntology(ontology="Disease", source=config['HMDB_DO_source']) mapping.update(mp.getMappingFromOntology(ontology="Tissue", source=None)) entities, attributes = build_metabolite_entity(config, directory, metabolites) relationships = build_relationships_from_HMDB(config, metabolites, mapping) entities_header = ['ID'] + attributes relationships_header = config['relationships_header'] #builder_utils.remove_directory(directory) return (entities, relationships, entities_header, relationships_header)
def parse_fasta(databases_directory, config, import_directory, download=True, updated_on=None): stats = set() url = config['uniprot_fasta_file'] entities_output_file = os.path.join(import_directory, "Amino_acid_sequence.tsv") rel_output_file = os.path.join( import_directory, "Protein_HAS_Sequence_Amino_acid_sequence.tsv") directory = os.path.join(databases_directory, "UniProt") builder_utils.checkDirectory(directory) file_name = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) ff = builder_utils.read_gzipped_file(file_name) records = builder_utils.parse_fasta(ff) num_entities = 0 with open(entities_output_file, 'w', encoding='utf-8') as ef: ef.write('ID\theader\tsequence\tsize\tsource\n') with open(rel_output_file, 'w', encoding='utf-8') as rf: rf.write('START_ID\tEND_ID\tTYPE\tsource\n') for i, batch in enumerate( builder_utils.batch_iterator(records, 1000)): for record in batch: identifier = record.id.split('|')[1] header = record.id sequence = str(record.seq) sequence_len = len(str(sequence)) ef.write(identifier + "\t" + header + '\t' + sequence + '\t' + str(sequence_len) + '\tUniProt\n') rf.write(identifier + '\t' + identifier + '\tHAS_SEQUENCE\tUniProt\n') num_entities += 1 stats.add( builder_utils.buildStats(num_entities, "entity", "Amino_acid_sequence", "UniProt", entities_output_file, updated_on)) stats.add( builder_utils.buildStats(num_entities, "relationships", "HAS_SEQUENCE", "UniProt", rel_output_file, updated_on)) return stats
def ontologiesImport(ontologies=None, download=True, import_type="partial"): """ Generates all the entities and relationships from the provided ontologies. If the ontologies list is\ not provided, then all the ontologies listed in the configuration will be imported (full_import). \ This function also updates the stats object with numbers from the imported ontologies. :param list ontologies: a list of ontology names to be imported. :param bool download: wether database is to be downloaded. :param str import_type: type of import (´full´ or ´partial´). """ ontologiesImportDirectory = ckg_config['imports_ontologies_directory'] builder_utils.checkDirectory(ontologiesImportDirectory) stats = oh.generate_graphFiles(ontologiesImportDirectory, ontologies, download) statsDf = generateStatsDataFrame(stats) setupStats(import_type=import_type) writeStats(statsDf, import_type)
def archiveImportDirectory(archive_type="full"): """ This function creates the compressed backup imports folder with either the whole folder \ (full update) or with only the files uploaded (partial update). The folder or files are \ compressed into a gzipped tarball file and stored in the archive/ folder defined in the \ configuration. :param str archive_type: whether it is a full update or a partial update. """ dest_folder = ckg_config["archive_directory"] builder_utils.checkDirectory(dest_folder) folder_to_backup = ckg_config["imports_directory"] date, time = builder_utils.getCurrentTime() file_name = "{}_{}_{}".format(archive_type, date.replace('-', ''), time.replace(':', '')) logger.info("Archiving {} to file: {}".format(folder_to_backup, file_name)) builder_utils.compress_directory(folder_to_backup, dest_folder, file_name) logger.info("New backup created: {}".format(file_name))
def experimentsImport(projects=None, n_jobs=1, import_type="partial"): """ Generates all the entities and relationships from the specified Projects. If the projects list is\ not provided, then all the projects the experiments directory will be imported (full_import). \ Calls function experimentImport. :param list projects: list of project identifiers to be imported. :param int n_jobs: number of jobs to run in parallel. 1 by default when updating one project. :param str import_type: type of import (´full´ or ´partial´). """ experiments_import_directory = ckg_config['imports_experiments_directory'] builder_utils.checkDirectory(experiments_import_directory) experiments_directory = ckg_config['experiments_directory'] if projects is None: projects = builder_utils.listDirectoryFolders(experiments_directory) if len(projects) > 0: Parallel(n_jobs=n_jobs)(delayed(experimentImport)( experiments_import_directory, experiments_directory, project) for project in projects)
def getSTRINGMapping(source="BLAST_UniProt_AC", download=True, db="STRING"): """ Parses database (db) and extracts relationships between identifiers to order databases (source). :param str url: link to download database raw file. :param str source: name of the source database for selecting aliases. :param bool download: wether to download the file or not. :param str db: name of the database to be parsed. :return: Dictionary of database identifers (keys) and set of unique aliases to other databases (values). """ url = get_STRING_mapping_url(db=db) mapping = defaultdict(set) directory = os.path.join(dbconfig["databasesDir"], db) file_name = os.path.join(directory, url.split('/')[-1]) builder_utils.checkDirectory(directory) if download: print("Downloading", url, directory) builder_utils.downloadDB(url, directory) f = os.path.join(directory, file_name) first = True with gzip.open(f, 'rb') as mf: for line in mf: if first: first = False continue data = line.decode('utf-8').rstrip("\r\n").split("\t") if db == "STRING": stringID = data[0] alias = data[1] sources = data[2].split(' ') else: stringID = data[0] alias = data[2] sources = data[3].split(' ') if not alias.startswith('DB'): continue if source in sources: mapping[stringID].add(alias) return mapping
def databasesImport(databases=None, n_jobs=1, download=True, import_type="partial"): """ Generates all the entities and relationships from the provided databases. If the databases list is\ not provided, then all the databases listed in the configuration will be imported (full_import).\ This function also updates the stats object with numbers from the imported databases. :param list databases: a list of database names to be imported. :param int n_jobs: number of jobs to run in parallel. 1 by default when updating one database. :param str import_type: type of import (´full´ or ´partial´). """ databasesImportDirectory = ckg_config['imports_databases_directory'] builder_utils.checkDirectory(databasesImportDirectory) stats = dh.generateGraphFiles(databasesImportDirectory, databases, download, n_jobs) statsDf = generateStatsDataFrame(stats) setupStats(import_type=import_type) writeStats(statsDf, import_type)
def parser(databases_dir, download=True): config = builder_utils.get_config(config_name="goaConfig.yml", data_type='databases') url = config['url'] rel_header = config['header'] protein_mapping = mp.getMappingForEntity(entity="Protein") valid_proteins = list(set(protein_mapping.values)) directory = os.path.join(databases_dir, "GOA") builder_utils.checkDirectory(directory) file_name = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) annotations = parse_annotations_with_pandas(file_name, valid_proteins) builder_utils.remove_directory(directory) return annotations, rel_header
def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="reactomeConfig.yml", data_type='databases') urls = config['reactome_urls'] entities = set() relationships = defaultdict(set) entities_header = config['pathway_header'] relationships_headers = config['relationships_header'] directory = os.path.join(databases_directory, "Reactome") builder_utils.checkDirectory(directory) metabolite_mapping = mp.getMappingForEntity("Metabolite") #drug_mapping = mp.getMappingForEntity("Drug") for dataset in urls: url = urls[dataset] file_name = url.split('/')[-1] if download: builder_utils.downloadDB(url, directory) f = os.path.join(directory, file_name) with open(f, 'r') as rf: if dataset == "pathway": entities = parsePathways(config, databases_directory, rf) elif dataset == "hierarchy": relationships[("pathway", "has_parent")] = parsePathwayHierarchy(rf) elif dataset == "protein": relationships[( dataset, "annotated_to_pathway")] = parsePathwayRelationships( config, rf) elif dataset == "metabolite": relationships[( dataset, "annotated_to_pathway")] = parsePathwayRelationships( config, rf, metabolite_mapping) #elif dataset == "drug": #relationships[(dataset, "annotated_to_pathway")] = set() builder_utils.remove_directory(directory) return entities, relationships, entities_header, relationships_headers
def parser(databases_directory, download=True): relationships = set() config = builder_utils.get_config(config_name="mutationDsConfig.yml", data_type='databases') header = config['header'] output_file_name = "mutation_curated_affects_interaction_with.tsv" regex = r":(\w+)\(" url = config['mutations_url'] directory = os.path.join(databases_directory, "MutationDs") builder_utils.checkDirectory(directory) file_name = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) with open(file_name, 'r') as mf: first = True for line in mf: if first: first = False continue data = line.rstrip("\r\n").split("\t") if len(data) > 12: internal_id = data[0] pvariant = '_'.join(data[1].split(':')) effect = data[5] organism = data[10] interaction = data[11] evidence = data[12] if organism.startswith("9606"): matches = re.finditer(regex, interaction) for matchNum, match in enumerate(matches, start=1): interactor = match.group(1) relationships.add((pvariant, interactor, "CURATED_AFFECTS_INTERACTION_WITH", effect, interaction, evidence, internal_id, "Intact-MutationDs")) builder_utils.remove_directory(directory) return (relationships, header, output_file_name)
def parser(databases_directory, download=True): config = builder_utils.get_config( config_name="drugGeneInteractionDBConfig.yml", data_type='databases') url = config['DGIdb_url'] header = config['header'] output_file = "dgidb_targets.tsv" drugmapping = mp.getMappingForEntity("Drug") relationships = set() directory = os.path.join(databases_directory, "DGIdb") builder_utils.checkDirectory(directory) fileName = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) with open(fileName, 'r', encoding='utf-8') as associations: first = True for line in associations: if first: first = False continue data = line.rstrip("\r\n").split("\t") gene = data[0] source = data[3] interactionType = data[4] if data[4] != '' else 'unknown' drug = data[8].lower() if drug == "": drug = data[7] if drug == "" and data[6] != "": drug = data[6] else: continue if gene != "": if drug in drugmapping: drug = drugmapping[drug] relationships.add((drug, gene, "TARGETS", "NA", "NA", "NA", interactionType, "DGIdb: " + source)) builder_utils.remove_directory(directory) return (relationships, header, output_file)
def parser(databases_directory): directory = os.path.join(databases_directory, "PhosphoSitePlus") builder_utils.checkDirectory(directory) config = builder_utils.get_config(config_name="pspConfig.yml", data_type='databases') modifications = config['modifications'] annotation_files = config['annotation_files'] entities_header = config['entities_header'] relationships_headers = config['rel_headers'] entities = set() relationships = defaultdict(set) for site_file in config['site_files']: file_name = os.path.join(directory, site_file) with gzip.open(file_name, 'r') as f: sites, site_relationships = parseSites(f, modifications) entities.update(sites) for r in site_relationships: relationships[r].update(site_relationships[r]) for er in annotation_files: entity, relationship_type = er.split('-') file_name = os.path.join(directory, annotation_files[er]) with gzip.open(file_name, 'r') as f: if entity == "disease": mapping = mp.getMappingFromOntology(ontology="Disease", source=None) relationships[(entity, relationship_type)].update( parseDiseaseAnnotations(f, modifications, mapping)) elif entity == "biological_process": mapping = mp.getMappingFromOntology(ontology="Gene_ontology", source=None) relationships[(entity, relationship_type)].update( parseRegulationAnnotations(f, modifications, mapping)) elif entity == "substrate": relationships[(entity, relationship_type)] = parseKinaseSubstrates( f, modifications) return entities, relationships, entities_header, relationships_headers
def parser(databases_directory, download=True): directory = os.path.join(databases_directory, "ExposomeExplorer") builder_utils.checkDirectory(directory) config = builder_utils.get_config(config_name="exposomeConfig.yml", data_type='databases') database_urls = config['database_urls'] relationships_header = config['relationships_header'] mapping = mp.getMappingForEntity("Food") correlations = {} for url in database_urls: zipped_fileName = os.path.join(directory, url.split('/')[-1]) file_name = '.'.join(url.split('/')[-1].split('.')[0:2]) if download: builder_utils.downloadDB(url, directory) with zipfile.ZipFile(zipped_fileName) as z: if file_name == "biomarkers.csv": biomarkers = parseBiomarkersFile(z, file_name) elif file_name == "correlations.csv": correlations = parseCorrelationsFile(z, file_name, biomarkers, mapping) builder_utils.remove_directory(directory) return correlations, relationships_header
def generate_dataset_imports(projectId, dataType, dataset_import_dir): stats = set() builder_utils.checkDirectory(dataset_import_dir) try: if dataType in ['project', 'experimental_design', 'clinical']: data = clinicalParser.parser(projectId, dataType) for dtype, ot in data: generate_graph_files(data[(dtype, ot)], dtype, projectId, stats, ot, dataset_import_dir) elif dataType in ["proteomics", "interactomics", "phosphoproteomics"]: data = proteomicsParser.parser(projectId, dataType) for dtype, ot in data: generate_graph_files(data[(dtype, ot)], dtype, projectId, stats, ot, dataset_import_dir) elif dataType == "wes": data = wesParser.parser(projectId) for dtype, ot in data: generate_graph_files(data[(dtype, ot)], dtype, projectId, stats, ot, dataset_import_dir) else: raise Exception("Error when importing experiment for project {}. Non-existing parser for data type {}".format(projectId, dataType)) except Exception as err: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] logger.error("Error: {}. Experiment {}: {} file: {}, line: {}".format(err, projectId, sys.exc_info(), fname, exc_tb.tb_lineno)) raise Exception("Error {}. Importing experiment {}. Data type {}.".format(err, projectId, dataType))