Exemplo n.º 1
0
def create_new_project(driver, projectId, data, separator='|'):
    """
    Creates a new project in the graph database, following the steps:

    1. Retrieves new project external identifier and creates project node and relationships in the graph database.
    2. Creates subjects, timepoints and intervention nodes.
    3. Saves all the entities and relationships to tab-delimited files.
    4. Returns the number of projects created and the project external identifier.

    :param driver: py2neo driver, which provides the connection to the neo4j graph database.
    :type driver: py2neo driver
    :param str projectId: internal project identifier (CPxxxxxxxxxxxx).
    :param data: pandas Dataframe with project as row and other attributes as columns.
    :param str separator: character used to separate multiple entries in a project attribute.
    :return: Two strings: number of projects created and the project external identifier.
    """
    query_name = 'create_project'
    external_identifier = None
    done = None
    try:
        db_project = check_if_node_exists(driver, 'Project', 'name',
                                          data['name'][0])
        if db_project.empty:
            external_identifier = get_new_project_identifier(driver, projectId)
            if external_identifier is None:
                external_identifier = 'P0000001'
            data['external_id'] = external_identifier

            projectDir = os.path.join(
                experimentDir, os.path.join(external_identifier, 'clinical'))
            ckg_utils.checkDirectory(projectDir)
            data.to_excel(os.path.join(
                projectDir, 'ProjectData_{}.xlsx'.format(external_identifier)),
                          index=False,
                          encoding='utf-8')

            datasetPath = os.path.join(
                os.path.join(importDir, external_identifier), 'clinical')
            ckg_utils.checkDirectory(datasetPath)
            eh.generate_dataset_imports(external_identifier, 'clinical',
                                        datasetPath)
            loader.partialUpdate(imports=['project'],
                                 specific=[external_identifier])
            done = 1
        else:
            done = 0
            external_identifier = ''
    except Exception as err:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        logger.error("Reading query {}: {}, file: {},line: {}, err: {}".format(
            query_name, sys.exc_info(), fname, exc_tb.tb_lineno, err))
    return done, external_identifier
Exemplo n.º 2
0
def run_minimal_update(user, n_jobs=3):
    licensed_dbs = ['phosphositeplus', 'drugbank']
    licensed_ont = ['Clinical_variable']
    mapping_ont = ['Disease', 'Gene_ontology', 'Experimental_factor']
    minimal_load = ['ontologies', 'modified_proteins', 'drugs', 'mentions', 'side effects', 'clinical_variants', 'project', 'experiment']
    logger.info("The user {} chose to perform a minimal build, after creating the database from a dump".format(user))
    logger.info("Building database > step 1: Importing licensed ontologies and databases")
    importer.ontologiesImport(importDirectory=directories['importDirectory'], ontologies=licensed_ont, download=False)
    importer.ontologiesImport(importDirectory=directories['importDirectory'], ontologies=mapping_ont, download=True)
    importer.databasesImport(importDirectory=directories['importDirectory'], databases=licensed_dbs, n_jobs=n_jobs, download=False)
    logger.info("Building database > step 2: Loading all missing nodes and entities")
    loader.partialUpdate(imports=minimal_load, specific=[])

    return True
Exemplo n.º 3
0
def run_processing(n_clicks, project_id):
    message = None
    style = {'display': 'none'}
    table = None
    if n_clicks > 0:
        session_cookie = flask.request.cookies.get('custom-auth-session')
        destDir = os.path.join(experimentDir, project_id)
        builder_utils.checkDirectory(destDir)
        temporaryDirectory = os.path.join(tmpDirectory,
                                          session_cookie + "upload")
        datasets = builder_utils.listDirectoryFoldersNotEmpty(
            temporaryDirectory)
        res_n = dataUpload.check_samples_in_project(driver, project_id)
        if 'experimental_design' in datasets:
            dataset = 'experimental_design'
            directory = os.path.join(temporaryDirectory, dataset)
            experimental_files = os.listdir(directory)
            if config['file_design'].replace('PROJECTID',
                                             project_id) in experimental_files:
                experimental_filename = config['file_design'].replace(
                    'PROJECTID', project_id)
                designData = builder_utils.readDataset(
                    os.path.join(directory, experimental_filename))
                designData = designData.astype(str)
                if 'subject external_id' in designData.columns and 'biological_sample external_id' in designData.columns and 'analytical_sample external_id' in designData.columns:
                    if (res_n > 0).any().values.sum() > 0:
                        res = dataUpload.remove_samples_nodes_db(
                            driver, project_id)
                        res_n = dataUpload.check_samples_in_project(
                            driver, project_id)
                        if (res_n > 0).any().values.sum() > 0:
                            message = 'ERROR: There is already an experimental design loaded into the database and there was an error when trying to delete it. Contact your administrator.'.format(
                                experimental_filename, ','.join([
                                    'subject external_id',
                                    'biological_sample external_id',
                                    'analytical_sample external_id'
                                ]))
                            return message, style, table

                    res_n = None
                    result = create_new_identifiers.apply_async(
                        args=[
                            project_id,
                            designData.to_json(), directory,
                            experimental_filename
                        ],
                        task_id='data_upload_' + session_cookie +
                        datetime.now().strftime('%Y%m-%d%H-%M%S-'))
                    result_output = result.wait(timeout=None,
                                                propagate=True,
                                                interval=0.2)
                    res_n = pd.DataFrame.from_dict(result_output['res_n'])
                else:
                    message = 'ERROR: The Experimental design file provided ({}) is missing some of the required fields: {}'.format(
                        experimental_filename, ','.join([
                            'subject external_id',
                            'biological_sample external_id',
                            'analytical_sample external_id'
                        ]))
                    builder_utils.remove_directory(directory)

                    return message, style, table

        if 'clinical' in datasets:
            dataset = 'clinical'
            directory = os.path.join(temporaryDirectory, dataset)
            clinical_files = os.listdir(directory)
            if config['file_clinical'].replace('PROJECTID',
                                               project_id) in clinical_files:
                clinical_filename = config['file_clinical'].replace(
                    'PROJECTID', project_id)
                data = builder_utils.readDataset(
                    os.path.join(directory, clinical_filename))
                external_ids = {}
                if 'subject external_id' in data and 'biological_sample external_id' in data and 'analytical_sample external_id' in data:
                    external_ids['subjects'] = data[
                        'subject external_id'].astype(str).unique().tolist()
                    external_ids['biological_samples'] = data[
                        'biological_sample external_id'].astype(
                            str).unique().tolist()
                    external_ids['analytical_samples'] = data[
                        'analytical_sample external_id'].astype(
                            str).unique().tolist()
                    dataUpload.create_mapping_cols_clinical(
                        driver,
                        data,
                        directory,
                        clinical_filename,
                        separator=separator)
                    if 0 in res_n.values:
                        samples = ', '.join([k for (k, v) in res_n if v == 0])
                        message = 'ERROR: No {} for project {} in the database. Please upload first the experimental design (ExperimentalDesign_{}.xlsx)'.format(
                            samples, project_id, project_id)
                        builder_utils.remove_directory(directory)

                        return message, style, table
                    else:
                        db_ids = dataUpload.check_external_ids_in_db(
                            driver, project_id).to_dict()
                        message = ''
                        intersections = {}
                        differences_in = {}
                        differences_out = {}
                        for col in external_ids:
                            intersect = list(
                                set(db_ids[col].values()).intersection(
                                    external_ids[col]))
                            difference_in = list(
                                set(db_ids[col].values()).difference(
                                    external_ids[col]))
                            difference_out = list(
                                set(external_ids[col]).difference(
                                    set(db_ids[col].values())))
                            if len(difference_in) > 0 or len(
                                    difference_out) > 0:
                                intersections[col] = intersect
                                differences_in[col] = difference_in
                                differences_out[col] = difference_out
                        for col in intersections:
                            message += 'WARNING: Some {} identifiers were not matched:\n Matching: {}\n No information provided: {} \n Non-existing in the database: {}\n'.format(
                                col, len(intersections[col]),
                                ','.join(differences_in[col]),
                                ','.join(differences_out[col]))
                else:
                    message = 'ERROR: Format of the Clinical Data file is not correct. Check template in the documentation. Check columns: subject external_id, biological_sample external_id and analytical_sample external_id'
                    builder_utils.remove_directory(directory)

                    return message, style, table
        try:
            for dataset in datasets:
                source = os.path.join(temporaryDirectory, dataset)
                destination = os.path.join(destDir, dataset)
                builder_utils.copytree(source, destination)
                datasetPath = os.path.join(
                    os.path.join(experimentsImportDir, project_id), dataset)
                if dataset != "experimental_design":
                    eh.generate_dataset_imports(project_id, dataset,
                                                datasetPath)

            loader.partialUpdate(imports=['project', 'experiment'],
                                 specific=[project_id])
            filename = os.path.join(tmpDirectory,
                                    'Uploaded_files_' + project_id)
            utils.compress_directory(filename,
                                     temporaryDirectory,
                                     compression_format='zip')
            style = {'display': 'block'}
            message = 'Files successfully uploaded.'
            table = dataUpload.get_project_information(driver, project_id)
        except Exception as err:
            style = {'display': 'block'}
            message = str(err)

    return message, style, table
Exemplo n.º 4
0
                            logger.info("These entities will be imported: {}".format(", ".join(valid_entities)))
                            print("These entities will be loaded into the database: {}".format(", ".join(valid_entities)))
                            importer.ontologiesImport(importDirectory=directories['importDirectory'], ontologies=valid_entities, download=download)
                        else:
                            logger.error("The indicated entities (--data) cannot be imported: {}".format(args.data))
                            print("The indicated entities (--data) cannot be imported: {}".format(args.data))
            else:
                print("Indicate the data to be imported by passing the argument --data and the list to be imported. \
                                Example: python builder.py --build_type import --import_types databases --data UniProt")
    elif args.build_type == 'load':
        logger.info("The build will load data into the database: {}".format("".join(args.load_entities)))
        valid_entities = []
        specific = args.specific
        if len(args.load_entities) > 0:
            valid_entities = [x.lower() for x in args.load_entities if x.lower() in config['graph']]
        else:
            valid_entities = config['graph']
        if len(valid_entities) > 0:
            logger.info("These entities will be loaded into the database: {}".format(", ".join(valid_entities)))
            print("These entities will be loaded into the database: {}".format(", ".join(valid_entities)))
            loader.partialUpdate(imports=valid_entities, specific=specific)
        else:
            logger.error("The indicated entities (--load_entities) cannot be loaded: {}".format(args.load_entities))
            print("The indicated entities (--load_entities) cannot be loaded into the database: {}".format(args.load_entities))
    else:
        print("Indicate the type of build you want to perform, either import (generate csv files to be loaded into the database), \
                                    load (load csv files into the database) or full (import and then load all the data into the database) \
                                    Example: Import > python builder.py --build_type import --import_types databases --data UniProt\n \
                                    Load > python builder.py --build_type load --load_types Mentions\n \
                                    Full > python builder.py --build_type full or simpy python builder.py")
Exemplo n.º 5
0
         importDirectory=directories['importDirectory'],
         ontologies=licensed_ont,
         download=False)
     importer.ontologiesImport(
         importDirectory=directories['importDirectory'],
         ontologies=mapping_ont,
         download=True)
     importer.databasesImport(
         importDirectory=directories['importDirectory'],
         databases=licensed_dbs,
         n_jobs=args.n_jobs,
         download=False)
     logger.info(
         "Building database > step 2: Loading all missing nodes and entities"
     )
     loader.partialUpdate(imports=minimal_load, specific=[])
 elif args.build_type == 'import':
     logger.info("The user chose to perform a partial build")
     if args.import_types is not None:
         if args.data is None or len(args.data) > 0:
             logger.info("The build will import data from {}".format(
                 "".join(args.import_types)))
             for import_type in args.import_types:
                 logger.info("Importing {}: {}".format(
                     import_type, args.data))
                 if import_type.lower(
                 ) == 'experiments' or import_type.lower() == 'experiment':
                     importer.experimentsImport(projects=args.data,
                                                n_jobs=1)
                 elif import_type.lower() == 'users' or import_type.lower(
                 ) == 'user':