Пример #1
0
def map_experiment_files(project_id, datasetPath, mapping):
    files = builder_utils.listDirectoryFiles(datasetPath)

    for file in files:
        outputfile = os.path.join(datasetPath, file)
        data = builder_utils.readDataset(outputfile)
        data = map_experimental_data(data, mapping)
        builder_utils.export_contents(data, datasetPath, file)
Пример #2
0
def load_dataset(uri, configuration):
    ''' This function gets the molecular data from a proteomics experiment.
        Input: uri of the processed file resulting from MQ
        Output: pandas DataFrame with the columns and filters defined in config.py '''
    data = None
    regexCols = None
    filters = None
    columns = configuration["columns"]
    regexCols = [c.replace("\\\\", "\\") for c in columns if '+' in c]
    columns = set(columns).difference(regexCols)
    generated_columns = []
    if 'generated_columns' in configuration:
        generated_columns = configuration['generated_columns']

    if 'filters' in configuration:
        filters = configuration["filters"]

    indexCol = configuration["indexCol"]
    data = builder_utils.readDataset(uri)
    missing_cols = check_columns(data, columns, generated_columns)
    if len(missing_cols) == 0:
        if filters is not None:
            data = data[data[filters].isnull().all(1)]
            data = data.drop(filters, axis=1)
            columns = set(columns).difference(filters)
        if 'numeric filter' in configuration:
            for f in configuration['numeric filter']:
                key = list(f.keys())[0]
                if key in columns:
                    value = f[key]
                    data = data[data[key] >= value]
                else:
                    raise Exception(
                        "Error when applying numeric filter on {}. The column is not in the dataset"
                        .format(f))
        data = data.dropna(subset=[configuration["proteinCol"]], axis=0)
        data = expand_groups(data, configuration)
        columns.remove(indexCol)

        for regex in regexCols:
            r = re.compile(regex)
            columns.update(set(filter(r.match, data.columns)))

        data = data[list(columns)].replace('Filtered', np.nan)
        value_cols = get_value_cols(data, configuration)
        data[value_cols] = data[value_cols].apply(
            lambda x: pd.to_numeric(x, errors='coerce'))
        data = data.dropna(how='all', subset=value_cols, axis=0)
    else:
        raise Exception(
            "Error when importing proteomics experiment.\n Missing columns: {}"
            .format(",".join(missing_cols)))

    return data, regexCols
Пример #3
0
def parse_dataset(projectId, configuration, dataDir, key='project'):
    '''This function parses clinical data from subjects in the project
    Input: uri of the clinical data file. Format: Subjects as rows, clinical variables as columns
    Output: pandas DataFrame with the same input format but the clinical variables mapped to the
    right ontology (defined in config), i.e. type = -40 -> SNOMED CT'''
    data = None
    if 'file_'+key in configuration:
        data_file = configuration['file_'+key].replace('PROJECTID', projectId)

        filepath = os.path.join(dataDir, data_file)
        if os.path.isfile(filepath):
            data = builder_utils.readDataset(filepath)

    return data
Пример #4
0
def loadWESDataset(uri, configuration):
    ''' This function gets the molecular data from a Whole Exome Sequencing experiment.
        Input: uri of the processed file resulting from the WES analysis pipeline. The resulting
        Annovar annotated VCF file from Mutect (sampleID_mutect_annovar.vcf)
        Output: pandas DataFrame with the columns and filters defined in config.py '''
    aux = uri.split("/")[-1].split("_")
    sample = aux[0]
    #Get the columns from config
    columns = configuration["columns"]
    #Read the data from file
    data = builder_utils.readDataset(uri)
    if configuration['filter'] in data.columns:
        data = data.loc[data[configuration['filter']], :]
    data = data[columns]
    data["sample"] = aux[0]
    data["variant_calling_method"] = aux[1]
    data["annotated_with"] = aux[2].split('.')[0]
    data["alternative_names"] = data[configuration["alt_names"]]
    data = data.drop(configuration["alt_names"], axis = 1)
    data = data.iloc[1:]
    data = data.replace('.', np.nan)
    data["ID"] = data[configuration["id_fields"]].apply(lambda x: str(x[0])+":g."+str(x[1])+str(x[2])+'>'+str(x[3]), axis=1)
    data.columns = configuration['new_columns']
    return sample, data
Пример #5
0
def run_processing(n_clicks, project_id):
    message = None
    style = {'display': 'none'}
    table = None
    if n_clicks > 0:
        session_cookie = flask.request.cookies.get('custom-auth-session')
        destDir = os.path.join(experimentDir, project_id)
        builder_utils.checkDirectory(destDir)
        temporaryDirectory = os.path.join(tmpDirectory,
                                          session_cookie + "upload")
        datasets = builder_utils.listDirectoryFoldersNotEmpty(
            temporaryDirectory)
        res_n = dataUpload.check_samples_in_project(driver, project_id)
        if 'experimental_design' in datasets:
            dataset = 'experimental_design'
            directory = os.path.join(temporaryDirectory, dataset)
            experimental_files = os.listdir(directory)
            if config['file_design'].replace('PROJECTID',
                                             project_id) in experimental_files:
                experimental_filename = config['file_design'].replace(
                    'PROJECTID', project_id)
                designData = builder_utils.readDataset(
                    os.path.join(directory, experimental_filename))
                designData = designData.astype(str)
                if 'subject external_id' in designData.columns and 'biological_sample external_id' in designData.columns and 'analytical_sample external_id' in designData.columns:
                    if (res_n > 0).any().values.sum() > 0:
                        res = dataUpload.remove_samples_nodes_db(
                            driver, project_id)
                        res_n = dataUpload.check_samples_in_project(
                            driver, project_id)
                        if (res_n > 0).any().values.sum() > 0:
                            message = 'ERROR: There is already an experimental design loaded into the database and there was an error when trying to delete it. Contact your administrator.'.format(
                                experimental_filename, ','.join([
                                    'subject external_id',
                                    'biological_sample external_id',
                                    'analytical_sample external_id'
                                ]))
                            return message, style, table

                    res_n = None
                    result = create_new_identifiers.apply_async(
                        args=[
                            project_id,
                            designData.to_json(), directory,
                            experimental_filename
                        ],
                        task_id='data_upload_' + session_cookie +
                        datetime.now().strftime('%Y%m-%d%H-%M%S-'))
                    result_output = result.wait(timeout=None,
                                                propagate=True,
                                                interval=0.2)
                    res_n = pd.DataFrame.from_dict(result_output['res_n'])
                else:
                    message = 'ERROR: The Experimental design file provided ({}) is missing some of the required fields: {}'.format(
                        experimental_filename, ','.join([
                            'subject external_id',
                            'biological_sample external_id',
                            'analytical_sample external_id'
                        ]))
                    builder_utils.remove_directory(directory)

                    return message, style, table

        if 'clinical' in datasets:
            dataset = 'clinical'
            directory = os.path.join(temporaryDirectory, dataset)
            clinical_files = os.listdir(directory)
            if config['file_clinical'].replace('PROJECTID',
                                               project_id) in clinical_files:
                clinical_filename = config['file_clinical'].replace(
                    'PROJECTID', project_id)
                data = builder_utils.readDataset(
                    os.path.join(directory, clinical_filename))
                external_ids = {}
                if 'subject external_id' in data and 'biological_sample external_id' in data and 'analytical_sample external_id' in data:
                    external_ids['subjects'] = data[
                        'subject external_id'].astype(str).unique().tolist()
                    external_ids['biological_samples'] = data[
                        'biological_sample external_id'].astype(
                            str).unique().tolist()
                    external_ids['analytical_samples'] = data[
                        'analytical_sample external_id'].astype(
                            str).unique().tolist()
                    dataUpload.create_mapping_cols_clinical(
                        driver,
                        data,
                        directory,
                        clinical_filename,
                        separator=separator)
                    if 0 in res_n.values:
                        samples = ', '.join([k for (k, v) in res_n if v == 0])
                        message = 'ERROR: No {} for project {} in the database. Please upload first the experimental design (ExperimentalDesign_{}.xlsx)'.format(
                            samples, project_id, project_id)
                        builder_utils.remove_directory(directory)

                        return message, style, table
                    else:
                        db_ids = dataUpload.check_external_ids_in_db(
                            driver, project_id).to_dict()
                        message = ''
                        intersections = {}
                        differences_in = {}
                        differences_out = {}
                        for col in external_ids:
                            intersect = list(
                                set(db_ids[col].values()).intersection(
                                    external_ids[col]))
                            difference_in = list(
                                set(db_ids[col].values()).difference(
                                    external_ids[col]))
                            difference_out = list(
                                set(external_ids[col]).difference(
                                    set(db_ids[col].values())))
                            if len(difference_in) > 0 or len(
                                    difference_out) > 0:
                                intersections[col] = intersect
                                differences_in[col] = difference_in
                                differences_out[col] = difference_out
                        for col in intersections:
                            message += 'WARNING: Some {} identifiers were not matched:\n Matching: {}\n No information provided: {} \n Non-existing in the database: {}\n'.format(
                                col, len(intersections[col]),
                                ','.join(differences_in[col]),
                                ','.join(differences_out[col]))
                else:
                    message = 'ERROR: Format of the Clinical Data file is not correct. Check template in the documentation. Check columns: subject external_id, biological_sample external_id and analytical_sample external_id'
                    builder_utils.remove_directory(directory)

                    return message, style, table
        try:
            for dataset in datasets:
                source = os.path.join(temporaryDirectory, dataset)
                destination = os.path.join(destDir, dataset)
                builder_utils.copytree(source, destination)
                datasetPath = os.path.join(
                    os.path.join(experimentsImportDir, project_id), dataset)
                if dataset != "experimental_design":
                    eh.generate_dataset_imports(project_id, dataset,
                                                datasetPath)

            loader.partialUpdate(imports=['project', 'experiment'],
                                 specific=[project_id])
            filename = os.path.join(tmpDirectory,
                                    'Uploaded_files_' + project_id)
            utils.compress_directory(filename,
                                     temporaryDirectory,
                                     compression_format='zip')
            style = {'display': 'block'}
            message = 'Files successfully uploaded.'
            table = dataUpload.get_project_information(driver, project_id)
        except Exception as err:
            style = {'display': 'block'}
            message = str(err)

    return message, style, table