def write_specimen_file(specimen_field,
                        specimen_value,
                        project_short_name,
                        output_dir,
                        entry_id,
                        full_file=None):
    dataset_name = 'FIELD_{}_VALUE_{}'.format(specimen_field,
                                              specimen_value).replace(
                                                  ' ', '_').replace('/', '_')

    r = send_get('dataset')
    datasets = json.loads(r.text)['_items']
    for dataset in datasets:
        if dataset['name'] is dataset_name:
            bioblocks_log(
                'Dataset with name \'{}\' already exists, skipping!'.format(
                    dataset_name))
            return

    es_query = generate_es_query(project_short_name, specimen_field,
                                 specimen_value)
    r = session.post(
        url=HCA_DSS_URL,
        data=json.dumps(es_query),
        headers={'Content-type': 'application/json'},
        timeout=None,
    )
    time.sleep(5)
    results = json.loads(r.text)['results']
    if len(results) == 0:
        bioblocks_log(
            'Unable to find HCA data for dataset with short name \'{}\' and field:value pair \'{}:{}\''
            .format(project_short_name, specimen_field, specimen_value))
        return

    bioblocks_log('Creating specimen dataset \'{}\''.format(dataset_name))

    specimen_uuid = str(uuid.uuid4())
    create_dataset(specimen_uuid, dataset_name, [entry_id])
    output_dir = '{}/{}'.format(output_dir, specimen_uuid)
    create_directory(output_dir)

    results_file = '{}/{}_fqids.tsv'.format(output_dir, dataset_name)

    with open(results_file, 'w') as file:
        while True:
            write_bundle_results(results, file, full_file)
            if 'Link' in r.headers:
                nextHeader = r.headers['Link'][1:r.headers['Link'].
                                               index('&output')]
                r = session.post(url=nextHeader,
                                 data=json.dumps(es_query),
                                 headers={'Content-type': 'application/json'},
                                 timeout=None)
                results = json.loads(r.text)['results']
                bioblocks_log('Found {} results'.format(len(results)))
            else:
                break
    bioblocks_log('Finished writing results file \'{}\''.format(results_file))
def analyze_dataset(dataset):
    dataset_analyses = dataset['analyses']
    dataset_id = dataset['_id']
    pca_location = ''
    for analysis in dataset_analyses:
        if analysis['processType'] == 'TSNE':
            bioblocks_log(
                'T-SNE process already found for dataset \'{}\', skipping'.
                format(dataset_id))
            return
        elif analysis['processType'] == 'SPRING':
            pca_location = 'files/datasets/{}/analyses/{}/counts_norm.npz'.format(
                dataset_id, analysis['_id'])

    if pca_location == '':
        bioblocks_log(
            'Unable to find counts_norm.npz for dataset \'{}\', skipping T-SNE'
            .format(dataset_id))
        return

    analysis_id = str(uuid.uuid4())
    start_time = datetime.utcnow()
    bioblocks_log(
        'Starting TensorFlow T-SNE analysis \'{}\' for dataset \'{}\''.format(
            analysis_id, dataset_id))

    try:
        npz_file = portal_spring_upload_functions.load_npz(pca_location)
    except Exception as e:
        bioblocks_log(e)
        return

    data = portal_spring_helper.get_pca(npz_file,
                                        keep_sparse=True,
                                        normalize=False)
    tsne = TSNE(n_jobs=4)
    Y = tsne.fit_transform(data)

    end_time = datetime.utcnow()
    bioblocks_log(
        'Finished TensorFlow T-SNE analysis \'{}\' for dataset \'{}\'. Duration: {}'
        .format(analysis_id, dataset_id, end_time - start_time))

    output_dir = create_analysis_directory(
        'files/datasets/{}'.format(dataset_id), analysis_id)

    np.savetxt('{}/tsne_matrix.csv'.format(output_dir), Y, delimiter=',')

    analysis = {
        '_id': analysis_id,
        'process_type': 'TSNE',
        'name': '{} - {}'.format(dataset['name'], 'TSNE')
    }
    post_bioblocks_analysis(analysis)
    patch_analysis_for_dataset(dataset, analysis_id)
Exemplo n.º 3
0
def send_get(endpoint, timeout=None):
    url = bioblocks_api_url.format(endpoint)
    bioblocks_log('Sending GET to \'{}\''.format(url))
    time.sleep(4)
    r = session.get(
        url=url,
        timeout=timeout,
        verify=verify,
    )
    bioblocks_log('Received status code \'{}\' from GET'.format(r.status_code))
    return r
def start_getting_bundles(args):
    response = session.get(url=HCA_PROJECT_URL, timeout=None)
    hits = json.loads(response.text)['hits']
    output_dir = '{}/files/datasets'.format(path)
    bioblocks_log('Using output_dir: \'{}\''.format(output_dir))

    for hit in hits:
        entry_id = hit['entryId']
        projects = hit['projects']
        specimens = hit['specimens']
        for project in projects:
            process_project(project, specimens, entry_id, args, output_dir)
def start_analysis():
    dataset_url = 'dataset?embedded={"analyses":1}'
    r = send_get(dataset_url)
    if (r.ok is False):
        bioblocks_log('Error getting dataset analyes: {}'.format(r.text))
    else:
        datasets = json.loads(r.text)['_items']
        for dataset in datasets:
            if 'matrixLocation' in dataset:
                analyze_dataset(dataset)
            else:
                bioblocks_log('Dataset {} has no matrix'.format(
                    dataset['_id']))
def create_dataset(_id, name, derived_from):
    r = send_post(
        'dataset',
        json.dumps({
            '_id': _id,
            'derivedFrom': derived_from,
            'name': name,
        }))
    if (r.ok):
        bioblocks_log('Successfully created dataset \'{}\''.format(_id))
    else:
        bioblocks_log('Error creating dataset \'{}\': \'{}\''.format(
            _id, r.text))
Exemplo n.º 7
0
def send_delete(endpoint, headers, timeout=None):
    url = bioblocks_api_url.format(endpoint)
    time.sleep(4)
    bioblocks_log('Sending DELETE to: \'{}\''.format(url))
    r = session.delete(
        url=url,
        headers=headers,
        timeout=timeout,
        verify=verify,
    )
    bioblocks_log('Received status code \'{}\' from DELETE'.format(
        r.status_code))
    return r
Exemplo n.º 8
0
def send_patch(endpoint, data, headers, timeout=None):
    url = bioblocks_api_url.format(endpoint)
    bioblocks_log('Sending PATCH to \'{}\''.format(url))
    time.sleep(4)
    r = session.patch(
        url=url,
        data=data,
        headers=headers,
        timeout=timeout,
        verify=verify,
    )
    bioblocks_log('Received status code \'{}\' from PATCH'.format(
        r.status_code))
    return r
Exemplo n.º 9
0
def patch_dataset_for_job(request_id, result_location, associated_dataset):
    r = send_patch('{}/{}'.format('dataset', associated_dataset['dataset']),
                   json.dumps({
                       'matrixLocation': result_location,
                   }), {
                       'Content-type': 'application/json',
                       'If-Match': associated_dataset['etag']
                   })

    if (r.ok):
        bioblocks_log('Successfully patched job \'{}\''.format(request_id))
        return json.loads(r.text)['_etag']
    else:
        bioblocks_log('Error patching job \'{}\': \'{}\''.format(
            request_id, r.text))
Exemplo n.º 10
0
def send_post(endpoint,
              data,
              headers={'Content-type': 'application/json'},
              timeout=None):
    url = bioblocks_api_url.format(endpoint)
    bioblocks_log('Sending POST to \'{}\''.format(url))
    time.sleep(4)
    r = session.post(
        url=url,
        data=data,
        headers=headers,
        timeout=timeout,
        verify=verify,
    )
    bioblocks_log('Received status code \'{}\' from POST'.format(
        r.status_code))
    return r
Exemplo n.º 11
0
def create_bioblocks_job(request_id, dataset):
    r = send_post(
        'job',
        json.dumps({
            '_id': request_id,
            'associatedDataset': {
                'dataset': dataset['_id'],
                'etag': dataset['_etag']
            },
            'link': '{}/{}'.format(MATRIX_SERVICE_V0_URL, request_id),
            'status': 'IN_PROGRESS'
        }))

    if (r.ok):
        bioblocks_log('Successfully created job \'{}\''.format(request_id))
    else:
        bioblocks_log('Error creating job \'{}\': \'{}\''.format(
            request_id, r.text))
Exemplo n.º 12
0
def derive_es_query_match_field(match_key):
    match_dict = {
        'id':
        'files.specimen_from_organism_json.biomaterial_core.biomaterial_id',
        'genusSpecies': 'files.cell_suspension_json.genus_species.text',
        'organ': 'files.specimen_from_organism_json.organ.text',
        'organPart': 'files.specimen_from_organism_json.organ_part.text',
        'organismAge': 'files.donor_organism_json.organism_age',
        'biologicalSex': 'files.donor_organism_json.sex',
        'disease': 'files.donor_organism_json.diseases.text',
        'preservationMethod':
        'files.specimen_from_organism_json.preservation_storage.preservation_method',
        'source': 'files.library_preparation_protocol_json.nucleic_acid_source'
    }

    if match_key in match_dict:
        return match_dict[match_key]
    else:
        bioblocks_log('Unhandled specimen field \'{}\'!'.format(match_key))
        return match_key
Exemplo n.º 13
0
def send_hca_matrix_job_request(dataset):
    """ V0
    bundle_fqids_url = '{}/datasets/{}/{}_fqids.tsv'.format(TSV_PUBLIC_URL,
                                                            dataset['_id'], dataset['name'])
    bioblocks_log('POST-ing matrix job with bundle_fqid_url=\'{}\''.format(bundle_fqids_url))
    r = session.post(
        url=MATRIX_SERVICE_V0_URL,
        data=json.dumps({
            'bundle_fqids_url': bundle_fqids_url,
            'format': 'mtx',
        }),
        headers={'Content-type': 'application/json'},
        timeout=None
    )
    """
    r = session.post(url=MATRIX_SERVICE_V1_URL,
                     data=json.dumps({
                         'fields':
                         ['specimen_from_organism.provenance.document_id'],
                         'filter': {
                             'op': '=',
                             'value': dataset['name'],
                             'field': 'project.project_core.project_short_name'
                         },
                         'format':
                         'mtx',
                     }),
                     headers={'Content-type': 'application/json'},
                     timeout=None)
    bioblocks_log('Returned status from matrix service: {}'.format(
        r.status_code))
    if (r.ok):
        result = json.loads(r.text)
        request_id = result['request_id']
        associated_dataset = {
            'dataset': dataset['_id'],
            'etag': dataset['_etag']
        }
        if result['status'] == 'In Progress':
            dataset['_etag'] = patch_dataset_for_job(
                request_id, 'IN_PROGRESS - CHECK JOB {}'.format(request_id),
                associated_dataset)
            create_bioblocks_job(request_id, dataset)
        elif result['status'] == 'Failed':
            bioblocks_log('Matrix job failed with message: \'{}\''.format(
                result['message']))
        else:
            bioblocks_log('Unhandled status \'{}\' from matrix service'.format(
                result['status']))
    else:
        bioblocks_log(r.text)
Exemplo n.º 14
0
def patch_matrix_info_for_dataset(dataset, mtx_info, matrix_location=None):
    dataset_id = dataset['_id']
    dataset_etag = dataset['_etag']

    if matrix_location is None:
        matrix_location = dataset['matrixLocation']

    r = send_patch('{}/{}'.format('dataset', dataset_id), json.dumps({
        'matrixInfo': {
            'colCount': mtx_info[1],
            'rowCount': mtx_info[0]
        },
        'matrixLocation': matrix_location,
    }), {'Content-type': 'application/json',
         'If-Match': dataset_etag})
    bioblocks_log('Returned status from dataset PATCH of matrix info: {}'.format(r.status_code))
    if r.ok is False:
        bioblocks_log(r.text)
        return dataset_etag
    else:
        return json.loads(r.text)['_etag']
Exemplo n.º 15
0
def delete_directory(old_dir):
    bioblocks_log('Deleting directory \'{}\''.format(old_dir))
    try:
        shutil.rmtree(old_dir)
        bioblocks_log('Deleted directory \'{}\''.format(old_dir))
    except Exception:
        bioblocks_log(
            'Directory \'{}\' doesn\'t exist, skipping!'.format(old_dir))
Exemplo n.º 16
0
def create_directory(new_dir):
    bioblocks_log('Creating directory \'{}\''.format(new_dir))
    try:
        os.mkdir(new_dir)
        bioblocks_log('Created new directory \'{}\''.format(new_dir))
    except Exception:
        bioblocks_log(
            'Directory \'{}\' already exists, skipping!'.format(new_dir))
Exemplo n.º 17
0
def check_bioblocks_jobs():
    """
    Determine if each job has either finished or taken too long to finish.
    """
    r = send_get('job')
    jobs = json.loads(r.text)['_items']
    for job in jobs:
        try:
            handle_hca_matrix_job_status(job)
            job_id = job['_id']
            started = job['_created']
            job_age_days = days_between_dates(
                datetime.datetime.utcnow(),
                datetime.datetime.strptime(started, RFC_1123_FORMAT))
            if job_age_days >= MAX_DAYS_JOB_KEEP_ALIVE:
                delete_bioblocks_job(job)
            else:
                bioblocks_log(
                    'Not deleting job \'{}\', less than {} days old'.format(
                        job_id, MAX_DAYS_JOB_KEEP_ALIVE))
        except Exception as e:
            bioblocks_log('Exception checking job id \'{}\': {}'.format(
                job_id, e))
def create_analysis_directory(dataset_dir, analysis_id):
    try:
        analysis_dir = '{}/analyses'.format(dataset_dir)
        os.mkdir(analysis_dir)
        bioblocks_log('Created new directory \'{}\''.format(analysis_dir))
    except Exception:
        bioblocks_log(
            'Analyses directory \'{}\' already exists, skipping!'.format(
                analysis_dir))

    try:
        output_dir = '{}/{}'.format(analysis_dir, analysis_id)
        os.mkdir(output_dir)
        bioblocks_log('Created new directory \'{}\''.format(output_dir))
    except Exception:
        bioblocks_log(
            'Analyses directory \'{}\' already exists, skipping!'.format(
                output_dir))
    finally:
        return output_dir
Exemplo n.º 19
0
def process_project(project, specimens, entry_id, args, output_dir):
    short_name = project['projectShortname']
    if (args.is_dry_run and (short_name == 'HumanMousePancreas')) or \
            not args.is_dry_run:
        bioblocks_log(
            'Getting bundles for project with shortname \'{}\''.format(
                short_name))
        create_directory('{}/{}'.format(output_dir, entry_id))
        create_dataset(entry_id, short_name, [])

        return

        with open('{}/{}/full_fqids.tsv'.format(output_dir, entry_id),
                  'w') as full_file:
            for specimen in specimens:
                for specimen_field in specimen:
                    for specimen_value in specimen[specimen_field]:
                        if specimen_value is None:
                            bioblocks_log(
                                'Specimen field \'{}\' with value of \'none\''.
                                format(specimen_field))
                        elif specimen_field == 'id':
                            write_specimen_file(specimen_field, specimen_value,
                                                short_name, output_dir,
                                                entry_id, full_file)
                            """
                            elif not args.is_dry_run:
                                write_specimen_file(
                                    specimen_field, specimen_value, short_name, output_dir, entry_id)
                            """
                        else:
                            bioblocks_log(
                                'Skipping field \'{}\' with value \'{}\''.
                                format(specimen_field, specimen_value))
    else:
        bioblocks_log(
            'Skipping getting bundles for project with shortname \'{}\'.'.
            format(short_name))
Exemplo n.º 20
0
def patch_analysis_for_dataset(dataset, analysis_id):
    dataset_id = dataset['_id']
    dataset_etag = dataset['_etag']
    dataset_analyses = dataset['analyses']
    dataset_analyses.append(analysis_id)
    bioblocks_log('PATCHing dataset \'{}\' with analysis \'{}\''.format(dataset_id, analysis_id))
    r = send_patch('{}/{}'.format('dataset', dataset_id), json.dumps({
        'analyses': dataset_analyses,
    }), {'Content-type': 'application/json',
         'If-Match': dataset_etag})

    bioblocks_log('Returned status from dataset PATCH of analysis: {}'.format(r.status_code))
    if r.ok is False:
        bioblocks_log(r.text)
        return dataset_etag
    else:
        return json.loads(r.text)['_etag']
Exemplo n.º 21
0
def analyze_dataset(dataset, dataset_dir, MAX_CELLS_COUNT=200000):
    dataset_analyses = dataset['analyses']
    dataset_id = dataset['_id']
    matrix_location = dataset['matrixLocation']
    # dataset_id = 'f83165c5-e2ea-4d15-a5cf-33f3550bffde'
    # matrix_location =
    # 'https://s3.amazonaws.com/dcp-matrix-service-results-prod/c34ccb0e-e4fa-4c08-8d76-84aca71dfc99.mtx.zip'

    bioblocks_log('analyzing dataset directory \'{}\' with matrix_location \'{}\''.format(dataset_dir, matrix_location))
    ends_with_zip = matrix_location.endswith('.zip')
    ends_with_mtx = matrix_location.endswith('mtx')
    ends_with_mtx_gz = matrix_location.endswith('mtx.gz')

    if (
        ends_with_zip is False and ends_with_mtx is False and ends_with_mtx_gz is False
    ):
        bioblocks_log('Dataset \'{}\' has a invalid matrix location: {}, not running SPRING.'.format(
            dataset_id, matrix_location))
        return

    for analysis in dataset_analyses:
        if analysis['processType'] == 'SPRING':
            bioblocks_log('SPRING process already found for dataset \'{}\', skipping'.format(
                dataset_id))
            return

    if (ends_with_zip is True or ends_with_mtx is True):
        zip_request = session.get(matrix_location)
        zip_location = zip_request.content
    else:
        zip_location = matrix_location

    # Unzip the matrix and write it to the local file system.
    tmp_dir = ''
    with zipfile.ZipFile(io.BytesIO(zip_location)) as z:
        z.extractall(dataset_dir)
        for name in z.namelist():
            tmp_dir = name.split('/')[0]
            final_file_name = name.split('/', 1)[-1][:-3]
            full_path = '{}/{}'.format(dataset_dir, name)
            create_directory('{}/matrix'.format(dataset_dir))
            with gzip.open(full_path, 'rb') as f_in:
                with open('{}/matrix/{}'.format(dataset_dir, final_file_name), 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
    run_spring_analysis(dataset_dir=dataset_dir, dataset_id=dataset_id,
                        dataset=dataset, MAX_CELLS_COUNT=MAX_CELLS_COUNT, tmp_dir=tmp_dir)
Exemplo n.º 22
0
def create_hca_matrix_jobs():
    r = send_get('dataset')

    if (r.ok):
        datasets = json.loads(r.text)['_items']
        for dataset in datasets:
            if 'matrixLocation' in dataset:
                bioblocks_log(
                    'Not creating job for dataset \'{}\', it already has a matrix field'
                    .format(dataset['_id']))
            else:
                bioblocks_log('Creating job for dataset \'{}\'.'.format(
                    dataset['_id']))
                send_hca_matrix_job_request(dataset)
    else:
        bioblocks_log('Unable to get datasets from bioblocks: {}'.format(
            r.text))
Exemplo n.º 23
0
def post_bioblocks_analysis(analysis):
    bioblocks_log('Creating analysis with analysis_id=\'{}\''.format(
        analysis['_id']))
    r = send_post(
        'analysis',
        json.dumps({
            '_id': analysis['_id'],
            'name': analysis['name'],
            'processType': analysis['process_type'],
        }), {'Content-type': 'application/json'})

    bioblocks_log(
        'Returned status from bioblocks analysis request: \'{}\''.format(
            r.status_code))
    if r.ok is False:
        bioblocks_log(r.text)
    else:
        return json.loads(r.text)['_etag']
Exemplo n.º 24
0
def start_analysis():
    dataset_url = 'dataset?embedded={"analyses":1}'
    r = send_get(dataset_url)
    if (r.ok is False):
        bioblocks_log('Error getting dataset analyes: {}'.format(r.text))
    else:
        datasets = json.loads(r.text)['_items']

        for dataset in datasets:
            dataset_id = dataset['_id']
            dataset_dir = 'files/datasets/{}'.format(dataset_id)
            if 'matrixLocation' in dataset:
                try:
                    analyze_dataset(dataset, dataset_dir)
                except Exception as e:
                    bioblocks_log('Exception found running SPRING: {}'.format(e))
                    traceback.print_exc()

            else:
                bioblocks_log('Dataset {} has no matrix, not running SPRING.'.format(dataset_id))
Exemplo n.º 25
0
def create_subsampled_matrix(large_matrix_location,
                             gene_file=None,
                             MAX_CELLS_COUNT=MAX_CELLS_COUNT):
    mtx_info = mminfo(large_matrix_location)
    num_rows = mtx_info[0]
    num_cols = mtx_info[1]
    num_cells = num_cols

    if gene_file is not None:
        gene_list = load_genes(
            gene_file,
            delimiter='\t' if gene_file.endswith('tsv') else None,
            skip_rows=1 if gene_file.endswith('tsv') else 0)

        if num_rows == len(gene_list):
            bioblocks_log('Genes are rows, Cells are cols')
            num_cells = num_cols
            sample_rows = False
        else:
            bioblocks_log('Cells are rows, Genes are cols')
            num_cells = num_rows
            sample_rows = True
    else:
        gene_list = ['']

    subsampled_indices = random.sample(range(1, num_cells + 1),
                                       MAX_CELLS_COUNT)
    subsampled_indices.sort()
    if (num_cells > MAX_CELLS_COUNT):
        subsampled_matrix_location = '{}_sub.mtx'.format(
            large_matrix_location[0:len(large_matrix_location) - 4])
        with open(subsampled_matrix_location, 'w') as subsampled_matrix:
            with open(large_matrix_location) as large_matrix:
                matrix_header = large_matrix.readline()
                num_entries = 0
                output_lines = []
                line = large_matrix.readline()
                while line:
                    line = large_matrix.readline()
                    matrix_index = -1
                    if sample_rows is True and line.split(' ')[0].isnumeric():
                        matrix_index = int(line.split(' ')[0])
                        num_entries += 1
                    elif len(line.split(' ')) >= 2 and line.split(
                            ' ')[1].isnumeric():
                        matrix_index = int(line.split(' ')[1])
                        num_entries += 1

                    bisect_index = bisect_left(subsampled_indices,
                                               matrix_index)

                    if bisect_index != len(
                            subsampled_indices
                    ) and subsampled_indices[bisect_index] == matrix_index:
                        output_lines.append(line)

                if sample_rows is True:
                    matrix_header = '{}{}'.format(
                        '{} {} {}'.format(MAX_CELLS_COUNT, len(gene_list),
                                          num_entries), matrix_header)
                else:
                    matrix_header = '{}{}'.format(
                        matrix_header,
                        '{} {} {}'.format(len(gene_list), MAX_CELLS_COUNT,
                                          num_entries))
                subsampled_matrix.write(matrix_header)
                for line in output_lines:
                    subsampled_matrix.write(line)
        os.remove(large_matrix_location)
        os.rename(subsampled_matrix_location, large_matrix_location)
Exemplo n.º 26
0
def handle_hca_matrix_job_status(job):
    """
    Process a matrix job which can either succeed, fail, or still be in progress.
    """
    r = session.get(url=job['link'], timeout=None)
    job_id = job['_id']
    if r.ok is False:
        bioblocks_log('Job \'{}\' link returned error: {}'.format(
            job_id, r.text))
    else:
        result = json.loads(r.text)
        if result['status'] == 'In Progress':
            bioblocks_log('IN PROGRESS')
            bioblocks_log('Job \'{}\' is still in progress'.format(job_id))
        elif result['status'] == 'Failed':
            bioblocks_log('FAILED')
            bioblocks_log('Job \'{}\' failed with message \'{}\''.format(
                job_id, result['message']))
            delete_bioblocks_job(job)
        elif result['status'] == 'Complete':
            bioblocks_log('COMPLETE')
            matrix_location = result['matrix_location']
            bioblocks_log(
                'Job \'{}\' is complete! Storing matrix location of \'{}\''.
                format(job_id, matrix_location))
            patch_dataset_for_job(job_id, result['matrix_location'],
                                  job['associatedDataset'])
            delete_bioblocks_job(job)
        else:
            bioblocks_log('Unhandled status \'{}\' with message \'{}\''.format(
                result['status'], result['message']))
def make_spring_subplot(E,
                        gene_list,
                        save_path,
                        base_ix=None,
                        normalize=True,
                        exclude_dominant_frac=1.0,
                        min_counts=3,
                        min_cells=5,
                        min_vscore_pctl=75,
                        show_vscore_plot=False,
                        exclude_gene_names=None,
                        num_pc=30,
                        sparse_pca=False,
                        pca_norm=True,
                        k_neigh=4,
                        cell_groupings={},
                        num_force_iter=100,
                        output_spring=True,
                        precomputed_pca=None,
                        gene_filter=None,
                        custom_colors={},
                        exclude_corr_genes_list=None,
                        exclude_corr_genes_minCorr=0.2,
                        dist_metric='euclidean',
                        use_approxnn=False,
                        run_doub_detector=False,
                        dd_k=50,
                        dd_frac=5,
                        dd_approx=True,
                        tot_counts_final=None):

    out = {}

    E = E.tocsc()
    if base_ix is None:
        base_ix = np.arange(E.shape[0])

    # total counts normalize
    if tot_counts_final is None:
        tot_counts_final = E.sum(1).A.squeeze()
    out['tot_counts_final'] = tot_counts_final

    if normalize:
        # print 'Normalizing'
        E = tot_counts_norm(E, exclude_dominant_frac=exclude_dominant_frac)[0]

    if precomputed_pca is None:
        if gene_filter is None:
            # Get gene stats (above Poisson noise, i.e. V-scores)
            # print 'Filtering genes'
            if (min_counts > 0) or (min_cells > 0) or (min_vscore_pctl > 0):
                gene_filter = filter_genes(E,
                                           base_ix,
                                           min_vscore_pctl=min_vscore_pctl,
                                           min_counts=min_counts,
                                           min_cells=min_cells,
                                           show_vscore_plot=show_vscore_plot)
            else:
                gene_filter = np.arange(E.shape[1])

            if len(gene_filter) == 0:
                return 'Error: No genes passed filter'
                # print 'Error: All genes have mean expression < '+repr(min_exp) + ' or CV < '+repr(min_cv)
            # print 'Using %i genes' %(len(gene_filter))

            if not exclude_corr_genes_list is None:
                gene_filter = remove_corr_genes(
                    E,
                    gene_list,
                    exclude_corr_genes_list,
                    gene_filter,
                    min_corr=exclude_corr_genes_minCorr)
                if len(gene_filter) == 0:
                    return 'Error: No genes passed filter'

            # Remove user-excluded genes from consideration
            if not exclude_gene_names is None:
                keep_ix = np.array([
                    ii for ii, gix in enumerate(gene_filter)
                    if gene_list[gix] not in exclude_gene_names
                ])
                # print 'Excluded %i user-provided genes' %(len(gene_filter)-len(keep_ix))
                gene_filter = gene_filter[keep_ix]
                if len(gene_filter) == 0:
                    return 'Error: No genes passed filter'

        out['gene_filter'] = gene_filter
        # RUN PCA
        # if method == 'sparse': normalize by stdev
        # if method == anything else: z-score normalize
        # print 'Running PCA'
        num_pc = min(len(gene_filter), num_pc)
        bioblocks_log('num_pc: {}'.format(num_pc))
        out['num_pc'] = num_pc
        Epca = get_pca(E[:, gene_filter],
                       base_ix=base_ix,
                       numpc=num_pc,
                       keep_sparse=sparse_pca,
                       normalize=pca_norm)
        out['Epca'] = Epca
    # else:
    #     print 'Using user-supplied PCA coordinates'
    #     Epca = precomputed_pca

    # print 'Building kNN graph'

    links, knn_graph = get_knn_graph(Epca,
                                     k=k_neigh,
                                     dist_metric=dist_metric,
                                     approx=use_approxnn)
    out['knn_graph'] = knn_graph

    if run_doub_detector:
        import doublet_detector as woublet
        # print 'Running woublet'
        doub_score, doub_score_full, doub_labels = woublet.detect_doublets(
            [],
            counts=tot_counts_final,
            doub_frac=dd_frac,
            k=dd_k,
            use_approxnn=dd_approx,
            precomputed_pca=Epca)
        out['doub_score'] = doub_score
        out['doub_score_sim'] = doub_score_sim

    if output_spring:

        if not os.path.exists(save_path):
            os.makedirs(save_path)

        # print 'Saving SPRING files to %s' %save_path
        custom_colors['Total Counts'] = tot_counts_final
        np.savez_compressed(save_path + '/intermediates.npz',
                            Epca=Epca,
                            gene_filter=gene_filter,
                            total_counts=tot_counts_final)

        bioblocks_log('--- SAVING PCA ---')
        np.savetxt('{}/pca.csv'.format(save_path),
                   Epca,
                   delimiter=',',
                   fmt='%.3f')
        bioblocks_log('--- FINISHED SAVING PCA ---')

        if run_doub_detector:
            custom_colors['Doublet Score'] = doub_score

        if len(cell_groupings) > 0:
            save_spring_dir_sparse_hdf5(E,
                                        gene_list,
                                        save_path,
                                        list(links),
                                        custom_colors=custom_colors,
                                        cell_groupings=cell_groupings)
        else:
            save_spring_dir_sparse_hdf5(E,
                                        gene_list,
                                        save_path,
                                        list(links),
                                        custom_colors=custom_colors)

    if num_force_iter > 0:
        positions = get_force_layout(links,
                                     Epca.shape[0],
                                     n_iter=num_force_iter,
                                     edgeWeightInfluence=1,
                                     barnesHutTheta=2,
                                     scalingRatio=1,
                                     gravity=0.05,
                                     jitterTolerance=1,
                                     verbose=False)
        positions = positions / 5.0
        positions = positions - \
            np.min(positions, axis=0) - np.ptp(positions, axis=0) / 2.0
        positions[:, 0] = positions[:, 0] + 750
        positions[:, 1] = positions[:, 1] + 250
        out['coordinates'] = positions

    if output_spring:
        if num_force_iter > 0:
            np.savetxt(save_path + '/coordinates.txt',
                       np.hstack(
                           (np.arange(positions.shape[0])[:,
                                                          None], positions)),
                       fmt='%i,%.5f,%.5f')

        info_dict = {}
        info_dict['Date'] = '%s' % datetime.now()
        info_dict['Nodes'] = Epca.shape[0]
        info_dict['Filtered_Genes'] = len(gene_filter)
        info_dict['Gene_Var_Pctl'] = min_vscore_pctl
        info_dict['Min_Cells'] = min_cells
        info_dict['Min_Counts'] = min_counts
        info_dict['Num_Neighbors'] = k_neigh
        info_dict['Num_PCs'] = num_pc
        info_dict['Num_Force_Iter'] = num_force_iter
        with open(save_path + '/run_info.json', 'w') as f:
            f.write(json.dumps(info_dict, indent=4, sort_keys=True))

    return out
Exemplo n.º 28
0
def run_spring_preprocessing(
    # Input files
    mtx_file='matrix.mtx',
    gene_file='gene_id.csv',
    cell_labels_file=None,
    custom_colors_file=None,

    # Main dataset directory (for storing counts matrices, gene names, subplots)
    main_dir='example_dataset',

    # Subplot directory (all cells used for first subplot; subsets of cells
    # can be used to make additional subplots from within SPRING viewer)
    subplot_name='all_cells',

    # Get pre-processing parameters.
    # On our server, these are specified by the user.
    # For now, I've just hard-coded some (probably)
    # reasonable defaults.
    cell_min_counts=0,
    gene_min_cells=3,
    gene_min_counts=3,
    gene_var_pctl=85,
    n_neighbors=5,
    n_prin_comps=30,
    n_force_iter=500,
    subsample_range=[],
    sample_rows=False,
    num_cells=0
):

    #========================================================================================#
    # Initialize some stuff
    cell_labels = {}
    custom_color_tracks = {}
    gene_sets = {}
    subplot_dir = '{}/{}'.format(main_dir, subplot_name)

    bioblocks_log('SPRING parameters: \n\
        \'mtx_file\': {}\n\
        \'gene_file\': {}\n\
        \'cell_labels_file\': {}\n\
        \'main_dir\': {}\n\
        \'subplot_name\': {}\n\
        \'subplot_dir\': {}'
                  .format(mtx_file, gene_file, cell_labels_file, main_dir, subplot_dir, subplot_dir))

    #========================================================================================#
    # LOAD DATA

    # Load expression matrix - supporting mtx files, but I also have code for
    # many other formats. Let me know if you want something more flexible.
    mtx_info = mminfo(mtx_file)
    num_rows = mtx_info[0]

    gene_list = load_genes(gene_file,
                           delimiter='\t' if gene_file.endswith('tsv') else None,
                           skip_rows=1 if gene_file.endswith('tsv') else 0)
    E = run_matrix_sampling(num_cells=num_cells, mtx_file=mtx_file,
                            sample_size=subsample_range, sample_rows=sample_rows)

    bioblocks_log('E shape: {}'.format(E.shape))

    # Find dimension of counts matrix that matches number of genes.
    # If necessary, transpose counts matrix with
    # rows=cells and columns=genes.
    if num_rows == len(gene_list):
        E = E.T.tocsc()

    valid_gene_mask = get_valid_gene_mask(gene_list)

    gene_list = [g for iG, g in enumerate(gene_list) if valid_gene_mask[iG]]
    E = E[:, valid_gene_mask]

    # Load cell_labels (categorical variables) if file has been specified
    # if cell_labels_file is not None:
    # cell_labels = load_custom_data(cell_labels_file)

    # Load custom_colors (continuous variables) if file has been specified
    # if custom_colors_file is not None:
    # custom_color_tracks = load_custom_data(custom_colors_file)

    #========================================================================================#
    # PROCESS DATA

    # Make main directory
    if not os.path.exists(main_dir):
        os.makedirs(main_dir)

    # Perform cell filtering, removing cells with less than minimum
    # number of total counts
    total_counts = E.sum(1).A.squeeze()
    if cell_min_counts > 0:
        cell_filter = (total_counts >= cell_min_counts)
    else:
        cell_filter = (total_counts > 0)

    total_counts = total_counts[cell_filter]

    # Save cell filter
    np.save('{}/cell_filter_mask.npy'.format(main_dir), cell_filter)

    # Save gene list
    np.savetxt('{}/genes.txt'.format(main_dir), gene_list, fmt='%s')

    # Calculate stress signature: fraction of counts from mitochondrially
    # encoded genes (genes starting with "mt-" or "MT-")
    mito_ix = np.array([iG for iG, g in enumerate(gene_list)
                        if g.startswith('mt-') or g.startswith('MT-')], dtype=int)
    if len(mito_ix) > 0:
        mito_frac = E[:, mito_ix][cell_filter, :].sum(
            1).A.squeeze() / total_counts.astype(float)
        custom_color_tracks['Mito. frac.'] = mito_frac

    # Normalize counts matrix
    E = tot_counts_norm(E[cell_filter, :])[0]
    # E = tot_counts_norm(E[0])[0]

    # Save counts matrix as hdf5 files for fast loading in SPRING
    save_hdf5_genes(
        E, gene_list, '{}/counts_norm_sparse_genes.hdf5'.format(main_dir))
    save_hdf5_cells(E, '{}/counts_norm_sparse_cells.hdf5'.format(main_dir))
    save_sparse_npz(E, '{}/counts_norm.npz'.format(main_dir))

    # Save total counts per cell
    np.savetxt('{}/total_counts.txt'.format(main_dir), total_counts)

    # Set default cell label - same for all cells
    cell_labels['Default'] = ['All cells' for i in range(E.shape[0])]

    # Calculate gene set signatures if gene sets are provided
    if len(gene_sets) > 0:
        for kk, vv in gene_sets.items():
            custom_color_tracks[kk] = average_profile(E, gene_list, vv)

    # Use Truncated SVD and approximate nearest neighbors if >100,000 cells
    if E.shape[0] < 100000:
        sparse_pca = False
        use_approxnn = False
    else:
        sparse_pca = True
        use_approxnn = True

    # Run SPRING pre-processing
    out = make_spring_subplot(
        E,
        gene_list,
        subplot_dir,
        normalize=False,
        min_counts=gene_min_counts,
        min_cells=gene_min_cells,
        min_vscore_pctl=gene_var_pctl,
        num_pc=n_prin_comps,
        sparse_pca=sparse_pca,
        k_neigh=n_neighbors,
        cell_groupings=cell_labels,
        num_force_iter=n_force_iter,
        custom_colors=custom_color_tracks,
        use_approxnn=use_approxnn,
        tot_counts_final=total_counts
    )

    # Save pre-processing parameters
    new_params = {
        'min_reads': cell_min_counts,
        'min_cells': gene_min_cells,
        'min_counts': gene_min_counts,
        'vscore_pctl': gene_var_pctl,
        'k': n_neighbors,
        'p': n_prin_comps
    }

    print('{}/params.p'.format(subplot_dir))
    pickle.dump(new_params, open('{}/params.p'.format(subplot_dir), 'wb'))

    # Save cell filter files
    np.savetxt('{}/cell_filter.txt'.format(subplot_dir),
               np.arange(E.shape[0]), fmt='%i')
    np.save('{}/cell_filter.npy'.format(subplot_dir), np.arange(E.shape[0]))
Exemplo n.º 29
0
def run_spring_analysis(dataset_dir, dataset_id, dataset, MAX_CELLS_COUNT, tmp_dir):
    mtx_file = '{}/matrix/matrix.mtx'.format(dataset_dir)
    gene_file = '{}/matrix/genes.tsv'.format(dataset_dir)
    bioblocks_log('mtx_file = {}'.format(mtx_file))
    mtx_info = mminfo(mtx_file)
    num_rows = mtx_info[0]
    num_cols = mtx_info[1]
    bioblocks_log(mtx_info)

    gene_list = load_genes(gene_file,
                           delimiter='\t' if gene_file.endswith('tsv') else None,
                           skip_rows=1 if gene_file.endswith('tsv') else 0)

    if num_rows == len(gene_list):
        bioblocks_log('Genes are rows, Cells are cols')
        num_cells = num_cols
        sample_rows = False
    else:
        bioblocks_log('Cells are rows, Genes are cols')
        num_cells = num_rows
        sample_rows = True

    if num_cells > MAX_CELLS_COUNT:
        bioblocks_log('mtx_file: {}'.format(mtx_file))
        create_subsampled_matrix(mtx_file, gene_file, MAX_CELLS_COUNT)
        num_cells = MAX_CELLS_COUNT

    subsample_ranges = get_cell_subsample_ranges(num_cells)

    bioblocks_log('Attempting to run SPRING with subsample ranges {}'.format(subsample_ranges))

    for subsample_range in subsample_ranges:
        analysis_id = str(uuid.uuid4())
        start_time = datetime.utcnow()
        bioblocks_log('Starting SPRING analysis \'{}\' for dataset \'{}\''.format(
            analysis_id, dataset_id))

        main_dir = '{}/analyses/{}'.format(dataset_dir, analysis_id)

        spring_load_preprocess.run_spring_preprocessing(
            mtx_file=mtx_file,
            gene_file=gene_file,
            cell_labels_file='{}/matrix/cells.tsv'.format(
                dataset_dir),
            main_dir=main_dir,
            subplot_name=dataset['name'],
            sample_rows=sample_rows,
            subsample_range=subsample_range,
            num_cells=num_cells
        )

        try:
            dataset['_etag'] = patch_matrix_info_for_dataset(dataset, mtx_info, mtx_file)
            analysis = {
                '_id': analysis_id,
                'process_type': 'SPRING',
                'name': '{} - {}'.format(dataset['name'], get_numeric_shorthand_suffix(subsample_range))
            }
            post_bioblocks_analysis(analysis)
            dataset['_etag'] = patch_analysis_for_dataset(dataset, analysis_id)
        except Exception as e:
            bioblocks_log('Error with compression of matrix file: {}'.format(e))
            return

    try:
        # bioblocks_log('Compressing file: {}'.format(mtx_file))
        # with open(mtx_file, 'rb') as f_in:
        #     with gzip.open('{}.gz'.format(mtx_file), 'wb') as f_out:
        #         shutil.copyfileobj(f_in, f_out)
        # bioblocks_log('Finished compressing file: {}'.format(mtx_file))

        delete_directory('{}/{}'.format(dataset_dir, tmp_dir))
        os.remove(mtx_file)

    except Exception as e:
        bioblocks_log('Error with cleanup of matrix file: {}'.format(e))

    end_time = datetime.utcnow()
    bioblocks_log('Finished SPRING analysis \'{}\' for dataset \'{}\'. Duration: {}'.format(
        analysis_id, dataset_id, end_time - start_time))
Exemplo n.º 30
0
def delete_file(old_file):
    try:
        os.rm(old_file)
        bioblocks_log('Deleted file \'{}\''.format(old_file))
    except Exception:
        bioblocks_log('File \'{}\' doesn\'t exist, skipping!'.format(old_file))