def get_colossus_tantalus_data(directory): hmmcopy_data = collections.defaultdict(list) for table_name, data in load_qc_data(directory).items(): hmmcopy_data[table_name].append(data) for table_name in hmmcopy_data: hmmcopy_data[table_name] = pd.concat(hmmcopy_data[table_name], ignore_index=True) return hmmcopy_data
def load_data(directory, dashboard_id, host, port): logger.info("LOADING DATA: " + dashboard_id) hmmcopy_data = collections.defaultdict(list) for table_name, data in load_qc_data(directory).items(): hmmcopy_data[table_name].append(data) for table_name in hmmcopy_data: hmmcopy_data[table_name] = pd.concat(hmmcopy_data[table_name], ignore_index=True) logger.info(f'loading hmmcopy data with tables {hmmcopy_data.keys()}') for index_type in constants.DATA_TYPES: index_name = f"{dashboard_id.lower()}_{index_type}" logger.info(f"Index {index_name}") data = eval(f"get_{index_type}_data(hmmcopy_data)") logger.info(f"dataframe for {index_name} has shape {data.shape}") load_records(data, index_name, host, port)
def test_load_local_qc_data(results_dir): results_tables = load_qc_data(results_dir) test_qc_data(results_tables)
def load_ticket( jira_ticket, ip_address, local_cache_directory=None, ticket_directory=None, description=None, title=None, sample_id=None, cell_subset_count=None, cell_ids=None, experimental_condition_override=None, ): if (local_cache_directory is not None) == (len(ticket_directory) > 0): raise ValueError( 'must specify one of local_cache_directory or ticket_directory') if len(cell_ids) == 0: cell_ids = None if cell_subset_count and cell_ids: logging.info( f'Sorry, --cell_subset_count and --cell_ids arguments cannot be used together') return logging.info(f'jira ticket {jira_ticket}') if local_cache_directory is not None: cache_qc_results(jira_ticket, local_cache_directory) ticket_directory = [os.path.join(local_cache_directory, jira_ticket)] hmmcopy_data = collections.defaultdict(list) for d in ticket_directory: for table_name, data in load_qc_data(d).items(): hmmcopy_data[table_name].append(data) for table_name in hmmcopy_data: hmmcopy_data[table_name] = pd.concat( hmmcopy_data[table_name], ignore_index=True) if experimental_condition_override is not None: for table_name, data in hmmcopy_data.items(): if 'experimental_condition' in data: data['experimental_condition'] = experimental_condition_override logging.info(f'loading hmmcopy data with tables {hmmcopy_data.keys()}') if sample_id is not None: logging.info(f'filtering hmmcopy data by sample={sample_id}') filter_by_sample_id(hmmcopy_data, sample_id) elasticsearch_client = ElasticsearchClient(host=ip_address) if cell_subset_count is not None: cell_ids = hmmcopy_data['annotation_metrics']['cell_id'].iloc[:cell_subset_count].values index = jira_ticket.lower() index_get_data = { f"qc": get_qc_data, f"segs": get_segs_data, f"bins": get_bins_data, f"gc_bias": get_gc_bias_data, } for index_type, get_data in index_get_data.items(): index_name = f"{jira_ticket.lower()}_{index_type}" logging.info(f"Index {index_name}") init_load(elasticsearch_client, index_name,) data = get_data(hmmcopy_data) # Subset cells if cell_ids is not None and index_type != 'gc_bias': data = data[data['cell_id'].isin(cell_ids)] logging.info(f"dataframe for {index_name} has shape {data.shape}") data['caller'] = caller_map[index_type] data['sample_id'] = jira_ticket load_index(elasticsearch_client, index_name, data,) logging.info( f"loading published dashboard record {jira_ticket}") AnalysisLoader().load_data(jira_ticket, ip_address, 9200)
def test_load_local_qc_data(results_dir): results_tables = load_qc_data(results_dir) test_qc_data(results_tables) logging.info(f'successfully loaded results from {results_dir}')