def get_all_days(self): """ Returns a sorted list of all the dates (formatted as: YYYY-MM-DD) for which there is data available """ all_days = [d for d in utils.get_subdirectories(self.directory) if d != QUEUE_DIR] all_days.sort() return all_days
def load(cls, source_path): metadata_path = os.path.join(source_path, "config.json") rir_path = os.path.join(source_path, "rir") # load metadata with open(metadata_path) as json_file: metadata = json.load(json_file) # load responses mic_responses = get_subdirectories(rir_path) responses = [] for _mic in mic_responses: rir_files = glob.glob(os.path.join(rir_path, _mic, "*.wav")) responses.append(rir_files) # load background if available background_noise_path = os.path.join(source_path, "background_noise") background_files = glob.glob( os.path.join(background_noise_path, "*.wav") ) return cls( responses=responses, mic_metadata=metadata["mic_metadata"], speaker_metadata=metadata["speaker_metadata"], room_params=metadata["room_params"], room_id=metadata["id"], background_noise=background_files, )
def get_all_provider_names(db_root): """ Get the list of providers for which we have data. Returns a list of string, one for each content provider. """ subdirs = utils.get_subdirectories(db_root) subdirs.sort() return subdirs
def get_queued_items_count(self): queue_directory = os.path.join(self.directory, QUEUE_DIR) batched_days = utils.get_subdirectories(queue_directory) item_count = 0 for day_string in batched_days: day_directory = os.path.join(queue_directory, day_string) item_count += self.get_item_count_for_day(day_directory) return item_count
def get_all_batch_hours(self, date_string): """ For a certain date (YYYY-MM-DD string), returns a list of hours (as HH.MM.SS strings) for which we have data available. """ path = os.path.join(self.directory, date_string) if os.path.exists(path): all_batches = utils.get_subdirectories(path) all_batches.sort() return all_batches else: raise NonExistentDayError(self.name, date_string)
def get_data_per_batch(self, date_string, data_extractor_func): day_directory = os.path.join(self.directory, date_string) if os.path.exists(day_directory): all_batch_times = utils.get_subdirectories(day_directory) all_data = [] for batch_time in all_batch_times: extracted_data = data_extractor_func(self, date_string, batch_time) all_data.append(extracted_data) all_data.sort(key=lambda x: x[0]) return all_data else: raise NonExistentDayError(self.name, date_string)
def get_source_summary_for_all_days(self): """ Returns a list of (date, article_count, error_count). The date is a string (formatted as: YYYY-MM-DD), and counters are integers. The list is sorted on the date (earlier date at the front) """ all_days = [d for d in utils.get_subdirectories(self.directory) if d != QUEUE_DIR] all_days.sort() result = list() for date_string in all_days: metainfos = self.get_cached_metainfos_for_day(date_string) result.append((date_string, metainfos)) return result
def get_errors_per_batch(self, date_string): """ Returns a list of (time, [errors]). """ day_directory = os.path.join(self.directory, date_string) if os.path.exists(day_directory): all_batch_times = utils.get_subdirectories(day_directory) all_errors = [] for batch_time in all_batch_times: errors = self.get_errors_from_batch(date_string, batch_time) all_errors.append((batch_time, errors)) all_errors.sort(key=lambda x: x[0]) return all_errors else: raise NonExistentDayError(self.name, date_string)
def get_queued_batches_by_day(self): """ Each datasource directory contains a 'queue' directory in which items' urls are stored for delayed download. Under the 'queue' directory, """ queue_directory = os.path.join(self.directory, QUEUE_DIR) batched_days = utils.get_subdirectories(queue_directory) batches_by_day = list() for day_string in batched_days: day_directory = os.path.join(queue_directory, day_string) batches_by_day.append((day_string, self.get_queued_items_by_batch(day_directory))) batches_by_day.sort(key=lambda day_batches: day_batches[0]) return batches_by_day
def get_reprocessed_dates(self, date_string, batch_time_string): """ Returns a list of (date, time) tuples for which we have reprocessed content This allows external tools to rebuild the path and reprocessed the raw html stored there """ batch_dir = os.path.join(self.directory, date_string, batch_time_string) if os.path.exists(batch_dir): reprocessed_articles_dates = list() for reprocessed_data_dir in [i for i in utils.get_subdirectories(batch_dir) if i.startswith(REPROCESSED_DIR_PREFIX)]: reprocessed_date, reprocessed_time = reprocessed_data_dir.split("_")[1:] reprocessed_articles_dates.append((reprocessed_date, reprocessed_time)) return reprocessed_articles_dates else: raise NonExistentBatchError(self.name, date_string, batch_time_string)
def get_articles_and_errorcounts_per_batch(self, date_string): """ Returns a list of (hour_string, [Articles], error_count) for a certain date """ day_directory = os.path.join(self.directory, date_string) if os.path.exists(day_directory): all_batch_times = utils.get_subdirectories(day_directory) all_batches = [] for batch_time in all_batch_times: batch_content = self.get_batch_content(date_string, batch_time) articles, batch_error_count = batch_content all_batches.append((batch_time, articles, batch_error_count)) all_batches.sort(key=lambda x: x[0]) return all_batches else: raise NonExistentDayError(self.name, date_string)
def get_reprocessed_batch_articles(self, date_string, batch_time_string): """ Returns articles fetched during an error handling session. ((date_string, hour_string), articles) """ batch_dir = os.path.join(self.directory, date_string, batch_time_string) if os.path.exists(batch_dir): reprocessed_articles = list() for reprocessed_data_dir in [i for i in utils.get_subdirectories(batch_dir) if i.startswith(REPROCESSED_DIR_PREFIX)]: reprocessed_date, reprocessed_time = reprocessed_data_dir.split("_")[1:] json_filepath = os.path.join(batch_dir, reprocessed_data_dir, ARTICLES_FILENAME) with open(json_filepath, 'r') as f: json_content = json.load(f) articles = [ArticleData.from_json(json_string) for json_string in json_content['articles']] articles.sort(key=lambda art: art.url) reprocessed_articles.append(((reprocessed_date, reprocessed_time), articles)) return reprocessed_articles else: raise NonExistentBatchError(self.name, date_string, batch_time_string)
def simulate_measured_bundle( original_dataset, air_abs=False, ism_order=17, ray_tracing=False, freq_dep=False, software=RoomSimSoftware.PYROOMACOUSTICS, ): if software == RoomSimSoftware.PYGSOUND: ism_order = None air_abs = False freq_dep = False ray_tracing = True ray_tracing_param = None if ray_tracing: if software == RoomSimSoftware.PYROOMACOUSTICS: ray_tracing_param = { "n_rays": int(1e5), "receiver_radius": 0.5, "time_thres": 10.0, # "energy_thres": 1e-7, # "hist_bin_size": 0.004 } elif software == RoomSimSoftware.PYGSOUND: ray_tracing_param = { "diffuse_count": 20000, "specular_count": 2000, "src_radius": 0.01, "mic_radius": 0.01, } # load dataset with open(os.path.join(original_dataset, "dataset_metadata.json")) as json_file: metadata = json.load(json_file) n_rooms = metadata["n_rooms"] dataset_type = metadata["dataset_type"] timestamp = metadata["timestamp"] # create output dir if ray_tracing and ism_order is not None and ism_order >= 0: sim_type = f"hyb{ism_order}" elif ray_tracing: sim_type = f"srt" else: sim_type = f"ism{ism_order}" if air_abs: sim_type += "_air_abs" if freq_dep: sim_type += "_freq_dep" dataset_id = (f"measured_room_dataset_SIM_{software}_{sim_type}" f"_{dataset_type}_{n_rooms}rooms_{timestamp}") if os.path.isdir(dataset_id): click.confirm( "\n{} exists. Delete and replace?".format(dataset_id), default=True, abort=True, ) shutil.rmtree(dataset_id) os.mkdir(dataset_id) data_index_file = "data_index.json" data_folder = "data" data_path = os.path.join(dataset_id, data_folder) print("New dataset ID : {}".format(dataset_id)) os.mkdir(data_path) # loop through rooms original_data_folder = os.path.join(original_dataset, data_folder) rooms = get_subdirectories(original_data_folder) for k, _id in enumerate(rooms): original_room_subdir = os.path.join(original_data_folder, _id) room = Room.load(original_room_subdir) print("room {} / {} : {}".format(k + 1, n_rooms, room.id)) # get room params room_params = room.params dimensions = room_params["dimensions"] temperature = room_params.get("temperature") # use furniture coverage as proxy for average scattering scattering = room_params.get("furniture_coverage", 0.5) scattering = max(scattering, 0.1) # get mic and speaker metadata, loop over speaker_metadata = room.speaker_metadata mic_metadata = room.mic_metadata speaker_pos = [] for _speaker in speaker_metadata: speaker_pos.append(_speaker["target_location"]) print("\nRoom params : ") pprint(room_params) print("speaker pos : ") print(speaker_pos) # create new room dir room_subdir = os.path.join(data_path, room.id) os.mkdir(room_subdir) rir_dir = os.path.join(room_subdir, "rir") os.mkdir(rir_dir) shutil.copy( os.path.join(original_room_subdir, "config.json"), os.path.join(room_subdir, "config.json"), ) shutil.copytree( os.path.join(original_room_subdir, "background_noise"), os.path.join(room_subdir, "background_noise"), ) for m, _mic in enumerate(mic_metadata): mic_subdir = os.path.join(rir_dir, "mic{}".format(m)) os.mkdir(mic_subdir) mic_pos = _mic["mic_location"] print(" mic pos : {}".format(mic_pos)) # estimate from impulse if freq_dep: materials = room_materials_registry[_id] else: materials = _mic["t60_estimate"] if isinstance(materials, list): # unique RT60 per mic-speaker pair mic_responses = [] for spk_idx, rt60 in enumerate(materials): resp, sample_rate = compute_room_irs( room_dim=dimensions, room_properties=rt60, mic_pos=mic_pos, source_pos=[speaker_pos[spk_idx]], ism_order=ism_order, ray_tracing_param=ray_tracing_param, ray_tracing=ray_tracing, air_absorption=air_abs, temperature=temperature, scattering=scattering, software=software, ) mic_responses += resp else: # compute RIR(s) mic_responses, sample_rate = compute_room_irs( room_dim=dimensions, room_properties=materials, mic_pos=mic_pos, source_pos=speaker_pos, ism_order=ism_order, ray_tracing_param=ray_tracing_param, ray_tracing=ray_tracing, air_absorption=air_abs, temperature=temperature, scattering=scattering, software=software, ) # write RIRs for n, _rir in enumerate(mic_responses): sf.write( os.path.join(mic_subdir, "{}.wav".format(n)), mic_responses[n], sample_rate, ) # write bundle metadata dataset_metadata = metadata.copy() dataset_metadata["dataset_id"] = dataset_id dataset_metadata["sim_type"] = sim_type output_json = os.path.join(dataset_id, "dataset_metadata.json") with open(output_json, "w") as f: json.dump(dataset_metadata, f, indent=4) # copy data index from measured dataset shutil.copy( src=os.path.join(original_dataset, data_index_file), dst=os.path.join(dataset_id, data_index_file), )
def has_reprocessed_content(self, date_string, batch_time_string): batch_dir = os.path.join(self.directory, date_string, batch_time_string) if os.path.exists(batch_dir): return len([i for i in utils.get_subdirectories(batch_dir) if i.startswith(REPROCESSED_DIR_PREFIX)]) > 0 else: raise NonExistentBatchError(self.name, date_string, batch_time_string)
def download_collate_to_vcf_kinship( sample_lists_to_download_directory, out_directory, reference_vcf_path, reference_snp_pickle, chromosomes, vcf_from_plex_bin, bcftools_bin, baton_bin, baton_metaquery_bin, baton_get_bin, akt, info_include_path, nsnps=10, minkin=0.4, irods_credentials_path=None, n_max_processes=25, plink_bin=None, pipeline_entry_name='download'): """ Args: sample_list_path (str): Path to a headerless text file which lists the Sanger sample IDs to process. out_directory (str): Directory where the all results will be saved. If already exists returns error. irods_credentials_path (str, optional): Path to a text file containing user's irods password. If not supplied, user will be prompted. reference_vcf_path (str): Path to VCF which will be merged against. vcf_from_plex_bin (str): bcftools_bin (str): entry_point (str): The point at which the pipeline starts: -download -makehandprints -makevcf -kinship Returns: Creates the following directory structure upon completion: out_directory/ execution_arguments.txt fingerprints/sequenom: downloaded CSV files fingerprints/fluidigm: downloaded CSV files handprints/sequenom/handprint*/ sampleid*.csv snpset.tsv handprints/fluidigm/handprint*/ sampleid*.csv snpset.tsv logs/vcf_from_plex_fluidigm_handprint1.o|e vcfs/ sequenom/ handprint*.vcf.gz fluidigm/ handprint*.vcf.gz handprints.vcf.gz(.csi) subsetted_reference.vcf.gz(.csi) all_merged.vcf.gz(.csi) kinship/kinship_results.csv kinship/kinship_results.pickle """ """ TODO 1. Determine how much the handprints stuff actually helps. How many times does one individual have multiple fingerprints? 2. Plot 3. Prune on LD in reference vcf """ utils.PLINK_BIN = plink_bin utils.BCFTOOLS_BIN = bcftools_bin PIPELINE_STEPS = { 'download': 0, 'makehandprints': 1, 'plex2vcf': 2, 'check_sampleids': 3, 'kinship': 4, 'analysis': 5, } assert pipeline_entry_name in PIPELINE_STEPS, \ 'Please specify a valid entry point.' pipeline_entry = PIPELINE_STEPS.get(pipeline_entry_name, 0) print(f'Starting at {pipeline_entry_name} in pipeline.') fingerprints_directory = os.path.join(out_directory, 'fingerprints') if pipeline_entry <= PIPELINE_STEPS['download']: # Make sure directory doesn't already exist - we want to start from # fresh. # if os.path.exists(out_directory): # print(f'Error: {out_directory} already exists. Please provide ' # 'a path to nonexistent directory to deposit results.') # sys.exit(-1) # Download raw fingerprints from iRODS os.makedirs(out_directory, exist_ok=True) os.makedirs(fingerprints_directory, exist_ok=True) sample_list_irods_db_path = os.path.join(out_directory, 'sample_list_irods.json') digest_sample_lists_directory( sample_lists_to_download_directory, sample_list_irods_db_path, fingerprints_directory, baton_bin, baton_metaquery_bin, baton_get_bin, irods_credentials_path, n_max_processes) # for fingerprint_method in utils.FINGERPRINT_METHODS: # # Create directory for method # fingerprint_method_directory = os.path.join( # fingerprint_directory,fingerprint_method) # os.makedirs((fingerprint_method_directory, exist_ok=True) # download_fingerprints(sample_list_path, # fingerprints_directory, fingerprint_method, # baton_bin, baton_metaquery_bin, baton_get_bin, # n_max_processes) handprints_directory = os.path.join(out_directory, 'handprints') # Generate handprints from downloaded if pipeline_entry <= PIPELINE_STEPS['makehandprints']: shutil.rmtree(handprints_directory, ignore_errors=True) os.mkdir(handprints_directory) for fingerprint_method in utils.FINGERPRINT_METHODS: fingerprint_method_directory = os.path.join( fingerprints_directory, fingerprint_method) handprint_method_directory = os.path.join( handprints_directory, fingerprint_method) if fingerprint_method == 'fluidigm': fluidigm.generate(fingerprint_method_directory, handprint_method_directory, reference_snp_pickle) elif fingerprint_method == 'sequenom': sequenom.generate(fingerprint_method_directory, handprint_method_directory, reference_snp_pickle) vcf_directory = os.path.join(out_directory, 'vcfs') print(vcf_directory) vcf_merged_path = os.path.join(vcf_directory, 'all_merged.vcf.gz') subsetted_reference_vcf = os.path.join(vcf_directory, 'subsetted_reference.vcf.gz') vcf_merged_handprints_path = os.path.join(vcf_directory, 'handprints.vcf.gz') # Convert handprints to VCFs if pipeline_entry <= PIPELINE_STEPS['plex2vcf']: shutil.rmtree(vcf_directory, ignore_errors=True) os.mkdir(vcf_directory) # log_directory = os.path.join(out_directory, 'logs') # os.mkdir(log_directory) for fingerprint_method in utils.FINGERPRINT_METHODS: handprint_method_directory = os.path.join(handprints_directory, fingerprint_method) handprints = utils.get_subdirectories( handprint_method_directory) n_handprints = len(handprints) if not n_handprints: print(f'No handprints generated for {fingerprint_method}') continue vcf_out_dir = os.path.join(vcf_directory, fingerprint_method) if not os.path.exists(vcf_out_dir): os.makedirs(vcf_out_dir) for handprint in handprints: filelist = os.path.join(handprint, f'filelist.txt') snpset = os.path.join(handprint, 'snpset.tsv') handprint = os.path.normpath(handprint) handprint = os.path.basename(handprint) vcf_filename = f'{handprint}.vcf' vcf_out = os.path.join(vcf_out_dir, vcf_filename) print(vcf_out) # # Save output of vcf_from_plex to log files # _, this_handprint_directory = os.path.split(handprint) # base = f'{fingerprint_method}_{this_handprint_directory}' # stdout = os.path.join(log_directory, base + '.o') # stderr = os.path.join(log_directory, base + '.e') utils.run_vcf_from_plex(vcf_from_plex_bin, filelist, chromosomes, fingerprint_method, snpset, vcf_out) utils.convert_sampleids_to_lowercase_vcf(vcf_out) # Removing this because it is difficult to restart: # # if pipeline_entry <= PIPELINE_STEPS['mergevcfs']: # # Merge all VCFs handprint_vcfs = glob(os.path.join(vcf_directory, '*', '*')) print('Handprint VCFs found:') print('\n'.join(handprint_vcfs)) n_handprint_vcfs = len(handprint_vcfs) if not n_handprint_vcfs: print('Error: No handprint VCFs found to merge.') sys.exit(-1) print(f'Found {n_handprint_vcfs} handprint VCFs to merge') subsetted_reference_vcf = os.path.join(vcf_directory, 'subsetted_reference.vcf.gz') # Subset reference on only those SNPs seen in all of the # handprint_vcfs utils.subset_reference_vcfs_on_handprint_snps(handprint_vcfs, reference_snp_pickle, reference_vcf_path, subsetted_reference_vcf) utils.index_vcf(subsetted_reference_vcf) # utils.concat_vcfs(subsetted_referenence_vcfs, referenence_vcf) # Remove samples in handprints already present in reference - these # will be much higher quality in the imputed reference handprint_vcfs = utils.filter_duplicate_individuals( subsetted_reference_vcf, handprint_vcfs) handprint_vcfs = utils.gzip_vcfs(handprint_vcfs) utils.index_vcfs(handprint_vcfs) # Create a VCF of all handprints to print MAF/LD/missing stats utils.merge_vcfs(handprint_vcfs, vcf_merged_handprints_path) utils.index_vcf(vcf_merged_handprints_path) vcfs_to_merge = [ subsetted_reference_vcf, vcf_merged_handprints_path ] # Merge all VCFs utils.merge_vcfs(vcfs_to_merge, vcf_merged_path) utils.index_vcf(vcf_merged_path) # Prune subsetted_reference_vcf by LD - don't trust the # fingerprints because of the (most likely) high missingness. prune_exclude_path = utils.get_LD_prune_list( subsetted_reference_vcf) print('Pruning the following:') with open(prune_exclude_path) as f: print(f.readlines()) utils.prune_by_rsid(vcf_merged_path, prune_exclude_path) # Post-pruning will still have the same filename utils.index_vcf(vcf_merged_path) # Generate stats vcfs = vcfs_to_merge + [vcf_merged_path] for vcf in vcfs: i = vcf.rfind('.vcf') out = vcf[:i] generate_missingness_stats(vcf, plink_bin, out) generate_LD_stats(vcf, plink_bin, out) generate_MAF_stats(vcf, plink_bin, out) # Filter out low MAF variants i = vcf_merged_path.rfind('.vcf') root = vcf_merged_path[:i] frq_path = f'{root}.frq' maf_out = f'{root}.maf.out' utils.get_MAF_exclude_list(frq_path, maf_out) utils.prune_by_rsid(vcf_merged_path, maf_out) utils.index_vcf(vcf_merged_path) # Filter out low INFO variants utils.include_by_region_file(vcf_merged_path, info_include_path) utils.index_vcf(vcf_merged_path) if pipeline_entry <= PIPELINE_STEPS['check_sampleids']: print('Checking number of the Sanger sample IDs present in the final merged VCF.') sample_ids_for_lookup = get_all_sangerids_from_sample_lists_directory(sample_lists_to_download_directory) sample_ids_for_lookup = {sampleid.lower() for sampleid in sample_ids_for_lookup if sampleid} n_sample_ids_for_lookup = len(sample_ids_for_lookup) sample_ids_from_vcf = utils.get_sample_ids_from_vcf(vcf_merged_path) included = sample_ids_from_vcf & sample_ids_for_lookup n_included = len(included) not_included = sample_ids_for_lookup - included n_not_included = len(not_included) print(f'...N included {n_included} / {n_sample_ids_for_lookup}') print(f'...N not included {n_not_included} / {n_sample_ids_for_lookup}') if n_not_included: s = ','.join(not_included) print(f'......{s}') # Run akt kinship_directory = os.path.join(out_directory, 'kinship') if pipeline_entry <= PIPELINE_STEPS['kinship']: shutil.rmtree(kinship_directory) os.makedirs(kinship_directory, exist_ok=True) kinship_results = os.path.join(kinship_directory, f'kinship_results_minkin_{minkin}_nsnps_{nsnps}.csv') command = f'{akt} kin --method 0 {vcf_merged_path} --force --minkin {minkin} |' command += f'awk \'{{OFS=","}} {{ if ($7 >= {nsnps}) print $1, $2, $6, $7; }}\' > {kinship_results}' utils.run(command)