예제 #1
0
 def get_all_days(self):
     """
     Returns a sorted list of all the dates (formatted as: YYYY-MM-DD) for which there is data available
     """
     all_days = [d for d in utils.get_subdirectories(self.directory) if d != QUEUE_DIR]
     all_days.sort()
     return all_days
예제 #2
0
    def load(cls, source_path):

        metadata_path = os.path.join(source_path, "config.json")
        rir_path = os.path.join(source_path, "rir")

        # load metadata
        with open(metadata_path) as json_file:
            metadata = json.load(json_file)

        # load responses
        mic_responses = get_subdirectories(rir_path)
        responses = []
        for _mic in mic_responses:
            rir_files = glob.glob(os.path.join(rir_path, _mic, "*.wav"))
            responses.append(rir_files)

        # load background if available
        background_noise_path = os.path.join(source_path, "background_noise")
        background_files = glob.glob(
            os.path.join(background_noise_path, "*.wav")
        )

        return cls(
            responses=responses,
            mic_metadata=metadata["mic_metadata"],
            speaker_metadata=metadata["speaker_metadata"],
            room_params=metadata["room_params"],
            room_id=metadata["id"],
            background_noise=background_files,
        )
예제 #3
0
def get_all_provider_names(db_root):
    """
    Get the list of providers for which we have data.

    Returns a list of string, one for each content provider.
    """
    subdirs = utils.get_subdirectories(db_root)
    subdirs.sort()
    return subdirs
예제 #4
0
    def get_queued_items_count(self):
        queue_directory = os.path.join(self.directory, QUEUE_DIR)
        batched_days = utils.get_subdirectories(queue_directory)
        item_count = 0

        for day_string in batched_days:
            day_directory = os.path.join(queue_directory, day_string)
            item_count += self.get_item_count_for_day(day_directory)

        return item_count
예제 #5
0
 def get_all_batch_hours(self, date_string):
     """
     For a certain date (YYYY-MM-DD string), returns a list of hours (as HH.MM.SS strings) for which we have data available.
     """
     path = os.path.join(self.directory, date_string)
     if os.path.exists(path):
         all_batches = utils.get_subdirectories(path)
         all_batches.sort()
         return all_batches
     else:
         raise NonExistentDayError(self.name, date_string)
예제 #6
0
    def get_data_per_batch(self, date_string, data_extractor_func):
        day_directory = os.path.join(self.directory, date_string)
        if os.path.exists(day_directory):
            all_batch_times = utils.get_subdirectories(day_directory)
            all_data = []
            for batch_time in all_batch_times:
                extracted_data = data_extractor_func(self, date_string, batch_time)
                all_data.append(extracted_data)
            all_data.sort(key=lambda x: x[0])
            return all_data

        else:
            raise NonExistentDayError(self.name, date_string)
예제 #7
0
    def get_source_summary_for_all_days(self):
        """
        Returns a list of (date, article_count, error_count). The date is a string (formatted as: YYYY-MM-DD),
        and counters are integers.

        The list is sorted on the date (earlier date at the front)
        """

        all_days = [d for d in utils.get_subdirectories(self.directory) if d != QUEUE_DIR]
        all_days.sort()
        result = list()
        for date_string in all_days:
            metainfos = self.get_cached_metainfos_for_day(date_string)
            result.append((date_string, metainfos))
        return result
예제 #8
0
    def get_errors_per_batch(self, date_string):
        """
        Returns a list of (time, [errors]).
        """
        day_directory = os.path.join(self.directory, date_string)
        if os.path.exists(day_directory):
            all_batch_times = utils.get_subdirectories(day_directory)
            all_errors = []
            for batch_time in all_batch_times:
                errors = self.get_errors_from_batch(date_string, batch_time)
                all_errors.append((batch_time, errors))
            all_errors.sort(key=lambda x: x[0])
            return all_errors

        else:
            raise NonExistentDayError(self.name, date_string)
예제 #9
0
    def get_queued_batches_by_day(self):
        """
        Each datasource directory contains a 'queue' directory in which items' urls
        are stored for delayed download.

        Under the 'queue' directory,
        """
        queue_directory = os.path.join(self.directory, QUEUE_DIR)
        batched_days = utils.get_subdirectories(queue_directory)
        batches_by_day = list()
        for day_string in batched_days:
            day_directory = os.path.join(queue_directory, day_string)
            batches_by_day.append((day_string, self.get_queued_items_by_batch(day_directory)))

        batches_by_day.sort(key=lambda day_batches: day_batches[0])
        return batches_by_day
예제 #10
0
    def get_reprocessed_dates(self, date_string, batch_time_string):
        """
        Returns a list of (date, time) tuples for which we have reprocessed content

        This allows external tools to rebuild the path and reprocessed the
        raw html stored there
        """
        batch_dir = os.path.join(self.directory, date_string, batch_time_string)
        if os.path.exists(batch_dir):
            reprocessed_articles_dates = list()
            for reprocessed_data_dir in [i for i in utils.get_subdirectories(batch_dir) if i.startswith(REPROCESSED_DIR_PREFIX)]:
                reprocessed_date, reprocessed_time = reprocessed_data_dir.split("_")[1:]
                reprocessed_articles_dates.append((reprocessed_date, reprocessed_time))
            return reprocessed_articles_dates
        else:
            raise NonExistentBatchError(self.name, date_string, batch_time_string)
예제 #11
0
    def get_articles_and_errorcounts_per_batch(self, date_string):
        """
        Returns a list of (hour_string, [Articles], error_count) for a certain date
        """
        day_directory = os.path.join(self.directory, date_string)
        if os.path.exists(day_directory):
            all_batch_times = utils.get_subdirectories(day_directory)
            all_batches = []
            for batch_time in all_batch_times:
                batch_content = self.get_batch_content(date_string, batch_time)
                articles, batch_error_count = batch_content
                all_batches.append((batch_time, articles, batch_error_count))

            all_batches.sort(key=lambda x: x[0])
            return all_batches
        else:
            raise NonExistentDayError(self.name, date_string)
예제 #12
0
    def get_reprocessed_batch_articles(self, date_string, batch_time_string):
        """
        Returns articles fetched during an error handling session.

        ((date_string, hour_string), articles)
        """
        batch_dir = os.path.join(self.directory, date_string, batch_time_string)
        if os.path.exists(batch_dir):
            reprocessed_articles = list()
            for reprocessed_data_dir in [i for i in utils.get_subdirectories(batch_dir) if i.startswith(REPROCESSED_DIR_PREFIX)]:
                reprocessed_date, reprocessed_time = reprocessed_data_dir.split("_")[1:]

                json_filepath = os.path.join(batch_dir, reprocessed_data_dir, ARTICLES_FILENAME)
                with open(json_filepath, 'r') as f:
                    json_content = json.load(f)
                    articles = [ArticleData.from_json(json_string) for json_string in json_content['articles']]
                    articles.sort(key=lambda art: art.url)
                    reprocessed_articles.append(((reprocessed_date, reprocessed_time), articles))
            return reprocessed_articles
        else:
            raise NonExistentBatchError(self.name, date_string, batch_time_string)
예제 #13
0
def simulate_measured_bundle(
    original_dataset,
    air_abs=False,
    ism_order=17,
    ray_tracing=False,
    freq_dep=False,
    software=RoomSimSoftware.PYROOMACOUSTICS,
):
    if software == RoomSimSoftware.PYGSOUND:
        ism_order = None
        air_abs = False
        freq_dep = False
        ray_tracing = True

    ray_tracing_param = None
    if ray_tracing:
        if software == RoomSimSoftware.PYROOMACOUSTICS:
            ray_tracing_param = {
                "n_rays": int(1e5),
                "receiver_radius": 0.5,
                "time_thres": 10.0,
                # "energy_thres": 1e-7,
                # "hist_bin_size": 0.004
            }
        elif software == RoomSimSoftware.PYGSOUND:
            ray_tracing_param = {
                "diffuse_count": 20000,
                "specular_count": 2000,
                "src_radius": 0.01,
                "mic_radius": 0.01,
            }

    # load dataset
    with open(os.path.join(original_dataset,
                           "dataset_metadata.json")) as json_file:
        metadata = json.load(json_file)

    n_rooms = metadata["n_rooms"]
    dataset_type = metadata["dataset_type"]
    timestamp = metadata["timestamp"]

    # create output dir
    if ray_tracing and ism_order is not None and ism_order >= 0:
        sim_type = f"hyb{ism_order}"
    elif ray_tracing:
        sim_type = f"srt"
    else:
        sim_type = f"ism{ism_order}"
    if air_abs:
        sim_type += "_air_abs"
    if freq_dep:
        sim_type += "_freq_dep"
    dataset_id = (f"measured_room_dataset_SIM_{software}_{sim_type}"
                  f"_{dataset_type}_{n_rooms}rooms_{timestamp}")
    if os.path.isdir(dataset_id):
        click.confirm(
            "\n{} exists. Delete and replace?".format(dataset_id),
            default=True,
            abort=True,
        )
        shutil.rmtree(dataset_id)
    os.mkdir(dataset_id)
    data_index_file = "data_index.json"
    data_folder = "data"
    data_path = os.path.join(dataset_id, data_folder)
    print("New dataset ID : {}".format(dataset_id))
    os.mkdir(data_path)

    # loop through rooms
    original_data_folder = os.path.join(original_dataset, data_folder)
    rooms = get_subdirectories(original_data_folder)
    for k, _id in enumerate(rooms):

        original_room_subdir = os.path.join(original_data_folder, _id)
        room = Room.load(original_room_subdir)
        print("room {} / {} : {}".format(k + 1, n_rooms, room.id))

        # get room params
        room_params = room.params
        dimensions = room_params["dimensions"]
        temperature = room_params.get("temperature")

        # use furniture coverage as proxy for average scattering
        scattering = room_params.get("furniture_coverage", 0.5)
        scattering = max(scattering, 0.1)

        # get mic and speaker metadata, loop over
        speaker_metadata = room.speaker_metadata
        mic_metadata = room.mic_metadata

        speaker_pos = []
        for _speaker in speaker_metadata:
            speaker_pos.append(_speaker["target_location"])

        print("\nRoom params : ")
        pprint(room_params)

        print("speaker pos : ")
        print(speaker_pos)

        # create new room dir
        room_subdir = os.path.join(data_path, room.id)
        os.mkdir(room_subdir)
        rir_dir = os.path.join(room_subdir, "rir")
        os.mkdir(rir_dir)
        shutil.copy(
            os.path.join(original_room_subdir, "config.json"),
            os.path.join(room_subdir, "config.json"),
        )
        shutil.copytree(
            os.path.join(original_room_subdir, "background_noise"),
            os.path.join(room_subdir, "background_noise"),
        )

        for m, _mic in enumerate(mic_metadata):
            mic_subdir = os.path.join(rir_dir, "mic{}".format(m))
            os.mkdir(mic_subdir)

            mic_pos = _mic["mic_location"]
            print("    mic pos : {}".format(mic_pos))

            # estimate from impulse
            if freq_dep:
                materials = room_materials_registry[_id]
            else:
                materials = _mic["t60_estimate"]

            if isinstance(materials, list):
                # unique RT60 per mic-speaker pair
                mic_responses = []
                for spk_idx, rt60 in enumerate(materials):
                    resp, sample_rate = compute_room_irs(
                        room_dim=dimensions,
                        room_properties=rt60,
                        mic_pos=mic_pos,
                        source_pos=[speaker_pos[spk_idx]],
                        ism_order=ism_order,
                        ray_tracing_param=ray_tracing_param,
                        ray_tracing=ray_tracing,
                        air_absorption=air_abs,
                        temperature=temperature,
                        scattering=scattering,
                        software=software,
                    )
                    mic_responses += resp
            else:

                # compute RIR(s)
                mic_responses, sample_rate = compute_room_irs(
                    room_dim=dimensions,
                    room_properties=materials,
                    mic_pos=mic_pos,
                    source_pos=speaker_pos,
                    ism_order=ism_order,
                    ray_tracing_param=ray_tracing_param,
                    ray_tracing=ray_tracing,
                    air_absorption=air_abs,
                    temperature=temperature,
                    scattering=scattering,
                    software=software,
                )

            # write RIRs
            for n, _rir in enumerate(mic_responses):
                sf.write(
                    os.path.join(mic_subdir, "{}.wav".format(n)),
                    mic_responses[n],
                    sample_rate,
                )

    # write bundle metadata
    dataset_metadata = metadata.copy()
    dataset_metadata["dataset_id"] = dataset_id
    dataset_metadata["sim_type"] = sim_type
    output_json = os.path.join(dataset_id, "dataset_metadata.json")
    with open(output_json, "w") as f:
        json.dump(dataset_metadata, f, indent=4)

    # copy data index from measured dataset
    shutil.copy(
        src=os.path.join(original_dataset, data_index_file),
        dst=os.path.join(dataset_id, data_index_file),
    )
예제 #14
0
 def has_reprocessed_content(self, date_string, batch_time_string):
     batch_dir = os.path.join(self.directory, date_string, batch_time_string)
     if os.path.exists(batch_dir):
         return len([i for i in utils.get_subdirectories(batch_dir) if i.startswith(REPROCESSED_DIR_PREFIX)]) > 0
     else:
         raise NonExistentBatchError(self.name, date_string, batch_time_string)
예제 #15
0
def download_collate_to_vcf_kinship(
    sample_lists_to_download_directory, out_directory,
    reference_vcf_path, reference_snp_pickle, chromosomes,
    vcf_from_plex_bin, bcftools_bin, baton_bin, baton_metaquery_bin,
    baton_get_bin, akt, info_include_path, nsnps=10, minkin=0.4,
    irods_credentials_path=None, n_max_processes=25, plink_bin=None,
    pipeline_entry_name='download'):
    """
    Args:
        sample_list_path (str): Path to a headerless text file which
            lists the Sanger sample IDs to process.
        out_directory (str): Directory where the all results will be
            saved. If already exists returns error.
        irods_credentials_path (str, optional): Path to a text file
            containing user's irods password. If not supplied, user will
            be prompted.
        reference_vcf_path (str): Path to VCF which will be merged
            against.
        vcf_from_plex_bin (str):
        bcftools_bin (str):
        entry_point (str): The point at which the pipeline starts:
            -download
            -makehandprints
            -makevcf
            -kinship

    Returns:
        Creates the following directory structure upon completion:
            out_directory/
                execution_arguments.txt
                fingerprints/sequenom: downloaded CSV files
                fingerprints/fluidigm: downloaded CSV files
                handprints/sequenom/handprint*/
                    sampleid*.csv
                    snpset.tsv
                handprints/fluidigm/handprint*/
                    sampleid*.csv
                    snpset.tsv
                logs/vcf_from_plex_fluidigm_handprint1.o|e
                vcfs/
                    sequenom/
                        handprint*.vcf.gz
                    fluidigm/
                        handprint*.vcf.gz
                    handprints.vcf.gz(.csi)
                    subsetted_reference.vcf.gz(.csi)
                    all_merged.vcf.gz(.csi)
                kinship/kinship_results.csv
                kinship/kinship_results.pickle
    """


    """
    TODO
    1. Determine how much the handprints stuff actually helps. How many
       times does one individual have multiple fingerprints?
    2. Plot
    3. Prune on LD in reference vcf
    """

    utils.PLINK_BIN = plink_bin
    utils.BCFTOOLS_BIN = bcftools_bin

    PIPELINE_STEPS = {
        'download': 0,
        'makehandprints': 1,
        'plex2vcf': 2,
        'check_sampleids': 3,
        'kinship': 4,
        'analysis': 5,
    }

    assert pipeline_entry_name in PIPELINE_STEPS, \
        'Please specify a valid entry point.'

    pipeline_entry = PIPELINE_STEPS.get(pipeline_entry_name, 0)

    print(f'Starting at {pipeline_entry_name} in pipeline.')

    fingerprints_directory = os.path.join(out_directory, 'fingerprints')

    if pipeline_entry <= PIPELINE_STEPS['download']:
        # Make sure directory doesn't already exist - we want to start from
        # fresh.
        # if os.path.exists(out_directory):
        #     print(f'Error: {out_directory} already exists. Please provide '
        #         'a path to nonexistent directory to deposit results.')
        #     sys.exit(-1)

        # Download raw fingerprints from iRODS
        os.makedirs(out_directory, exist_ok=True)
        os.makedirs(fingerprints_directory, exist_ok=True)

        sample_list_irods_db_path = os.path.join(out_directory, 'sample_list_irods.json')

        digest_sample_lists_directory(
            sample_lists_to_download_directory, sample_list_irods_db_path,
            fingerprints_directory, baton_bin,
            baton_metaquery_bin, baton_get_bin, irods_credentials_path,
            n_max_processes)
        # for fingerprint_method in utils.FINGERPRINT_METHODS:
        #     # Create directory for method
        #     fingerprint_method_directory = os.path.join(
        #         fingerprint_directory,fingerprint_method)
        #     os.makedirs((fingerprint_method_directory, exist_ok=True)
        #     download_fingerprints(sample_list_path,
        #         fingerprints_directory, fingerprint_method,
        #         baton_bin, baton_metaquery_bin, baton_get_bin,
        #         n_max_processes)

    handprints_directory = os.path.join(out_directory, 'handprints')
    # Generate handprints from downloaded
    if pipeline_entry <= PIPELINE_STEPS['makehandprints']:
        shutil.rmtree(handprints_directory, ignore_errors=True)
        os.mkdir(handprints_directory)
        for fingerprint_method in utils.FINGERPRINT_METHODS:
            fingerprint_method_directory = os.path.join(
                fingerprints_directory,
                fingerprint_method)
            handprint_method_directory = os.path.join(
                handprints_directory,
                fingerprint_method)
            if fingerprint_method == 'fluidigm':
                fluidigm.generate(fingerprint_method_directory,
                    handprint_method_directory, reference_snp_pickle)
            elif fingerprint_method == 'sequenom':
                sequenom.generate(fingerprint_method_directory,
                    handprint_method_directory, reference_snp_pickle)

    vcf_directory = os.path.join(out_directory, 'vcfs')
    print(vcf_directory)
    vcf_merged_path = os.path.join(vcf_directory, 'all_merged.vcf.gz')
    subsetted_reference_vcf = os.path.join(vcf_directory,
        'subsetted_reference.vcf.gz')
    vcf_merged_handprints_path = os.path.join(vcf_directory,
        'handprints.vcf.gz')
    
    # Convert handprints to VCFs
    if pipeline_entry <= PIPELINE_STEPS['plex2vcf']:
        shutil.rmtree(vcf_directory, ignore_errors=True)
        os.mkdir(vcf_directory)
        # log_directory = os.path.join(out_directory, 'logs')
        # os.mkdir(log_directory)
        for fingerprint_method in utils.FINGERPRINT_METHODS:
            handprint_method_directory = os.path.join(handprints_directory,
                fingerprint_method)
            handprints = utils.get_subdirectories(
                handprint_method_directory)

            n_handprints = len(handprints)
            if not n_handprints:
                print(f'No handprints generated for {fingerprint_method}')
                continue

            vcf_out_dir = os.path.join(vcf_directory, fingerprint_method)
            if not os.path.exists(vcf_out_dir):
                os.makedirs(vcf_out_dir)

            for handprint in handprints:
                filelist = os.path.join(handprint, f'filelist.txt')

                snpset = os.path.join(handprint, 'snpset.tsv')
                handprint = os.path.normpath(handprint)
                handprint = os.path.basename(handprint)
                vcf_filename = f'{handprint}.vcf'
                vcf_out = os.path.join(vcf_out_dir, vcf_filename)
                print(vcf_out)

                # # Save output of vcf_from_plex to log files
                # _, this_handprint_directory = os.path.split(handprint)
                # base = f'{fingerprint_method}_{this_handprint_directory}'
                # stdout = os.path.join(log_directory, base + '.o')
                # stderr = os.path.join(log_directory, base + '.e')

                utils.run_vcf_from_plex(vcf_from_plex_bin, filelist,
                    chromosomes, fingerprint_method, snpset, vcf_out)
                utils.convert_sampleids_to_lowercase_vcf(vcf_out)

        # Removing this because it is difficult to restart:
        #
        # if pipeline_entry <= PIPELINE_STEPS['mergevcfs']:
        #

        # Merge all VCFs
        handprint_vcfs = glob(os.path.join(vcf_directory, '*', '*'))
        print('Handprint VCFs found:')
        print('\n'.join(handprint_vcfs))
        n_handprint_vcfs = len(handprint_vcfs)
        if not n_handprint_vcfs:
            print('Error: No handprint VCFs found to merge.')
            sys.exit(-1)

        print(f'Found {n_handprint_vcfs} handprint VCFs to merge')

        subsetted_reference_vcf = os.path.join(vcf_directory,
            'subsetted_reference.vcf.gz')
        # Subset reference on only those SNPs seen in all of the
        # handprint_vcfs
        utils.subset_reference_vcfs_on_handprint_snps(handprint_vcfs,
            reference_snp_pickle, reference_vcf_path,
            subsetted_reference_vcf)
        utils.index_vcf(subsetted_reference_vcf)

        # utils.concat_vcfs(subsetted_referenence_vcfs, referenence_vcf)


        # Remove samples in handprints already present in reference - these
        # will be much higher quality in the imputed reference
        handprint_vcfs = utils.filter_duplicate_individuals(
            subsetted_reference_vcf, handprint_vcfs)
        handprint_vcfs = utils.gzip_vcfs(handprint_vcfs)
        utils.index_vcfs(handprint_vcfs)

        # Create a VCF of all handprints to print MAF/LD/missing stats
        utils.merge_vcfs(handprint_vcfs, vcf_merged_handprints_path)
        utils.index_vcf(vcf_merged_handprints_path)

        vcfs_to_merge = [
            subsetted_reference_vcf,
            vcf_merged_handprints_path
        ]

        # Merge all VCFs
        utils.merge_vcfs(vcfs_to_merge, vcf_merged_path)
        utils.index_vcf(vcf_merged_path)

        # Prune subsetted_reference_vcf by LD - don't trust the
        # fingerprints because of the (most likely) high missingness.
        prune_exclude_path = utils.get_LD_prune_list(
            subsetted_reference_vcf)
        print('Pruning the following:')
        with open(prune_exclude_path) as f:
            print(f.readlines())
        utils.prune_by_rsid(vcf_merged_path, prune_exclude_path)
        # Post-pruning will still have the same filename
        utils.index_vcf(vcf_merged_path)

        # Generate stats
        vcfs = vcfs_to_merge + [vcf_merged_path]

        for vcf in vcfs:
            i = vcf.rfind('.vcf')
            out = vcf[:i]
            generate_missingness_stats(vcf, plink_bin, out)
            generate_LD_stats(vcf, plink_bin, out)
            generate_MAF_stats(vcf, plink_bin, out)

        # Filter out low MAF variants
        i = vcf_merged_path.rfind('.vcf')
        root = vcf_merged_path[:i]
        frq_path = f'{root}.frq'
        maf_out = f'{root}.maf.out'
        utils.get_MAF_exclude_list(frq_path, maf_out)
        utils.prune_by_rsid(vcf_merged_path, maf_out)
        utils.index_vcf(vcf_merged_path)


        # Filter out low INFO variants
        utils.include_by_region_file(vcf_merged_path, info_include_path)
        utils.index_vcf(vcf_merged_path)

    if pipeline_entry <= PIPELINE_STEPS['check_sampleids']:
        print('Checking number of the Sanger sample IDs present in the final merged VCF.')
        sample_ids_for_lookup = get_all_sangerids_from_sample_lists_directory(sample_lists_to_download_directory)
        sample_ids_for_lookup = {sampleid.lower() for sampleid in sample_ids_for_lookup if sampleid}
        n_sample_ids_for_lookup = len(sample_ids_for_lookup)
        sample_ids_from_vcf = utils.get_sample_ids_from_vcf(vcf_merged_path)
        included = sample_ids_from_vcf & sample_ids_for_lookup
        n_included = len(included)
        not_included = sample_ids_for_lookup - included
        n_not_included = len(not_included)
        print(f'...N included {n_included} / {n_sample_ids_for_lookup}')
        print(f'...N not included {n_not_included} / {n_sample_ids_for_lookup}')
        if n_not_included:
            s = ','.join(not_included)
            print(f'......{s}')


    # Run akt
    kinship_directory = os.path.join(out_directory, 'kinship')
    if pipeline_entry <= PIPELINE_STEPS['kinship']:
        shutil.rmtree(kinship_directory)
        os.makedirs(kinship_directory, exist_ok=True)
        kinship_results = os.path.join(kinship_directory, f'kinship_results_minkin_{minkin}_nsnps_{nsnps}.csv')
        command = f'{akt} kin --method 0 {vcf_merged_path} --force --minkin {minkin} |'
        command += f'awk \'{{OFS=","}} {{ if ($7 >= {nsnps}) print $1, $2, $6, $7; }}\' > {kinship_results}'
        utils.run(command)