Exemplo n.º 1
0
def demultiplex_all(context: CGConfig, bcl_converter: str,
                    flowcells_directory: click.Path, dry_run: bool):
    """Demultiplex all flowcells that are ready under the flowcells_directory"""
    LOG.info("Running cg demultiplex all, using %s.", bcl_converter)
    if flowcells_directory:
        flowcells_directory: Path = Path(str(flowcells_directory))
    else:
        flowcells_directory: Path = Path(context.demultiplex.run_dir)
    demultiplex_api: DemultiplexingAPI = context.demultiplex_api
    demultiplex_api.set_dry_run(dry_run=dry_run)
    tb_api: TrailblazerAPI = context.trailblazer_api
    LOG.info("Search for flowcells ready to demultiplex in %s",
             flowcells_directory)
    for sub_dir in flowcells_directory.iterdir():
        if not sub_dir.is_dir():
            continue
        LOG.info("Found directory %s", sub_dir)
        try:
            flowcell_obj = Flowcell(flowcell_path=sub_dir,
                                    bcl_converter=bcl_converter)
        except FlowcellError:
            continue

        if not demultiplex_api.is_demultiplexing_possible(
                flowcell=flowcell_obj) and not dry_run:
            continue

        if not flowcell_obj.validate_sample_sheet():
            LOG.warning(
                "Malformed sample sheet. Run cg demultiplex samplesheet validate %s",
                flowcell_obj.sample_sheet_path,
            )
            continue

        delete_demux_api: DeleteDemuxAPI = DeleteDemuxAPI(
            config=context,
            demultiplex_base=demultiplex_api.out_dir,
            dry_run=dry_run,
            run_path=(flowcells_directory / sub_dir),
        )

        delete_demux_api.delete_flow_cell(
            cg_stats=False,
            demultiplexing_dir=True,
            run_dir=False,
            housekeeper=True,
            init_files=False,
            status_db=False,
        )

        slurm_job_id: int = demultiplex_api.start_demultiplexing(
            flowcell=flowcell_obj)
        demultiplex_api.add_to_trailblazer(tb_api=tb_api,
                                           slurm_job_id=slurm_job_id,
                                           flowcell=flowcell_obj)
Exemplo n.º 2
0
def convert(experiment_path: click.Path, group_name: str, world_name: str,
            output_path: Optional[click.Path], with_date: bool) -> None:
    experiment_path = Path(str(experiment_path))
    experiment_id = experiment_path.name
    output_base_path = experiment_path if output_path is None else output_path

    dataframes = []
    for run_path in experiment_path.iterdir():
        if run_path.is_dir():
            run_id = run_path.name
            df = pd.read_csv(run_path / 'measures2.txt', sep=' ')
            df.rename(columns={'genome': 'genome_id'}, inplace=True)
            df = pd.pivot_table(df,
                                values='value',
                                index=['generation', 'genome_id'],
                                columns=['measures'],
                                aggfunc='last')

            df_history = pd.read_csv(run_path / 'history.txt', sep=' ')
            df_history.rename(columns={
                'idgenome': 'genome_id',
                'idparent1': 'parent1_id',
                'idparent2': 'parent2_id'
            },
                              inplace=True)
            df_history.columns.rename('measures', inplace=True)
            df_history.set_index(['generation', 'genome_id'], inplace=True)
            df_history.drop(df_history.columns[-1], axis=1, inplace=True)
            for c in ['parent1_id', 'parent2_id']:
                df_history[c] = df_history[c].replace('N',
                                                      np.nan).astype(float)

            df = df.join(df_history, how='inner')
            df = pd.concat([df], keys=[run_id], names=['run_id'])
            dataframes.append(df)

    df = pd.concat(dataframes)
    n_rows = len(df)
    n_files = len(dataframes)
    print(
        f'Found {n_rows} rows in {n_files} runs ({n_rows // n_files} on average)'
    )

    output_file_path = output_base_path / get_file_name(
        group_name, world_name, experiment_id, with_date)
    print(f'Wrote hdf5 file to {str(output_file_path)}')
    df.to_hdf(output_file_path, key='descriptors')