def sample_sheet(sspath):
    Sheet = SampleSheet(sspath)
    data = {}
    for sample in Sheet.samples:
        sample_name = sample['Sample_Name']
        sample_description = sample['Description']

        if len(sample_description.split(
                "_")) != 15 or sample_description.split("_")[-1] == 'null':
            sample_criteria = 7  #'Information saknas'
        else:
            sample_criteria = int(sample_description.split("_")[-1])

        #Skip controls and eurofins samples
        if sample_name.startswith(('PosCtrl', 'PosKon', 'NegCtrl', 'NegKon')):
            continue

        #Skip samples set to runType = 01 (desc. field 3)
        #These are samples which have been re-sequenced, so they have already been uploaded to GENSAM.
        runtype = sample_description.split("_")[2]
        if runtype == '01':
            continue
        else:
            #populate a dictionary with samples and their selection criteria value
            sample_id = sample_name.split('_')[0]
            data[sample_id] = sample_criteria

    return data
Exemplo n.º 2
0
def sample_sheet_criteria(path, inhouse_dict, week, run_name):
    Sheet = SampleSheet(path)
    data = {}
    data['general'] = 0
    data['selection'] = 0
    data['unknown'] = 0
    for sample in Sheet.samples:
        sample_name = sample['Sample_ID']
        description = sample['Description']

        # Skip controls or if not in dict
        if sample_name.lower().startswith(('neg', 'pos')):
            continue
        elif sample_name not in inhouse_dict[week][run_name].keys():
            continue
        else:
            #Check if the last element is the selection criterion. This is a pretty weak check...
            criterion = description.split("_")[-1]
            if criterion.isdigit() and int(criterion) <= 10: 
               # print(criterion)
                if int(criterion) == 1 or int(criterion) == 8 or int(criterion) == 9:
                    data['general'] += 1
            
                elif int(criterion) == 7:
                    data['unknown'] += 1

                else:
                    data['selection'] += 1
            else:
                data['unknown'] += 1

    return data
Exemplo n.º 3
0
def writeSammpleSheets(sample_list, sheet_path, template_sheet):
    samplesheet_name = os.path.basename(sheet_path)
    samplesheet_dir = os.path.dirname(os.path.realpath(sheet_path))
    count = 0
    exit_status = "success"
    for key in sample_list:
        count += 1
        logger.debug(
            f"{len(sample_list[key])} samples with idx lengths {key[1]}/{key[2]} for {key[0]} dataset"
        )

        new_sample_sheet = SampleSheet()
        new_sample_sheet.Header = template_sheet.Header
        new_sample_sheet.Reads = template_sheet.Reads
        new_sample_sheet.Settings = template_sheet.Settings
        for sample in sample_list[key]:
            new_sample_sheet.add_sample(sample)

        new_sample_sheet_file = os.path.join(
            samplesheet_dir,
            samplesheet_name + ".custom." + str(count) + "." + key[0])
        logger.info(f"Creating custom sample sheet: {new_sample_sheet_file}")
        try:
            with open(new_sample_sheet_file, "w") as ss_writer:
                new_sample_sheet.write(ss_writer)
        except Exception as error:
            logger.error(f"Exception writing new sample sheet: {error}")
            exit_status = "failure"

        logger.debug(f"Created custom sample sheet: {new_sample_sheet_file}")

    return exit_status
Exemplo n.º 4
0
def main(samplesheet_file_path, check_only):
    logger.info(f"Checking SampleSheet {samplesheet_file_path}")
    original_sample_sheet = SampleSheet(samplesheet_file_path)

    # Run some consistency checks
    #import_library_sheet_from_google('2019')
    # TODO: replace has_error return with enum and expand to error, warning, info?
    #has_header_error = checkSampleSheetMetadata(original_sample_sheet)
    #has_id_error = checkSampleAndLibraryIdFormat(original_sample_sheet)
    #has_index_error = checkSampleSheetForIndexClashes(original_sample_sheet)
    #has_metadata_error = checkMetadataCorrespondence(original_sample_sheet)
    # Only fail on metadata or id errors
    #if has_header_error or has_id_error or has_index_error or has_metadata_error:
    #    raise ValueError(f"Validation detected errors. Please review the error logs!")

    # Split and write individual SampleSheets, based on indexes and technology (10X)
    if not check_only:
        # Sort samples based on technology (truseq/10X and/or index length)
        # Also replace N indexes with ""
        sorted_samples = getSortedSamples(original_sample_sheet)

        # Now that the samples have been sorted, we can write one or more custom sample sheets
        # (which may be the same as the original if no processing was necessary)
        logger.info(f"Writing {len(sorted_samples)} sample sheets.")
        writeSammpleSheets(sample_list=sorted_samples,
                           sheet_path=samplesheet_file_path,
                           template_sheet=original_sample_sheet)

    logger.info("All done.")
Exemplo n.º 5
0
def sample_sheet(path,run):
    Sheet = SampleSheet(path)
    data = {}
    for sample in Sheet.samples:
        sample_name = sample['Sample_ID']
        description = sample['Description']

        # Skip controls
        if sample_name.startswith(('NegCtrl', 'PosCtrl', 'PosKon', 'NegKon')):
            continue
        else:
            data[sample['Sample_ID']] = []
            data[sample['Sample_ID']].append({
                'referensnummer': description.split("_")[0],
                'date': description.split("_")[1],
                'runtype': description.split("_")[2],
                'age': description.split("_")[3],
                'gender': description.split("_")[4],
                'lab_reference': description.split("_")[5],
                'postalcode': description.split("_")[6],
                'ct_value': description.split("_")[7]
            })

    with open(f"/medstore/results/clinical/SARS-CoV-2-typing/nextseq_data/{run}/metadata/{run}_metadata.json", 'w') as outfile:
        json.dump(data, outfile,indent=4)
Exemplo n.º 6
0
def parse_samplesheet(args):
    ss = SampleSheet(args.input)
    df = pd.DataFrame([s.to_json() for s in ss.samples])
    df["is_umi"] = args.is_umi
    df["fwd_adapter"] = args.fwd_adapter
    df["rev_adapter"] = args.rev_adapter
    if args.merge_lanes:
        log.info("Merging samples across all lanes!")
        df["Lane"] = "all"
    else:
        if "Lane" not in df:
            log.error("No lanes specified in SampleSheet.csv; use --merge-lanes or update sample sheet.")
            sys.exit(1)

    df["Sample_Project"] = args.project_name
    df["library_type"] = args.library_type
    
    if "Sample_ID" in df:
        df["Sample_Name"] = df.Sample_ID
    elif "Sample_Name" in df:
        df["Sample_ID"] = df.Sample_Name
    else:
        log.error("Samplesheet must specify Sample_ID or Sample_Name!")
        sys.exit(1)

    df = df.drop_duplicates() # needed in case merge_lanes is true, which can result in duplicates
    return df
Exemplo n.º 7
0
def create_investigators(samplesheet_path):
    samplesheet_data = SampleSheet(samplesheet_path)
    header_info = samplesheet_data.Header
    investigator_initials_list = header_info['Investigator Name'].split(";")
    investigator_dict = {}
    for investigator_initials in investigator_initials_list:
        investigator_dict[investigator_initials] = Investigator(
            investigator_initials)

    return investigator_dict
    def __parse_sample_sheet(self, path_to_sample_sheet):
        parsed_data = {
            "header": {
                "experiment_name": None,
                "instrument_type": None,
                "investigator_name": None,
                "workflow": None,
                "chemistry": None,
            },
            "reads": [],
            "settings": {},
            "data": [],
        }

        try:
            sample_sheet = json.loads(
                SampleSheet(path_to_sample_sheet).to_json())
        except Exception as e:
            print(e)

        sample_sheet_keys_to_message_keys_sample_sheet_header = {
            'Experiment Name': 'experiment_name',
            'Instrument Type': 'instrument_type',
            'Investigator Name': 'investigator_name',
            'Workflow': 'workflow',
            'Chemistry': 'chemistry',
        }

        for sample_sheet_key, message_key in sample_sheet_keys_to_message_keys_sample_sheet_header.items(
        ):
            try:
                parsed_data['header'][message_key] = sample_sheet['Header'][
                    sample_sheet_key]
            except Exception as e:
                print(e)

        for read in sample_sheet['Reads']:
            parsed_data['reads'].append(read)

        for key, val in sample_sheet['Settings'].items():
            if key == 'ReverseComplement':
                key = 'reverse_complement'
            else:
                key = key.lower()
            parsed_data['settings'][key] = val

        for sample in sample_sheet['Data']:
            sample_to_append = {}
            for key, val in sample.items():
                sample_to_append[key.lower()] = val
                parsed_data['data'].append(sample_to_append)

        return parsed_data
    def setUp(self):

        qc_config = {
            'name': 'UnidentifiedIndexHandler',
            'significance_threshold': 1,
            'white_listed_indexes': ['.*N.*', 'G{8,}']
        }
        self.unidentifiedIndexHandler = UnidentifiedIndexHandler(qc_config)

        conversion_results_key = "ConversionResults"
        conversion_results = get_stats_json()["ConversionResults"]
        samplesheet_key = "samplesheet"
        self.samplesheet = SampleSheet()
        sample_1 = Sample(
            dict(Lane=1,
                 Sample_ID='1823A',
                 Sample_Name='1823A-tissue',
                 index='AAAA'))
        sample_2 = Sample(
            dict(Lane=2,
                 Sample_ID='1823B',
                 Sample_Name='1823B-tissue',
                 index='TTTT'))
        sample_3 = Sample(
            dict(Lane=3,
                 Sample_ID='1823C',
                 Sample_Name='1823C-tissue',
                 index='AAAA',
                 index2='TTTT'))
        sample_4 = Sample(
            dict(Lane=4,
                 Sample_ID='1823D',
                 Sample_Name='1823D-tissue',
                 index='GGGG',
                 index2='CCCC'))
        sample_5 = Sample(
            dict(Lane=6,
                 Sample_ID='1823E',
                 Sample_Name='1823D-tissue',
                 index='ATCG'))
        self.samplesheet.add_sample(sample_1)
        self.samplesheet.add_sample(sample_2)
        self.samplesheet.add_sample(sample_3)
        self.samplesheet.add_sample(sample_4)
        self.samplesheet.add_sample(sample_5)

        self.unidentifiedIndexHandler.collect(
            (conversion_results_key, conversion_results))
        self.unidentifiedIndexHandler.collect(
            (samplesheet_key, self.samplesheet))

        self.samplesheet_searcher = _SamplesheetSearcher(self.samplesheet)
Exemplo n.º 10
0
def build_samplesheet(df, args):
    samplesheet = SampleSheet()
    for ix, row in df.iterrows():
        s = {
            'Sample_ID': row.Sample_ID,
            'Sample_Name': row.Sample_Name,
            'Sample_Project': row.Sample_Project,
            'index': row["index"],
            'index2': row.index2
        }
        if not args.merge_lanes:
            s["Lane"] = row.Lane
        samplesheet.add_sample(Sample(s))
    return samplesheet
Exemplo n.º 11
0
def make_sample_sheet(body: Mapping[str, Any],
                      adapter_result_type=None) -> SampleSheet:
    wfa = body[WORKFLOW_ACTIVITY]
    activity_id = wfa[ID]
    wf = wfa[WORKFLOW]
    samples = wf[SAMPLES]

    sample_sheet = SampleSheet()
    for sample in samples:
        sample_sheet.add_samples(
            sample_records(activity_id,
                           sample,
                           adapter_result_type=adapter_result_type))

    return sample_sheet
def get_sample_sheet_info(sample_sheet_path, header_name, data_name):
    sample_sheet = SampleSheet(sample_sheet_path)
    ## Header section
    header = pd.DataFrame(list(sample_sheet.Header.values()),
                          index=list(sample_sheet.Header.keys())).transpose()
    for i in header_name:
        if i not in header.columns:
            header[i] = np.nan
    header = header.rename(columns={'Description': 'Header.Description'})

    ## Reads section
    reads = pd.DataFrame(sample_sheet.Reads, index=['Read1',
                                                    'Read2']).transpose()

    ## Settings section
    if len(sample_sheet.Settings) == 0:
        setting = pd.DataFrame({'Adapter': [np.nan]})
    else:
        setting = pd.DataFrame(list(sample_sheet.Settings.values()),
                               index=list(
                                   sample_sheet.Settings.keys())).transpose()
    setting = setting.rename(columns={'adapter': 'Adapter'})

    ## Data section
    run_name = '_'.join([
        sample_sheet_path.split('/')[3],
        sample_sheet_path.split('/')[4].split('_')[2]
    ])
    data = pd.DataFrame()
    for i in range(0, len(sample_sheet)):
        tmp = pd.DataFrame([dict(sample_sheet.samples[i])])
        data = data.append(tmp, sort=False)

    for i in data_name:
        if i not in data.columns:
            data[i] = np.nan
    data['Run_barcode'] = run_name  # + '_' + data['Sample_ID'].astype(str)
    data = data.rename(columns={'Description': 'Data.Description'})

    ## Combine all sections
    data = pd.concat([data, header], axis=1)
    data = pd.concat([data, setting], axis=1)
    data = pd.concat([data, reads], axis=1)
    #     out_header = ['Date', 'Run_barcode', 'Sample_ID', 'Read1', 'Read2', "Adapter", 'Sample_Name', 'Data.Description', 'Index_Plate_Well', 'index', 'I7_Index_ID', 'index2', 'I5_Index_ID',
    #                   'Sample_Project', 'Sample_Plate', 'Sample_Well', 'Local Run Manager Analysis Id', 'IEMFileVersion', 'Experiment Name', 'Module', 'Workflow', 'Application', 'Instrument Type', 'Assay',
    #                   'Index Adapters', 'Library Prep Kit', 'Header.Description', 'Chemistry']
    data = data.fillna('-')
    return data
Exemplo n.º 13
0
def main(samplesheet_file_path, check_only):
    logger.info(f"Checking SampleSheet {samplesheet_file_path}")
    original_sample_sheet = SampleSheet(samplesheet_file_path)

    # Run some consistency checks
    years = get_years_from_samplesheet(original_sample_sheet)
    logger.info(f"Samplesheet contains IDs from {len(years)} years: {years}")
    for year in years:
        library_tracking_spreadsheet[year] = get_library_sheet_from_google(
            year)
    import_library_sheet_validation_from_google()
    # TODO: replace has_error return with enum and expand to error, warning, info?
    has_header_error = checkSampleSheetMetadata(original_sample_sheet)
    has_id_error = checkSampleAndLibraryIdFormat(original_sample_sheet)
    has_index_error = checkSampleSheetForIndexClashes(original_sample_sheet)
    has_metadata_error = checkMetadataCorrespondence(original_sample_sheet)
    # Only fail on metadata or id errors
    if has_index_error:
        print(
            "Index errors detected. Note: the pipeline will ignore those, please make sure to review those errors!"
        )
    if has_header_error or has_id_error or has_metadata_error:
        raise ValueError(
            "Pipeline breaking validation detected errors. Please review the error logs!"
        )

    # Split and write individual SampleSheets, based on indexes and technology (10X)
    if not check_only:
        # Sort samples based on technology (truseq/10X and/or index length)
        # Also replace N indexes with ""
        sorted_samples = getSortedSamples(original_sample_sheet)

        # Now that the samples have been sorted, we can write one or more custom sample sheets
        # (which may be the same as the original if no processing was necessary)
        logger.info(f"Writing {len(sorted_samples)} sample sheets.")
        writeSammpleSheets(sample_list=sorted_samples,
                           sheet_path=samplesheet_file_path,
                           template_sheet=original_sample_sheet)

    logger.info("All done.")
Exemplo n.º 14
0
                                                'SampleSheet.csv')
    else:
        logger.info(
            "Processing successful run. Using generated sample sheet(s).")
        samplesheet_path_pattern = os.path.join(runfolder_base_dir, runfolder,
                                                'SampleSheet.csv.custom.*')

    samplesheet_paths = glob(samplesheet_path_pattern)
    if len(samplesheet_paths) < 1:
        raise ValueError("No sample sheets found!")
    logger.info(f"Using {len(samplesheet_paths)} sample sheet(s).")

    for samplesheet in samplesheet_paths:
        logger.info(f"Processing samplesheet {samplesheet}")
        name, extension = os.path.splitext(samplesheet)
        samples = SampleSheet(samplesheet).samples
        logger.info(f"Found {len(samples)} samples.")
        for sample in samples:
            logger.debug(
                f"Looking up metadata with {sample.Sample_Name} for samplesheet.Sample_ID (UMCCR SampleID); "
                +
                f"{sample.Sample_ID} and samplesheet.sample_Name (UMCCR LibraryID): {sample.Sample_Name}"
            )
            column_values = get_meta_data_by_library_id(sample.Sample_Name)

            fastq_pattern = os.path.join(bcl2fastq_base_dir, runfolder,
                                         sample.Sample_Project,
                                         sample.Sample_ID,
                                         sample.Sample_Name + "*.fastq.gz")
            s3_fastq_pattern = os.path.join(fastq_hpc_base_dir, runfolder,
                                            sample.Sample_Project,
def scan_data(sample_sheet_path):
    sample_sheet = SampleSheet(sample_sheet_path)
    data = dict(sample_sheet.samples[0]).keys()
    return list(data)
def scan_header(sample_sheet_path):
    sample_sheet = SampleSheet(sample_sheet_path)
    header = pd.DataFrame(list(sample_sheet.Header.values()),
                          index=list(sample_sheet.Header.keys())).transpose()
    return list(header.columns)