Пример #1
0
 def generate_from_directory(
         directory: str,
         regex: str = DEFAULT_REGEX,
         reads_extension: str = DEFAULT_READS_EXTENSION):
     pair_2d_array = Utilities.get_most_similar_word_pairs(
         Utilities.find_file_by_tail(directory, reads_extension))
     return SampleDataArray.generate(pair_2d_array,
                                     regex=regex,
                                     extension=reads_extension)
Пример #2
0
import pandas as pd
from shutil import copy2
from meta.scripts.Utilities import Utilities
from vradchenko.lactobacillus_salivarius.ProjectDescriber import ProjectDescriber

# Get the raw reads files
raw_reads_files_dir = ProjectDescriber.RAW_DATA_DIR
raw_reads_files_list = [
    i for i in Utilities.scan_whole_dir(raw_reads_files_dir)
    if i.endswith("_001.fastq.gz")
]

# Split them into the two groups
STRANDS = ("R1", "R2")
raw_reads_list = []
for raw_reads_files_pair in Utilities.get_most_similar_word_pairs(
        raw_reads_files_list):
    # Illumina file names have template '[sample]_[sequence]_[lane]_[strand]_[number].fastq.gz'
    # E.g: '336g_S1_L001_R1_001.fastq.gz'
    sample_name = Utilities.safe_findall(
        "(.+)_S[0-9]+_L[0-9]+_R[0-9]+_[0-9]+",
        os.path.basename(raw_reads_files_pair[0]))
    raw_reads_dict = dict(sample_name=sample_name)
    for raw_reads_file in raw_reads_files_pair:
        for reads_strand in STRANDS:
            if "_{}_".format(reads_strand) in os.path.splitext(
                    os.path.basename(raw_reads_file))[0]:
                raw_reads_dict[reads_strand] = raw_reads_file
    if all([
            raw_reads_dict.get(STRANDS[0]).replace("_{}_".format(
                STRANDS[0]), "_{}_".format(STRANDS[-1])) == raw_reads_dict.get(
                    STRANDS[-1])