def __init__(self, charts_dir, deploy_prefix, nodes_number, threads_number, sampledata_file, refdata_file, output_mask, output_dir): self.charts_directory = Utilities.ends_with_slash(charts_dir) self.deploy_prefix = re.sub("[^A-Za-z0-9\-]+", "-", deploy_prefix) self.config_chart = Chart( file="{}config.yaml".format(self.charts_directory), # URL is not supported url= "https://raw.githubusercontent.com/ivasilyev/biopipelines-docker/master/bwt_filtering_pipeline/templates/bwt-fp-only-coverage/config.yaml" ) self.cfgDict = { "QUEUE_NAME": "{}-queue".format(self.deploy_prefix), "MASTER_CONTAINER_NAME": "{}-master".format(self.deploy_prefix), "JOB_NAME": "{}-job".format(self.deploy_prefix), "WORKER_CONTAINER_NAME": "{}-worker".format(self.deploy_prefix), "ACTIVE_NODES_NUMBER": nodes_number, "THREADS_NUMBER": threads_number, "SAMPLEDATA": sampledata_file, "REFDATA": refdata_file, "OUTPUT_MASK": output_mask, "OUTPUT_DIR": Utilities.ends_with_slash(output_dir) } self.master_chart = Chart( file="{}master.yaml".format(self.charts_directory), url= "https://raw.githubusercontent.com/ivasilyev/biopipelines-docker/master/bwt_filtering_pipeline/templates/bwt-fp-only-coverage/master.yaml" ) self.worker_chart = Chart( file="{}worker.yaml".format(self.charts_directory), url= "https://raw.githubusercontent.com/ivasilyev/biopipelines-docker/master/bwt_filtering_pipeline/templates/bwt-fp-only-coverage/worker.yaml" )
def set_raw_pvals_dir(self, output_dir: str): self.output_dir = Utilities.ends_with_slash(output_dir) self.raw_pvals_dir = "{OUTPUT_DIR}pvals/{VALUE_COLUMN}/{GROUPS}/".format(OUTPUT_DIR=self.output_dir, VALUE_COLUMN=self.pivot_value_col_name_abbreviation, GROUPS=self.groupdata_digest_name) self.digest_dir = "{OUTPUT_DIR}digest/{VALUE_COLUMN}/{GROUPS}/".format(OUTPUT_DIR=self.output_dir, VALUE_COLUMN=self.pivot_value_col_name_abbreviation, GROUPS=self.groupdata_digest_name)
def split(self, output_dir: str): output_dir = Utilities.ends_with_slash(output_dir) os.makedirs(output_dir, exist_ok=True) # Note: the dataframe must have only index and value columns for sample_col_name in list(self.pivot_df): sample_name = Utilities.filename_only(sample_col_name).split( "_")[0] sample_file_name = "{}{}.tsv".format(output_dir, sample_name) self.pivot_df[sample_col_name].reset_index().rename( columns={ sample_col_name: self.value_col_name }).to_csv(sample_file_name, sep="\t", header=True, index=False) self._sample_names_list.append(sample_file_name)
def finalize_datasets(self, output_dir): self.output_dir = Utilities.ends_with_slash(output_dir) self.virulence_dataset = self.digest_ds_values(self.concat_datasets(self.virulence_dss_list)) self.genera_dataset = self.digest_ds_values(self.concat_datasets(self.genera_dss_list)) # association_name = "virulence" dataset_dir = "{a}{b}/{c}/".format(a=self.output_dir, b=self.groupdata_digest_name, c=association_name) dataset_file = "{a}{b}_{c}_dataset.tsv".format(a=dataset_dir, b=self.groupdata_digest_name, c=association_name) os.makedirs(dataset_dir, exist_ok=True) self.dump_dataset(self.virulence_dataset, file=dataset_file) # association_name = "genera" dataset_dir = "{a}{b}/{c}/".format(a=self.output_dir, b=self.groupdata_digest_name, c=association_name) dataset_file = "{a}{b}_{c}_dataset.tsv".format(a=dataset_dir, b=self.groupdata_digest_name, c=association_name) os.makedirs(dataset_dir, exist_ok=True) self.dump_dataset(self.genera_dataset, file=dataset_file)
def create_multiboxplots(ds: pd.DataFrame, boxplot_y_col_name, output_dir, keywords_list: list, title_text): import seaborn as sns import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator sns.set(style="whitegrid", font_scale=0.5) sns.set_palette("cubehelix") multiboxplot_alias = re.sub("[\W\-]+", "_", boxplot_y_col_name).strip("_") multiboxplot_dir = "{}{}/".format(Utilities.ends_with_slash(output_dir), multiboxplot_alias) os.makedirs(os.path.dirname(multiboxplot_dir), exist_ok=True) # fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(10, 5), sharey=False) for ax, keyword in zip(axes.flatten(), keywords_list): multiboxplot_data = ds.loc[ds["keyword"] == keyword, ["keyword", boxplot_y_col_name, "group_name"]] DataSetsKeeper.dump_dataset(multiboxplot_data, file="{a}dataset_{b}_{c}.tsv".format(a=multiboxplot_dir, b=multiboxplot_alias, c=keyword)) sns.boxplot(x="keyword", y=boxplot_y_col_name, hue="group_name", data=multiboxplot_data, orient="v", fliersize=1, linewidth=1, palette="Set3", ax=ax) handles, labels = ax.get_legend_handles_labels() fig.legend(handles, labels, loc="right", bbox_to_anchor=(0.985, 0.5), title="Group ID", fancybox=True) ax.legend_.remove() ax.set_title(keyword.replace(" ", "\n")) ax.title.set_position([0.5, 0.97]) ax.axes.get_xaxis().set_visible(False) ax.yaxis.label.set_visible(False) ax.tick_params(axis="y", which="major", labelrotation=0, pad=-3) ax.yaxis.set_major_locator(MaxNLocator(integer=True)) fig.subplots_adjust(hspace=0.3, wspace=0.3) ax0 = fig.add_axes([0, 0, 1, 1]) plt.text(0.09, 0.5, boxplot_y_col_name, horizontalalignment="left", verticalalignment='center', rotation=90, transform=ax0.transAxes) plt.text(0.5, 0.95, title_text, horizontalalignment="center", verticalalignment='center', transform=ax0.transAxes, fontsize="large", fontstyle="normal", fontweight="bold") ax0.set_axis_off() multiboxplot_image = "{a}multiboxplot_{b}.png".format(a=multiboxplot_dir, b=multiboxplot_alias) fig.savefig(multiboxplot_image, format="png", dpi=900) plt.clf() plt.close()
def create_sampledata_dict(dirs: list): output_dict = {} for raw_reads_dir in dirs: raw_reads_dir = Utilities.ends_with_slash(raw_reads_dir) files_list = os.listdir(raw_reads_dir) for file_name in files_list: if any([ file_name.endswith(i) for i in ["csfasta", "fasta", "fa", "fastq", "fq", "gz"] ]): sample_name = file_name.split("_")[0].strip() file_extension = file_name.split(".")[-1] sample_files = sorted( sorted([ raw_reads_dir + i for i in files_list if len(re.findall("^{}".format(sample_name), i)) > 0 and i.endswith(file_extension) ], key=len, reverse=True)[:2]) sample_name = re.sub("_+", "_", re.sub("[^A-Za-z0-9]+", "_", sample_name)) existing_files = output_dict.get(sample_name) if not existing_files: output_dict[sample_name] = sample_files else: if existing_files[0].endswith( "csfasta") and file_extension != "csfasta": print( "Replacing SOLID reads with Illumina reads for sample {a}: {b}, {c}" .format(a=sample_name, b=existing_files, c=sample_files)) output_dict[sample_name] = sample_files output_dict = dict(sorted(output_dict.items())) return output_dict
# Create sampledata for Illumina raw reads raw_reads_dict = create_sampledata_dict(raw_reads_dirs) raw_reads_dict = { k: raw_reads_dict[k] for k in raw_reads_dict if "HP" in k and not os.path.isfile( "/data2/bio/Metagenomes/HG19/Unmapped_reads/{}_no_hg19.1.gz".format(k)) } Utilities.dump_2d_array([[k] + raw_reads_dict[k] for k in raw_reads_dict], file=projectDescriber.sampledata) # Prepare deploy charts launchGuideLiner = LaunchGuideLiner( charts_dir="{}{}/charts/".format( Utilities.ends_with_slash(projectDescriber.directory), "hg19"), deploy_prefix=projectDescriber.owner + "-hg19", nodes_number=7, threads_number="half", sampledata_file=projectDescriber.sampledata, refdata_file="/data/reference/homo_sapiens/hg/hg19/hg19.refdata", output_mask="hg19", output_dir="/data2/bio/Metagenomes/HG19") launchGuideLiner.generate_config() launchGuideLiner.get_deploy_guide() """ # Charts directory: '/data1/bio/projects/dsafina/hp_checkpoints/hg19/charts/' # Look for Redis pod & service: kubectl get pods
for i in subprocess.getoutput("find {} -name *R{}*.fastq.gz".format( project_dir, j)).split("\n") if len(i.strip()) > 0 ]) for j in (1, 2) ] max_id = max([ int(re.findall("[0-9]{6}", i)[0]) for i in r1_fastq_archives_list + r2_fastq_archives_list ]) output_files_list = [] for r1_fastq_archive, r2_fastq_archive in zip(r1_fastq_archives_list, r2_fastq_archives_list): for copy_number in [1, 2]: max_id += 1 for fastq_archive in [r1_fastq_archive, r2_fastq_archive]: output_file = "{}{}".format( Utilities.ends_with_slash("/".join( fastq_archive.split("/")[:-1])), re.sub( "\.gz$", "", re.sub("[0-9]{6}", str(max_id).zfill(6), fastq_archive.split("/")[-1]))) print("Loading file '{}'".format(fastq_archive)) fq_array = FASTAArray( subprocess.getoutput("zcat {}".format(fastq_archive))) print("Loaded file '{}'".format(fastq_archive)) fq_array.parse_fastq(output_file) del fq_array output_files_list.append(output_file) print("Saved file '{}'".format(output_file)) gc.collect()