Exemplo n.º 1
0
 def __init__(self, charts_dir, deploy_prefix, nodes_number, threads_number,
              sampledata_file, refdata_file, output_mask, output_dir):
     self.charts_directory = Utilities.ends_with_slash(charts_dir)
     self.deploy_prefix = re.sub("[^A-Za-z0-9\-]+", "-", deploy_prefix)
     self.config_chart = Chart(
         file="{}config.yaml".format(self.charts_directory),
         # URL is not supported
         url=
         "https://raw.githubusercontent.com/ivasilyev/biopipelines-docker/master/bwt_filtering_pipeline/templates/bwt-fp-only-coverage/config.yaml"
     )
     self.cfgDict = {
         "QUEUE_NAME": "{}-queue".format(self.deploy_prefix),
         "MASTER_CONTAINER_NAME": "{}-master".format(self.deploy_prefix),
         "JOB_NAME": "{}-job".format(self.deploy_prefix),
         "WORKER_CONTAINER_NAME": "{}-worker".format(self.deploy_prefix),
         "ACTIVE_NODES_NUMBER": nodes_number,
         "THREADS_NUMBER": threads_number,
         "SAMPLEDATA": sampledata_file,
         "REFDATA": refdata_file,
         "OUTPUT_MASK": output_mask,
         "OUTPUT_DIR": Utilities.ends_with_slash(output_dir)
     }
     self.master_chart = Chart(
         file="{}master.yaml".format(self.charts_directory),
         url=
         "https://raw.githubusercontent.com/ivasilyev/biopipelines-docker/master/bwt_filtering_pipeline/templates/bwt-fp-only-coverage/master.yaml"
     )
     self.worker_chart = Chart(
         file="{}worker.yaml".format(self.charts_directory),
         url=
         "https://raw.githubusercontent.com/ivasilyev/biopipelines-docker/master/bwt_filtering_pipeline/templates/bwt-fp-only-coverage/worker.yaml"
     )
Exemplo n.º 2
0
 def set_raw_pvals_dir(self, output_dir: str):
     self.output_dir = Utilities.ends_with_slash(output_dir)
     self.raw_pvals_dir = "{OUTPUT_DIR}pvals/{VALUE_COLUMN}/{GROUPS}/".format(OUTPUT_DIR=self.output_dir,
                                                                              VALUE_COLUMN=self.pivot_value_col_name_abbreviation,
                                                                              GROUPS=self.groupdata_digest_name)
     self.digest_dir = "{OUTPUT_DIR}digest/{VALUE_COLUMN}/{GROUPS}/".format(OUTPUT_DIR=self.output_dir,
                                                                            VALUE_COLUMN=self.pivot_value_col_name_abbreviation,
                                                                            GROUPS=self.groupdata_digest_name)
Exemplo n.º 3
0
 def split(self, output_dir: str):
     output_dir = Utilities.ends_with_slash(output_dir)
     os.makedirs(output_dir, exist_ok=True)
     # Note: the dataframe must have only index and value columns
     for sample_col_name in list(self.pivot_df):
         sample_name = Utilities.filename_only(sample_col_name).split(
             "_")[0]
         sample_file_name = "{}{}.tsv".format(output_dir, sample_name)
         self.pivot_df[sample_col_name].reset_index().rename(
             columns={
                 sample_col_name: self.value_col_name
             }).to_csv(sample_file_name, sep="\t", header=True, index=False)
         self._sample_names_list.append(sample_file_name)
Exemplo n.º 4
0
 def finalize_datasets(self, output_dir):
     self.output_dir = Utilities.ends_with_slash(output_dir)
     self.virulence_dataset = self.digest_ds_values(self.concat_datasets(self.virulence_dss_list))
     self.genera_dataset = self.digest_ds_values(self.concat_datasets(self.genera_dss_list))
     #
     association_name = "virulence"
     dataset_dir = "{a}{b}/{c}/".format(a=self.output_dir, b=self.groupdata_digest_name, c=association_name)
     dataset_file = "{a}{b}_{c}_dataset.tsv".format(a=dataset_dir, b=self.groupdata_digest_name, c=association_name)
     os.makedirs(dataset_dir, exist_ok=True)
     self.dump_dataset(self.virulence_dataset, file=dataset_file)
     #
     association_name = "genera"
     dataset_dir = "{a}{b}/{c}/".format(a=self.output_dir, b=self.groupdata_digest_name, c=association_name)
     dataset_file = "{a}{b}_{c}_dataset.tsv".format(a=dataset_dir, b=self.groupdata_digest_name, c=association_name)
     os.makedirs(dataset_dir, exist_ok=True)
     self.dump_dataset(self.genera_dataset, file=dataset_file)
Exemplo n.º 5
0
 def create_multiboxplots(ds: pd.DataFrame, boxplot_y_col_name, output_dir, keywords_list: list, title_text):
     import seaborn as sns
     import matplotlib.pyplot as plt
     from matplotlib.ticker import MaxNLocator
     sns.set(style="whitegrid", font_scale=0.5)
     sns.set_palette("cubehelix")
     multiboxplot_alias = re.sub("[\W\-]+", "_", boxplot_y_col_name).strip("_")
     multiboxplot_dir = "{}{}/".format(Utilities.ends_with_slash(output_dir), multiboxplot_alias)
     os.makedirs(os.path.dirname(multiboxplot_dir), exist_ok=True)
     #
     fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(10, 5), sharey=False)
     for ax, keyword in zip(axes.flatten(), keywords_list):
         multiboxplot_data = ds.loc[ds["keyword"] == keyword, ["keyword", boxplot_y_col_name, "group_name"]]
         DataSetsKeeper.dump_dataset(multiboxplot_data, file="{a}dataset_{b}_{c}.tsv".format(a=multiboxplot_dir, b=multiboxplot_alias, c=keyword))
         sns.boxplot(x="keyword", y=boxplot_y_col_name, hue="group_name", data=multiboxplot_data, orient="v",
                     fliersize=1, linewidth=1, palette="Set3", ax=ax)
         handles, labels = ax.get_legend_handles_labels()
         fig.legend(handles, labels, loc="right", bbox_to_anchor=(0.985, 0.5), title="Group ID", fancybox=True)
         ax.legend_.remove()
         ax.set_title(keyword.replace(" ", "\n"))
         ax.title.set_position([0.5, 0.97])
         ax.axes.get_xaxis().set_visible(False)
         ax.yaxis.label.set_visible(False)
         ax.tick_params(axis="y", which="major", labelrotation=0, pad=-3)
         ax.yaxis.set_major_locator(MaxNLocator(integer=True))
     fig.subplots_adjust(hspace=0.3, wspace=0.3)
     ax0 = fig.add_axes([0, 0, 1, 1])
     plt.text(0.09, 0.5, boxplot_y_col_name, horizontalalignment="left", verticalalignment='center', rotation=90,
              transform=ax0.transAxes)
     plt.text(0.5, 0.95, title_text,
              horizontalalignment="center", verticalalignment='center', transform=ax0.transAxes,
              fontsize="large", fontstyle="normal", fontweight="bold")
     ax0.set_axis_off()
     multiboxplot_image = "{a}multiboxplot_{b}.png".format(a=multiboxplot_dir, b=multiboxplot_alias)
     fig.savefig(multiboxplot_image, format="png", dpi=900)
     plt.clf()
     plt.close()
Exemplo n.º 6
0
def create_sampledata_dict(dirs: list):
    output_dict = {}
    for raw_reads_dir in dirs:
        raw_reads_dir = Utilities.ends_with_slash(raw_reads_dir)
        files_list = os.listdir(raw_reads_dir)
        for file_name in files_list:
            if any([
                    file_name.endswith(i)
                    for i in ["csfasta", "fasta", "fa", "fastq", "fq", "gz"]
            ]):
                sample_name = file_name.split("_")[0].strip()
                file_extension = file_name.split(".")[-1]
                sample_files = sorted(
                    sorted([
                        raw_reads_dir + i for i in files_list
                        if len(re.findall("^{}".format(sample_name), i)) > 0
                        and i.endswith(file_extension)
                    ],
                           key=len,
                           reverse=True)[:2])
                sample_name = re.sub("_+", "_",
                                     re.sub("[^A-Za-z0-9]+", "_", sample_name))
                existing_files = output_dict.get(sample_name)
                if not existing_files:
                    output_dict[sample_name] = sample_files
                else:
                    if existing_files[0].endswith(
                            "csfasta") and file_extension != "csfasta":
                        print(
                            "Replacing SOLID reads with Illumina reads for sample {a}: {b}, {c}"
                            .format(a=sample_name,
                                    b=existing_files,
                                    c=sample_files))
                        output_dict[sample_name] = sample_files
    output_dict = dict(sorted(output_dict.items()))
    return output_dict
Exemplo n.º 7
0
# Create sampledata for Illumina raw reads
raw_reads_dict = create_sampledata_dict(raw_reads_dirs)
raw_reads_dict = {
    k: raw_reads_dict[k]
    for k in raw_reads_dict if "HP" in k and not os.path.isfile(
        "/data2/bio/Metagenomes/HG19/Unmapped_reads/{}_no_hg19.1.gz".format(k))
}

Utilities.dump_2d_array([[k] + raw_reads_dict[k] for k in raw_reads_dict],
                        file=projectDescriber.sampledata)

# Prepare deploy charts
launchGuideLiner = LaunchGuideLiner(
    charts_dir="{}{}/charts/".format(
        Utilities.ends_with_slash(projectDescriber.directory), "hg19"),
    deploy_prefix=projectDescriber.owner + "-hg19",
    nodes_number=7,
    threads_number="half",
    sampledata_file=projectDescriber.sampledata,
    refdata_file="/data/reference/homo_sapiens/hg/hg19/hg19.refdata",
    output_mask="hg19",
    output_dir="/data2/bio/Metagenomes/HG19")
launchGuideLiner.generate_config()
launchGuideLiner.get_deploy_guide()
"""
# Charts directory: '/data1/bio/projects/dsafina/hp_checkpoints/hg19/charts/'

# Look for Redis pod & service:
kubectl get pods
Exemplo n.º 8
0
        for i in subprocess.getoutput("find {} -name *R{}*.fastq.gz".format(
            project_dir, j)).split("\n") if len(i.strip()) > 0
    ]) for j in (1, 2)
]
max_id = max([
    int(re.findall("[0-9]{6}", i)[0])
    for i in r1_fastq_archives_list + r2_fastq_archives_list
])
output_files_list = []
for r1_fastq_archive, r2_fastq_archive in zip(r1_fastq_archives_list,
                                              r2_fastq_archives_list):
    for copy_number in [1, 2]:
        max_id += 1
        for fastq_archive in [r1_fastq_archive, r2_fastq_archive]:
            output_file = "{}{}".format(
                Utilities.ends_with_slash("/".join(
                    fastq_archive.split("/")[:-1])),
                re.sub(
                    "\.gz$", "",
                    re.sub("[0-9]{6}",
                           str(max_id).zfill(6),
                           fastq_archive.split("/")[-1])))
            print("Loading file '{}'".format(fastq_archive))
            fq_array = FASTAArray(
                subprocess.getoutput("zcat {}".format(fastq_archive)))
            print("Loaded file '{}'".format(fastq_archive))
            fq_array.parse_fastq(output_file)
            del fq_array
            output_files_list.append(output_file)
            print("Saved file '{}'".format(output_file))
            gc.collect()