def set_groupdata_dict(self, groupdata_file: str): self.groupdata_file = groupdata_file self.groupdata_digest_name = Utilities.filename_only(self.groupdata_file).replace(".groupdata", "") groupdata_df = pd.read_table(self.groupdata_file, sep="\t", header="infer", names=["sample_name", "group_name"]) self.groupdata_dict = {i: sorted(set( groupdata_df.loc[groupdata_df["group_name"] == i, ["sample_name"]])) for i in sorted( set(groupdata_df["group_name"]))} self.raw_all_sample_names_list = sorted(set(groupdata_df["sample_name"]))
def split(self, output_dir: str): output_dir = Utilities.ends_with_slash(output_dir) os.makedirs(output_dir, exist_ok=True) # Note: the dataframe must have only index and value columns for sample_col_name in list(self.pivot_df): sample_name = Utilities.filename_only(sample_col_name).split( "_")[0] sample_file_name = "{}{}.tsv".format(output_dir, sample_name) self.pivot_df[sample_col_name].reset_index().rename( columns={ sample_col_name: self.value_col_name }).to_csv(sample_file_name, sep="\t", header=True, index=False) self._sample_names_list.append(sample_file_name)
def evaluate_sampledata(): import os import subprocess import pandas as pd from meta.scripts.Utilities import Utilities import re # df = pd.DataFrame(columns=["sample_name", "sample_path"]) for dir_mask in [ "/data2/bio/ecoli_komfi/raw_reads/*", "/data2/bio/ecoli_komfi/raw_reads2/*" ]: data_1 = [ i.strip() for i in subprocess.getoutput( "ls -d {}R1*.fastq* | sort".format(dir_mask)).split("\n") ] data_12 = [ "{a}\t{b}".format(a=i, b=i.replace("R1", "R2")) if os.path.isfile(i.replace("R1", "R2")) else "" for i in data_1 ] sample_names_list = [ re.sub("_S.*$", "", Utilities.filename_only(i)) for i in data_1 ] df = pd.concat([ df, pd.DataFrame.from_dict({ "sample_name": sample_names_list, "sample_path": data_12 }) ], axis=0, ignore_index=True) # df["group_id"] = "group_id" # os.makedirs(ProjectDescriber.directory, exist_ok=True) df.loc[:, ["sample_name", "group_id"]].to_csv(ProjectDescriber.groupdata, sep='\t', index=False, header=False) df.loc[:, ["sample_name", "sample_path"]].to_csv( ProjectDescriber.sampledata, sep='\t', index=False, header=False) subprocess.getoutput( "sed -i 's|\"||g' {}".format(ProjectDescriber.sampledata) ) # Tab-containing columns items are flanked by '"'