def Pipeline_Files(self): self.blast_db_fasta_biomart = str( Path("Biomart") / f"{self.organism}_biomart_3utr.fasta") self.blast_db_fasta_biomart_valid = str( self.tmp_dir / Path(self.organism + "_biomart_valid.fasta")) self.blast_db_biomart = str(self.tmp_dir / "blast_files" / "blastdb_biomart") self.blast_db_csv_ucsc = str(self.tmp_dir / Path(self.organism + "_ucsc.csv")) self.blast_db_fasta_ucsc_valid = str( self.tmp_dir / Path(self.organism + "_ucsc_valid.fasta")) self.blast_db_ucsc = str(self.tmp_dir / "blast_files" / "blastdb_ucsc") self.blast_tmp_result = str(self.tmp_dir / "tmp_blast_result.csv") self.blast_with_mRNA = str(self.tmp_dir / Path(self.organism + "_mRNA.csv")) self.blast_with_mRNA = utils.filename_date_append(self.blast_with_mRNA) self.final_output = str( "Datafiles_Prepare/CSV" / Path(self.organism + "_" + self.paper_name + "_Data.csv")) self.final_output = utils.filename_date_append(self.final_output) self.blast_no_unique = str("Datafiles_Prepare/Logs" / Path(self.organism + "_" + self.paper_name + "_Blast_Nonunique.fasta")) self.blast_no_unique = utils.filename_date_append(self.blast_no_unique)
def main(): try: debug=ast.literal_eval(sys.argv[1]) except IndexError: debug=True if (debug): print ("***************************************\n" "\t\t\t DEBUG \n" "***************************************\n") interaction_file = str(Path("Papers/1-s2.0-S009286741300439X-mmc1.txt")) log_dir = "Datafiles_Prepare/Logs/" tmp_dir = utils.make_tmp_dir("Datafiles_Prepare/tmp_dir", parents=True) organisms = ["Human"] for organism in organisms: JsonLog.set_filename( utils.filename_date_append(Path(log_dir) / Path("Mapping_the_Human_miRNA_" + organism + ".json"))) JsonLog.add_to_json('file name', interaction_file) JsonLog.add_to_json('paper', "Mapping the Human miRNA Interactome by CLASH Reveals Frequent Noncanonical Binding") JsonLog.add_to_json('Organism', organism) JsonLog.add_to_json('paper_url', "https://www.sciencedirect.com/science/article/pii/S009286741300439X") p = Pipeline(paper_name="Mapping_the_Human_miRNA", organism=organism, in_df=df_prepare(read_paper_data(interaction_file, debug)), tmp_dir=tmp_dir) p.run()
def file_formatting(self): in_df = pd.read_csv(self.blast_with_mRNA) in_df['Source'] = self.paper_name in_df['Organism'] = self.organism in_df['Creation_time'] = JsonLog.get_creation_time() # Take only the valid rows: Status=OK valid_rows = in_df['biomart_blast_status'] == "OK" JsonLog.add_to_json("Pipeline valid blast results", sum(valid_rows)) JsonLog.add_to_json("Pipeline invalid blast results", (in_df.shape[0] - sum(valid_rows))) in_df = in_df[valid_rows] #Remove miRNA with XXX/stars rows_without_XXX = in_df['miRNA sequence'].apply( lambda x: x.find('X') == -1) JsonLog.add_to_json("Pipeline valid miRNA_no_xxx", sum(rows_without_XXX)) JsonLog.add_to_json("Pipeline invalid miRNA (xxx)", (in_df.shape[0] - sum(rows_without_XXX))) in_df = in_df[rows_without_XXX] # Remove miRNA with stars rows_without_stars = in_df['microRNA_name'].apply( lambda x: x.find('star') == -1) JsonLog.add_to_json("Pipeline valid miRNA_no_***", sum(rows_without_stars)) JsonLog.add_to_json("Pipeline invalid miRNA (star)", (in_df.shape[0] - sum(rows_without_stars))) in_df = in_df[rows_without_stars] # Choose the necessary columns in_df_filter = in_df.filter([ 'Source', 'Organism', 'GI_ID', 'microRNA_name', 'miRNA sequence', 'target sequence', 'number of reads', 'biomart_title', 'biomart_sbjct_start', 'biomart_sbjct_end', 'biomart_full_mrna' ], axis=1) in_df_filter.rename(columns={ 'biomart_title': 'mRNA_name', 'biomart_sbjct_start': 'mRNA_start', 'biomart_sbjct_end': 'mRNA_end', 'biomart_full_mrna': 'full_mrna' }, inplace=True) # reset the index in_df_filter.reset_index(drop=True, inplace=True) # self.log.append("miRNA statistics") # self.log.append(dict(in_df_filter['microRNA_name'].value_counts())) # save to file in_df_filter.to_csv(utils.filename_date_append(self.final_output))
def main(): try: debug = ast.literal_eval(sys.argv[1]) except IndexError: debug = True if (debug): print("***************************************\n" "\t\t\t DEBUG \n" "***************************************\n") mouse_config = { "organism": "Mouse", "interaction_file": "Papers/ncomms9864-s2.xlsx" } human_config = { "organism": "Human", "interaction_file": "Papers/ncomms9864-s4.xlsx" } tmp_dir = utils.make_tmp_dir("Datafiles_Prepare/tmp_dir", parents=True) log_dir = "Datafiles_Prepare/Logs/" for cnfg in [mouse_config, human_config]: organism = cnfg["organism"] interaction_file = cnfg["interaction_file"] JsonLog.set_filename( utils.filename_date_append( Path(log_dir) / Path("Darnell_miRNA_target_chimeras_" + organism + ".json"))) JsonLog.add_to_json('file name', interaction_file) JsonLog.add_to_json( 'paper', "miRNA–target chimeras reveal miRNA 3-end pairing as a major determinant of Argonaute target specificity" ) JsonLog.add_to_json('Organism', organism) JsonLog.add_to_json('paper_url', "https://www.nature.com/articles/ncomms9864") org = Darnell_miRNA_target_chimeras(interaction_file, tmp_dir, organism, debug=debug) org.run() print("Pipeline start") p = Pipeline(paper_name="Darnell_miRNA_target_chimeras", organism=organism, in_df=org.prepare_for_pipeline(), tmp_dir=tmp_dir) p.run()
def main(): try: debug = ast.literal_eval(sys.argv[1]) except IndexError: debug = True if (debug): print("***************************************\n" "\t\t\t DEBUG \n" "***************************************\n") interaction_file = str(Path("Papers/1-s2.0-S1097276516305214-mmc3.xlsx")) log_dir = "Datafiles_Prepare/Logs/" tmp_dir = utils.make_tmp_dir("Datafiles_Prepare/tmp_dir", parents=True) organisms = ["Celegans"] for organism in organisms: JsonLog.set_filename( utils.filename_date_append( Path(log_dir) / Path("Pairing_Beyond_Seed_" + organism + ".json"))) JsonLog.add_to_json('file name', interaction_file) JsonLog.add_to_json( 'paper', "Pairing beyond the Seed Supports MicroRNA Targeting Specificity") JsonLog.add_to_json('Organism', organism) JsonLog.add_to_json( 'paper_url', "https://www.sciencedirect.com/science/article/pii/S1097276516305214#mmc3" ) ce = Pairing_Beyond_Seed(input_file=interaction_file, organism=organism, tmp_dir=tmp_dir, debug=debug) ce.run() p = Pipeline(paper_name="Pairing_Beyond_Seed", organism=organism, in_df=ce.prepare_for_pipeline(), tmp_dir=tmp_dir) p.run()
def main(): try: debug = ast.literal_eval(sys.argv[1]) except IndexError: debug = True if (debug): print("***************************************\n" "\t\t\t DEBUG \n" "***************************************\n") interaction_file = str(Path("Papers/41598_2017_7880_MOESM4_ESM.csv")) log_dir = "Datafiles_Prepare/Logs/" tmp_dir = utils.make_tmp_dir("Datafiles_Prepare/tmp_dir", parents=True) organisms = ["Cow"] for organism in organisms: JsonLog.set_filename( utils.filename_date_append( Path(log_dir) / Path("Global_Mapping_Cattle_" + organism + ".json"))) JsonLog.add_to_json('file name', interaction_file) JsonLog.add_to_json( 'paper', "Global mapping of miRNA-target interactions in cattle (Bos taurus)" ) JsonLog.add_to_json('Organism', organism) JsonLog.add_to_json( 'paper_url', "https://www.nature.com/articles/s41598-017-07880-8#MOESM1") cow = Global_Mapping_Cattle(input_file=interaction_file, tmp_dir=tmp_dir, debug=debug) cow.run() p = Pipeline(paper_name="Global_Mapping_Cattle", organism=organism, in_df=cow.prepare_for_pipeline(), tmp_dir=tmp_dir) p.run()
def main(): try: debug = ast.literal_eval(sys.argv[1]) except IndexError: debug = True if (debug): print("***************************************\n" "\t\t\t DEBUG \n" "***************************************\n") interaction_file = str(Path("Papers/1-s2.0-S1097276514003566-mmc3.xls")) log_dir = "Datafiles_Prepare/Logs/" tmp_dir = utils.make_tmp_dir("Datafiles_Prepare/tmp_dir", parents=True) organisms = ["Celegans", "Human", "Mouse"] for organism in organisms: JsonLog.set_filename( utils.filename_date_append( Path(log_dir) / Path("Unambiguous_Identification_" + organism + ".json"))) JsonLog.add_to_json('file name', interaction_file) JsonLog.add_to_json( 'paper', "Unambiguous Identification of miRNA:Target Site Interactions by Different Types of Ligation Reactions" ) JsonLog.add_to_json('Organism', organism) JsonLog.add_to_json( 'paper_url', "https://www.sciencedirect.com/science/article/pii/S1097276514003566#app3" ) p = Pipeline(paper_name="Unambiguous_Identification", organism=organism, in_df=df_prepare( read_paper_data(interaction_file, organism, debug)), tmp_dir=tmp_dir) p.run()