def main(): try: debug=ast.literal_eval(sys.argv[1]) except IndexError: debug=True if (debug): print ("***************************************\n" "\t\t\t DEBUG \n" "***************************************\n") interaction_file = str(Path("Papers/1-s2.0-S009286741300439X-mmc1.txt")) log_dir = "Datafiles_Prepare/Logs/" tmp_dir = utils.make_tmp_dir("Datafiles_Prepare/tmp_dir", parents=True) organisms = ["Human"] for organism in organisms: JsonLog.set_filename( utils.filename_date_append(Path(log_dir) / Path("Mapping_the_Human_miRNA_" + organism + ".json"))) JsonLog.add_to_json('file name', interaction_file) JsonLog.add_to_json('paper', "Mapping the Human miRNA Interactome by CLASH Reveals Frequent Noncanonical Binding") JsonLog.add_to_json('Organism', organism) JsonLog.add_to_json('paper_url', "https://www.sciencedirect.com/science/article/pii/S009286741300439X") p = Pipeline(paper_name="Mapping_the_Human_miRNA", organism=organism, in_df=df_prepare(read_paper_data(interaction_file, debug)), tmp_dir=tmp_dir) p.run()
def test_classification_total(self): input_list = [ ['1979.486.1', 'FALSE', 'FALSE', '1', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'Metal'], ['1980.2d64.5', 'FALSE', 'FALSE', '2', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'Furniture'], ['67.265', 'FALSE', 'FALSE', '3', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'Metal'], ['67.265.10', 'FALSE', 'FALSE', '4', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'Gold'] ] output_dict = {} for row in input_list: Pipeline.running_total(row, output_dict) self.assertEqual(2,output_dict['Metal']) self.assertEqual(1, output_dict['Furniture']) self.assertEqual(1, output_dict['Gold']) def select_table(self): DBUtil.select_table("classification_totals")
def menu(): ini_path = os.path.dirname(os.path.realpath(__file__)) ini_path = os.path.join(ini_path, 'input.ini') Ini = ReadIni(ini_path) path = Ini.project_path start = Ini.start end = Ini.end test_begin(end, start) now = datetime.now() now = now.strftime("%b %d %Y %H:%M:%S") mkdir(path) rec = 'Project begins.' rec += '\n' + '***'*25 rename_file(path, 'record') record(path, rec, init=True) print('***'*25) print(now) print(rec) try: shutil.copy(ini_path, path + '/input.ini') except Exception as e: print(e) Pipeline.pipeline(path, start, end)
def get_tree(self): from AnalysedTreeTransforms import AutoTestDictTransform # The AutoTestDictTransform creates the statement "__test__ = {}", # which when copied into the main ModuleNode overwrites # any __test__ in user code; not desired excludes = [AutoTestDictTransform] import Pipeline, ParseTreeTransforms context = CythonUtilityCodeContext(self.name) context.prefix = self.prefix #context = StringParseContext(self.name) tree = parse_from_strings(self.name, self.pyx, context=context) pipeline = Pipeline.create_pipeline(context, 'pyx', exclude_classes=excludes) transform = ParseTreeTransforms.CnameDirectivesTransform(context) # InterpretCompilerDirectives already does a cdef declarator check #before = ParseTreeTransforms.DecoratorTransform before = ParseTreeTransforms.InterpretCompilerDirectives pipeline = Pipeline.insert_into_pipeline(pipeline, transform, before=before) (err, tree) = Pipeline.run_pipeline(pipeline, tree) assert not err, err return tree
def filterAlignments(infile, outfile): ''' filter alignments to retain only those that have > 99% identity to the reference ''' to_cluster = True statement = '''delta-filter -q -i 99 %(infile)s > %(outfile)s''' P.run()
def downloadSCOP( infile, outfile ): '''download the latest scop sequence set (< 40% identical)''' statement = ''' wget -O %(outfile)s "http://astral.berkeley.edu/seq.cgi?get=scopdom-seqres-gd-sel-gs-bib;ver=1.75;item=seqs;cut=40" ''' P.run()
def buildAlignmentCoordinates(infile, outfile): ''' build coordinates file from alignment delta file ''' to_cluster = True statement = '''show-coords -T -r %(infile)s > %(outfile)s''' P.run()
def process_pxd(self, source_desc, scope, module_name): import Pipeline if isinstance(source_desc, FileSourceDescriptor) and source_desc._file_type == 'pyx': source = CompilationSource(source_desc, module_name, os.getcwd()) result_sink = create_default_resultobj(source, self.options) pipeline = Pipeline.create_pyx_as_pxd_pipeline(self, result_sink) result = Pipeline.run_pipeline(pipeline, source) else: pipeline = Pipeline.create_pxd_pipeline(self, scope, module_name) result = Pipeline.run_pipeline(pipeline, source_desc) return result
def main(): # start socket TCP_IP = '128.237.198.49' TCP_PORT = 2002 print('Socket Information: %s:%d' % (TCP_IP, TCP_PORT)) s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.connect((TCP_IP, TCP_PORT)) time.sleep(1e-3) # start camera vs = VideoStream(src=0).start() time.sleep(2.0) # calibration and find block #Caliberate_camera(vs) block_pixel_position = detect_block_grab(vs) block_real_position = transfer_to_real(block_pixel_position) print(block_pixel_position) print(block_real_position) # Inverse Kinematics inverse_kinematics(block_real_position,s) HOME_POSITION = [20,-15,20] roll = -math.pi/2 traj = RPC.pipline_position_encoder_roll(HOME_POSITION, [40,-30,20], roll, s) traj = RPC.pipline_position_encoder_roll([40,-30,20], [40,-30,12], roll, s) Pipeline.C_execute([traj]) the_block = [42, -26, 1, -1.04] ball_position = detect_ball(vs) ball_position.reverse() K = 0.4 adj = 0 for pos in ball_position: if pos == None: continue print(pos) if pos[1] > -16: if pos[0] > 47: err = pos[0]-47 adj = err*K break print(adj) #print(ball_position) commands = Pipeline.Adjust(the_block, adj, s) for command in commands: print(command) Pipeline.C_execute(commands) '
def main(): try: debug = ast.literal_eval(sys.argv[1]) except IndexError: debug = True if (debug): print("***************************************\n" "\t\t\t DEBUG \n" "***************************************\n") mouse_config = { "organism": "Mouse", "interaction_file": "Papers/ncomms9864-s2.xlsx" } human_config = { "organism": "Human", "interaction_file": "Papers/ncomms9864-s4.xlsx" } tmp_dir = utils.make_tmp_dir("Datafiles_Prepare/tmp_dir", parents=True) log_dir = "Datafiles_Prepare/Logs/" for cnfg in [mouse_config, human_config]: organism = cnfg["organism"] interaction_file = cnfg["interaction_file"] JsonLog.set_filename( utils.filename_date_append( Path(log_dir) / Path("Darnell_miRNA_target_chimeras_" + organism + ".json"))) JsonLog.add_to_json('file name', interaction_file) JsonLog.add_to_json( 'paper', "miRNA–target chimeras reveal miRNA 3-end pairing as a major determinant of Argonaute target specificity" ) JsonLog.add_to_json('Organism', organism) JsonLog.add_to_json('paper_url', "https://www.nature.com/articles/ncomms9864") org = Darnell_miRNA_target_chimeras(interaction_file, tmp_dir, organism, debug=debug) org.run() print("Pipeline start") p = Pipeline(paper_name="Darnell_miRNA_target_chimeras", organism=organism, in_df=org.prepare_for_pipeline(), tmp_dir=tmp_dir) p.run()
def __init__(self, audiofile, strings=None, filename=None): self.filename = filename self.audiofile = audiofile self.touched = True if not strings: strings = [-5, -10, -14, -19, -24, -29] self.appsinkpipeline = Pipeline.AppSinkPipeline(self.audiofile) self.pipeline = Pipeline.Pipeline(self.audiofile) self.timeline = Timeline.Timeline(self, strings) self.timeline.show_all() self.control = VisualizerControl(self.pipeline)
def get_tree(self, entries_only=False, cython_scope=None): from AnalysedTreeTransforms import AutoTestDictTransform # The AutoTestDictTransform creates the statement "__test__ = {}", # which when copied into the main ModuleNode overwrites # any __test__ in user code; not desired excludes = [AutoTestDictTransform] import Pipeline, ParseTreeTransforms context = CythonUtilityCodeContext(self.name) context.prefix = self.prefix context.cython_scope = cython_scope #context = StringParseContext(self.name) tree = parse_from_strings(self.name, self.impl, context=context, allow_struct_enum_decorator=True) pipeline = Pipeline.create_pipeline(context, 'pyx', exclude_classes=excludes) if entries_only: p = [] for t in pipeline: p.append(t) if isinstance( p, ParseTreeTransforms.AnalyseDeclarationsTransform): break pipeline = p transform = ParseTreeTransforms.CnameDirectivesTransform(context) # InterpretCompilerDirectives already does a cdef declarator check #before = ParseTreeTransforms.DecoratorTransform before = ParseTreeTransforms.InterpretCompilerDirectives pipeline = Pipeline.insert_into_pipeline(pipeline, transform, before=before) if self.from_scope: def scope_transform(module_node): module_node.scope.merge_in(self.from_scope) return module_node transform = ParseTreeTransforms.AnalyseDeclarationsTransform pipeline = Pipeline.insert_into_pipeline(pipeline, scope_transform, before=transform) (err, tree) = Pipeline.run_pipeline(pipeline, tree, printtree=False) assert not err, err return tree
def calculateFalsePositiveRate(infiles, outfile): ''' calculate the false positive rate in taxonomic abundances ''' # connect to database dbh = sqlite3.connect(PARAMS["database"]) cc = dbh.cursor() true_file = infiles[0] true_set = set() estimate_set = set() for estimate_file in infiles[1:]: if os.path.basename(estimate_file)[ len("metaphlan_"):] == os.path.basename(true_file): tablenames = [ P.toTable(os.path.basename(true_file)), P.toTable(os.path.basename(estimate_file)) ] for species in cc.execute("""SELECT species_name FROM %s""" % tablenames[0]).fetchall(): true_set.add(species[0]) for species in cc.execute( """SELECT taxon FROM %s WHERE taxon_level == 'species'""" % tablenames[1]).fetchall(): if species[0].find("_unclassified") != -1: continue estimate_set.add(species[0]) total_estimate = len(estimate_set) total_true = len(true_set) E.info("counting false positives and false negatives") print(estimate_set.difference(true_set)) nfp = len(estimate_set.difference(true_set)) nfn = len(true_set.difference(estimate_set)) ntp = len(estimate_set.intersection(true_set)) E.info("writing results") track = P.snip(os.path.basename(true_file), ".load") outf = open(outfile, "w") outf.write("track\ttp_rate\tfp_rate\tfn_rate\n") outf.write("\t".join( map(str, [ track, float(ntp) / total_estimate, float(nfp) / total_estimate, float(nfn) / total_true ])) + "\n") outf.close()
def createAlignmentBedFiles(infile, outfile): ''' create bed files - the intervals are with respect to the reference genome intervals are merged to form a non redundant alignment set ''' # has to be output from show coords in tab format # also have to be sorted for mergeBed to_cluster = True statement = '''cat %(infile)s | python %(scriptsdir)s/nucmer2bed.py -t bed4 --log=%(outfile)s.log | mergeBed -i - | gzip > %(outfile)s''' P.run()
def alignmentTargets(genome_files, contig_files): ''' generator object to produce filenames for aligning contigs to known ncbi genomes ''' parameters = [] for genome, contig in itertools.product(genome_files, contig_files): outfile = os.path.join( "alignment.dir", P.snip(contig, ".contigs.fa") + "_vs_" + P.snip(os.path.basename(genome), ".fna")) + ".delta" additional_input = add_inputs(contig) parameters.append([outfile, genome, contig]) return parameters
def splitFasta( infiles, outfiles): '''split fasta file.''' infile = infiles[0] chunk_size = 500 statement = ''' cat %(infile)s | perl /ifs/devel/andreas/cgat/split_fasta.pl -a blast.dir/chunk_%%s.fasta %(chunk_size)i > split.log ''' P.run()
def alignContigsToReference(outfile, param1, param2): ''' align the contigs to the reference genomes using nucmer ''' to_cluster = True reffile, contigfile = param1, param2 pattern = P.snip(os.path.basename(outfile), ".delta") statement = '''nucmer -p %(pattern)s %(reffile)s %(contigfile)s''' P.run() outf = os.path.basename(outfile) statement = '''mv %(outf)s alignment.dir''' P.run()
def buildMask( infile, outfile ): '''build seg mask for protein sequences.''' to_cluster = True statement = ''' segmasker -in %(infile)s -infmt fasta -parse_seqids -outfmt maskinfo_asn1_bin -out %(outfile)s >& %(outfile)s.log ''' P.run()
def downloadPFAM( infile, outfiles ): '''download the latest PFAM domain sequence set''' outfile1, outfile2 = outfiles statement = ''' wget -O %(outfile1)s "ftp://ftp.sanger.ac.uk/pub/databases/Pfam/current_release/Pfam-A.fasta.gz"; ''' P.run() statement = ''' wget -O %(outfile2)s "ftp://ftp.sanger.ac.uk/pub/databases/Pfam/current_release/Pfam-A.seed.gz"; ''' P.run()
def run(self, typeCode, endTime): plInstance, histData = plu.Pipeline(histInterval), None endTimeUNIX = utl.dateToUNIX(endTime) startDate = utl.getCurrentDateStr() priorDate = utl.datetimeDiff(startDate, 30) marketData = (self.ticker, self.tradeQuantity) systemData = (endTimeUNIX, histLag, systemLag, plInstance) if (self.ticker in cst.GDAX_TICKERS): gdaxTicker = cst.GDAX_TO_POLONIEX[self.ticker] histData = plInstance.getCryptoHistoricalData( gdaxTicker, priorDate, startDate) else: raise ValueError( 'Bad ticker! Supported tickers are BTC, LTC, ETH.') self.generateTechIndObjects(histData) sysTuple = (marketData, systemData) if (typeCode == "BT"): from Pipeline import indsToDF techDF = indsToDF(self.techInds) positionData = () return self.loopBacktestLogic(positionData, histData, techDF) if (typeCode == "PT"): self.loopPaperTradeLogic(*sysTuple, histData) return self.endPaperTrading(endCode, sysTuple)
def loadCoverageData(infile, outfile): ''' load coverage data into database ''' to_cluster = True tablename = P.toTable(outfile) database = os.path.join(PARAMS["results_resultsdir"], PARAMS["database"]) dbh = sqlite3.connect(database) cc = dbh.cursor() temp = P.getTempFile() temp.write("contig_id\tacoverage\n") for data in cc.execute("""SELECT contig_id, AVG(coverage) FROM %s GROUP BY contig_id""" % tablename).fetchall(): temp.write("\t".join(list(data)) + "\n") temp.close() P.load(temp.name, outfile) os.unlink(temp.name)
def prepareDatabase( infiles, outfile ): '''prepare the blast database.''' fastafile, maskfile = infiles to_cluster = True statement = ''' makeblastdb -in %(fastafile)s -dbtype prot -parse_seqids -mask_data %(maskfile)s -out nrdb50 -title "Uniref Protein Database" >& %(outfile)s ''' P.run()
def removeBlastUnfinished( infiles, outfile ): '''remove aborted blast runs.''' deleted = 0 for infile in infiles: line = IOTools.getLastLine( infile ) if not re.search( "job finished", line ): fn = infile[:-len(".log")] if os.path.exists( fn ): P.info("deleting %s" % fn ) os.unlink( fn ) deleted += 1 P.info("deleted %i files" % deleted)
def __init__(self, dbReference, connection): super().__init__(dbReference, connection) self.formatterInstance = plu.Formatter() self.gdaxPublicClient = gdax.PublicClient() self.spotDataRef = self.dbReference.table('SpotData') self.techIndsRef = self.dbReference.table('TechIndicators') self.spotPrice, self.spotVolume = (None, ) * 2
def test_normalize_row(self): input_list = ["1853","1901","1909–27","1800–1900","1867","ca. 1785","1795–1810"] output_dict = {} for row in input_list: date_range = Pipeline.normalize_row(cleaned_row[21]) self.assertEqual(4, date_range[0]) self.assertEqual(4, date_range[1])
def main(start_fold, gpu): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu) config = tf.ConfigProto() config.gpu_options.allow_growth = True # dynamically grow the memory used on the GPU sess = tf.Session(config=config) set_session( sess) # set this TensorFlow session as the default session for Keras GetData = DataGenerator(dataset_mode='lr') CV = Pipeline(GetData, DL_model, start_fold, gpu, model_name=MODEL_PATH + 'LSTM_model_lr') score = CV.train() log.info(f'Model accuracy = {score}')
def get_tree(self, entries_only=False, cython_scope=None): from AnalysedTreeTransforms import AutoTestDictTransform # The AutoTestDictTransform creates the statement "__test__ = {}", # which when copied into the main ModuleNode overwrites # any __test__ in user code; not desired excludes = [AutoTestDictTransform] import Pipeline, ParseTreeTransforms context = CythonUtilityCodeContext(self.name) context.prefix = self.prefix context.cython_scope = cython_scope #context = StringParseContext(self.name) tree = parse_from_strings(self.name, self.impl, context=context, allow_struct_enum_decorator=True) pipeline = Pipeline.create_pipeline(context, 'pyx', exclude_classes=excludes) if entries_only: p = [] for t in pipeline: p.append(t) if isinstance(p, ParseTreeTransforms.AnalyseDeclarationsTransform): break pipeline = p transform = ParseTreeTransforms.CnameDirectivesTransform(context) # InterpretCompilerDirectives already does a cdef declarator check #before = ParseTreeTransforms.DecoratorTransform before = ParseTreeTransforms.InterpretCompilerDirectives pipeline = Pipeline.insert_into_pipeline(pipeline, transform, before=before) if self.from_scope: def scope_transform(module_node): module_node.scope.merge_in(self.from_scope) return module_node transform = ParseTreeTransforms.AnalyseDeclarationsTransform pipeline = Pipeline.insert_into_pipeline(pipeline, scope_transform, before=transform) (err, tree) = Pipeline.run_pipeline(pipeline, tree, printtree=False) assert not err, err return tree
def run_pipeline(source, options, full_module_name=None, context=None): import Pipeline source_ext = os.path.splitext(source)[1] options.configure_language_defaults(source_ext[1:]) # py/pyx if context is None: context = options.create_context() # Set up source object cwd = os.getcwd() abs_path = os.path.abspath(source) full_module_name = full_module_name or context.extract_module_name( source, options) if options.relative_path_in_code_position_comments: rel_path = full_module_name.replace('.', os.sep) + source_ext if not abs_path.endswith(rel_path): rel_path = source # safety measure to prevent printing incorrect paths else: rel_path = abs_path source_desc = FileSourceDescriptor(abs_path, rel_path) source = CompilationSource(source_desc, full_module_name, cwd) # Set up result object result = create_default_resultobj(source, options) if options.annotate is None: # By default, decide based on whether an html file already exists. html_filename = os.path.splitext(result.c_file)[0] + ".html" if os.path.exists(html_filename): line = codecs.open(html_filename, "r", encoding="UTF-8").readline() if line.startswith(u'<!-- Generated by Cython'): options.annotate = True # Get pipeline if source_ext.lower() == '.py' or not source_ext: pipeline = Pipeline.create_py_pipeline(context, options, result) else: pipeline = Pipeline.create_pyx_pipeline(context, options, result) context.setup_errors(options, result) err, enddata = Pipeline.run_pipeline(pipeline, source) context.teardown_errors(err, options, result) return result
def plotRelativeAbundanceCorrelations(infiles, outfile): ''' plot the correlation between the estimated relative abundance of species and the true relative abundances - done on the shared set ''' # connect to database dbh = sqlite3.connect(PARAMS["database"]) cc = dbh.cursor() true_file = infiles[0] temp = P.getTempFile() temp.write("true\testimate\n") for estimate_file in infiles[1:]: if os.path.basename(estimate_file)[ len("metaphlan_"):] == os.path.basename(true_file): tablenames = [ P.toTable(os.path.basename(true_file)), P.toTable(os.path.basename(estimate_file)) ] # get data statement = """SELECT a.relab, b.rel_abundance FROM %s as a, %s as b WHERE b.taxon_level == "species" AND a.species_name == b.taxon""" % (tablenames[0], tablenames[1]) for data in cc.execute(statement).fetchall(): true, estimate = data[0], data[1] temp.write("%f\t%f\n" % (true, estimate)) temp.close() print(temp.name) inf = temp.name R('''data <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % inf) R('''png("%s")''' % outfile) main_name = P.snip(outfile, ".png") R('''data$estimate <- data$estimate/100''') R('''plot(data$estimate, data$true, pch = 16, main = "%s", xlab = "estimated relative abundance", ylab = "observed relative abundance")''' % main_name) R('''text(0.05, y = 0.35, labels = paste("r = ", round(cor(data$estimate, data$true),2)), cex = 2)''' ) R["dev.off"]() os.unlink(inf)
def filterContigsByCoverage(infiles, outfile): ''' filter contigs by their average base coverage ''' fcoverage = PARAMS["coverage_filter"] contig_file = infiles[0] dbh = sqlite3.connect(PARAMS["database"]) cc = dbh.cursor() for infile in infiles[1:]: print contig_file, P.snip(os.path.basename(infile), ".load")
def filterContigsByCoverage(infiles, outfile): ''' filter contigs by their average base coverage ''' fcoverage = PARAMS["coverage_filter"] contig_file = infiles[0] dbh = sqlite3.connect(PARAMS["database"]) cc = dbh.cursor() for infile in infiles[1:]: print(contig_file, P.snip(os.path.basename(infile), ".load"))
def checkBlastRuns( infiles, outfile ): '''check if output files are complete. ''' outf = IOTools.openFile( outfile, "w" ) outf.write( "chunkid\tquery_first\tquery_last\tfound_first\tfound_last\tfound_total\tfound_results\thas_finished\tattempts\t%s\n" %\ "\t".join(Logfile.RuntimeInformation._fields)) for infile in infiles: E.debug( "processing %s" % infile) chunkid = P.snip( os.path.basename( infile ), ".blast.gz" ) logfile = infile + ".log" chunkfile = P.snip( infile, ".blast.gz" ) + ".fasta" with IOTools.openFile( infile ) as inf: l = inf.readline() ids = set() total_results = 0 for l in inf: if l.startswith("#//"): continue ids.add( int(l.split("\t")[0] ) ) total_results += 1 found_first = min(ids) found_last = max(ids) found_total = len(ids) l = IOTools.getFirstLine( chunkfile ) query_first = l[1:-1] l2 = IOTools.getLastLine( chunkfile, nlines = 2).split("\n") query_last = l2[0][1:] logresults = Logfile.parse( logfile ) outf.write( "\t".join( map(str, (\ chunkid, query_first, query_last, found_first, found_last, found_total, total_results, logresults[-1].has_finished, len(logresults), "\t".join( map(str, logresults[-1]) ) ) ) ) + "\n" ) outf.close()
def chimeraTargets(alignment_files, contig_files): ''' generator object to produce filenames for scoring chimericity ''' parameters = [] for alignment, contig in itertools.product(genome_files, contig_files): outfile = os.path.join("chimeras.dir", P.snip(alignment, ".bam") + ".chimeras") parameters.append( [outfile, alignment, contig] ) return parameters
def run_pipeline(source, options, full_module_name=None, context=None): import Pipeline source_ext = os.path.splitext(source)[1] options.configure_language_defaults(source_ext[1:]) # py/pyx if context is None: context = options.create_context() # Set up source object cwd = os.getcwd() abs_path = os.path.abspath(source) full_module_name = full_module_name or context.extract_module_name(source, options) if options.relative_path_in_code_position_comments: rel_path = full_module_name.replace('.', os.sep) + source_ext if not abs_path.endswith(rel_path): rel_path = source # safety measure to prevent printing incorrect paths else: rel_path = abs_path source_desc = FileSourceDescriptor(abs_path, rel_path) source = CompilationSource(source_desc, full_module_name, cwd) # Set up result object result = create_default_resultobj(source, options) if options.annotate is None: # By default, decide based on whether an html file already exists. html_filename = os.path.splitext(result.c_file)[0] + ".html" if os.path.exists(html_filename): line = codecs.open(html_filename, "r", encoding="UTF-8").readline() if line.startswith(u'<!-- Generated by Cython'): options.annotate = True # Get pipeline if source_ext.lower() == '.py' or not source_ext: pipeline = Pipeline.create_py_pipeline(context, options, result) else: pipeline = Pipeline.create_pyx_pipeline(context, options, result) context.setup_errors(options, result) err, enddata = Pipeline.run_pipeline(pipeline, source) context.teardown_errors(err, options, result) return result
def run_pipeline(self): with open(self.data_file, encoding='utf8') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_num = 0 for row in csv_reader: if line_num == 0: self.setup_tables(row, insert_columns) # Create both table here with column names with headers available in 0th row line_num += 1 else: cleaned_row = Pipeline.data_cleanup(row) # Step 1: data cleanup for first column if cleaned_row is not None: # Step 2: Normalize the date column date_range = Pipeline.normalize_row(cleaned_row[21]) insert_stmt = "INSERT INTO data_normalized(" + insert_columns_a + ") VALUES(" \ + '"{0}"'.format('", "'.join(cleaned_row)) + ",\"" \ + date_range[0]+ "\", \"" + date_range[1] +"\")" self.dbobj.insert_table(insert_stmt) Pipeline.running_total(cleaned_row, self.running_total_dict) # Step 3: Calculate running_total line_num += 1 #if line_num == 2000: #break; # Populate classification_totals table with calculated running totals for classification, totals in self.running_total_dict.items(): insert_stmt = "INSERT INTO classification_totals(Classification, Totals) VALUES (\"" \ + classification +"\",\""+ str(totals) +"\")" self.dbobj.insert_table(insert_stmt) self.dbobj.commit() #self.dbobj.select_table("data_normalized") #self.dbobj.select_table("classification_totals") self.dbobj.close_connection()
def main(): try: debug = ast.literal_eval(sys.argv[1]) except IndexError: debug = True if (debug): print("***************************************\n" "\t\t\t DEBUG \n" "***************************************\n") interaction_file = str(Path("Papers/1-s2.0-S1097276516305214-mmc3.xlsx")) log_dir = "Datafiles_Prepare/Logs/" tmp_dir = utils.make_tmp_dir("Datafiles_Prepare/tmp_dir", parents=True) organisms = ["Celegans"] for organism in organisms: JsonLog.set_filename( utils.filename_date_append( Path(log_dir) / Path("Pairing_Beyond_Seed_" + organism + ".json"))) JsonLog.add_to_json('file name', interaction_file) JsonLog.add_to_json( 'paper', "Pairing beyond the Seed Supports MicroRNA Targeting Specificity") JsonLog.add_to_json('Organism', organism) JsonLog.add_to_json( 'paper_url', "https://www.sciencedirect.com/science/article/pii/S1097276516305214#mmc3" ) ce = Pairing_Beyond_Seed(input_file=interaction_file, organism=organism, tmp_dir=tmp_dir, debug=debug) ce.run() p = Pipeline(paper_name="Pairing_Beyond_Seed", organism=organism, in_df=ce.prepare_for_pipeline(), tmp_dir=tmp_dir) p.run()
def main(): try: debug = ast.literal_eval(sys.argv[1]) except IndexError: debug = True if (debug): print("***************************************\n" "\t\t\t DEBUG \n" "***************************************\n") interaction_file = str(Path("Papers/41598_2017_7880_MOESM4_ESM.csv")) log_dir = "Datafiles_Prepare/Logs/" tmp_dir = utils.make_tmp_dir("Datafiles_Prepare/tmp_dir", parents=True) organisms = ["Cow"] for organism in organisms: JsonLog.set_filename( utils.filename_date_append( Path(log_dir) / Path("Global_Mapping_Cattle_" + organism + ".json"))) JsonLog.add_to_json('file name', interaction_file) JsonLog.add_to_json( 'paper', "Global mapping of miRNA-target interactions in cattle (Bos taurus)" ) JsonLog.add_to_json('Organism', organism) JsonLog.add_to_json( 'paper_url', "https://www.nature.com/articles/s41598-017-07880-8#MOESM1") cow = Global_Mapping_Cattle(input_file=interaction_file, tmp_dir=tmp_dir, debug=debug) cow.run() p = Pipeline(paper_name="Global_Mapping_Cattle", organism=organism, in_df=cow.prepare_for_pipeline(), tmp_dir=tmp_dir) p.run()
def alignmentTargets(genome_files, contig_files): ''' generator object to produce filenames for aligning contigs to known ncbi genomes ''' parameters = [] for genome, contig in itertools.product(genome_files, contig_files): outfile = os.path.join("alignment.dir", P.snip(contig, ".contigs.fa") + "_vs_" + P.snip(os.path.basename(genome), ".fna")) + ".delta" additional_input = add_inputs(contig) parameters.append( [outfile, genome, contig] ) return parameters
def mergeBlast( infiles, outfile ): '''merge blast results into a single file.''' to_cluster = True files = [ (int(re.match( ".*chunk_(\d+).blast.gz", x).groups()[0]), x) for x in infiles ] files.sort() files = " ".join( [ x[1] for x in files ] ) statement = '''zcat %(files)s | awk '$1 == "query_nid" { if(a){ next;} a=1; } {print}' | gzip > %(outfile)s''' P.run() files = [ (int(re.match( ".*chunk_(\d+).blast.gz.log", x).groups()[0]), x) for x in infiles ] files.sort() files = " ".join( [ x[1] for x in files ] ) statement = '''cat %(files)s >> %(outfile)s.log''' P.run()
def inverse_kinematics(block_real_position, s): put_index = 0 for key in block_real_position: grab_position = block_real_position[key] if(grab_position[2] > 0): grab_roll = grab_position[2] - 180 else: grab_roll = grab_position[2] grab_roll = grab_roll/180*math.pi grab_position = [grab_position[0][0]-5,grab_position[0][1],1] put_position = block_put_position[put_index] put_roll = put_position[3] put_position = put_position[0:3] commands = Pipeline.classical_combi(grab_position, put_position, grab_roll, put_roll, s) Pipeline.C_execute(commands) put_index += 1 HOME_POSITION = [20,-15,20] END_POSITION = [40,20,20] traj = RPC.pipline_position_encoder(HOME_POSITION,END_POSITION,s) Pipeline.C_execute([traj])
def main(start_fold, gpu, batch, add_trend, freq_enc): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu) config = tf.ConfigProto() config.gpu_options.allow_growth = True # dynamically grow the memory used on the GPU sess = tf.Session(config=config) set_session( sess) # set this TensorFlow session as the default session for Keras if add_trend: log.info('Will add trend to XEEK Train data') GetData = DataGenerator(add_trend=add_trend, dataset_mode='ud') CV = Pipeline(GetData, DL_model, start_fold, gpu, batch, model_name=MODEL_PATH + 'LSTM_model_ud') score = CV.train(freq_encoder=freq_enc) log.info(f'Model accuracy = {score}')
def __init__(self): self.UrlAndIDContr = URLSchedul.UrlManager() self.downloader = Download.Downloader() self.parser = Html_pareser.HtmlPare() self.ProceClean = Pipeline.pinline() self.outjson = FeeDExport.FeedExp() self.CollectAllData={} self.errGeoGet = [] msgKeys = ["geo_code","latitude","longitude","nation","province","city","district","street","street_number"] for k in msgKeys: self.CollectAllData[k] = []
def mapSCOP( infile, outfile ): '''map scop against sequence database. ''' to_cluster = True max_evalue = 0.00001 num_results = 100 mask = 21 # upper case is critical, otherwise traceback fails!? matrix = "BLOSUM50" gop = 12 gep = 2 dbname = "/tmp/blast/nrdb50" num_jobs = 8 job_options = '-pe dedicated %i -R y' % num_jobs statement = ''' /ifs/devel/andreas/cgat/run.py --log=%(outfile)s.log 'blastp -query %(infile)s -db %(dbname)s -evalue %(max_evalue)f -num_alignments %(num_results)i -num_descriptions %(num_results)i -db_soft_mask %(mask)i -matrix %(matrix)s -gapopen %(gop)i -gapextend %(gep)i -num_threads %(num_jobs)i -outfmt "6 qseqid qstart qend sseqid sstart send evalue bitscore pident score qseq sseq" | python /ifs/devel/andreas/cgat/blast2table.py --alignment-format=blocks | gzip > %(outfile)s'; checkpoint; echo "#//" | gzip >> %(outfile)s ''' P.run()
def collectGenomeSizes(infile, outfile): ''' output the genome sizes for each genome ''' to_cluster = True outf = open(outfile, "w") outf.write("genome\tlength\n") # assume single fasta entry for fasta in FastaIterator.iterate(IOTools.openFile(infile)): name = P.snip(os.path.basename(infile), ".fna") length = len(list(fasta.sequence)) outf.write("%s\t%s\n" % (name, str(length))) outf.close()
def runTest(self): alignments, alignments_score = self.aligner.align( self.p_str_tokens, self.h_str_tokens, self.weights) #print 'Alignments:\n' for a in alignments: print a prediction = Pipeline.get_entailment( self.p_str_tokens, self.h, alignments) logging.info('Target: %s' % self.target) logging.info('Prediction: %s' % prediction) self.assertEqual(prediction, self.target)
def calculateFalsePositiveRate(infiles, outfile): ''' calculate the false positive rate in taxonomic abundances ''' # connect to database dbh = sqlite3.connect(PARAMS["database"]) cc = dbh.cursor() true_file = infiles[0] true_set = set() estimate_set = set() for estimate_file in infiles[1:]: if os.path.basename(estimate_file)[len("metaphlan_"):] == os.path.basename(true_file): tablenames = [P.toTable(os.path.basename(true_file)), P.toTable(os.path.basename(estimate_file))] for species in cc.execute("""SELECT species_name FROM %s""" % tablenames[0]).fetchall(): true_set.add(species[0]) for species in cc.execute("""SELECT taxon FROM %s WHERE taxon_level == 'species'""" % tablenames[1]).fetchall(): if species[0].find("_unclassified") != -1: continue estimate_set.add(species[0]) total_estimate = len(estimate_set) total_true = len(true_set) E.info("counting false positives and false negatives") print estimate_set.difference(true_set) nfp = len(estimate_set.difference(true_set)) nfn = len(true_set.difference(estimate_set)) ntp = len(estimate_set.intersection(true_set)) E.info("writing results") track = P.snip(os.path.basename(true_file), ".load") outf = open(outfile, "w") outf.write("track\ttp_rate\tfp_rate\tfn_rate\n") outf.write("\t".join(map(str, [track, float(ntp)/total_estimate, float(nfp)/total_estimate, float(nfn)/total_true])) + "\n") outf.close()
def runBlast( infile, outfile ): '''run blast ''' to_cluster = True max_evalue = 1.0 num_results = 1000000 mask = 21 dbsize = 1500000000 # upper case is critical, otherwise traceback fails!? matrix = "BLOSUM50" gop = 12 gep = 2 dbname = "/tmp/blast/nrdb50" statement = ''' /ifs/devel/andreas/cgat/run.py --log=%(outfile)s.log 'blastp -query %(infile)s -db %(dbname)s -evalue %(max_evalue)f -num_alignments %(num_results)i -num_descriptions %(num_results)i -db_soft_mask %(mask)i -matrix %(matrix)s -gapopen %(gop)i -gapextend %(gep)i -outfmt "6 qseqid qstart qend sseqid sstart send evalue bitscore pident score qseq sseq" | python /ifs/devel/andreas/cgat/blast2table.py --alignment-format=blocks | gzip > %(outfile)s'; checkpoint; echo "#//" | gzip >> %(outfile)s ''' P.run()
def runTest(self): start = time() alignments, alignments_score = self.aligner.align( self.p_str_tokens, self.h_str_tokens, self.weights) print "Alignment %s" % (time() - start) #print 'Alignments:\n' for a in alignments: print a prediction = Pipeline.get_entailment( self.p_str_tokens, self.h, alignments) logging.info('Target: %s' % self.target) logging.info('Prediction: %s' % prediction) print 'Answer: %s' % self.answer[prediction] self.assertEqual(prediction, self.target)
def buildAlignmentSizes(infiles, outfile): ''' use bed files to sum the total number of bases that are aligned to the genomes ''' outf = open(outfile, "w") outf.write("genome\tsize\n") for infile in infiles: genome = P.snip(os.path.basename(infile), ".bed.gz") c = 0 inf = IOTools.openFile(infile) for bed in Bed.iterator(inf): c += bed.end - bed.start outf.write("%s\t%s\n" % (genome, str(c))) outf.close()
def plotRelativeAbundanceCorrelations(infiles, outfile): ''' plot the correlation between the estimated relative abundance of species and the true relative abundances - done on the shared set ''' # connect to database dbh = sqlite3.connect(PARAMS["database"]) cc = dbh.cursor() true_file = infiles[0] temp = P.getTempFile() temp.write("true\testimate\n") for estimate_file in infiles[1:]: if os.path.basename(estimate_file)[len("metaphlan_"):] == os.path.basename(true_file): tablenames = [P.toTable(os.path.basename(true_file)), P.toTable(os.path.basename(estimate_file))] # get data statement = """SELECT a.relab, b.rel_abundance FROM %s as a, %s as b WHERE b.taxon_level == "species" AND a.species_name == b.taxon""" % (tablenames[0], tablenames[1]) for data in cc.execute(statement).fetchall(): true, estimate = data[0], data[1] temp.write("%f\t%f\n" % (true, estimate)) temp.close() print temp.name inf = temp.name R('''data <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % inf) R('''png("%s")''' % outfile) main_name = P.snip(outfile, ".png") R('''data$estimate <- data$estimate/100''') R('''plot(data$estimate, data$true, pch = 16, main = "%s", xlab = "estimated relative abundance", ylab = "observed relative abundance")''' % main_name) R('''text(0.05, y = 0.35, labels = paste("r = ", round(cor(data$estimate, data$true),2)), cex = 2)''') R["dev.off"]() os.unlink(inf)
################################################################### ################################################################### ################################################################### # Run configuration script from SphinxReport.Utils import PARAMS as P EXPORTDIR=P['medip_exportdir'] DATADIR=P['medip_datadir'] DATABASE=P['medip_backend'] ################################################################### # cf. pipeline_medip.py # This should be automatically gleaned from pipeline_chipseq.py ################################################################### import Pipeline PARAMS_PIPELINE = Pipeline.peekParameters( ".", "pipeline_medip.py" ) import PipelineTracks Sample = PipelineTracks.Sample3 suffixes = ["export.txt.gz", "sra", "fastq.gz", "fastq.1.gz", "csfasta.gz" ] TRACKS = sum( itertools.chain( [ PipelineTracks.Tracks( Sample ).loadFromDirectory( [ x for x in glob.glob( "%s/*.%s" % (DATADIR, s) ) if "input" not in x], "%s/(\S+).%s" % (DATADIR, s) ) for s in suffixes ] ), PipelineTracks.Tracks( Sample ) )
def runCharacterize(cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath, optargs, nthreads): '''Load Pipeline files from first arg; configure CharacterizeModule; run alignments if runaligns; report on those alignments or the xmap provided as xmappath. ''' printargs = True if not os.path.isfile(os.path.join(cwd,"utilities.py")): print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import utilities as util if not util.checkFile(os.path.join(cwd,"Pipeline.py")): print "Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import Pipeline if not util.checkFile(os.path.join(cwd,"CharacterizeModule.py")): print "CharacterizeModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import CharacterizeModule as cm #if not util.checkFile(os.path.join(cwd,"MapClassesRev.py")): # print "MapClassesRev.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" # sys.exit(1) #import MapClassesRev #use Pipeline objects varsP = Pipeline.varsPipeline() varsP.optArgumentsFileIn = optargs varsP.RefAlignerBin = rabin varsP.latestMergedCmap = os.path.join(contigdir, contigbase+".cmap") #file suffix required to be .cmap varsP.contigFolder = contigdir varsP.nThreads = nthreads #necessary otherwise job won't start varsP.ref = refcmap varsP.stdoutlog = True #enable -stdout -stderr args to RefAligner varsP.curCharacterizeCmaps = [varsP.latestMergedCmap] if runaligns : varsP.contigAlignTarget = contigdir+"/alignref" #this is output dir varsP.runSV = False varsP.groupContigs = False varsP.stageComplete = contigbase varsP.outputContigFolder = contigdir varsP.memoryLogpath = os.path.join(contigdir, "memory_log.txt") varsP.pipeReportFile = os.path.join(contigdir, "pipeReport.txt") varsP.parseArguments() #parses optArgumentsFile if printargs : print "\nRunning Characterization with arguments:\n" + " ".join(varsP.argsListed('characterizeDefault')) + '\n' if hasattr(util, "InitStatus") : #if old version, skip util.InitStatus(os.path.join(contigdir, "status.xml")) #needed otherwise call to status_log fails charmod = cm.Characterize(varsP) #create Characterize object from CharacterizeModule -- this also calls generateJobList xmappath = charmod.xmapTarget #set in Characterize.generateJobList charmod.runJobs() else : #varsP.contigAlignTarget = contigdir #this is dir in which _q and _r cmaps must be located -- contigdir is from cmap; this should be from xmap varsP.contigAlignTarget = os.path.split(xmappath)[0] print "Loading alignments from\n" + xmappath + "\n" #no longer using this in Pipeline #print MapClassesRev.TopLevelCharacterization(varsP, [os.path.join(varsP.contigAlignTarget, contigbase)]) print cm.characterizeContigs(varsP, xmappath)