def __call__(self, identifier, partner_identifier, row_target, col_target, scale, reduction_axis): scale_dir = ensure_dir(os.path.join(self.work_dir, str(scale))) next_scale_dir = ensure_dir(os.path.join(self.work_dir, str(scale + 1))) temp_dir = ensure_dir(os.path.join(scale_dir, "work")) input_img = [identifier, partner_identifier] axis_name = ['rows', 'cols'] print("Partners over {}: {} and {}".format( axis_name[reduction_axis], os.path.basename(identifier), os.path.basename(partner_identifier))) target_name = "{}_{}".format(row_target, col_target) target_file = "{}.{}".format(target_name, img_data_fmt) dir_stitch = setup_directories(os.path.join(temp_dir, target_name)) stitched_img = os.path.join(next_scale_dir, target_file) if reduction_axis == 0: success = assemble_row(dir_stitch, input_img, stitched_img) else: success = assemble_column(dir_stitch, input_img, stitched_img) return success, stitched_img
def mux_batch_blast(dataset, bin_type): """Send batch jobs to Blast. Muxes to multiple reference DBs.""" # Identify the genome nickname = dataset['nickname'] # Determine the input file root root_dir = dirs['parcels_dir']+nickname+"/" file_root = root_dir+nickname+bin_type # Identify the references to blast against ref_nicks = dataset['ref_nicks'] for ref_nick in ref_nicks: # Identify Blast DB db_path = dirs['blast_db_dir']+ref_nick # Prep output directory out_dir = dirs['blast_out_dir']+nickname+"/"+ref_nick+"/" ensure_dir(out_dir) # Signal process start print "--- Blasting", nickname+bin_type, "against", ref_nick, "---" print datetime.now() index = 1 while os.path.isfile(file_root+"_"+str(index)+".fas"): query_file = file_root+"_"+str(index)+".fas" outfile = out_dir+nickname+bin_type+"_"+str(index)+"_blast.out" print "\tblasting", query_file local_blastn_2file(query_file, db_path, outfile, blast_prefs) index +=1 print "--- Finished BLAST run ---" print datetime.now() print index, "parcel files blasted" return "OK"
def build_a_bear(): """ the output of bear differs between versions, so we build the latest bear rather than trying to support multiple versions. FIXME: might be better to handle multiple versions instead. """ if os.path.isdir(c.BEAR_PREFIX): logging.debug("skipping Bear installation") return # download if not os.path.isfile(c.BEAR_ARCHIVE): curl = get_cmd_or_die("curl") curl['-s', c.BEAR_URL, '-o', c.BEAR_ARCHIVE] & pb.TEE # remove any existing build dir since we don't know if # bear was built for the current host environment. if os.path.isdir(c.BEAR_SRC): shutil.rmtree(c.BEAR_SRC, ignore_errors=True) # unpack tar = get_cmd_or_die("tar") with pb.local.cwd(c.DEPS_DIR): tar['xf', c.BEAR_ARCHIVE] & pb.TEE # cmake bear_build_dir = os.path.join(c.BEAR_SRC, "build") bear_install_prefix = "-DCMAKE_INSTALL_PREFIX=" + c.BEAR_PREFIX ensure_dir(bear_build_dir) with pb.local.cwd(bear_build_dir): cmake = get_cmd_or_die("cmake") cmake["..", bear_install_prefix] & pb.TEE make = get_cmd_or_die("make") make["install"] & pb.TEE
def apply_blankfield(files, blank_field_file, dest_dir, percentage=100, blur_alpha=0.7, threads=1, is_dark=False): files = sorted(files) ensure_dir(dest_dir) blank_field = image_load_resize(blank_field_file, percentage) if blur_alpha > 0: blank_field = omomorphic_shading_extraction(blank_field, blur_alpha) blankfield_wgt = get_blankfield_weights(blank_field) ims = (image_load_resize(i, percentage) for i in files) job_args = [ (blankfield_wgt, im_i, f, dest_dir) for f, im_i in itertools.izip(files, ims) ] if threads == 1: results = [ apply_blankfield_weights(*args) for args in job_args ] else: pool = mp.Pool(processes=threads) jobs = [ pool.apply_async(apply_blankfield_weights, args) for args in job_args ] pool.close() pool.join() results = [ job.get() for job in jobs ] success = all(results) return success
def generate_write_blankfield(files_in, output_dir, percentage=100, threads=1): bfield = generate_blankfield(files_in, int(percentage), threads) ensure_dir(output_dir) success = cv2.imwrite(os.path.join(output_dir, "blankfield-stat.png"), bfield) return success
def simple_q2a(dataset, trim_file): """.""" # Identify the genome nickname = dataset['nickname'] # Identify the trim file type ttype = trim_file['type'] # Identify the source file source_file = trim_file['name'] # Prep output files dir_root = dirs['mft_dir']+nickname+"/" ensure_dir(dir_root) out_file = dir_root+nickname+ttype+'.fas' track_file = dir_root+nickname+ttype+'_track.txt' # Save filenames for later reference dataset['mft_files'].append({'type': ttype, 'name': out_file, 'track': track_file}) # Signal the process start print "-- Converting ", nickname+ttype, "to multifasta --" print datetime.now() # Set up iterator multifasta = open(out_file, 'w') tracker = open(track_file, 'w') read_count = 0 for title, seq, qual in FastqGeneralIterator(open(source_file)) : read_count +=1 id_string = nickname+"_"+str(read_count) multifasta.write(">"+id_string+"\n"+seq+"\n") tracker.write(title+"\t"+id_string+"\n") if read_count%100000==0: print "\t"+str(read_count), "reads processed" multifasta.close() tracker.close() return read_count
def chop_multifasta(dataset, mft_file): """Split a master file into smaller multifasta files. This is useful for making reasonably-sized batch BLAST jobs. Iterator function adapted from http://biopython.org/wiki/Split_large_file """ # Identify the genome nickname = dataset['nickname'] # Identify the trim file type ttype = mft_file['type'] # Prep output file dir_root = dirs['parcels_dir']+nickname+"/" out_file_root = dir_root+nickname+ttype ensure_dir(dir_root) # Save filenames for later reference dataset['parcel_files'] = {'root': out_file_root, 'suffix': '.fas'} # Unpack chopping parameters parceln = chop_param['parceln'] # Signal the process start print "-- Splitting ", nickname+ttype, "into batches of", parceln, "reads --" print datetime.now() # Set up iterator function record_iter = SeqIO.parse(open(mft_file['name']),"fasta") parcel_files = [] for i, batch in enumerate(batch_iterator(record_iter, parceln)) : filename = out_file_root+"_%i.fas" % (i+1) handle = open(filename, "w") count = SeqIO.write(batch, handle, "fasta") handle.close() parcel_files.append(filename) print "\twrote %i records to %s" % (count, filename) print "-- Finished --" print datetime.now() return len(parcel_files)
def _main(): if on_mac(): die("Cross-checking is only supported on Linux hosts.") setup_logging() logging.debug("args: %s", " ".join(sys.argv)) # earlier plumbum versions are missing features such as TEE if pb.__version__ < c.MIN_PLUMBUM_VERSION: err = "locally installed version {} of plumbum is too old.\n" \ .format(pb.__version__) err += "please upgrade plumbum to version {} or later." \ .format(c.MIN_PLUMBUM_VERSION) die(err) args = _parse_args() if args.clean_all: logging.info("cleaning all dependencies and previous built files") shutil.rmtree(c.CLANG_XCHECK_PLUGIN_BLD, ignore_errors=True) make = get_cmd_or_die('make') with pb.local.cwd(c.LIBFAKECHECKS_DIR): make('clean') # clang 3.6.0 is known to work; 3.4.0 known to not work. ensure_clang_version([3, 6, 0]) # NOTE: it seems safe to disable this check since we now # that we use a rust-toolchain file for rustc versioning. # ensure_rustc_version(c.CUSTOM_RUST_RUSTC_VERSION) ensure_dir(c.CLANG_XCHECK_PLUGIN_BLD) ensure_dir(c.BUILD_DIR) git_ignore_dir(c.BUILD_DIR) build_clang_plugin(args)
def test_js_engine_path(self): # Test that running JS commands works for node, d8, and jsc and is not path dependent restore_and_set_up() sample_script = test_file('print_args.js') # Fake some JS engines # Note that the path contains 'd8'. test_path = self.in_dir('fake', 'abcd8765') ensure_dir(test_path) jsengines = [('d8', config.V8_ENGINE), ('d8_g', config.V8_ENGINE), ('js', config.SPIDERMONKEY_ENGINE), ('node', config.NODE_JS), ('nodejs', config.NODE_JS)] for filename, engine in jsengines: try_delete(SANITY_FILE) if type(engine) is list: engine = engine[0] if not engine: print('WARNING: Not testing engine %s, not configured.' % (filename)) continue print(filename, engine) test_engine_path = os.path.join(test_path, filename) with open(test_engine_path, 'w') as f: f.write('#!/bin/sh\n') f.write('exec %s $@\n' % (engine)) make_executable(test_engine_path) out = self.run_js(sample_script, engine=test_engine_path, args=['--foo']) self.assertEqual('0: --foo', out.strip())
def __init__(self, conf, uuid, namespace=None, service=None, pids_path=None, default_cmd_callback=None, cmd_addl_env=None, pid_file=None, run_as_root=False): self.conf = conf self.uuid = uuid self.namespace = namespace self.default_cmd_callback = default_cmd_callback self.cmd_addl_env = cmd_addl_env self.pids_path = pids_path or self.conf.external_pids self.pid_file = pid_file self.run_as_root = run_as_root if service: self.service_pid_fname = 'pid.' + service self.service = service else: self.service_pid_fname = 'pid' self.service = 'default-service' common_utils.ensure_dir(os.path.dirname(self.get_pid_file_name()))
def _main(): setup_logging() logging.debug("args: %s", " ".join(sys.argv)) # FIXME: allow env/cli override of LLVM_SRC and LLVM_BLD # FIXME: check that cmake and ninja are installed # FIXME: option to build LLVM/Clang from master? args = _parse_args() if args.clean_all: logging.info("cleaning all dependencies and previous built files") shutil.rmtree(c.LLVM_SRC, ignore_errors=True) shutil.rmtree(c.LLVM_BLD, ignore_errors=True) shutil.rmtree(c.BUILD_DIR, ignore_errors=True) shutil.rmtree(c.AST_EXPO_PRJ_DIR, ignore_errors=True) cargo = get_cmd_or_die("cargo") with pb.local.cwd(c.ROOT_DIR): invoke(cargo, "clean") ensure_dir(c.LLVM_BLD) ensure_dir(c.BUILD_DIR) git_ignore_dir(c.BUILD_DIR) download_llvm_sources() configure_and_build_llvm(args) build_transpiler(args) print_success_msg(args)
def _main(): setup_logging() logging.debug("args: %s", " ".join(sys.argv)) # earlier plumbum versions are missing features such as TEE if pb.__version__ < c.MIN_PLUMBUM_VERSION: err = "locally installed version {} of plumbum is too old.\n" \ .format(pb.__version__) err += "please upgrade plumbum to version {} or later." \ .format(c.MIN_PLUMBUM_VERSION) die(err) args = _parse_args() if args.clean_all: logging.info("cleaning all dependencies and previous built files") shutil.rmtree(c.CLANG_XCHECK_PLUGIN_BLD, ignore_errors=True) # prerequisites if not have_rust_toolchain(c.CUSTOM_RUST_NAME): die("missing rust toolchain: " + c.CUSTOM_RUST_NAME, errno.ENOENT) # clang 3.6.0 is known to work; 3.4.0 known to not work. ensure_clang_version([3, 6, 0]) ensure_rustc_version(c.CUSTOM_RUST_RUSTC_VERSION) ensure_dir(c.CLANG_XCHECK_PLUGIN_BLD) ensure_dir(c.DEPS_DIR) git_ignore_dir(c.DEPS_DIR) build_clang_plugin(args)
def importNavteq(options): ensure_dir(options.output_dir) netconvert = sumolib.checkBinary('netconvert') polyconvert = sumolib.checkBinary('polyconvert') for idx, config in enumerate(options.config.split(",")): netconvert_call = [netconvert, '--output-file', options.netfile, '-c', config] if idx > 0: tmp_net = os.path.join( options.output_dir, options.net_prefix + "_tmp.net.xml") os.rename(options.netfile, tmp_net) netconvert_call += ['--sumo-net-file', tmp_net] else: netconvert_call += ['--dlr-navteq', options.prefix] if options.verbose: print(' '.join(netconvert_call)) sys.stdout.flush() subprocess.call(netconvert_call) polyconvertCmd = [ polyconvert, '--verbose', '--dlr-navteq-poly-files', options.prefix + '_polygons.txt', #'--dlr-navteq-poi-files', options.prefix + '_points_of_interest.txt', '--output', os.path.join(options.output_dir, "shapes.xml"), '-n', options.netfile ] if options.verbose: print(polyconvertCmd) sys.stdout.flush() subprocess.call(polyconvertCmd)
def annot_ref(ref_name, ctg_fas, prot_db_name, fixed_dirs, project_id, blast_prefs): """Annotate reference contig (predict ORFs and assign function).""" # locate the COG database prot_db = fixed_dirs['ref_dbs_dir']+prot_db_name # set inputs and outputs g_gbk_ctgs_root = fixed_dirs['gbk_contigs_dir']+ref_name+"/" ctg_cds_root = fixed_dirs['ctg_cds_dir']+ref_name+"/" ctg_prot_root = fixed_dirs['ctg_prot_dir']+ref_name+"/" ctg_blast_root = fixed_dirs['ctg_blast_dir']+ref_name+"/" annot_trn_root = fixed_dirs['annot_trn_dir'] ensure_dir([g_gbk_ctgs_root, ctg_cds_root, ctg_prot_root, ctg_blast_root, annot_trn_root]) trn_file = annot_trn_root+ref_name+"_annot.trn" g_ctg_gbk = g_gbk_ctgs_root+ref_name+"_1.gbk" annot_gbk = ctg_cds_root+ref_name+"_1_cds.gbk" annot_aa = ctg_prot_root+ref_name+"_1_aa.fas" blast_out = ctg_blast_root+ref_name+"_1.xml" if path.exists(blast_out) and os.stat(blast_out)[6]==0: os.remove(blast_out) if not path.exists(g_ctg_gbk): l_tag_base = ref_name+"_1" record = annot_ctg(ctg_fas, ctg_fas, annot_gbk, annot_aa, trn_file, prot_db, blast_out, l_tag_base, blast_prefs) record.description = ref_name+"_re-annotated" record.name = ref_name+"_1" record.dbxrefs = ["Project: "+project_id+"/"+ref_name +"-like backbones"] record.seq.alphabet = generic_dna write_genbank(g_ctg_gbk, record) else: record = load_genbank(g_ctg_gbk) return record
def _get_conf_base(cfg_root, uuid, ensure_conf_dir): #TODO(mangelajo): separate responsibilities here, ensure_conf_dir # should be a separate function conf_dir = os.path.abspath(os.path.normpath(cfg_root)) conf_base = os.path.join(conf_dir, uuid) if ensure_conf_dir: common.ensure_dir(conf_dir) return conf_base
def run_fires(site, region): url = site + '/jobs/archive' try: p = getPage(url) except Exception as e: logging.error("Can't load {}".format(url)) logging.error(e) return None a = p.findAll('a') zips = [x.get('href') for x in a if x.get('href').endswith('.zip')] fires = sorted(set([x[x.rindex('/') + 1:x.index('_')] for x in zips])) times = {} recent = {} simtimes = {} dates = [] totaltime = 0 dir_download = common.ensure_dir(os.path.join(DIR, region)) dir_ext = common.ensure_dir(os.path.join(EXT_DIR, region)) logging.debug("Checking {} fires".format(len(fires))) for f in fires: times[f] = [ datetime.datetime.strptime(x[x.rindex('_') + 1:x.rindex('.')], '%Y%m%d%H%M%S%f') for x in zips if x[x.rindex('/') + 1:x.index('_')] == f ] recent[f] = { 'time': max(times[f]), 'url': [ x for x in zips if x[x.rindex('/') + 1:x.index('_')] == f and datetime.datetime.strptime( x[x.rindex('_') + 1:x.rindex('.')], '%Y%m%d%H%M%S%f') == max(times[f]) ][0], } logging.debug('{}: {}'.format(f, recent[f]['time'])) z = common.save_http(dir_download, site + recent[f]['url'], ignore_existing=True) cur_dir = os.path.join(dir_ext, os.path.basename(z)[:-4]) common.unzip(z, cur_dir) fgmj = os.path.join(cur_dir, 'job.fgmj') if os.path.exists(fgmj): try: t0 = timeit.default_timer() log_name = firestarr.do_run(fgmj) t1 = timeit.default_timer() if log_name is not None: simtimes[f] = t1 - t0 totaltime = totaltime + simtimes[f] logging.info("Took {}s to run {}".format(simtimes[f], f)) d = os.path.basename(os.path.dirname(log_name))[:8] if d not in dates: dates.append(d) except Exception as e: logging.error(e) return simtimes, totaltime, dates
def make_genome_DB(genome, fixed_dirs): """Make a Blast DB from a genome FastA file.""" # load inputs fas_dir = fixed_dirs['mfas_contigs_dir'] db_dir = fixed_dirs['blast_db_dir'] ensure_dir([fas_dir, db_dir]) g_name = genome['name'] # make DB make_blastDB(db_dir+g_name, fas_dir+g_name+'_contigs.fas', 'nucl')
def _main(): setup_logging() logging.debug("args: %s", " ".join(sys.argv)) # FIXME: allow env/cli override of LLVM_SRC, LLVM_VER, and LLVM_BLD # FIXME: check that cmake and ninja are installed # FIXME: option to build LLVM/Clang from master? # earlier plumbum versions are missing features such as TEE if pb.__version__ < c.MIN_PLUMBUM_VERSION: err = "locally installed version {} of plumbum is too old.\n" \ .format(pb.__version__) err += "please upgrade plumbum to version {} or later." \ .format(c.MIN_PLUMBUM_VERSION) die(err) args = _parse_args() if args.clean_all: logging.info("cleaning all dependencies and previous built files") shutil.rmtree(c.LLVM_SRC, ignore_errors=True) shutil.rmtree(c.LLVM_BLD, ignore_errors=True) shutil.rmtree(c.DEPS_DIR, ignore_errors=True) # prerequisites if not have_rust_toolchain(c.CUSTOM_RUST_NAME): die("missing rust toolchain: " + c.CUSTOM_RUST_NAME, errno.ENOENT) # clang 3.6.0 is known to work; 3.4.0 known to not work. ensure_clang_version([3, 6, 0]) ensure_rustc_version(c.CUSTOM_RUST_RUSTC_VERSION) ensure_dir(c.LLVM_BLD) ensure_dir(c.DEPS_DIR) git_ignore_dir(c.DEPS_DIR) if on_linux(): build_a_bear() if not os.path.isfile(c.BEAR_BIN): die("bear not found", errno.ENOENT) download_llvm_sources() integrate_ast_exporter() cc_db = install_tinycbor() configure_and_build_llvm(args) # NOTE: we're not doing this anymore since it is # faster and takes less space to simply pull the # prebuilt nightly binaries with rustup # download_and_build_custom_rustc(args) build_ast_importer(args.debug) if not on_mac() and args.sanity_test: test_ast_exporter(cc_db)
def make_ref_DB(reference, run_id, fixed_dirs, r_root_dir, run_dirs): """Make a Blast DB from a reference FastA file.""" # load inputs fas_dir = r_root_dir+run_id+"/"+run_dirs['ref_fas_dir'] db_dir = fixed_dirs['blast_db_dir'] ensure_dir([fas_dir, db_dir]) g_name = reference['name'] # make DB make_blastDB(db_dir+g_name, fas_dir+g_name+'.fas', 'nucl')
def _main(): setup_logging() logging.debug("args: %s", " ".join(sys.argv)) # FIXME: allow env/cli override of LLVM_SRC, LLVM_VER, and LLVM_BLD # FIXME: check that cmake and ninja are installed # FIXME: option to build LLVM/Clang from master? # earlier plumbum versions are missing features such as TEE if pb.__version__ < c.MIN_PLUMBUM_VERSION: err = "locally installed version {} of plumbum is too old.\n" \ .format(pb.__version__) err += "please upgrade plumbum to version {} or later." \ .format(c.MIN_PLUMBUM_VERSION) die(err) args = _parse_args() # prerequisites if not have_rust_toolchain(c.CUSTOM_RUST_NAME): die("missing rust toolchain: " + c.CUSTOM_RUST_NAME, errno.ENOENT) # clang 3.6.0 is known to work; 3.4.0 known to not work. ensure_clang_version([3, 6, 0]) if args.clean_all: logging.info("cleaning all dependencies and previous built files") shutil.rmtree(c.LLVM_SRC, ignore_errors=True) shutil.rmtree(c.LLVM_BLD, ignore_errors=True) shutil.rmtree(c.DEPS_DIR, ignore_errors=True) shutil.rmtree(c.AST_EXPO_PRJ_DIR, ignore_errors=True) cargo = get_cmd_or_die("cargo") with pb.local.cwd(c.ROOT_DIR): invoke(cargo, "clean") ensure_dir(c.LLVM_BLD) ensure_dir(c.DEPS_DIR) git_ignore_dir(c.DEPS_DIR) download_llvm_sources() update_cmakelists() configure_and_build_llvm(args) build_transpiler(args) # print a helpful message on how to run c2rust bin directly c2rust_bin_path = 'target/debug/c2rust' if args.debug \ else 'target/release/c2rust' c2rust_bin_path = os.path.join(c.ROOT_DIR, c2rust_bin_path) # if os.path.curdir abs_curdir = os.path.abspath(os.path.curdir) common_path = os.path.commonpath([abs_curdir, c2rust_bin_path]) if common_path != "/": c2rust_bin_path = "." + c2rust_bin_path[len(common_path):] print("success! you may now run", c2rust_bin_path)
def make_fake_llc(filename, targets): """Create a fake llc that only handles --version and writes target list to stdout. """ print('make_fake_llc: %s' % filename) ensure_dir(os.path.dirname(filename)) with open(filename, 'w') as f: f.write('#!/bin/sh\n') f.write('echo "llc fake output\nRegistered Targets:\n%s"' % targets) make_executable(filename)
def extract_natives(self): if not self.metadata: self.get_meta() natives_tmpdir = os.path.join(self.version_directory, 'natives-' + str(int(time.time()))) ensure_dir(natives_tmpdir) for lib in self.metadata['libraries']: skiplib = False # Check Rules if 'rules' in lib: for rule in lib['rules']: if 'action' in rule: if rule['action'] == 'allow' and 'os' in rule: if not rule['os']['name'] == platform(): skiplib = True if rule['action'] == 'noallow' and 'os' in rule: if rule['os']['name'] == platform(): skiplib = True if skiplib: continue # Skip non-download-included for now if not 'downloads' in lib: continue dl = lib['downloads'] if 'natives' in lib and 'extract' in lib: if platform() in lib['natives']: platform_native = lib['natives'][platform()] if platform_native in dl['classifiers']: try: zip_ref = zipfile.ZipFile( os.path.join( self.client_root, 'libraries', dl['classifiers'][platform_native] ['path']), 'r') zip_ref.extractall(natives_tmpdir) zip_ref.close() except Exception as e: print( 'Failed to extract native library %s due to errors.' % (lib['name'])) raise e metainf = os.path.join(natives_tmpdir, 'META-INF') if os.path.exists(metainf): shutil.rmtree(metainf) self.natives = natives_tmpdir
def start_new_game(): # ask for player names try: # ask for name of game game_name = input("Enter name of game (names with the same name will be overwritten): ") # ask for number of worlds num_worlds = int( input("Enter number of worlds (the less, the earlier the complete wavefunction collapse will happen): ")) # i = 1 name = "temp" print( "Entering names of player. Press enter without a name if you're finished. If you need to make changes later," "you can opt to edit the configuration directly") player_names = [] while name != "": print() name = input("Please enter name of player %02d: " % i) if name != "": player_names.append(name) i += 1 # ask for number of villagers, seer, players while True: num_villagers = int(input("Enter number of villagers: ")) num_wolves = int(input("Enter number of wolves: ")) num_seers = int(input("Enter number of seers: ")) if (num_seers + num_wolves + num_villagers) != len(player_names): print("Number of players / number of roles mismatch.") else: break except ValueError: print("Invalid input. Please repeat the setup process.") start_new_game() return # save to configuration game_config = { "name": game_name, "num_worlds": num_worlds, "players": player_names, "num_villagers": num_villagers, "num_wolves": num_wolves, "num_seers": num_seers } game_dir = os.path.join("games", game_name) ensure_dir(game_dir) game_config["game_dir"] = game_dir with open(os.path.join(game_dir, "config"), "w+") as f: json.dump(game_config, f, indent=2) print("Setup completed successfully.") # load game from config game = Game(dict_to_game_config(game_config)) play_game(game, game_config, True, 0)
def make_fake_tool(filename, version, report_name=None): if not report_name: report_name = os.path.basename(filename) print('make_fake_tool: %s' % filename) ensure_dir(os.path.dirname(filename)) with open(filename, 'w') as f: f.write('#!/bin/sh\n') f.write('echo "%s version %s"\n' % (report_name, version)) f.write('echo "..."\n') f.write('exit 0\n') make_executable(filename)
def ensure_directory_exists_without_file(path): dirname = os.path.dirname(path) if os.path.isdir(dirname): try: os.unlink(path) except OSError: with excutils.save_and_reraise_exception() as ctxt: if not os.path.exists(path): ctxt.reraise = False else: common.ensure_dir(dirname)
def make_qiime_reports(dataset, run_id): """Generate HTML output for reporting with Qiime. Makes an interactive heatmap, a Cytoscape network and a summary of community composition. """ # identify inputs and outputs set_id = dataset['set_id'] print " ", set_id run_root = root_dir+set_id+"/"+run_id+"/" otus_dir = run_root+dirs['otus'] table_file = otus_dir+run_id+"_otu_table.txt" map_file = otus_dir+run_id+"_map.txt" heatmap_dir = run_root+dirs['reports']+"otu_heatmap" network_dir = run_root+dirs['reports'] wf_taxa_sum = run_root+dirs['reports']+"communities" ensure_dir(heatmap_dir) ensure_dir(network_dir) # generate a dummy Qiime map file dummy_comps = ["#SampleID Barcode Primer Treat DOB Descript", "#Dummy map file to make Qiime happy"] for sample_id in dataset['samples']: dummy_comps.append("\t".join([sample_id,"NA","NA","NA","NA","NA"])) open(map_file, 'w').write("\n".join(dummy_comps)) # make OTU heatmap comps = ["macqiime", "make_otu_heatmap_html.py", "-i", table_file, "-o", heatmap_dir] cline = " ".join(comps) try: child = subprocess.Popen(str(cline), stdout=subprocess.PIPE, shell=True) output, error = child.communicate() except: raise else: print "\t", "OTU heatmap generated" # make OTU network comps = ["macqiime", "make_otu_network.py", "-i", table_file, "-m", map_file, "-o", network_dir] cline = " ".join(comps) try: child = subprocess.Popen(str(cline), stdout=subprocess.PIPE, shell=True) output, error = child.communicate() except: raise else: print "\t", "OTU network generated" # summarize communities by taxonomic composition comps = ["macqiime", "summarize_taxa_through_plots.py", "-i", table_file, "-o", wf_taxa_sum, "-m", map_file] cline = " ".join(comps) try: child = subprocess.Popen(str(cline), stdout=subprocess.PIPE, shell=True) output, error = child.communicate() except: raise else: print "\t", "Taxonomic composition of communities summarized"
def __init__(self, archive_path, base_mount_dir): self.archive_path = archive_path self.mount_dir = pth.join(base_mount_dir, "mnt-" + pth.basename(archive_path)) self.mount_available = False if pth.isdir(self.mount_dir): raise OSError("Mount directory {} already exists".format( self.mount_dir)) ensure_dir(self.mount_dir) self.mount_available = True
def write_object(self, obj: gitobj.GitObject): content = obj.bcontent() sha = compute_sha1(content) dirname, filename = parse_sha(sha) dirpath = self.path_in_gitdir('objects', dirname) ensure_dir(dirpath) path = os.path.join(dirpath, filename) with open(path, 'wb') as f: f.write(zlib.compress(content)) return sha
def set_stack_structure(stack_name, work_dir, make_subdirs=True): st_name = pth.abspath(stack_name) source_dirs = {kind: pth.join(st_name, kind) for kind in kinds} base_dir = ensure_dir(pth.join(work_dir, pth.basename(st_name))) target_dirs = {kind: pth.join(base_dir, kind) for kind in kinds} if make_subdirs and all( pth.exists(src_dir) for src_dir in source_dirs.values()): [ensure_dir(tdir) for tdir in target_dirs.values()] return source_dirs, target_dirs, base_dir
def main(): opt = process_command_line() print opt ensure_dir(opt.work_dir) data = gather_images_data(opt.files, opt.crop_size, opt.threads, opt.use_borders) success = align_images(data, opt.work_dir, opt.first_image_is_absolute) result = "done" if success else "failed" debug_log("Registration job", result)
def main(): opt = process_command_line() print opt ensure_dir(opt.work_dir) success = register_images(opt.files, opt.crop_size, opt.threads, opt.work_dir, opt.use_borders, opt.first_image_is_absolute, make_jpeg=opt.write_also_jpeg) result = "done" if success else "failed" debug_log("Registration job", result)
def rasterize_perim(run_output, perim, year, name, raster=None): """! Convert a perimeter to a raster @param run_output Folder to save perimeter to @param perim Perimeter to convert to raster @param year Year to find reference raster for projection @param name Name of fire to use for file name @param raster Specific name of file name to output to @return Perimeter that was rasterized @return Path to raster output """ prj = os.path.join(run_output, os.path.basename(perim).replace('.shp', '_NAD1983.shp')) ensure_dir(os.path.dirname(prj)) ref_NAD83 = osr.SpatialReference() ref_NAD83.SetWellKnownGeogCS('NAD83') #~ try: Project(perim, prj, ref_NAD83) del ref_NAD83 r = find_best_raster(Extent(prj).XCenter, year) prj_utm = os.path.join( run_output, os.path.basename(perim).replace('.shp', os.path.basename(r)[9:14] + '.shp')) Delete(prj_utm) zone = GetSpatialReference(r) Project(perim, prj_utm, zone) del zone cellsize = GetCellSize(r) size = 0.0 dataSource = ogr.GetDriverByName('ESRI Shapefile').Open( prj_utm, gdal.GA_ReadOnly) layer = dataSource.GetLayer() for feature in layer: geom = feature.GetGeometryRef() area = geom.GetArea() size += area / (cellsize * cellsize) del geom del feature del layer del dataSource if size < 1: # this is less than one cell in area so don't use perimeter perim = None raster = None else: if not raster: raster = os.path.join(run_output, name + '.tif') Rasterize(prj_utm, raster, r) return perim, raster
def main(): opt = process_command_line() print opt ensure_dir(opt.work_dir) data = gather_images_data(opt.files, opt.crop_size, opt.threads) delta_xp = match_ppl_xpl_opaques(data, opt.crop_size, opt.work_dir) debug_log("PPL->XPL mismatch is", delta_xp) success = align_images(data, delta_xp, opt.work_dir) result = "done" if success else "failed" debug_log("Registration job", result)
def __init__(self, name, no_download=False): """! Constructor @param self Pointer to this @param name Name for weather being loaded @param no_download Whether or not to not download files """ ## Name for weather being loaded self.name = name common.ensure_dir(self.DIR_DATA) ## Folder to save downloaded weather to self.DIR_DATA = os.path.join(self.DIR_DATA, self.name) common.ensure_dir(self.DIR_DATA) ## Whether or not to download files self.no_download = no_download
def get_assets(self): if self.metadata: self.get_meta() print('Verifying assets..') assets_dir = os.path.join(self.client_root, 'assets') assets_versions = os.path.join(assets_dir, 'indexes') ensure_dir(assets_versions) asset_index = self.metadata['assetIndex'] assets_file = os.path.join(assets_versions, '%s.json' % (asset_index['id'])) if not os.path.exists(assets_file): r = requests.get(asset_index['url'], stream=True) try: save_to_file_sha1(assets_file, r, asset_index['sha1']) except Exception: print('Failed to download assets!') raise with open(assets_file) as json_data: assets = json.load(json_data) for key, data in assets['objects'].items(): first = data['hash'][0:2] asset_url = 'http://resources.download.minecraft.net/%s/%s' % ( first, data['hash']) asset_dir = os.path.join(assets_dir, 'objects', first) ensure_dir(asset_dir) asset_file = os.path.join(asset_dir, data['hash']) if os.path.exists(asset_file): continue r = requests.get(asset_url, stream=True) try: save_to_file(asset_file, r) except Exception as e: print('Failed to download asset %s!' % (key)) raise e print('All assets verified.')
def pick_otus(dataset, run_id): """Pick OTUs using Uclust with Qiime.""" # identify inputs and outputs set_id = dataset['set_id'] print " ", set_id run_root = root_dir+set_id+"/"+run_id+"/" otus_dir = run_root+dirs['otus'] ensure_dir(otus_dir) master_file = run_root+dirs['merged']+run_id+".fas" # run the command comps = ["macqiime", "pick_otus.py", "-i", master_file, "-o", otus_dir] cline = " ".join(comps) try: child = subprocess.Popen(str(cline), stdout=subprocess.PIPE, shell=True) output, error = child.communicate() except: raise else: print "\t", "OTUs picked"
def download_image(img, window=None) : if os.name == "nt" : path = LM_CACHE_PATH + os.sep + 'media' + os.sep + os.sep.join(img.split('/')) else : path = LM_CACHE_PATH + os.sep + 'media' + os.sep + img img_path = 'http://www.pirates-caraibes.com/' + img try : ensure_dir(path) try : src = urllib2.urlopen(url_fix(img_path)) except urllib2.HTTPError : return False else : dst = open(path, 'wb') shutil.copyfileobj(src, dst) return True except : return False
def download_images(img_list) : for img in img_list : if os.name == "nt" : path = LM_CACHE_PATH + os.sep + 'media' + os.sep + os.sep.join(img.split('/')) else : path = LM_CACHE_PATH + os.sep + 'media' + os.sep + img img_path = 'http://www.pirates-caraibes.com/' + img print "Récupération de l'image '%s' vers %s" % (img_path, path) try : ensure_dir(path) try : src = urllib2.urlopen(img_path) except urllib2.HTTPError : pass else : dst = open(path, 'wb') shutil.copyfileobj(src, dst) print "Succès" except : (exctype, value, traceback) = sys.exc_info() print "Erreur - %s : %s" % (exctype, value)
def iter_align(coord_array, ref_rec, query_rec, aln_dir, segs_file): """Iterate through array of coordinates to make pairwise alignments.""" # set up the root subdirectories seqs = aln_dir+"input_seqs/" alns = aln_dir+"output_alns/" ensure_dir([seqs, alns]) aln_id = 0 aln_len = 0 # cycle through segments for segment_pair in coord_array: xa, xb, xc, xd = segment_pair # extract the corresponding sequence slices ref_seq = ref_rec[abs(xa):abs(xb)] query_seq = query_rec[abs(xc):abs(xd)] # reverse-complement sequences with negative sign if xa < 0 : ref_seq = ref_seq.reverse_complement() if xc < 0 : query_seq = query_seq.reverse_complement() # write sequences to file mscl_in = seqs+str(xa)+"_"+str(xb)+"_"+str(xc)+"_"+str(xd)+".fas" write_fasta(mscl_in, [ref_seq, query_seq]) # skip segments that are too small to align if abs(abs(xa)-abs(xb)) < 10: idp = 0 else: # set up outfiles mscl_out = alns+str(xa)+"_"+str(xb)+"_"+str(xc)+"_"+str(xd)+".aln" logfile = aln_dir+"muscle_log.txt" # perform alignment align_muscle(mscl_in, mscl_out, logfile) idntot = parse_clustal_idstars(mscl_out) idp = int((float(idntot)/len(query_seq))*100) aln_id += idntot aln_len += len(query_seq) # write details out to segments file line = "\t".join([str(xa), str(xb), str(xc), str(xd), str(idp)+"\n"]) open(segs_file, 'a').write(line) overall_id = int((float(aln_id)/aln_len)*100) return overall_id
def save_parameters(dataset, max_pairs, run_id, timestamp): """Save a copy of the dataset-specific parameters to file.""" set_id = dataset['set_id'] print " ", set_id run_root = root_dir+set_id+"/"+run_id+"/" report_root = run_root+dirs['reports'] param_file = report_root+run_id+"_parameters.txt" ensure_dir(report_root) # primer data primers = dataset['primers'] primers_list = [] for primer_ID in primers: primers_list.append("\t".join([primer_ID, primers[primer_ID]])) primers_str = "\n".join(primers_list) # samples + barcode data samples = dataset['samples'] samples_list = [] for sample_ID in samples: samples_list.append("\t".join([sample_ID, samples[sample_ID][0], samples[sample_ID][1]])) samples_str = "\n".join(samples_list) # text block txt = ["# Run ID", run_id, "# Date generated", dataset['date'], "# Date processing initiated", timestamp, "# Special processing parameters", "# read pair length min threshold (ensures overlap)", str(rp_min_len), "# max number of read pairs to process", str(max_pairs), "# Dataset specifications", "# Illumina FastQ master files", dataset['source_fwd'], dataset['source_rev'], "# Amplification primers", primers_str, "# Sample ID\tLeft tag\tRight tag", samples_str] # write to file open(param_file, 'w').write("\n".join(txt)) print "\t", "Run parameters saved to file"
def map_ref_segs(run_ref, run_id, r_root_dir, run_dirs, min_size, fct_flags, fct_colors, idpt): """Generate map of reference contig with segment details. This provides a comparison of the original reference and the re-annotated version. """ # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" ori_file = run_ref.file ref_maps_root = run_root+run_dirs['ref_map_dir'] ensure_dir([ref_maps_root]) gbk_file = run_root+run_dirs['ref_gbk_dir']+ref_n+"_re-annot.gbk" map_file = ref_maps_root+ref_n+"_ref.pdf" # start mapping try: # make mock segment, full-length with 100% id record = load_genbank(gbk_file) length = len(record.seq) segdata = [[1, length, 1, length, 100]] # deactivate offsetting g_offset = (0,0) q_invert = False # generate graphical map pairwise_draw(ref_n+"_ra", ref_n+"_ori", gbk_file, ori_file, segdata, map_file, q_invert, g_offset, 'dual', 'dual', 'm', 'fct', 'product', min_size, fct_flags, fct_colors, idpt) except IOError: msg = "\nERROR: could not load segments data" run_ref.log(msg) print msg except StopIteration: msg = "\nERROR: could not make map" run_ref.log(msg) print msg
def basic_batch_blast(genomes, run_ref, blast_mode, r_root_dir, run_dirs, fixed_dirs, blast_prefs, run_id, timestamp): """Send batch jobs to Blast. Muxes to multiple reference DBs.""" # load inputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" in_root = run_root+run_dirs['ref_seg_dir']+ref_n+"/" print " ", ref_n # log logstring = "".join(["\n\n# Blast segs to genomes @", timestamp, "\n\n"]) run_ref.log(logstring) # do blast for seg in run_ref.segs: input_file = in_root+ref_n+"_"+seg['name']+".fas" # translate if required if blast_mode == 'tn': record = load_fasta(input_file) record.seq = record.seq.translate() input_file = in_root+ref_n+"_"+seg['name']+"_aa.fas" # substitute write_fasta(input_file, record) out_dir = run_root+run_dirs['blast_out_dir']+ref_n+"/"+seg['name']+"/" ensure_dir([out_dir]) print "\t", seg['name'], for genome in genomes: g_name = genome['name'] db_path = fixed_dirs['blast_db_dir']+g_name outfile = out_dir+g_name+"_out.txt" print ".", if blast_mode == 'n': local_blastn_2file(input_file, db_path, outfile, blast_prefs) elif blast_mode == 'tx': local_tblastx_2file(input_file, db_path, outfile, blast_prefs) elif blast_mode == 'tn': local_tblastn_2file(input_file, db_path, outfile, blast_prefs) print "" run_ref.log("All OK") return "OK"
def annot_genome_contigs(run_ref, prot_db_name, fixed_dirs, r_root_dir, run_id, run_dirs, genomes, project_id, timestamp, blast_prefs): """Annotate genome contigs (predict ORFs and assign function).""" # locate the COG database prot_db = fixed_dirs['ref_dbs_dir']+prot_db_name # TODO: add other DB / pfams? # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" fas_ctgs_root = run_root+run_dirs['match_out_dir']+ref_n+"/" ctg_cds_root = fixed_dirs['ctg_cds_dir'] ctg_prot_root = fixed_dirs['ctg_prot_dir'] ctg_blast_root = fixed_dirs['ctg_blast_dir'] g_gbk_ctgs_root = fixed_dirs['gbk_contigs_dir'] r_gbk_ctgs_root = run_root+run_dirs['run_gbk_ctgs_dir']+ref_n+"/" annot_trn_root = fixed_dirs['annot_trn_dir'] print " ", ref_n # log logstring = "".join(["\n\n# Annotate genome contigs @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs g_name = genome['name'] fas_ctgs_dir = fas_ctgs_root+g_name+"/" g_file = fixed_dirs['ori_g_dir']+genome['file'] print '\t', g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # set output files training_file = annot_trn_root+g_name+"_annot.trn" # set output dirs ctg_cds_dir = ctg_cds_root+g_name+"/" ctg_prot_dir = ctg_prot_root+g_name+"/" ctg_blast_dir = ctg_blast_root+g_name+"/" g_gbk_ctgs_dir = g_gbk_ctgs_root+g_name+"/" r_gbk_ctgs_dir = r_gbk_ctgs_root+g_name+"/" ensure_dir([ctg_cds_dir, ctg_prot_dir, ctg_blast_dir, g_gbk_ctgs_dir, r_gbk_ctgs_dir]) # list fasta files in matches directory dir_contents = listdir(fas_ctgs_dir) for item in dir_contents: pattern = re.compile(r'.*_(\d*)\.fas$') match = pattern.match(item) if match: ctg_num = match.group(1) print ctg_num, logstring = "".join(["\t", ctg_num]) run_ref.log(logstring) # set inputs and outputs ctg_fas = fas_ctgs_dir+item g_ctg_gbk = g_gbk_ctgs_dir+g_name+"_"+ctg_num+".gbk" r_ctg_gbk = r_gbk_ctgs_dir+g_name+"_"+ctg_num+".gbk" annot_gbk = ctg_cds_dir+g_name+"_"+ctg_num+"_cds.gbk" annot_aa = ctg_prot_dir+g_name+"_"+ctg_num+"_aa.fas" blast_out = ctg_blast_dir+g_name+"_"+ctg_num+".xml" if path.exists(blast_out) and os.stat(blast_out)[6]==0: os.remove(blast_out) if not path.exists(r_ctg_gbk): if not path.exists(g_ctg_gbk): l_tag_base = g_name+"_"+ctg_num record = annot_ctg(g_file, ctg_fas, annot_gbk, annot_aa, training_file, prot_db, blast_out, l_tag_base, blast_prefs) record.description = g_name+"_"+ctg_num record.name = g_name+"_"+ctg_num record.dbxrefs = ["Project: "+project_id+"/"+ref_n +"-like backbones"] record.seq.alphabet = generic_dna write_genbank(g_ctg_gbk, record) copyfile(g_ctg_gbk, r_ctg_gbk) print ""
def batch_contig_annot(dataset): """Extract and annotate contigs.""" # identify dataset contig file contigs_file = dirs['assembly_dir']+dataset['f_nick']+'/'+'contigs.fa' # locate the COG database cog_db = dirs['blast_db_dir']+'Cog_LE/Cog' # make the training file training_file = dirs['annot_dir']+dataset['f_nick']+'/'+'contigs.trn' #train_prodigal(contigs_file, training_file) # set output dirs fas_out_dir = dirs['annot_dir']+dataset['f_nick']+'/fasta/' gbk_out_dir = dirs['annot_dir']+dataset['f_nick']+'/predict/' aa_out_dir = dirs['annot_dir']+dataset['f_nick']+'/aa/' blast_out_dir = dirs['annot_dir']+dataset['f_nick']+'/rpsblast/' solid_out_dir = dirs['annot_dir']+dataset['f_nick']+'/genbank/' maps_out_dir = dirs['annot_dir']+dataset['f_nick']+'/maps/' ensure_dir(fas_out_dir) ensure_dir(gbk_out_dir) ensure_dir(aa_out_dir) ensure_dir(blast_out_dir) ensure_dir(solid_out_dir) # set phage hit collector contig_hits = {} sp_hit_list = dirs['annot_dir']+dataset['f_nick']+'/'\ +dataset['f_nick']+'_kw_hits.html' all_hit_list = dirs['annot_dir']+dataset['f_nick']+'/'\ +dataset['f_nick']+'_all_hits.html' sp_hit_list_handle = open(sp_hit_list, 'w') all_hit_list_handle = open(all_hit_list, 'w') sp_hit_list_handle.write("<ul>") all_hit_list_handle.write("<ul>") # load all contigs contigs_list = load_multifasta(contigs_file) # cycle through contigs ctg_count = 0 gene_count = 0 for contig in contigs_list: ctg_count +=1 # use regex to acquire relevant record ID info pattern = re.compile(r'NODE_(\d*)_length_(\d*)_cov_(\d*)') match = pattern.match(contig.id) nick = match.group(1)+'_'+match.group(2)+'_'+match.group(3) contig.id = nick fasta_out = fas_out_dir+nick+'.fas' # write record to file write_fasta(fasta_out, contig) # create contig entry in dict contig_hits[nick] = [] # run the annotation annot_gbk = gbk_out_dir+nick+'.gbk' annot_aa = aa_out_dir+nick+'.fas' #run_prodigal(fasta_out, annot_gbk, annot_aa, training_file) # blast the amino acids against COG print '\tblasting', dataset['f_nick'], nick blast_out = blast_out_dir+nick+'.xml' if path.isfile(blast_out): print "\t\talready blasted" else: local_rpsblast_2file(annot_aa, cog_db, blast_out, blast_prefs) # collect best hits rec_cogs = collect_cogs(blast_out) map_file = maps_out_dir+nick+'.pdf' # consolidate annotated genbank file record = load_fasta(fasta_out) aa_defs = load_multifasta(annot_aa) features = [] counter = 1 ctg_flag_1 = 0 ctg_flag_2 = 0 for protein in aa_defs: gene_count +=1 # get feature details from description line # necessary because the prodigal output is not parser-friendly pattern = re.compile(r'\d+_\d+_\d+_\d+_\d+\s+\S+\s+(\d+)\s+\S+\s+(\d+)\s+\S+\s+(\S*\d)') match = pattern.match(protein.description) start_pos = int(match.group(1)) end_pos = int(match.group(2)) strand_pos = int(match.group(3)) feat_loc = FeatureLocation(start_pos, end_pos) annotation = rec_cogs['Query_'+str(counter)] if ctg_flag_1 is 0: all_hit_list_handle.write("</ul><br><a href='" +"../../../../" +map_file +"'>Contig " +nick+"</a><ul>") ctg_flag_1 = 1 all_hit_list_handle.write("<li>"+str(counter) +'. '+annotation+"</li>") # detect phage content in annotation phi_pattern = re.compile(r".+(COG\d+).+" "(phage|capsid|muramidase|tail|" "replication|helicase|polymerase|" "integrase|recombinase" "suppressor|hydrolase|transposase).+", re.IGNORECASE) phi_match = phi_pattern.match(annotation) if phi_match: hit_flag = 'on' hit_dict = {'CDS': counter, 'annot': annotation, 'COGs': phi_match.group} contig_hits[nick].append(hit_dict) # write out to summary file if ctg_flag_2 is 0: sp_hit_list_handle.write("</ul><br><a href='" +"../../../../" +map_file +"'>Contig " +nick+"</a><ul>") ctg_flag_2 = 1 sp_hit_list_handle.write("<li>"+str(counter) +'. '+annotation+"</li>") else: hit_flag = 'off' # consolidation feature annotations quals = {'note': protein.description, 'fct': annotation, 'flag': hit_flag} feature = SeqFeature(location=feat_loc, strand=strand_pos, id=protein.id, type='CDS', qualifiers=quals) features.append(feature) counter +=1 record.features = features record.description = dataset['f_nick']+'_contig_'+nick record.name = nick record.dbxrefs = ['Project:np1'] record.seq.alphabet = generic_dna gbk_out = solid_out_dir+nick+'.gbk' write_genbank(gbk_out, record) # generate graphical map ContigDraw(nick, gbk_out, map_file) sp_hit_list_handle.write("</ul>") all_hit_list_handle.write("</ul>") sp_hit_list_handle.close() all_hit_list_handle.close() print "\t", gene_count, "predicted genes in", ctg_count, "contigs"
def demux_illumina(dataset, max_pairs, run_id): """Demultiplex Illumina dataset. From separate forward/reverse read sets, combine read pairs and output to separate files for each sample based on barcode tags. As part of the process, reject read pairs that have mismatching tags or primers and trim the rest, removing primer+tag and low-quality sequences. """ # identify inputs and outputs set_id = dataset['set_id'] print " ", set_id run_root = root_dir+set_id+"/"+run_id+"/" ori_root = root_dir+set_id+"/"+dirs['master'] fwd_file = ori_root+dataset['source_fwd'] rev_file = ori_root+dataset['source_rev'] demux_root = run_root+dirs['demux'] report_root = run_root+dirs['reports'] qc_dir = "qc_details/" qc_main_file = report_root+"quality_control.html" cntsplt = report_root+"sample_counts" ensure_dir(ori_root) ensure_dir(demux_root) ensure_dir(report_root) ensure_dir(report_root+qc_dir) # set up files for reporting html_comps = ["<p><b>Quality control for run "+run_id+"</b></p>", "<p><img src='sample_counts.png' alt='sample_counts'/></p>", "<p><table border='1'><tr>", "<th>Sample</th>", "<th>Accepted</th>", "<th>Rejected</th>", "<th>Total</th>", "<th>% OK</th></tr>"] html_block = "".join(html_comps) open(qc_main_file, 'w').write(html_block) # prepare primers and barcodes info primers = dataset['primers'] samples = dataset['samples'] tag_pairs = samples.values() assert len(primers) >= 2 assert len(samples) >= 1 assert len(tag_pairs) >= 1 # prepare container and files for output batching and reporting hits_dict = {} for sample_id in samples: hits_dict[sample_id] = {'buffer': [], 'countY': 0, 'countN': 0} # add containers for rejected read pairs hits_dict['bad_tags'] = {'buffer': [], 'countY': 0, 'countN': 0} hits_dict['bad_qual'] = {'buffer': [], 'countY': 0, 'countN': 0} # initialize files for sample_id in samples: dmx_out = demux_root+sample_id+"_readpairs.txt" open(dmx_out, 'w').write('') open(demux_root+"bad_tags"+"_readpairs.txt", 'w').write('') open(demux_root+"bad_qual"+"_readpairs.txt", 'w').write('') # iterate through reads pair_count = 0 for titles, seqs, quals in FastqJointIterator(open(fwd_file), open(rev_file)) : F_title = titles[0][0] R_title = titles[0][1] F_seq = seqs[0][0].upper() R_seq = seqs[0][1].upper() F_qual = quals[0][0] R_qual = quals[0][1] flip = False sample_id = False # iterate through barcode tags # TODO: implement more robust solution to ambiguous base problem for tag_pair in tag_pairs: L_tag1 = (tag_pair[0]+primers['fwdRA']).upper() L_tag2 = (tag_pair[0]+primers['fwdRG']).upper() R_tag = (tag_pair[1]+primers['rev']).upper() tag_hit = False while True: # start by checking For R_tag since there's only one if not R_seq.find(R_tag, 0, len(R_tag)) is 0: if not F_seq.find(R_tag, 0, len(R_tag)) is 0: # no R_tag match -> reject break else: # is there an L_tag in R_seq? while True: if not R_seq.find(L_tag1, 0, len(L_tag1)) is 0: if not R_seq.find(L_tag2, 0, len(L_tag2)) is 0: # no L_tag match -> reject break else: R_clip = len(L_tag2) else: R_clip = len(L_tag1) tag_hit = True flip = True F_clip = len(R_tag) break else: # is there an L_tag in F_seq? while True: if not F_seq.find(L_tag1, 0, len(L_tag1)) is 0: if not F_seq.find(L_tag2, 0, len(L_tag2)) is 0: # no L_tag match -> reject break else: F_clip = len(L_tag2) else: F_clip = len(L_tag1) tag_hit = True R_clip = len(R_tag) break break if not tag_hit: # continue iterating sample_id = False else: # got it, stop iterating sample_id = key_by_value(samples, tag_pair)[0] break # in case no matches were found with any of the tags if not sample_id: sample_id = 'bad_tags' # for matched read pairs, clip off tag+primer and strip low qual runs else: F_trim = F_qual[F_clip:].find('##') if F_trim > -1: F_seq = F_seq[F_clip:F_clip+F_trim] F_qual = F_qual[F_clip:F_clip+F_trim] else: F_seq = F_seq[F_clip:] F_qual = F_qual[F_clip:] R_trim = R_qual[R_clip:].find('##') if R_trim > -1: R_seq = R_seq[R_clip:R_clip+R_trim] R_qual = R_qual[R_clip:R_clip+R_trim] else: R_seq = R_seq[R_clip:] R_qual = R_qual[R_clip:] if len(F_seq)+len(R_seq) < rp_min_len: # increment sample hit 'No' counter hits_dict[sample_id]['countN'] +=1 sample_id = 'bad_qual' # bundle read data in ordered string readF = str("@%s\n%s\n+\n%s\n" % (F_title, F_seq, F_qual)) readR = str("@%s\n%s\n+\n%s\n" % (R_title, R_seq, R_qual)) if flip: read_pair = readR+readF else: read_pair = readF+readR # output to the appropriate buffer hits_dict[sample_id]['buffer'].append(read_pair) # increment sample 'Yes' hit counter hits_dict[sample_id]['countY'] +=1 # when buffer capacity is reached, output to file and reset buffer if hits_dict[sample_id]['countY']% 100000==0: dmx_out = demux_root+sample_id+"_readpairs.txt" dump_buffer(dmx_out, hits_dict[sample_id]['buffer']) hits_dict[sample_id]['buffer'] = [] # increment counter pair_count +=1 # report on the progress if pair_count%1000000==0: print "\t", pair_count, "reads processed", datetime.now() if pair_count == max_pairs: # for testing purposes break print "\t", "Total", pair_count, "read pairs processed" print "\t", "Counts per sample:" # prepare graphing data containers pcntY = [] pcntN = [] sample_ids = [] # write out whatever remains in each of the samples buffers for sample_id in samples: dmx_out = demux_root+sample_id+"_readpairs.txt" dump_buffer(dmx_out, hits_dict[sample_id]['buffer']) hits_dict[sample_id]['buffer'] = [] acc = hits_dict[sample_id]['countY'] rej = hits_dict[sample_id]['countN'] print "\t\t", sample_id, acc, "pairs", datetime.now() pcntY.append(acc) pcntN.append(rej) sample_ids.append(sample_id) # generate FastQC report (use --noextract to not open zipped reports) run_FastQC(dmx_out, report_root+qc_dir, '--quiet', ' ') #print "see QC report" # add line in QC file link = qc_dir+sample_id+"_readpairs_fastqc/fastqc_report.html" html_comps = ["<tr>", "<th><a href='"+link+"'>"+sample_id+"</a></th>", "<td>", str(acc), "</td>", "<td>", str(rej), "</td>", "<td>", str(acc+rej), "</td>", "<td>", str(int((float(acc)/(acc+rej))*100)), "</td></tr>"] html_block = "".join(html_comps) open(qc_main_file, 'a').write(html_block) # write out whatever remains in the bad_qual buffer dmx_out = demux_root+"bad_qual_readpairs.txt" dump_buffer(dmx_out, hits_dict['bad_qual']['buffer']) hits_dict['bad_qual']['buffer'] = [] print "\t\t", "rejected (low quality)", hits_dict['bad_qual']['countY'],\ datetime.now() # generate FastQC report (use --noextract to not open zipped reports) run_FastQC(dmx_out, report_root+qc_dir, '--quiet', ' ') #print "see QC report" # add line in QC file link = qc_dir+"bad_qual_readpairs_fastqc/fastqc_report.html" html_comps = ["<tr>", "<th><a href='"+link+"'>"+"bad_qual"+"</a></th>", "<td>", '0', "</td>", "<td>", str(hits_dict['bad_qual']['countY']), "</td>", "<td>", str(hits_dict['bad_qual']['countY']), "</td>", "<td>", '0',"</td></tr>"] html_block = "".join(html_comps) open(qc_main_file, 'a').write(html_block) # write out whatever remains in the bad_tags buffer dmx_out = demux_root+"bad_tags_readpairs.txt" dump_buffer(dmx_out, hits_dict['bad_tags']['buffer']) hits_dict['bad_tags']['buffer'] = [] print "\t\t", "rejected (bad tags)", hits_dict['bad_tags']['countY'],\ datetime.now() # generate FastQC report (use --noextract to not open zipped reports) run_FastQC(dmx_out, report_root+qc_dir, '--quiet', ' ') #print "see QC report" # add line in QC file link = qc_dir+"bad_tags_readpairs_fastqc/fastqc_report.html" html_comps = ["<tr>", "<th><a href='"+link+"'>"+"bad_tags"+"</a></th>", "<td>", '0', "</td>", "<td>", str(hits_dict['bad_tags']['countY']), "</td>", "<td>", str(hits_dict['bad_tags']['countY']), "</td>", "<td>", '0',"</td></tr>"] html_block = "".join(html_comps) open(qc_main_file, 'a').write(html_block) # close table and add notes line_bq = "rejected after demultiplexing due to low sequence quality \ (top stacks in bar chart)" line_bt = "could not be assigned to a sample due to mismatches in tag \ and/or primer" html_comps = ["</table></p>", "<p><b>", "bad_qual", ": </b>", line_bq, "<br><b>", "bad_tags", ": </b>", line_bt, "</p>",] html_block = "".join(html_comps) open(qc_main_file, 'a').write(html_block) # add bad tags category for counts graphing (switch is on purpose) pcntY.append(hits_dict['bad_tags']['countN']) pcntN.append(hits_dict['bad_tags']['countY']) sample_ids.append('bad_tags')# check that the totals add up assert pair_count == sum(pcntY)+sum(pcntN) # plot the read counts per sample series = pcntY, pcntN legend = 'Accepted', 'Rejected' colors = 'g', 'r' titles = 'Number of read pairs', 'Read pairs per sample' two_storey_bar_chart(series, sample_ids, legend, colors, cntsplt, titles)
def merge_pair_libs(dataset, run_id): """Merge read pairs from Illumina sample libs and output FastA.""" # identify inputs and outputs set_id = dataset['set_id'] print " ", set_id run_root = root_dir+set_id+"/"+run_id+"/" dmx_root = run_root+dirs['demux'] merged_root = run_root+dirs['merged'] report_root = run_root+dirs['reports'] master_file = run_root+dirs['merged']+run_id+".fas" ensure_dir(merged_root) ensure_dir(report_root) merger_file = report_root+"merged_pairs.html" cntsplt = report_root+"merge_counts" samples = dataset['samples'] # set up files for reporting html_comps = ["<p><b>Read pairs merged for run ", run_id, "</b></p>", "<p><img src='merge_counts.png' alt='merge_counts'/></p>", "<p><table border='1'><tr>", "<th>Sample</th>", "<th>Accepted</th>", "<th>Rejected</th>", "<th>Total</th>", "<th>% OK</th></tr>"] html_block = "".join(html_comps) open(merger_file, 'w').write(html_block) # initialize master file open(master_file, 'w').write('') # merge per sample (demuxed) merge_countA = [] merge_countR = [] sample_ids = samples.keys() for sample_id in sample_ids: print "\t", sample_id, lib_file = dmx_root+sample_id+"_readpairs.txt" merge_out = merged_root+sample_id+"_merged.fas" open(merge_out, 'w').write('') # prepare container and files for output batching and reporting buffer = [] countY = 0 countF = 0 countN = 0 # iterate through the read pairs count = 0 for titles, seqs, quals in FastqGGIterator(open(lib_file)): count +=1 seq1 = seqs[0] seq2 = seqs[1] qual1 = quals[0] qual2 = quals[1] # merge reads TODO: better safeguard against merge failure try: merged = merge_overlaps(seq1, qual1, seq2, qual2) except: countF +=1 else: if merged.find('N') > -1: countN +=1 # if there are still N quality must be too low else: countY +=1 # compose string for output mcomps = [">",sample_id,"_",str(count),"\n",merged,"\n"] mstring = "".join(mcomps) # output to buffer buffer.append(mstring) # when buffer capacity is reached, output to file and reset buffer if countY % 10000==0: dump_buffer(merge_out, buffer) dump_buffer(master_file, buffer) buffer = [] # write out whatever remains in the buffer dump_buffer(merge_out, buffer) dump_buffer(master_file, buffer) # sum up assert countY+countF+countN == count print count, "pairs", datetime.now() print "\t\t", str(countY), "merged and accepted" print "\t\t", str(countN), "merged but rejected due to residual Ns" print "\t\t", str(countF), "failed to merge" # add line in QC file html_comps = ["<tr>", "<th>", sample_id, "</b></th>", "<td>", str(countY), "</td>", "<td>", str(countN + countF), "</td>", "<td>", str(count), "</td>", "<td>", str(int((float(countY)/count)*100)), "</td></tr>"] html_block = "".join(html_comps) open(merger_file, 'a').write(html_block) # pass values merge_countA.append(countY) merge_countR.append(countN+countF) # close table and add notes line_N = "either failed to merge or still contained Ns after merging" html_comps = ["</table></p>", "<p><b>", "Rejected", ":</b> ", line_N, "</p>"] html_block = "".join(html_comps) open(merger_file, 'a').write(html_block) # plot the read counts per sample series = merge_countA, merge_countR lgnd = 'Accepted', 'Rejected' colors = 'g', 'r' titles = 'Number of read pairs', 'Read pairs merged per sample' two_storey_bar_chart(series, sample_ids, lgnd, colors, cntsplt, titles)
def process_ref(ref, ref_annot_flag, r_root_dir, fixed_dirs, run_dirs, run_id, timestamp, prot_db_name, project_id): """Re-annotate contig and extract reference segments using coordinates.""" # set inputs and outputs run_root = r_root_dir+run_id+"/" ref_name = ref['name'] in_file = fixed_dirs['ori_g_dir']+ref['file'] seg_out_root = run_root+run_dirs['ref_seg_dir']+ref_name+"/" gen_fas_root = fixed_dirs['fas_contigs_dir']+ref_name+"/" if ref_annot_flag: ref_gbk = run_root+run_dirs['ref_gbk_dir']+ref_name+"_re-annot.gbk" else: ## bypass re-annotated ONLY IF ORIGINAL INPUT IS GBK #todo: fix ref_gbk = in_file ref_fas = run_root+run_dirs['ref_fas_dir']+ref_name+".fas" genome_fas = gen_fas_root+ref_name+"_1.fas" report_root = run_root+run_dirs['reports']+ref_name+"/" ref_log = report_root+run_id+"_"+ref_name+"_log.txt" ensure_dir([seg_out_root, report_root, gen_fas_root]) print " ", ref_name, "...", # initialize run_ref object run_ref = Reference(ref_name, in_file, ref['input'], ref['seg_mode'], ref['capture'], ref_fas, ref_gbk, seg_out_root, ref_log) # initialize reference log cl_header = ["# Console log:", run_id, "/", ref_name, timestamp, "\n\n"] open(ref_log, 'w').write(" ".join(cl_header)) # open record and ensure we have a fasta in the right place if not path.exists(ref_fas): if run_ref.input == 'fas': copyfile(in_file, ref_fas) elif run_ref.input == 'gbk': record = load_genbank(in_file) record.id = ref_name write_fasta(ref_fas, record) else: msg = "ERROR: Input not recognized for "+ref_name run_ref.log(msg) raise Exception(msg) # make a BLAST DB make_ref_DB(ref, run_id, fixed_dirs, r_root_dir, run_dirs) copyfile(ref_fas, genome_fas) # re-annotate ref contig if ref_annot_flag: record = annot_ref(ref_name, ref_fas, prot_db_name, fixed_dirs, project_id) else: ## bypass re-annotation ONLY IF ORIGINAL INPUT IS GBK #todo: fix record = load_genbank(in_file) # load or generate segment definitions if run_ref.seg_mode == 'chop': run_ref.get_segs_from_chop(len(record.seq), ref['chop_size']) elif run_ref.seg_mode == 'list': run_ref.get_segs_from_list(ref['segs']) elif run_ref.seg_mode == 'feats': run_ref.get_segs_from_feats(ref['feat_type']) # extract segment sequences rec_annot = run_ref.extract_segs_seqs(record, seg_out_root) # write re-annotated reference sequence to file write_genbank(ref_gbk, rec_annot) # report results logstring = " ".join([str(len(run_ref.segs)), "segments"]) print logstring run_ref.log(logstring) return run_ref
def unpack_genomes(genome, separator, fixed_dirs, ctg_thresholds): """Unpack genome files. Here, unpacking means extracting data and producing specific files to standardize how the information is made available to downstream analysis. Depending on the input file format, different unpacking methods are invoked. In all cases, this ensures that for each genome, there is a multifasta file of the contigs all together as well as a separate Genbank file for each contig. Supported input file formats are the following: - mfas: Basic whole genome sequence in multifasta file of contigs. This can be used to process a finished genome in a single Fasta file as well. - cgbk: All contigs concatenated in a single GenBank file (Genoscope, French WGS). This can be used to process a finished genome in a single GanBank file as well. # TODO: provide support for other possible input formats Unpacking 'cgbk' genomes involves an initial step to detect occurrences of the sequence separator and collect the start and stop coordinates of each contig. Each pair of coordinates can then be used to extract the contig sequence and create a SeqRecord for that contig, which SeqIO normally does when it unpacks multifasta files. """ # set up inputs infile = genome['file'] #TODO: make GUI input loader (upstream) inpath = fixed_dirs['ori_g_dir']+infile g_name = genome['name'] print " ", g_name, "...", # prep output destinations mfas_dir = fixed_dirs['mfas_contigs_dir'] fas_dir = fixed_dirs['fas_contigs_dir']+g_name+"/" ensure_dir([mfas_dir, fas_dir]) mfas_file = mfas_dir+g_name+"_contigs.fas" records = [] # select unpacking method if genome['input'] is 'fas': try: path.exists(inpath) is True except ValueError: raise Exception("Bad input file path") genome_recs = load_multifasta(inpath) # generate GenBank files counter = 0 for rec in genome_recs: counter +=1 ctg_num = str(counter) new_id = g_name+"_"+ctg_num # workaround for long ids new_seq = rec.seq new_seq.alphabet = generic_dna new_rec = SeqRecord(seq=new_seq, id=new_id) records.append(new_rec) # for multifasta output fas_file = fas_dir+new_id+".fas" write_fasta(fas_file, new_rec) elif genome['input'] is 'gbk': # load in genome data genome_rec = load_genbank(inpath) g_string = genome_rec.seq # find split coordinates coord_pairs = multisplit_finder(g_string, separator) # split record counter = 0 for (start, stop) in coord_pairs: counter +=1 ctg_num = str(counter) new_record = genome_rec[start:stop] new_record.id = g_name+"_"+ctg_num records.append(new_record) # for multifasta output fas_file = fas_dir+g_name+"_"+ctg_num+".fas" write_fasta(fas_file, new_record) else: xmsg = "Input file format "+genome['input']+" unspecified/unsupported" raise Exception(xmsg) print counter, "contigs" # write master file write_fasta(mfas_file, records) # pass records to stats logger ctg_stats(g_name, fixed_dirs, ctg_thresholds, records)
def build_scaffolds(run_ref, r_root_dir, run_dirs, prox_D, separator, genomes, run_id, timestamp, mtype, mode): """Build a scaffold of contigs based on the reference. This takes contigs that gave positive hits when blasted with reference segments. The contigs were aligned against the complete reference in a previous step for mapping purposes. Now the output of that step is re-used determine their position. A caveat is that if there are natural local rearrangements in the sequence relative to the reference, they may not be resolved appropriately. The problem is somewhat moderated by the fact that this function takes the best (usually the largest) hit region as "anchor" to position the contig within the scaffold. But if the rearranged region takes up a significant portion of the contig length, the anchoring will probably not be called correctly. Visual inspection of the finalized maps should help diagnose any such problems. The order can be fixed manually using the Mauve Contig Mover, which is part of Mauve 2. Note that not all hit contigs are "real" hits, so filtering should be applied before scaffolding to generate constructs. Model-based filtering produces a list of contigs that will be passed to the scaffolder. If filtering manually by looking at the maps, there are two options available: either select exclusively OR exclude a subset of contigs for the scaffolding process. This is done by listing their ID number in the genome dictionaries in the config file then resuming the pipeline from this step. """ # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" ctgs_root = run_root+run_dirs['run_gbk_ctgs_dir']+ref_n+"/" mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/contigs/" scaffolds_dir = run_root+run_dirs['scaffolds_dir']+ref_n+"/" print " ", ref_n # log logstring = "".join(["\n\n# Build scaffold constructs @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs g_name = genome['name'] ctgs_dir = ctgs_root+g_name+"/" print "\t", g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # set outputs mauve_dir = mauve_root+g_name+"/" ensure_dir([mauve_dir, scaffolds_dir]) scaff_fas = scaffolds_dir+g_name+"_"+ref_n+"_scaffold.fas" scaff_gbk = scaffolds_dir+g_name+"_"+ref_n+"_scaffold.gbk" # list genbank files in matches directory dir_contents = listdir(ctgs_dir) anchors_array = np.zeros(1, dtype=[('ctg', 'i4'), ('start', 'i4'), ('end', 'i4'), ('orient', 'i2')]) # identify contigs we want to select subset = [] for item in dir_contents: pattern = re.compile(r'.*_(\d*)\.gbk$') match = pattern.match(item) if match: ctg_num = match.group(1) if mode == "exclude": try: if int(ctg_num) in genome[mode]: msg = "("+ctg_num+")" print msg, run_ref.log(msg) else: subset.append(ctg_num) except KeyError: msg = "WARNING: no ignored segments list, including all" print msg msg = ctg_num print msg, subset.append(ctg_num) run_ref.log(msg) elif mode == "select": try: if int(ctg_num) in genome[mode]: msg = ctg_num print msg, run_ref.log(msg) subset.append(ctg_num) else: msg = "("+ctg_num+")" print msg, run_ref.log(msg) except KeyError: msg = "WARNING: no selected segments list, including all" print msg msg = ctg_num print msg, subset.append(ctg_num) run_ref.log(msg) # at this point we should have a subset of contigs selected for ctg_num in subset: logstring = "".join(["\t", ctg_num]) run_ref.log(logstring) # set inputs mauve_file = mauve_dir+ctg_num+".mauve" bb_file = mauve_file+".backbone" try: # parse Mauve output coords = mauver_load2_k0(bb_file, prox_D, mtype) # determine which segment to use as anchor anchor_seg = get_anchor_loc(coords) anchors_array = np.insert(anchors_array, 0, (ctg_num, anchor_seg['start'], anchor_seg['end'], anchor_seg['orient'])) except IOError: msg = "\tERROR: Mauve alignment not found\n\t" print msg run_ref.log(msg) except Exception: msg = "\tERROR: Iteration failure\n\t" print msg run_ref.log(msg) # abort if there is no valid contig to proceed with try: assert len(anchors_array) > 1 # always 1 left from stub except AssertionError: msg = "\tWARNING: Contig list empty\n\t" print msg run_ref.log(msg) else: # order contigs by anchor location anchors_array = np.sort(anchors_array, order='start') # load contig records from the genbank files in the matches directory ctg_list = [] for ctg_anchor in anchors_array: ctg_num = ctg_anchor['ctg'] if ctg_num > 0: contig_gbk = ctgs_dir+g_name+"_"+str(ctg_num)+".gbk" record = load_genbank(contig_gbk) if ctg_anchor['orient'] == -1: # flip record record = record.reverse_complement(id=True, name=True, annotations=True, description=True) ctg_list.append(record) else: # workaround for having 0 value leftover from stub pass # having it might come in handy in later dev # output scaffold files write_fasta(scaff_fas, ctg_list) scaff_record = SeqRecord('', id='temp') scaff_bumper = SeqRecord(separator, id='join') for record in ctg_list: feat_start = len(scaff_record.seq) scaff_record += record feat_stop = len(scaff_record.seq) scaff_record += scaff_bumper feat_loc = FeatureLocation(feat_start, feat_stop) pattern = re.compile(r'.*_(\d*)$') match = pattern.match(record.id) try: ctg_num = match.group(1) except Exception: ctg_num = 'N' feature = SeqFeature(location=feat_loc, type='contig', qualifiers={'id': ctg_num}) scaff_record.features.append(feature) scaff_record.description = g_name+" scaffold from "+ref_n try: scaff_record.id = g_name write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper except ValueError: scaff_record.id = g_name[:10] write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper print ""
def glomp_blast_out(dataset, ref_nick): """Consolidate Blast output files.""" # Identify the genome nickname = dataset['nickname'] # Determine the input file root root_dir = dirs['blast_out_dir']+nickname+"/"+ref_nick+"/" file_root = root_dir+nickname # Signal process start print "-- Consolidating B_out for", nickname, "against", ref_nick, "--" print datetime.now() # Cycle through bin types series_index = 0 averages = [] # for comparing series later binned_pos = [] for bin_type in bin_types: index = 1 bin_arrays =[] while os.path.isfile(file_root+bin_type+"_"+str(index)+"_blast.out"): infile = file_root+bin_type+"_"+str(index)+"_blast.out" rec_array = read_array(infile, blast_dtypes) if len(rec_array) > 0: bin_arrays.append(rec_array) index +=1 print "\t\t"+str(len(bin_arrays)), "arrays for", \ nickname+bin_type, "series" if len(bin_arrays) > 0: series = numpy.hstack(bin_arrays) else: series = [] print "\t\t"+str(len(series)), "total records in", \ nickname+bin_type, "series" # Save to file cons_outfile = file_root+bin_type+"_cons_out.npy" numpy.save(cons_outfile, series) # Evaluate match positions on reference positions = [] match_read = [] for row in series: # collect match read info while we're at it # use regex to extract query index query_pattern = re.compile(r'\w*_(\d*)') query_match = query_pattern.match(row[0]) query_index = int(query_match.group(1)) match_read.append(query_index) # use regex to extract ref coords ref_pattern = re.compile(r'\w*_\d*_(\d*)') ref_match = ref_pattern.match(row[1]) ref_pos = int(ref_match.group(1)) pos_scaled = ref_pos/cpm['size'] # adjust to db segment length positions.append(pos_scaled) # uniquify the match read array unique_matches = numpy.unique(match_read) print "\t"+str(len(unique_matches)), "unique matches for", bin_type # write to file for future use match_dir_root = dirs['match_dir']+nickname+"/"+ref_nick+"/" ensure_dir(match_dir_root) match_outfile = match_dir_root+nickname+bin_type+"_match.npy" numpy.save(match_outfile, unique_matches) # now count ocurrences per position pos_np = numpy.array(positions) binned = numpy.bincount(pos_np) binned_pos.append(binned) pos_count_average = numpy.average(binned) averages.append((pos_count_average, series_index)) series_index +=1 # compare series averages.sort() averages.reverse() order_indices = [] for pair in averages: order_indices.append(pair[1]) # identify reference ref_name = [reference['full_name'] for reference in references if reference['nickname'] is ref_nick] # prep directory & file fig_root = dirs['reports_dir']+"match_figs/" fig_file = fig_root+nickname+"_"+ref_nick+".png" ensure_dir(fig_root) # generate a figure pylot.autoscale(enable=True, axis='both', tight=True) pylot.xlabel('Position on the chromosome (/'+str(cpm['size'])+')') pylot.ylabel('Number of matches (includes multiples)') pylot.title(nickname+' matches to '+ref_name) pylot.grid(True) for index in order_indices: label_root = nickname+bin_types[index] label_str = label_root+" ("+str(numpy.sum(binned_pos[index]))+")" pylot.plot(binned_pos[index], label=label_str) pylot.legend(loc=1) pylot.savefig(fig_file, dpi=None, facecolor='w', edgecolor='w', orientation='portrait', papertype=None, format=None) pylot.clf() print "\t"+str(series_index), "series consolidated and parsed" print "-- Done, see plot --" print datetime.now() return "OK"
def filter_contigs(run_ref, run_id, genomes, norm_matches, seg_size, threshold, r_root_dir, run_dirs, fixed_dirs, timestamp): """Filter contigs.""" # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" fas_root = fixed_dirs['fas_contigs_dir'] report_root = run_root+run_dirs['reports']+ref_n+"/" ensure_dir([report_root]) print " ", ref_n # log logstring = "".join(["\n\n# Filter contigs @", timestamp, "\n"]) run_ref.log(logstring) # process # evaluate segment specificity using negative controls neg_controls = [genome['name'] for genome in genomes if ('ctrl' in genome.keys() and genome['ctrl'] == 'neg')] neg_dat = [norm_matches[g_name]['ctg_scores'] for g_name in neg_controls] neg_RA = np.vstack(neg_dat) neg_mean = nanmean(neg_RA, axis=0) # process the genomes we're testing test_genomes = [genome['name'] for genome in genomes if not ('ctrl' in genome.keys())] for g_name in test_genomes: print "\t", g_name, ctg_hits = norm_matches[g_name]['ctg_scores'] ctg_stats = {} #process individual contigs counter = 0 for ctg_RA in ctg_hits: # identify this contig by name ctg_name = norm_matches[g_name]['ctg_names'][counter] counter += 1 # subtract background signal from match scores recal_ctg_RA = np.subtract(ctg_RA, neg_mean) recal_ctg_RA = recal_ctg_RA.clip(min=0) # compute total similarity score s_score = np.sum(recal_ctg_RA) # compute clustering score (primitive) streak = False c_score = 0 for hit in recal_ctg_RA: if hit == 0: if streak == True: c_score += -1 streak = False else: c_score += 0 elif hit > 0: if streak == True: c_score += 2 else: c_score += 1 streak = True # compute backbone vs. cargo burden ctg_rec = load_fasta(fas_root+g_name+"/"+ctg_name+".fas") bbone = np.sum(np.ma.make_mask(recal_ctg_RA))*seg_size if bbone > len(ctg_rec): bbone = len(ctg_rec) # workaround for last segment being always a little short cargo = len(ctg_rec) - bbone # make inverted array mask (used for redundancy detection) ctg_mask = np.ma.getmaskarray(np.ma.masked_equal(recal_ctg_RA,0)) # consolidate contig information ctg_stats[ctg_name] = {'s_score': s_score, 'c_score': c_score, 'vector': recal_ctg_RA, 'inv_mask':ctg_mask, 'bbone': bbone, 'cargo': cargo} # detect redundant contigs ### use np.ma.mask_or(m1, m2) ### if any elements returns false there is a redundancy between two contigs ### if so evaluate which has better c_score and s_score # compute overall stats for the genome gs_score = sum([ctg_stats[contig]['s_score'] for contig in ctg_stats]) gc_score = sum([ctg_stats[contig]['c_score'] for contig in ctg_stats]) g_bbone = sum([ctg_stats[contig]['bbone'] for contig in ctg_stats]) g_cargo = sum([ctg_stats[contig]['cargo'] for contig in ctg_stats]) print gs_score, gc_score, g_bbone, g_cargo, # if gs_score > threshold: ## run plotters again ## pass the genome on to the next step (others will be dropped) print "MATCH" else: print "(-)"
def align_cstrct2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs, genomes, max_size, chop_mode, mtype, mauve_exec): """Align constructs pairwise to the reference contig.""" # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" ref_ctg_file = run_ref.file mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/constructs/" segments_root = run_root+run_dirs['aln_seg_dir']+ref_n+"/constructs/" scaff_root = run_root+run_dirs['scaffolds_dir']+ref_n+"/" ensure_dir([segments_root]) print " ", ref_n # log logstring = "".join(["\n\n# Align scaffold constructs to reference @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs g_name = genome['name'] scaff_gbk = scaff_root+g_name+"_"+ref_n+"_scaffold.gbk" file_list = (ref_ctg_file, scaff_gbk) print "\t", g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # set outputs mauve_dir = mauve_root+g_name+"/" aln_segs_dir = segments_root+g_name+"/" ensure_dir([mauve_dir, aln_segs_dir]) mauve_outfile = mauve_dir+g_name+"_"+ref_n+".mauve" segfile = aln_segs_dir+g_name+"_"+ref_n+"_segs.txt" # abort if the reference file is not found try: open(ref_ctg_file, 'r') except IOError: msg = "ERROR: Reference file not found" print msg run_ref.log(msg) raise # abort if there is no scaffold construct try: open(scaff_gbk, 'r') except IOError: msg = "WARNING: No scaffold construct to align" print msg run_ref.log(msg) else: # prep segments file open(segfile, 'w').write('') # purge any pre-existing sslist file sslist_file = scaff_gbk+".sslist" if os.path.isfile(sslist_file): try: os.remove(sslist_file) except Exception: raise # do Mauve alignment align_mauve(file_list, mauve_outfile, mauve_exec) try: # parse Mauve output (without initial clumping) coords = mauver_load2_k0(mauve_outfile+".backbone", 0, mtype) print len(coords), '->', logstring = "".join(["\t", str(len(coords))]) run_ref.log(logstring) # chop segments that are too long chop_array = chop_rows(coords, max_size, chop_mode, mtype) print len(chop_array), 'segments <', max_size, 'bp', logstring = "".join(["\t", str(len(chop_array))]) run_ref.log(logstring) # make detailed pairwise alignments of the segments ref_rec = load_genbank(ref_ctg_file) query_rec = load_genbank(scaff_gbk) id = iter_align(chop_array, ref_rec, query_rec, aln_segs_dir, segfile) print "@", id, "% id. overall" logstring = "".join(["\t", str(id)]) run_ref.log(logstring) except IOError: msg = "\nERROR: Mauve alignment failed" run_ref.log(msg) print msg
def glompX_blast_out(genomes, run_ref, blast_mode, r_root_dir, run_dirs, run_id, fixed_dirs, blast_dtypes, references, min_nt_match, min_nt_score, min_nt_idp, min_aa_match, min_aa_score, min_aa_idp, capture_span, timestamp): """Collect Blast results and extract match contigs.""" # load inputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" match_root = run_root+run_dirs['match_out_dir']+ref_n+"/" capture_root = run_root+run_dirs['capture_dir']+ref_n+"/" print " ", ref_n # log logstring = "".join(["\n\n# Collect Blast results @", timestamp, "\n\n"]) run_ref.log(logstring) # collect results ref_hits = {} control_scores = [] run_ref.log("Segs/Gs\t") run_ref.log("\t".join([genome['name'] for genome in genomes])) for seg in run_ref.segs: seg_n = seg['name'] print "\t", seg_n, "...", run_ref.log("".join(["\n", seg_n])) blast_dir = run_root+run_dirs['blast_out_dir']+ref_n+"/"+seg_n+"/" capture_dir = capture_root+"/"+seg_n+"/" ensure_dir([blast_dir, capture_dir]) ref_flag = True for genome in genomes: g_name = genome['name'] print "|", # process if g_name not in ref_hits.keys(): ref_hits[g_name] = {} matches_dir = match_root+g_name+"/" ensure_dir([matches_dir]) blast_infile = blast_dir+g_name+"_out.txt" genome_ctg_dir = fixed_dirs['fas_contigs_dir']+g_name+"/" rec_array = read_array(blast_infile, blast_dtypes) if len(rec_array) > 0: # take qualified hits p_cnt = 0 n_cnt = 0 if g_name in [ref['name'] for ref in references]: copyfile(genome_ctg_dir+g_name+"_1.fas", matches_dir+g_name+".fas") if ref_flag: # positive control TODO: better solution control_scores.append(rec_array[0][11]) ref_flag = False for line in rec_array: idp = line[2] q_start, q_stop = line[8], line[9] score = line[11] length = abs(q_stop-q_start) # check the blast mode to use the right thresholds if blast_mode == 'n' or blast_mode == 'tx': min_match = min_nt_match min_score = min_nt_score min_idp = min_nt_idp elif blast_mode == 'tn': min_match = min_aa_match min_score = min_aa_score min_idp = min_aa_idp else: # default to nucleotide mode min_match = min_nt_match min_score = min_nt_score min_idp = min_nt_idp if length>min_match and score>min_score and idp>min_idp: print "+", p_cnt +=1 contig_id = line[1] if contig_id not in ref_hits[g_name].keys(): ref_hits[g_name][contig_id] = {seg_n: score} else: ref_hits[g_name][contig_id][seg_n] = score pattern = re.compile(r'('+contig_id+')\.fas') for item in listdir(genome_ctg_dir): match = re.match(pattern, item) if match: fas_file = matches_dir+match.group(1)+".fas" if not path.exists(fas_file): copyfile(genome_ctg_dir+item, fas_file) # context capture capture_flag = False while True: try: if int(seg_n) in run_ref.capture: capture_flag = True else: break except ValueError: if seg_n in run_ref.capture: capture_flag = True else: break else: break if capture_flag: # load the sequence contig_file = matches_dir+contig_id+".fas" contig_rec = load_fasta(contig_file) # check orientation if q_start < q_stop: c_start = q_start-capture_span c_stop = q_stop+capture_span else: c_start = q_stop-capture_span c_stop = q_start+capture_span print c_start, c_stop # check limits if c_start < 0: c_start = 1 if c_stop > len(contig_rec.seq): c_stop = len(contig_rec.seq) # proceed cxt_file = capture_dir+g_name+"_"+contig_id+".fas" cxt_rec = SeqRecord(id=contig_id+"_" +str(c_start)+"_" +str(c_stop), seq=contig_rec.seq [c_start:c_stop]) write_fasta(cxt_file, cxt_rec) else: print "-", n_cnt +=1 if n_cnt > 0: logstring = "".join(["\t", str(p_cnt), " (", str(n_cnt), ")"]) else: logstring = "".join(["\t", str(p_cnt)]) run_ref.log(logstring) else: print "-", run_ref.log("".join(["\t", "0"])) print "" return ref_hits, control_scores
def prep_maps(run_ref, run_id, timestamp, g_select, r_root_dir, run_dirs, genomes, fixed_dirs, segtype, min_size, fct_flags, fct_colors, idpt): """Set up generation of various maps.""" # set inputs and outputs ref_ctg_n = run_ref.name run_root = r_root_dir+run_id+"/" ref_gbk = run_ref.gbk cst_root = run_root+run_dirs['scaffolds_dir']+ref_ctg_n+"/" segments_root = run_root+run_dirs['aln_seg_dir']+ref_ctg_n+"/" ctg_segs_root = segments_root+"contigs/" cst_segs_root = segments_root+"constructs/" maps_root = run_root+run_dirs['maps_dir']+ref_ctg_n+"/" ctg_aln_maps_root = maps_root+"contig_alns/" cst_ann_maps_root = maps_root+"constructs_annot/" cst_aln_maps_root = maps_root+"constructs_aln/" ensure_dir([cst_root, ctg_segs_root, cst_segs_root, maps_root, ctg_aln_maps_root, cst_ann_maps_root, cst_aln_maps_root]) print " ", ref_ctg_n, "...", # log logstring = "".join(["\n\n# Generate maps @", timestamp, "\n\n"]) run_ref.log(logstring) # map of reference with segment details map_ref_segs(run_ref, run_id, r_root_dir, run_dirs, min_size, fct_flags, fct_colors, idpt) # log logstring = "ref_map" run_ref.log(logstring) print logstring # cycle through genomes for genome in genomes: # set inputs and outputs g_name = genome['name'] # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) while True: try: if g_name in g_select: pass else: break except TypeError: pass print "\t", g_name, "...", scaff_gbk = cst_root+g_name+"_"+ref_ctg_n+"_scaffold.gbk" ctg_aln_maps_dir = ctg_aln_maps_root+g_name+"/" ensure_dir([ctg_aln_maps_dir]) # maps of contigs aligned to reference logstring = "ctg_aln" print logstring, logstring = "".join(["\t", logstring]) run_ref.log(logstring) map_ctg_alns(run_ref, ref_gbk, genome, ctg_segs_root, ctg_aln_maps_dir, fixed_dirs, segtype, min_size, fct_flags, fct_colors, idpt) # map of scaffold construct logstring = "cst_ant" print logstring, logstring = "".join(["\t", logstring]) run_ref.log(logstring) map_cst_annot(run_ref, genome, scaff_gbk, cst_ann_maps_root, fct_flags, fct_colors) # map of construct aligned to reference logstring = "cst_aln" print logstring logstring = "".join(["\t", logstring]) run_ref.log(logstring) map_cst_aln(run_ref, ref_gbk, genome, scaff_gbk, cst_segs_root, cst_aln_maps_root, segtype, min_size, fct_flags, fct_colors, idpt) break
def align_ctg2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs, genomes, mauve_exec, max_size, chop_mode, mtype): """Align contigs pairwise to the reference contig.""" # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" ref_ctg_file = run_ref.file mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/contigs/" segments_root = run_root+run_dirs['aln_seg_dir']+ref_n+"/contigs/" q_ctgs_root = run_root+run_dirs['match_out_dir']+ref_n+"/" ensure_dir([segments_root]) print " ", ref_n # log logstring = "".join(["\n\n# Align contigs to ref @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs and outputs g_name = genome['name'] ctgs_fas_dir = q_ctgs_root+g_name+"/" mauve_dir = mauve_root+g_name+"/" aln_segs_root = segments_root+g_name+"/" ensure_dir([mauve_dir]) print "\t", g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # list genbank files in matches directory dir_contents = listdir(ctgs_fas_dir) for item in dir_contents: pattern = re.compile(r'.*_(\d*)\.fas$') match = pattern.match(item) if match: ctg_num = match.group(1) print ctg_num, logstring = "".join(["\t", ctg_num]) run_ref.log(logstring) # set inputs and outputs q_contig = ctgs_fas_dir+item file_list = (ref_ctg_file, q_contig) mauve_outfile = mauve_dir+ctg_num+".mauve" aln_segs_dir = aln_segs_root+ctg_num+"/" ensure_dir([aln_segs_dir]) segfile = aln_segs_dir+ctg_num+"_"+ref_n+"_segs.txt" open(segfile, 'w').write('') # do Mauve alignment try: open(ref_ctg_file, 'r') open(q_contig, 'r') except IOError: msg = "\nERROR: File missing, cannot align\n\t\t\t" run_ref.log(msg) print msg else: align_mauve(file_list, mauve_outfile, mauve_exec) try: # parse Mauve output (without initial clumping) coords = mauver_load2_k0(mauve_outfile+".backbone", 0, mtype) # chop segments that are too long chop_array = chop_rows(coords, max_size, chop_mode, mtype) # make detailed pairwise alignments of the segments ref_rec = load_genbank(ref_ctg_file) query_rec = load_fasta(q_contig) iter_align(chop_array, ref_rec, query_rec, aln_segs_dir, segfile) except IOError: msg = "\nERROR: Mauve alignment failed\n\t\t\t" run_ref.log(msg) print msg except Exception: msg = "\nERROR: Iteration failed\n\t\t\t" run_ref.log(msg) print msg print ""