Exemplo n.º 1
0
    def __call__(self, identifier, partner_identifier, row_target, col_target,
                 scale, reduction_axis):

        scale_dir = ensure_dir(os.path.join(self.work_dir, str(scale)))
        next_scale_dir = ensure_dir(os.path.join(self.work_dir,
                                                 str(scale + 1)))
        temp_dir = ensure_dir(os.path.join(scale_dir, "work"))

        input_img = [identifier, partner_identifier]
        axis_name = ['rows', 'cols']
        print("Partners over {}: {} and {}".format(
            axis_name[reduction_axis], os.path.basename(identifier),
            os.path.basename(partner_identifier)))

        target_name = "{}_{}".format(row_target, col_target)
        target_file = "{}.{}".format(target_name, img_data_fmt)
        dir_stitch = setup_directories(os.path.join(temp_dir, target_name))
        stitched_img = os.path.join(next_scale_dir, target_file)

        if reduction_axis == 0:
            success = assemble_row(dir_stitch, input_img, stitched_img)
        else:
            success = assemble_column(dir_stitch, input_img, stitched_img)

        return success, stitched_img
Exemplo n.º 2
0
def mux_batch_blast(dataset, bin_type):
    """Send batch jobs to Blast. Muxes to multiple reference DBs."""
    # Identify the genome
    nickname = dataset['nickname']
    # Determine the input file root
    root_dir = dirs['parcels_dir']+nickname+"/"
    file_root = root_dir+nickname+bin_type
    # Identify the references to blast against
    ref_nicks = dataset['ref_nicks']
    for ref_nick in ref_nicks:
        # Identify Blast DB
        db_path = dirs['blast_db_dir']+ref_nick
        # Prep output directory
        out_dir = dirs['blast_out_dir']+nickname+"/"+ref_nick+"/"
        ensure_dir(out_dir)
        # Signal process start
        print "--- Blasting", nickname+bin_type, "against", ref_nick, "---"
        print datetime.now()
        index = 1
        while os.path.isfile(file_root+"_"+str(index)+".fas"):
            query_file = file_root+"_"+str(index)+".fas"
            outfile = out_dir+nickname+bin_type+"_"+str(index)+"_blast.out"
            print "\tblasting", query_file
            local_blastn_2file(query_file, db_path, outfile, blast_prefs)
            index +=1
        print "--- Finished BLAST run ---"
        print datetime.now()
        print index, "parcel files blasted"
    return "OK"
Exemplo n.º 3
0
def build_a_bear():
    """
    the output of bear differs between versions, so we build the
    latest bear rather than trying to support multiple versions.
    FIXME: might be better to handle multiple versions instead.
    """
    if os.path.isdir(c.BEAR_PREFIX):
        logging.debug("skipping Bear installation")
        return

    # download
    if not os.path.isfile(c.BEAR_ARCHIVE):
        curl = get_cmd_or_die("curl")
        curl['-s', c.BEAR_URL, '-o', c.BEAR_ARCHIVE] & pb.TEE

    # remove any existing build dir since we don't know if
    # bear was built for the current host environment.
    if os.path.isdir(c.BEAR_SRC):
        shutil.rmtree(c.BEAR_SRC, ignore_errors=True)

    # unpack
    tar = get_cmd_or_die("tar")
    with pb.local.cwd(c.DEPS_DIR):
        tar['xf', c.BEAR_ARCHIVE] & pb.TEE

    # cmake
    bear_build_dir = os.path.join(c.BEAR_SRC, "build")
    bear_install_prefix = "-DCMAKE_INSTALL_PREFIX=" + c.BEAR_PREFIX
    ensure_dir(bear_build_dir)
    with pb.local.cwd(bear_build_dir):
        cmake = get_cmd_or_die("cmake")
        cmake["..", bear_install_prefix] & pb.TEE
        make = get_cmd_or_die("make")
        make["install"] & pb.TEE
Exemplo n.º 4
0
def apply_blankfield(files, blank_field_file, dest_dir, percentage=100,
                     blur_alpha=0.7, threads=1, is_dark=False):

    files = sorted(files)

    ensure_dir(dest_dir)

    blank_field = image_load_resize(blank_field_file, percentage)
    if blur_alpha > 0:
        blank_field = omomorphic_shading_extraction(blank_field, blur_alpha)

    blankfield_wgt = get_blankfield_weights(blank_field)

    ims = (image_load_resize(i, percentage) for i in files)
    job_args = [ (blankfield_wgt, im_i, f, dest_dir) for f, im_i
                in itertools.izip(files, ims) ]
    if threads == 1:
        results = [ apply_blankfield_weights(*args) for args in job_args ]
    else:
        pool = mp.Pool(processes=threads)
        jobs = [ pool.apply_async(apply_blankfield_weights, args) for args
                in job_args ]
        pool.close()
        pool.join()
        results = [ job.get() for job in jobs ]
    success = all(results)

    return success
Exemplo n.º 5
0
def generate_write_blankfield(files_in, output_dir, percentage=100, threads=1):

    bfield = generate_blankfield(files_in, int(percentage), threads)
    ensure_dir(output_dir)
    success = cv2.imwrite(os.path.join(output_dir, "blankfield-stat.png"),
                          bfield)
    return success
Exemplo n.º 6
0
def simple_q2a(dataset, trim_file):
    """."""
    # Identify the genome
    nickname = dataset['nickname']
    # Identify the trim file type
    ttype = trim_file['type']
    # Identify the source file
    source_file = trim_file['name']
    # Prep output files
    dir_root = dirs['mft_dir']+nickname+"/"
    ensure_dir(dir_root)
    out_file = dir_root+nickname+ttype+'.fas'
    track_file = dir_root+nickname+ttype+'_track.txt'
    # Save filenames for later reference
    dataset['mft_files'].append({'type': ttype,
                                 'name': out_file,
                                 'track': track_file})
    # Signal the process start
    print "-- Converting ", nickname+ttype, "to multifasta --"
    print datetime.now()
    # Set up iterator
    multifasta = open(out_file, 'w')
    tracker = open(track_file, 'w')
    read_count = 0
    for title, seq, qual in FastqGeneralIterator(open(source_file)) :
        read_count +=1
        id_string = nickname+"_"+str(read_count)
        multifasta.write(">"+id_string+"\n"+seq+"\n")
        tracker.write(title+"\t"+id_string+"\n")
        if read_count%100000==0:
            print "\t"+str(read_count), "reads processed"
    multifasta.close()
    tracker.close()
    return read_count
Exemplo n.º 7
0
def chop_multifasta(dataset, mft_file):
    """Split a master file into smaller multifasta files.

    This is useful for making reasonably-sized batch BLAST jobs.
    Iterator function adapted from http://biopython.org/wiki/Split_large_file

    """
    # Identify the genome
    nickname = dataset['nickname']
    # Identify the trim file type
    ttype = mft_file['type']
    # Prep output file
    dir_root = dirs['parcels_dir']+nickname+"/"
    out_file_root = dir_root+nickname+ttype
    ensure_dir(dir_root)
    # Save filenames for later reference
    dataset['parcel_files'] = {'root': out_file_root, 'suffix': '.fas'}
    # Unpack chopping parameters
    parceln = chop_param['parceln']
    # Signal the process start
    print "-- Splitting ", nickname+ttype, "into batches of", parceln, "reads --"
    print datetime.now()
    # Set up iterator function
    record_iter = SeqIO.parse(open(mft_file['name']),"fasta")
    parcel_files = []
    for i, batch in enumerate(batch_iterator(record_iter, parceln)) :
        filename = out_file_root+"_%i.fas" % (i+1)
        handle = open(filename, "w")
        count = SeqIO.write(batch, handle, "fasta")
        handle.close()
        parcel_files.append(filename)
        print "\twrote %i records to %s" % (count, filename)
    print "-- Finished --"
    print datetime.now()
    return len(parcel_files)
Exemplo n.º 8
0
def _main():
    if on_mac():
        die("Cross-checking is only supported on Linux hosts.")

    setup_logging()
    logging.debug("args: %s", " ".join(sys.argv))

    # earlier plumbum versions are missing features such as TEE
    if pb.__version__ < c.MIN_PLUMBUM_VERSION:
        err = "locally installed version {} of plumbum is too old.\n" \
            .format(pb.__version__)
        err += "please upgrade plumbum to version {} or later." \
            .format(c.MIN_PLUMBUM_VERSION)
        die(err)

    args = _parse_args()
    if args.clean_all:
        logging.info("cleaning all dependencies and previous built files")
        shutil.rmtree(c.CLANG_XCHECK_PLUGIN_BLD, ignore_errors=True)
        make = get_cmd_or_die('make')
        with pb.local.cwd(c.LIBFAKECHECKS_DIR):
            make('clean')

    # clang 3.6.0 is known to work; 3.4.0 known to not work.
    ensure_clang_version([3, 6, 0])
    # NOTE: it seems safe to disable this check since we now
    # that we use a rust-toolchain file for rustc versioning.
    # ensure_rustc_version(c.CUSTOM_RUST_RUSTC_VERSION)

    ensure_dir(c.CLANG_XCHECK_PLUGIN_BLD)
    ensure_dir(c.BUILD_DIR)
    git_ignore_dir(c.BUILD_DIR)

    build_clang_plugin(args)
Exemplo n.º 9
0
  def test_js_engine_path(self):
    # Test that running JS commands works for node, d8, and jsc and is not path dependent
    restore_and_set_up()

    sample_script = test_file('print_args.js')

    # Fake some JS engines
    # Note that the path contains 'd8'.
    test_path = self.in_dir('fake', 'abcd8765')
    ensure_dir(test_path)

    jsengines = [('d8',     config.V8_ENGINE),
                 ('d8_g',   config.V8_ENGINE),
                 ('js',     config.SPIDERMONKEY_ENGINE),
                 ('node',   config.NODE_JS),
                 ('nodejs', config.NODE_JS)]
    for filename, engine in jsengines:
      try_delete(SANITY_FILE)
      if type(engine) is list:
        engine = engine[0]
      if not engine:
        print('WARNING: Not testing engine %s, not configured.' % (filename))
        continue

      print(filename, engine)

      test_engine_path = os.path.join(test_path, filename)
      with open(test_engine_path, 'w') as f:
        f.write('#!/bin/sh\n')
        f.write('exec %s $@\n' % (engine))
      make_executable(test_engine_path)

      out = self.run_js(sample_script, engine=test_engine_path, args=['--foo'])

      self.assertEqual('0: --foo', out.strip())
Exemplo n.º 10
0
    def __init__(self,
                 conf,
                 uuid,
                 namespace=None,
                 service=None,
                 pids_path=None,
                 default_cmd_callback=None,
                 cmd_addl_env=None,
                 pid_file=None,
                 run_as_root=False):

        self.conf = conf
        self.uuid = uuid
        self.namespace = namespace
        self.default_cmd_callback = default_cmd_callback
        self.cmd_addl_env = cmd_addl_env
        self.pids_path = pids_path or self.conf.external_pids
        self.pid_file = pid_file
        self.run_as_root = run_as_root

        if service:
            self.service_pid_fname = 'pid.' + service
            self.service = service
        else:
            self.service_pid_fname = 'pid'
            self.service = 'default-service'

        common_utils.ensure_dir(os.path.dirname(self.get_pid_file_name()))
Exemplo n.º 11
0
def _main():
    setup_logging()
    logging.debug("args: %s", " ".join(sys.argv))

    # FIXME: allow env/cli override of LLVM_SRC and LLVM_BLD
    # FIXME: check that cmake and ninja are installed
    # FIXME: option to build LLVM/Clang from master?

    args = _parse_args()

    if args.clean_all:
        logging.info("cleaning all dependencies and previous built files")
        shutil.rmtree(c.LLVM_SRC, ignore_errors=True)
        shutil.rmtree(c.LLVM_BLD, ignore_errors=True)
        shutil.rmtree(c.BUILD_DIR, ignore_errors=True)
        shutil.rmtree(c.AST_EXPO_PRJ_DIR, ignore_errors=True)
        cargo = get_cmd_or_die("cargo")
        with pb.local.cwd(c.ROOT_DIR):
            invoke(cargo, "clean")

    ensure_dir(c.LLVM_BLD)
    ensure_dir(c.BUILD_DIR)
    git_ignore_dir(c.BUILD_DIR)

    download_llvm_sources()
    configure_and_build_llvm(args)
    build_transpiler(args)
    print_success_msg(args)
Exemplo n.º 12
0
def _main():
    setup_logging()
    logging.debug("args: %s", " ".join(sys.argv))

    # earlier plumbum versions are missing features such as TEE
    if pb.__version__ < c.MIN_PLUMBUM_VERSION:
        err = "locally installed version {} of plumbum is too old.\n" \
            .format(pb.__version__)
        err += "please upgrade plumbum to version {} or later." \
            .format(c.MIN_PLUMBUM_VERSION)
        die(err)

    args = _parse_args()
    if args.clean_all:
        logging.info("cleaning all dependencies and previous built files")
        shutil.rmtree(c.CLANG_XCHECK_PLUGIN_BLD, ignore_errors=True)

    # prerequisites
    if not have_rust_toolchain(c.CUSTOM_RUST_NAME):
        die("missing rust toolchain: " + c.CUSTOM_RUST_NAME, errno.ENOENT)

    # clang 3.6.0 is known to work; 3.4.0 known to not work.
    ensure_clang_version([3, 6, 0])
    ensure_rustc_version(c.CUSTOM_RUST_RUSTC_VERSION)

    ensure_dir(c.CLANG_XCHECK_PLUGIN_BLD)
    ensure_dir(c.DEPS_DIR)
    git_ignore_dir(c.DEPS_DIR)

    build_clang_plugin(args)
Exemplo n.º 13
0
def importNavteq(options):
    ensure_dir(options.output_dir)
    netconvert = sumolib.checkBinary('netconvert')
    polyconvert = sumolib.checkBinary('polyconvert')

    for idx, config in enumerate(options.config.split(",")):
        netconvert_call = [netconvert, '--output-file', options.netfile, '-c', config]
        if idx > 0:
            tmp_net = os.path.join(
                options.output_dir, options.net_prefix + "_tmp.net.xml")
            os.rename(options.netfile, tmp_net)
            netconvert_call += ['--sumo-net-file', tmp_net]
        else:
            netconvert_call += ['--dlr-navteq', options.prefix]
        if options.verbose:
            print(' '.join(netconvert_call))
            sys.stdout.flush()
        subprocess.call(netconvert_call)

    polyconvertCmd = [
        polyconvert,
        '--verbose',
        '--dlr-navteq-poly-files', options.prefix + '_polygons.txt',
        #'--dlr-navteq-poi-files', options.prefix + '_points_of_interest.txt',
        '--output', os.path.join(options.output_dir, "shapes.xml"),
        '-n', options.netfile
    ]
    if options.verbose:
        print(polyconvertCmd)
        sys.stdout.flush()
    subprocess.call(polyconvertCmd)
Exemplo n.º 14
0
def annot_ref(ref_name, ctg_fas, prot_db_name, fixed_dirs, project_id,
              blast_prefs):
    """Annotate reference contig (predict ORFs and assign function)."""
    # locate the COG database
    prot_db = fixed_dirs['ref_dbs_dir']+prot_db_name
    # set inputs and outputs
    g_gbk_ctgs_root = fixed_dirs['gbk_contigs_dir']+ref_name+"/"
    ctg_cds_root = fixed_dirs['ctg_cds_dir']+ref_name+"/"
    ctg_prot_root = fixed_dirs['ctg_prot_dir']+ref_name+"/"
    ctg_blast_root = fixed_dirs['ctg_blast_dir']+ref_name+"/"
    annot_trn_root = fixed_dirs['annot_trn_dir']
    ensure_dir([g_gbk_ctgs_root, ctg_cds_root, ctg_prot_root,
                ctg_blast_root, annot_trn_root])
    trn_file = annot_trn_root+ref_name+"_annot.trn"
    g_ctg_gbk = g_gbk_ctgs_root+ref_name+"_1.gbk"
    annot_gbk = ctg_cds_root+ref_name+"_1_cds.gbk"
    annot_aa = ctg_prot_root+ref_name+"_1_aa.fas"
    blast_out = ctg_blast_root+ref_name+"_1.xml"
    if path.exists(blast_out) and os.stat(blast_out)[6]==0:
        os.remove(blast_out)
    if not path.exists(g_ctg_gbk):
        l_tag_base = ref_name+"_1"
        record = annot_ctg(ctg_fas, ctg_fas, annot_gbk,
                           annot_aa, trn_file, prot_db,
                           blast_out, l_tag_base, blast_prefs)
        record.description = ref_name+"_re-annotated"
        record.name = ref_name+"_1"
        record.dbxrefs = ["Project: "+project_id+"/"+ref_name
                          +"-like backbones"]
        record.seq.alphabet = generic_dna
        write_genbank(g_ctg_gbk, record)
    else:
        record = load_genbank(g_ctg_gbk)
    return record
Exemplo n.º 15
0
def _get_conf_base(cfg_root, uuid, ensure_conf_dir):
    #TODO(mangelajo): separate responsibilities here, ensure_conf_dir
    #                 should be a separate function
    conf_dir = os.path.abspath(os.path.normpath(cfg_root))
    conf_base = os.path.join(conf_dir, uuid)
    if ensure_conf_dir:
        common.ensure_dir(conf_dir)
    return conf_base
Exemplo n.º 16
0
def _get_conf_base(cfg_root, uuid, ensure_conf_dir):
    #TODO(mangelajo): separate responsibilities here, ensure_conf_dir
    #                 should be a separate function
    conf_dir = os.path.abspath(os.path.normpath(cfg_root))
    conf_base = os.path.join(conf_dir, uuid)
    if ensure_conf_dir:
        common.ensure_dir(conf_dir)
    return conf_base
Exemplo n.º 17
0
def run_fires(site, region):
    url = site + '/jobs/archive'
    try:
        p = getPage(url)
    except Exception as e:
        logging.error("Can't load {}".format(url))
        logging.error(e)
        return None
    a = p.findAll('a')
    zips = [x.get('href') for x in a if x.get('href').endswith('.zip')]
    fires = sorted(set([x[x.rindex('/') + 1:x.index('_')] for x in zips]))
    times = {}
    recent = {}
    simtimes = {}
    dates = []
    totaltime = 0
    dir_download = common.ensure_dir(os.path.join(DIR, region))
    dir_ext = common.ensure_dir(os.path.join(EXT_DIR, region))
    logging.debug("Checking {} fires".format(len(fires)))
    for f in fires:
        times[f] = [
            datetime.datetime.strptime(x[x.rindex('_') + 1:x.rindex('.')],
                                       '%Y%m%d%H%M%S%f') for x in zips
            if x[x.rindex('/') + 1:x.index('_')] == f
        ]
        recent[f] = {
            'time':
            max(times[f]),
            'url': [
                x for x in zips
                if x[x.rindex('/') +
                     1:x.index('_')] == f and datetime.datetime.strptime(
                         x[x.rindex('_') +
                           1:x.rindex('.')], '%Y%m%d%H%M%S%f') == max(times[f])
            ][0],
        }
        logging.debug('{}: {}'.format(f, recent[f]['time']))
        z = common.save_http(dir_download,
                             site + recent[f]['url'],
                             ignore_existing=True)
        cur_dir = os.path.join(dir_ext, os.path.basename(z)[:-4])
        common.unzip(z, cur_dir)
        fgmj = os.path.join(cur_dir, 'job.fgmj')
        if os.path.exists(fgmj):
            try:
                t0 = timeit.default_timer()
                log_name = firestarr.do_run(fgmj)
                t1 = timeit.default_timer()
                if log_name is not None:
                    simtimes[f] = t1 - t0
                    totaltime = totaltime + simtimes[f]
                    logging.info("Took {}s to run {}".format(simtimes[f], f))
                    d = os.path.basename(os.path.dirname(log_name))[:8]
                    if d not in dates:
                        dates.append(d)
            except Exception as e:
                logging.error(e)
    return simtimes, totaltime, dates
Exemplo n.º 18
0
def make_genome_DB(genome, fixed_dirs):
    """Make a Blast DB from a genome FastA file."""
    # load inputs
    fas_dir = fixed_dirs['mfas_contigs_dir']
    db_dir = fixed_dirs['blast_db_dir']
    ensure_dir([fas_dir, db_dir])
    g_name = genome['name']
    # make DB
    make_blastDB(db_dir+g_name, fas_dir+g_name+'_contigs.fas', 'nucl')
Exemplo n.º 19
0
def _main():
    setup_logging()
    logging.debug("args: %s", " ".join(sys.argv))

    # FIXME: allow env/cli override of LLVM_SRC, LLVM_VER, and LLVM_BLD
    # FIXME: check that cmake and ninja are installed
    # FIXME: option to build LLVM/Clang from master?

    # earlier plumbum versions are missing features such as TEE
    if pb.__version__ < c.MIN_PLUMBUM_VERSION:
        err = "locally installed version {} of plumbum is too old.\n" \
            .format(pb.__version__)
        err += "please upgrade plumbum to version {} or later." \
            .format(c.MIN_PLUMBUM_VERSION)
        die(err)

    args = _parse_args()
    if args.clean_all:
        logging.info("cleaning all dependencies and previous built files")
        shutil.rmtree(c.LLVM_SRC, ignore_errors=True)
        shutil.rmtree(c.LLVM_BLD, ignore_errors=True)
        shutil.rmtree(c.DEPS_DIR, ignore_errors=True)

    # prerequisites
    if not have_rust_toolchain(c.CUSTOM_RUST_NAME):
        die("missing rust toolchain: " + c.CUSTOM_RUST_NAME, errno.ENOENT)

    # clang 3.6.0 is known to work; 3.4.0 known to not work.
    ensure_clang_version([3, 6, 0])
    ensure_rustc_version(c.CUSTOM_RUST_RUSTC_VERSION)

    ensure_dir(c.LLVM_BLD)
    ensure_dir(c.DEPS_DIR)
    git_ignore_dir(c.DEPS_DIR)

    if on_linux():
        build_a_bear()
        if not os.path.isfile(c.BEAR_BIN):
            die("bear not found", errno.ENOENT)

    download_llvm_sources()

    integrate_ast_exporter()

    cc_db = install_tinycbor()

    configure_and_build_llvm(args)

    # NOTE: we're not doing this anymore since it is
    # faster and takes less space to simply pull the
    # prebuilt nightly binaries with rustup
    # download_and_build_custom_rustc(args)

    build_ast_importer(args.debug)

    if not on_mac() and args.sanity_test:
        test_ast_exporter(cc_db)
Exemplo n.º 20
0
def make_ref_DB(reference, run_id, fixed_dirs, r_root_dir, run_dirs):
    """Make a Blast DB from a reference FastA file."""
    # load inputs
    fas_dir = r_root_dir+run_id+"/"+run_dirs['ref_fas_dir']
    db_dir = fixed_dirs['blast_db_dir']
    ensure_dir([fas_dir, db_dir])
    g_name = reference['name']
    # make DB
    make_blastDB(db_dir+g_name, fas_dir+g_name+'.fas', 'nucl')
Exemplo n.º 21
0
def _main():
    setup_logging()
    logging.debug("args: %s", " ".join(sys.argv))

    # FIXME: allow env/cli override of LLVM_SRC, LLVM_VER, and LLVM_BLD
    # FIXME: check that cmake and ninja are installed
    # FIXME: option to build LLVM/Clang from master?

    # earlier plumbum versions are missing features such as TEE
    if pb.__version__ < c.MIN_PLUMBUM_VERSION:
        err = "locally installed version {} of plumbum is too old.\n" \
            .format(pb.__version__)
        err += "please upgrade plumbum to version {} or later." \
            .format(c.MIN_PLUMBUM_VERSION)
        die(err)

    args = _parse_args()

    # prerequisites
    if not have_rust_toolchain(c.CUSTOM_RUST_NAME):
        die("missing rust toolchain: " + c.CUSTOM_RUST_NAME, errno.ENOENT)

    # clang 3.6.0 is known to work; 3.4.0 known to not work.
    ensure_clang_version([3, 6, 0])

    if args.clean_all:
        logging.info("cleaning all dependencies and previous built files")
        shutil.rmtree(c.LLVM_SRC, ignore_errors=True)
        shutil.rmtree(c.LLVM_BLD, ignore_errors=True)
        shutil.rmtree(c.DEPS_DIR, ignore_errors=True)
        shutil.rmtree(c.AST_EXPO_PRJ_DIR, ignore_errors=True)
        cargo = get_cmd_or_die("cargo")
        with pb.local.cwd(c.ROOT_DIR):
            invoke(cargo, "clean")

    ensure_dir(c.LLVM_BLD)
    ensure_dir(c.DEPS_DIR)
    git_ignore_dir(c.DEPS_DIR)

    download_llvm_sources()

    update_cmakelists()

    configure_and_build_llvm(args)

    build_transpiler(args)

    # print a helpful message on how to run c2rust bin directly
    c2rust_bin_path = 'target/debug/c2rust' if args.debug \
                      else 'target/release/c2rust'
    c2rust_bin_path = os.path.join(c.ROOT_DIR, c2rust_bin_path)
    # if os.path.curdir
    abs_curdir = os.path.abspath(os.path.curdir)
    common_path = os.path.commonpath([abs_curdir, c2rust_bin_path])
    if common_path != "/":
        c2rust_bin_path = "." + c2rust_bin_path[len(common_path):]
    print("success! you may now run", c2rust_bin_path)
Exemplo n.º 22
0
def make_fake_llc(filename, targets):
  """Create a fake llc that only handles --version and writes target
  list to stdout.
  """
  print('make_fake_llc: %s' % filename)
  ensure_dir(os.path.dirname(filename))
  with open(filename, 'w') as f:
    f.write('#!/bin/sh\n')
    f.write('echo "llc fake output\nRegistered Targets:\n%s"' % targets)
  make_executable(filename)
Exemplo n.º 23
0
    def extract_natives(self):
        if not self.metadata:
            self.get_meta()

        natives_tmpdir = os.path.join(self.version_directory,
                                      'natives-' + str(int(time.time())))
        ensure_dir(natives_tmpdir)

        for lib in self.metadata['libraries']:
            skiplib = False

            # Check Rules
            if 'rules' in lib:
                for rule in lib['rules']:
                    if 'action' in rule:
                        if rule['action'] == 'allow' and 'os' in rule:
                            if not rule['os']['name'] == platform():
                                skiplib = True
                        if rule['action'] == 'noallow' and 'os' in rule:
                            if rule['os']['name'] == platform():
                                skiplib = True
            if skiplib:
                continue

            # Skip non-download-included for now
            if not 'downloads' in lib:
                continue

            dl = lib['downloads']

            if 'natives' in lib and 'extract' in lib:
                if platform() in lib['natives']:
                    platform_native = lib['natives'][platform()]

                    if platform_native in dl['classifiers']:
                        try:
                            zip_ref = zipfile.ZipFile(
                                os.path.join(
                                    self.client_root, 'libraries',
                                    dl['classifiers'][platform_native]
                                    ['path']), 'r')
                            zip_ref.extractall(natives_tmpdir)
                            zip_ref.close()
                        except Exception as e:
                            print(
                                'Failed to extract native library %s due to errors.'
                                % (lib['name']))
                            raise e

        metainf = os.path.join(natives_tmpdir, 'META-INF')

        if os.path.exists(metainf):
            shutil.rmtree(metainf)

        self.natives = natives_tmpdir
Exemplo n.º 24
0
def start_new_game():
    # ask for player names
    try:
        # ask for name of game
        game_name = input("Enter name of game (names with the same name will be overwritten): ")
        # ask for number of worlds
        num_worlds = int(
            input("Enter number of worlds (the less, the earlier the complete wavefunction collapse will happen): "))
        #
        i = 1
        name = "temp"
        print(
            "Entering names of player. Press enter without a name if you're finished. If you need to make changes later,"
            "you can opt to edit the configuration directly")
        player_names = []
        while name != "":
            print()
            name = input("Please enter name of player %02d: " % i)
            if name != "":
                player_names.append(name)
            i += 1
        # ask for number of villagers, seer, players
        while True:
            num_villagers = int(input("Enter number of villagers: "))
            num_wolves = int(input("Enter number of wolves: "))
            num_seers = int(input("Enter number of seers: "))
            if (num_seers + num_wolves + num_villagers) != len(player_names):
                print("Number of players / number of roles mismatch.")
            else:
                break
    except ValueError:
        print("Invalid input. Please repeat the setup process.")
        start_new_game()
        return

    # save to configuration
    game_config = {
        "name": game_name,
        "num_worlds": num_worlds,
        "players": player_names,
        "num_villagers": num_villagers,
        "num_wolves": num_wolves,
        "num_seers": num_seers
    }
    game_dir = os.path.join("games", game_name)
    ensure_dir(game_dir)
    game_config["game_dir"] = game_dir

    with open(os.path.join(game_dir, "config"), "w+") as f:
        json.dump(game_config, f, indent=2)
    print("Setup completed successfully.")

    # load game from config
    game = Game(dict_to_game_config(game_config))
    play_game(game, game_config, True, 0)
Exemplo n.º 25
0
def make_fake_tool(filename, version, report_name=None):
  if not report_name:
    report_name = os.path.basename(filename)
  print('make_fake_tool: %s' % filename)
  ensure_dir(os.path.dirname(filename))
  with open(filename, 'w') as f:
    f.write('#!/bin/sh\n')
    f.write('echo "%s version %s"\n' % (report_name, version))
    f.write('echo "..."\n')
    f.write('exit 0\n')
  make_executable(filename)
Exemplo n.º 26
0
def ensure_directory_exists_without_file(path):
    dirname = os.path.dirname(path)
    if os.path.isdir(dirname):
        try:
            os.unlink(path)
        except OSError:
            with excutils.save_and_reraise_exception() as ctxt:
                if not os.path.exists(path):
                    ctxt.reraise = False
    else:
        common.ensure_dir(dirname)
Exemplo n.º 27
0
def ensure_directory_exists_without_file(path):
    dirname = os.path.dirname(path)
    if os.path.isdir(dirname):
        try:
            os.unlink(path)
        except OSError:
            with excutils.save_and_reraise_exception() as ctxt:
                if not os.path.exists(path):
                    ctxt.reraise = False
    else:
        common.ensure_dir(dirname)
Exemplo n.º 28
0
def make_qiime_reports(dataset, run_id):
    """Generate HTML output for reporting with Qiime.

    Makes an interactive heatmap, a Cytoscape network and a summary of
    community composition.
    """
    # identify inputs and outputs
    set_id = dataset['set_id']
    print " ", set_id
    run_root = root_dir+set_id+"/"+run_id+"/"
    otus_dir = run_root+dirs['otus']
    table_file = otus_dir+run_id+"_otu_table.txt"
    map_file = otus_dir+run_id+"_map.txt"
    heatmap_dir = run_root+dirs['reports']+"otu_heatmap"
    network_dir = run_root+dirs['reports']
    wf_taxa_sum = run_root+dirs['reports']+"communities"
    ensure_dir(heatmap_dir)
    ensure_dir(network_dir)
    # generate a dummy Qiime map file
    dummy_comps = ["#SampleID Barcode Primer Treat DOB Descript",
                   "#Dummy map file to make Qiime happy"]
    for sample_id in dataset['samples']:
        dummy_comps.append("\t".join([sample_id,"NA","NA","NA","NA","NA"]))
    open(map_file, 'w').write("\n".join(dummy_comps))
    # make OTU heatmap
    comps = ["macqiime", "make_otu_heatmap_html.py", "-i", table_file,
             "-o", heatmap_dir]
    cline = " ".join(comps)
    try:
        child = subprocess.Popen(str(cline), stdout=subprocess.PIPE,
                                 shell=True)
        output, error = child.communicate()
    except: raise
    else: print "\t", "OTU heatmap generated"
    # make OTU network
    comps = ["macqiime", "make_otu_network.py", "-i", table_file,
             "-m", map_file, "-o", network_dir]
    cline = " ".join(comps)
    try:
        child = subprocess.Popen(str(cline), stdout=subprocess.PIPE,
                                 shell=True)
        output, error = child.communicate()
    except: raise
    else: print "\t", "OTU network generated"
    # summarize communities by taxonomic composition
    comps = ["macqiime", "summarize_taxa_through_plots.py", "-i", table_file,
             "-o", wf_taxa_sum, "-m", map_file]
    cline = " ".join(comps)
    try:
        child = subprocess.Popen(str(cline), stdout=subprocess.PIPE,
                                 shell=True)
        output, error = child.communicate()
    except: raise
    else: print "\t", "Taxonomic composition of communities summarized"
Exemplo n.º 29
0
    def __init__(self, archive_path, base_mount_dir):

        self.archive_path = archive_path

        self.mount_dir = pth.join(base_mount_dir,
                                  "mnt-" + pth.basename(archive_path))
        self.mount_available = False

        if pth.isdir(self.mount_dir):
            raise OSError("Mount directory {} already exists".format(
                self.mount_dir))
        ensure_dir(self.mount_dir)
        self.mount_available = True
Exemplo n.º 30
0
    def write_object(self, obj: gitobj.GitObject):
        content = obj.bcontent()
        sha = compute_sha1(content)

        dirname, filename = parse_sha(sha)
        dirpath = self.path_in_gitdir('objects', dirname)
        ensure_dir(dirpath)
        path = os.path.join(dirpath, filename)

        with open(path, 'wb') as f:
            f.write(zlib.compress(content))

        return sha
Exemplo n.º 31
0
def set_stack_structure(stack_name, work_dir, make_subdirs=True):

    st_name = pth.abspath(stack_name)
    source_dirs = {kind: pth.join(st_name, kind) for kind in kinds}

    base_dir = ensure_dir(pth.join(work_dir, pth.basename(st_name)))
    target_dirs = {kind: pth.join(base_dir, kind) for kind in kinds}

    if make_subdirs and all(
            pth.exists(src_dir) for src_dir in source_dirs.values()):
        [ensure_dir(tdir) for tdir in target_dirs.values()]

    return source_dirs, target_dirs, base_dir
Exemplo n.º 32
0
def main():

    opt = process_command_line()
    print opt

    ensure_dir(opt.work_dir)

    data = gather_images_data(opt.files, opt.crop_size, opt.threads,
                              opt.use_borders)
    success = align_images(data, opt.work_dir, opt.first_image_is_absolute)

    result = "done" if success else "failed"
    debug_log("Registration job", result)
Exemplo n.º 33
0
def main():

    opt = process_command_line()
    print opt

    ensure_dir(opt.work_dir)

    success = register_images(opt.files, opt.crop_size, opt.threads,
                              opt.work_dir, opt.use_borders,
                              opt.first_image_is_absolute,
                              make_jpeg=opt.write_also_jpeg)

    result = "done" if success else "failed"
    debug_log("Registration job", result)
Exemplo n.º 34
0
def rasterize_perim(run_output, perim, year, name, raster=None):
    """!
    Convert a perimeter to a raster
    @param run_output Folder to save perimeter to
    @param perim Perimeter to convert to raster
    @param year Year to find reference raster for projection
    @param name Name of fire to use for file name
    @param raster Specific name of file name to output to
    @return Perimeter that was rasterized
    @return Path to raster output
    """
    prj = os.path.join(run_output,
                       os.path.basename(perim).replace('.shp', '_NAD1983.shp'))
    ensure_dir(os.path.dirname(prj))
    ref_NAD83 = osr.SpatialReference()
    ref_NAD83.SetWellKnownGeogCS('NAD83')
    #~ try:
    Project(perim, prj, ref_NAD83)
    del ref_NAD83
    r = find_best_raster(Extent(prj).XCenter, year)
    prj_utm = os.path.join(
        run_output,
        os.path.basename(perim).replace('.shp',
                                        os.path.basename(r)[9:14] + '.shp'))
    Delete(prj_utm)
    zone = GetSpatialReference(r)
    Project(perim, prj_utm, zone)
    del zone
    cellsize = GetCellSize(r)
    size = 0.0
    dataSource = ogr.GetDriverByName('ESRI Shapefile').Open(
        prj_utm, gdal.GA_ReadOnly)
    layer = dataSource.GetLayer()
    for feature in layer:
        geom = feature.GetGeometryRef()
        area = geom.GetArea()
        size += area / (cellsize * cellsize)
        del geom
        del feature
    del layer
    del dataSource
    if size < 1:
        # this is less than one cell in area so don't use perimeter
        perim = None
        raster = None
    else:
        if not raster:
            raster = os.path.join(run_output, name + '.tif')
        Rasterize(prj_utm, raster, r)
    return perim, raster
Exemplo n.º 35
0
def main():

    opt = process_command_line()
    print opt

    ensure_dir(opt.work_dir)

    data = gather_images_data(opt.files, opt.crop_size, opt.threads)
    delta_xp = match_ppl_xpl_opaques(data, opt.crop_size, opt.work_dir)
    debug_log("PPL->XPL mismatch is", delta_xp)
    success = align_images(data, delta_xp, opt.work_dir)

    result = "done" if success else "failed"
    debug_log("Registration job", result)
Exemplo n.º 36
0
 def __init__(self, name, no_download=False):
     """!
     Constructor
     @param self Pointer to this
     @param name Name for weather being loaded
     @param no_download Whether or not to not download files
     """
     ## Name for weather being loaded
     self.name = name
     common.ensure_dir(self.DIR_DATA)
     ## Folder to save downloaded weather to
     self.DIR_DATA = os.path.join(self.DIR_DATA, self.name)
     common.ensure_dir(self.DIR_DATA)
     ## Whether or not to download files
     self.no_download = no_download
Exemplo n.º 37
0
    def get_assets(self):
        if self.metadata:
            self.get_meta()

        print('Verifying assets..')

        assets_dir = os.path.join(self.client_root, 'assets')
        assets_versions = os.path.join(assets_dir, 'indexes')
        ensure_dir(assets_versions)

        asset_index = self.metadata['assetIndex']
        assets_file = os.path.join(assets_versions,
                                   '%s.json' % (asset_index['id']))

        if not os.path.exists(assets_file):
            r = requests.get(asset_index['url'], stream=True)

            try:
                save_to_file_sha1(assets_file, r, asset_index['sha1'])
            except Exception:
                print('Failed to download assets!')
                raise

        with open(assets_file) as json_data:
            assets = json.load(json_data)

        for key, data in assets['objects'].items():
            first = data['hash'][0:2]
            asset_url = 'http://resources.download.minecraft.net/%s/%s' % (
                first, data['hash'])
            asset_dir = os.path.join(assets_dir, 'objects', first)

            ensure_dir(asset_dir)

            asset_file = os.path.join(asset_dir, data['hash'])

            if os.path.exists(asset_file):
                continue

            r = requests.get(asset_url, stream=True)

            try:
                save_to_file(asset_file, r)
            except Exception as e:
                print('Failed to download asset %s!' % (key))
                raise e

        print('All assets verified.')
Exemplo n.º 38
0
def pick_otus(dataset, run_id):
    """Pick OTUs using Uclust with Qiime."""
    # identify inputs and outputs
    set_id = dataset['set_id']
    print " ", set_id
    run_root = root_dir+set_id+"/"+run_id+"/"
    otus_dir = run_root+dirs['otus']
    ensure_dir(otus_dir)
    master_file = run_root+dirs['merged']+run_id+".fas"
    # run the command
    comps = ["macqiime", "pick_otus.py", "-i", master_file, "-o", otus_dir]
    cline = " ".join(comps)
    try:
        child = subprocess.Popen(str(cline), stdout=subprocess.PIPE,
                                 shell=True)
        output, error = child.communicate()
    except: raise
    else: print "\t", "OTUs picked"
Exemplo n.º 39
0
def download_image(img, window=None) :
	if os.name == "nt" :
		path = LM_CACHE_PATH + os.sep + 'media' + os.sep + os.sep.join(img.split('/'))
	else :
		path = LM_CACHE_PATH + os.sep + 'media' + os.sep + img
	img_path = 'http://www.pirates-caraibes.com/' + img
	try :
		ensure_dir(path)
		try :
			src = urllib2.urlopen(url_fix(img_path))
		except urllib2.HTTPError :
			return False
		else :
			dst = open(path, 'wb')
			shutil.copyfileobj(src, dst)
		return True
	except :
		return False
Exemplo n.º 40
0
    def __init__(self, conf, uuid, namespace=None, service=None,
                 pids_path=None, default_cmd_callback=None,
                 cmd_addl_env=None, pid_file=None, run_as_root=False):

        self.conf = conf
        self.uuid = uuid
        self.namespace = namespace
        self.default_cmd_callback = default_cmd_callback
        self.cmd_addl_env = cmd_addl_env
        self.pids_path = pids_path or self.conf.external_pids
        self.pid_file = pid_file
        self.run_as_root = run_as_root

        if service:
            self.service_pid_fname = 'pid.' + service
            self.service = service
        else:
            self.service_pid_fname = 'pid'
            self.service = 'default-service'

        common_utils.ensure_dir(os.path.dirname(self.get_pid_file_name()))
Exemplo n.º 41
0
def download_images(img_list) :
	for img in img_list :
		if os.name == "nt" :
			path = LM_CACHE_PATH + os.sep + 'media' + os.sep + os.sep.join(img.split('/'))
		else :
			path = LM_CACHE_PATH + os.sep + 'media' + os.sep + img
		img_path = 'http://www.pirates-caraibes.com/' + img
		print "Récupération de l'image '%s' vers %s" % (img_path, path)
		try :
			ensure_dir(path)
			try :
				src = urllib2.urlopen(img_path)
			except urllib2.HTTPError :
				pass
			else :
				dst = open(path, 'wb')
				shutil.copyfileobj(src, dst)
			print "Succès"
		except :
			(exctype, value, traceback) = sys.exc_info()
			print "Erreur - %s : %s" % (exctype, value)
Exemplo n.º 42
0
def iter_align(coord_array, ref_rec, query_rec, aln_dir, segs_file):
    """Iterate through array of coordinates to make pairwise alignments."""
    # set up the root subdirectories
    seqs = aln_dir+"input_seqs/"
    alns = aln_dir+"output_alns/"
    ensure_dir([seqs, alns])
    aln_id = 0
    aln_len = 0
    # cycle through segments
    for segment_pair in coord_array:
        xa, xb, xc, xd = segment_pair
        # extract the corresponding sequence slices
        ref_seq = ref_rec[abs(xa):abs(xb)]
        query_seq = query_rec[abs(xc):abs(xd)]
        # reverse-complement sequences with negative sign
        if xa < 0 :
            ref_seq = ref_seq.reverse_complement()
        if xc < 0 :
            query_seq = query_seq.reverse_complement()
        # write sequences to file
        mscl_in = seqs+str(xa)+"_"+str(xb)+"_"+str(xc)+"_"+str(xd)+".fas"
        write_fasta(mscl_in, [ref_seq, query_seq])
        # skip segments that are too small to align
        if abs(abs(xa)-abs(xb)) < 10:
            idp = 0
        else:
            # set up outfiles
            mscl_out = alns+str(xa)+"_"+str(xb)+"_"+str(xc)+"_"+str(xd)+".aln"
            logfile = aln_dir+"muscle_log.txt"
            # perform alignment
            align_muscle(mscl_in, mscl_out, logfile)
            idntot = parse_clustal_idstars(mscl_out)
            idp = int((float(idntot)/len(query_seq))*100)
            aln_id += idntot
            aln_len += len(query_seq)
        # write details out to segments file
        line = "\t".join([str(xa), str(xb), str(xc), str(xd), str(idp)+"\n"])
        open(segs_file, 'a').write(line)
    overall_id = int((float(aln_id)/aln_len)*100)
    return overall_id
Exemplo n.º 43
0
def save_parameters(dataset, max_pairs, run_id, timestamp):
    """Save a copy of the dataset-specific parameters to file."""
    set_id = dataset['set_id']
    print " ", set_id
    run_root = root_dir+set_id+"/"+run_id+"/"
    report_root = run_root+dirs['reports']
    param_file = report_root+run_id+"_parameters.txt"
    ensure_dir(report_root)
    # primer data
    primers = dataset['primers']
    primers_list = []
    for primer_ID in primers:
        primers_list.append("\t".join([primer_ID, primers[primer_ID]]))
    primers_str = "\n".join(primers_list)
     # samples + barcode data
    samples = dataset['samples']
    samples_list = []
    for sample_ID in samples:
        samples_list.append("\t".join([sample_ID,
                                       samples[sample_ID][0],
                                       samples[sample_ID][1]]))
    samples_str = "\n".join(samples_list)
    # text block
    txt = ["# Run ID", run_id,
           "# Date generated", dataset['date'],
           "# Date processing initiated", timestamp,
           "# Special processing parameters",
           "# read pair length min threshold (ensures overlap)",
           str(rp_min_len),
           "# max number of read pairs to process",
           str(max_pairs),
           "# Dataset specifications",
           "# Illumina FastQ master files",
           dataset['source_fwd'], dataset['source_rev'],
           "# Amplification primers", primers_str,
           "# Sample ID\tLeft tag\tRight tag", samples_str]
    # write to file
    open(param_file, 'w').write("\n".join(txt))
    print "\t", "Run parameters saved to file"
Exemplo n.º 44
0
def map_ref_segs(run_ref, run_id, r_root_dir, run_dirs, min_size,
                 fct_flags, fct_colors, idpt): 
    """Generate map of reference contig with segment details.

    This provides a comparison of the original reference and the
    re-annotated version.

    """
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    ori_file = run_ref.file
    ref_maps_root = run_root+run_dirs['ref_map_dir']
    ensure_dir([ref_maps_root])
    gbk_file = run_root+run_dirs['ref_gbk_dir']+ref_n+"_re-annot.gbk"
    map_file = ref_maps_root+ref_n+"_ref.pdf"
    # start mapping
    try:
        # make mock segment, full-length with 100% id
        record = load_genbank(gbk_file)
        length = len(record.seq)
        segdata = [[1, length, 1, length, 100]]
        # deactivate offsetting
        g_offset = (0,0)
        q_invert = False
        # generate graphical map
        pairwise_draw(ref_n+"_ra", ref_n+"_ori", gbk_file, ori_file,
                     segdata, map_file, q_invert, g_offset, 'dual', 'dual',
                     'm', 'fct', 'product', min_size, fct_flags,
                     fct_colors, idpt)
    except IOError:
        msg = "\nERROR: could not load segments data"
        run_ref.log(msg)
        print msg
    except StopIteration:
        msg = "\nERROR: could not make map"
        run_ref.log(msg)
        print msg
Exemplo n.º 45
0
def basic_batch_blast(genomes, run_ref, blast_mode, r_root_dir, run_dirs,
                      fixed_dirs, blast_prefs, run_id, timestamp):
    """Send batch jobs to Blast. Muxes to multiple reference DBs."""
    # load inputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    in_root = run_root+run_dirs['ref_seg_dir']+ref_n+"/"
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Blast segs to genomes @", timestamp, "\n\n"])
    run_ref.log(logstring)
    # do blast
    for seg in run_ref.segs:
        input_file = in_root+ref_n+"_"+seg['name']+".fas"
        # translate if required
        if blast_mode == 'tn':
            record = load_fasta(input_file)
            record.seq = record.seq.translate()
            input_file = in_root+ref_n+"_"+seg['name']+"_aa.fas" # substitute
            write_fasta(input_file, record)
        out_dir = run_root+run_dirs['blast_out_dir']+ref_n+"/"+seg['name']+"/"
        ensure_dir([out_dir])
        print "\t", seg['name'],
        for genome in genomes:
            g_name = genome['name']
            db_path = fixed_dirs['blast_db_dir']+g_name
            outfile = out_dir+g_name+"_out.txt"
            print ".",
            if blast_mode == 'n':
                local_blastn_2file(input_file, db_path, outfile, blast_prefs)
            elif blast_mode == 'tx':
                local_tblastx_2file(input_file, db_path, outfile, blast_prefs)
            elif blast_mode == 'tn':
                local_tblastn_2file(input_file, db_path, outfile, blast_prefs)
        print ""
    run_ref.log("All OK")
    return "OK"
Exemplo n.º 46
0
def annot_genome_contigs(run_ref, prot_db_name, fixed_dirs, r_root_dir,
                         run_id, run_dirs, genomes, project_id, timestamp,
                         blast_prefs): 
    """Annotate genome contigs (predict ORFs and assign function)."""
    # locate the COG database
    prot_db = fixed_dirs['ref_dbs_dir']+prot_db_name
    # TODO: add other DB / pfams?
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    fas_ctgs_root = run_root+run_dirs['match_out_dir']+ref_n+"/"
    ctg_cds_root = fixed_dirs['ctg_cds_dir']
    ctg_prot_root = fixed_dirs['ctg_prot_dir']
    ctg_blast_root = fixed_dirs['ctg_blast_dir']
    g_gbk_ctgs_root = fixed_dirs['gbk_contigs_dir']
    r_gbk_ctgs_root = run_root+run_dirs['run_gbk_ctgs_dir']+ref_n+"/"
    annot_trn_root = fixed_dirs['annot_trn_dir']
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Annotate genome contigs @", timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs
        g_name = genome['name']
        fas_ctgs_dir = fas_ctgs_root+g_name+"/"
        g_file = fixed_dirs['ori_g_dir']+genome['file']
        print '\t', g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # set output files
        training_file = annot_trn_root+g_name+"_annot.trn"
        # set output dirs
        ctg_cds_dir = ctg_cds_root+g_name+"/"
        ctg_prot_dir = ctg_prot_root+g_name+"/"
        ctg_blast_dir = ctg_blast_root+g_name+"/"
        g_gbk_ctgs_dir = g_gbk_ctgs_root+g_name+"/"
        r_gbk_ctgs_dir = r_gbk_ctgs_root+g_name+"/"
        ensure_dir([ctg_cds_dir, ctg_prot_dir, ctg_blast_dir,
                    g_gbk_ctgs_dir, r_gbk_ctgs_dir])
        # list fasta files in matches directory
        dir_contents = listdir(fas_ctgs_dir)
        for item in dir_contents:
            pattern = re.compile(r'.*_(\d*)\.fas$')
            match = pattern.match(item)
            if match:
                ctg_num = match.group(1)
                print ctg_num,
                logstring = "".join(["\t", ctg_num])
                run_ref.log(logstring)
                # set inputs and outputs
                ctg_fas = fas_ctgs_dir+item
                g_ctg_gbk = g_gbk_ctgs_dir+g_name+"_"+ctg_num+".gbk"
                r_ctg_gbk = r_gbk_ctgs_dir+g_name+"_"+ctg_num+".gbk"
                annot_gbk = ctg_cds_dir+g_name+"_"+ctg_num+"_cds.gbk"
                annot_aa = ctg_prot_dir+g_name+"_"+ctg_num+"_aa.fas"
                blast_out = ctg_blast_dir+g_name+"_"+ctg_num+".xml"
                if path.exists(blast_out) and os.stat(blast_out)[6]==0:
                    os.remove(blast_out)
                if not path.exists(r_ctg_gbk):
                    if not path.exists(g_ctg_gbk):
                        l_tag_base = g_name+"_"+ctg_num
                        record = annot_ctg(g_file, ctg_fas, annot_gbk,
                                           annot_aa, training_file, prot_db,
                                           blast_out, l_tag_base, blast_prefs)
                        record.description = g_name+"_"+ctg_num
                        record.name = g_name+"_"+ctg_num
                        record.dbxrefs = ["Project: "+project_id+"/"+ref_n
                                          +"-like backbones"]
                        record.seq.alphabet = generic_dna
                        write_genbank(g_ctg_gbk, record)
                    copyfile(g_ctg_gbk, r_ctg_gbk)
        print ""
Exemplo n.º 47
0
def batch_contig_annot(dataset):
    """Extract and annotate contigs."""
    # identify dataset contig file
    contigs_file = dirs['assembly_dir']+dataset['f_nick']+'/'+'contigs.fa'
    # locate the COG database
    cog_db = dirs['blast_db_dir']+'Cog_LE/Cog'
    # make the training file
    training_file = dirs['annot_dir']+dataset['f_nick']+'/'+'contigs.trn'
    #train_prodigal(contigs_file, training_file)
    # set output dirs
    fas_out_dir = dirs['annot_dir']+dataset['f_nick']+'/fasta/'
    gbk_out_dir = dirs['annot_dir']+dataset['f_nick']+'/predict/'
    aa_out_dir = dirs['annot_dir']+dataset['f_nick']+'/aa/'
    blast_out_dir = dirs['annot_dir']+dataset['f_nick']+'/rpsblast/'
    solid_out_dir = dirs['annot_dir']+dataset['f_nick']+'/genbank/'
    maps_out_dir = dirs['annot_dir']+dataset['f_nick']+'/maps/'
    ensure_dir(fas_out_dir)
    ensure_dir(gbk_out_dir)
    ensure_dir(aa_out_dir)
    ensure_dir(blast_out_dir)
    ensure_dir(solid_out_dir)
    # set phage hit collector
    contig_hits = {}
    sp_hit_list = dirs['annot_dir']+dataset['f_nick']+'/'\
                                   +dataset['f_nick']+'_kw_hits.html'
    all_hit_list = dirs['annot_dir']+dataset['f_nick']+'/'\
                                    +dataset['f_nick']+'_all_hits.html'
    sp_hit_list_handle = open(sp_hit_list, 'w')
    all_hit_list_handle = open(all_hit_list, 'w')
    sp_hit_list_handle.write("<ul>")
    all_hit_list_handle.write("<ul>")
    # load all contigs
    contigs_list = load_multifasta(contigs_file)
    # cycle through contigs
    ctg_count = 0
    gene_count = 0
    for contig in contigs_list:
        ctg_count +=1
        # use regex to acquire relevant record ID info
        pattern = re.compile(r'NODE_(\d*)_length_(\d*)_cov_(\d*)')
        match = pattern.match(contig.id)
        nick = match.group(1)+'_'+match.group(2)+'_'+match.group(3)
        contig.id = nick
        fasta_out = fas_out_dir+nick+'.fas'
        # write record to file
        write_fasta(fasta_out, contig)
        # create contig entry in dict
        contig_hits[nick] = []
        # run the annotation
        annot_gbk = gbk_out_dir+nick+'.gbk'
        annot_aa = aa_out_dir+nick+'.fas'
        #run_prodigal(fasta_out, annot_gbk, annot_aa, training_file)
        # blast the amino acids against COG
        print '\tblasting', dataset['f_nick'], nick
        blast_out = blast_out_dir+nick+'.xml'
        if path.isfile(blast_out):
            print "\t\talready blasted"
        else:
            local_rpsblast_2file(annot_aa, cog_db, blast_out, blast_prefs)
        # collect best hits
        rec_cogs = collect_cogs(blast_out)
        map_file = maps_out_dir+nick+'.pdf'
        # consolidate annotated genbank file
        record = load_fasta(fasta_out)
        aa_defs = load_multifasta(annot_aa)
        features = []
        counter = 1
        ctg_flag_1 = 0
        ctg_flag_2 = 0
        for protein in aa_defs:
            gene_count +=1
            # get feature details from description line
            # necessary because the prodigal output is not parser-friendly
            pattern = re.compile(r'\d+_\d+_\d+_\d+_\d+\s+\S+\s+(\d+)\s+\S+\s+(\d+)\s+\S+\s+(\S*\d)')
            match = pattern.match(protein.description)
            start_pos = int(match.group(1))
            end_pos = int(match.group(2))
            strand_pos = int(match.group(3))
            feat_loc = FeatureLocation(start_pos, end_pos)
            annotation = rec_cogs['Query_'+str(counter)]
            if ctg_flag_1 is 0:
                all_hit_list_handle.write("</ul><br><a href='"
                                          +"../../../../"
                                          +map_file
                                          +"'>Contig "
                                          +nick+"</a><ul>")
                ctg_flag_1 = 1
            all_hit_list_handle.write("<li>"+str(counter)
                                            +'. '+annotation+"</li>")
            # detect phage content in annotation
            phi_pattern = re.compile(r".+(COG\d+).+"
                                      "(phage|capsid|muramidase|tail|"
                                      "replication|helicase|polymerase|"
                                      "integrase|recombinase"
                                      "suppressor|hydrolase|transposase).+",
                                     re.IGNORECASE)
            phi_match = phi_pattern.match(annotation)
            if phi_match:
                hit_flag = 'on'
                hit_dict = {'CDS': counter,
                            'annot': annotation,
                            'COGs': phi_match.group}
                contig_hits[nick].append(hit_dict)
                # write out to summary file
                if ctg_flag_2 is 0:
                    sp_hit_list_handle.write("</ul><br><a href='"
                                             +"../../../../"
                                             +map_file
                                             +"'>Contig "
                                             +nick+"</a><ul>")
                    ctg_flag_2 = 1
                sp_hit_list_handle.write("<li>"+str(counter)
                                          +'. '+annotation+"</li>")
            else:
                hit_flag = 'off'
            # consolidation feature annotations
            quals = {'note': protein.description,
                     'fct': annotation,
                     'flag': hit_flag}
            feature = SeqFeature(location=feat_loc,
                                 strand=strand_pos,
                                 id=protein.id,
                                 type='CDS',
                                 qualifiers=quals)
            features.append(feature)
            counter +=1
        record.features = features
        record.description = dataset['f_nick']+'_contig_'+nick
        record.name = nick
        record.dbxrefs = ['Project:np1']
        record.seq.alphabet = generic_dna
        gbk_out = solid_out_dir+nick+'.gbk'
        write_genbank(gbk_out, record)
        # generate graphical map
        ContigDraw(nick, gbk_out, map_file)
    sp_hit_list_handle.write("</ul>")
    all_hit_list_handle.write("</ul>")
    sp_hit_list_handle.close()
    all_hit_list_handle.close()
    print "\t", gene_count, "predicted genes in", ctg_count, "contigs"
Exemplo n.º 48
0
def demux_illumina(dataset, max_pairs, run_id):
    """Demultiplex Illumina dataset.

    From separate forward/reverse read sets, combine read pairs and output
    to separate files for each sample based on barcode tags. As part of the
    process, reject read pairs that have mismatching tags or primers and trim
    the rest, removing primer+tag and low-quality sequences.
    """
    # identify inputs and outputs
    set_id = dataset['set_id']
    print " ", set_id
    run_root = root_dir+set_id+"/"+run_id+"/"
    ori_root = root_dir+set_id+"/"+dirs['master']
    fwd_file = ori_root+dataset['source_fwd']
    rev_file = ori_root+dataset['source_rev']
    demux_root = run_root+dirs['demux']
    report_root = run_root+dirs['reports']
    qc_dir = "qc_details/"
    qc_main_file = report_root+"quality_control.html"
    cntsplt = report_root+"sample_counts"
    ensure_dir(ori_root)
    ensure_dir(demux_root)
    ensure_dir(report_root)
    ensure_dir(report_root+qc_dir)
    # set up files for reporting
    html_comps = ["<p><b>Quality control for run "+run_id+"</b></p>",
                  "<p><img src='sample_counts.png' alt='sample_counts'/></p>",
                  "<p><table border='1'><tr>",
                  "<th>Sample</th>",
                  "<th>Accepted</th>",
                  "<th>Rejected</th>",
                  "<th>Total</th>",
                  "<th>% OK</th></tr>"]
    html_block = "".join(html_comps)
    open(qc_main_file, 'w').write(html_block)
    # prepare primers and barcodes info
    primers = dataset['primers']
    samples = dataset['samples']
    tag_pairs = samples.values()
    assert len(primers) >= 2
    assert len(samples) >= 1
    assert len(tag_pairs) >= 1
    # prepare container and files for output batching and reporting
    hits_dict = {}
    for sample_id in samples:
        hits_dict[sample_id] = {'buffer': [], 'countY': 0, 'countN': 0}
    # add containers for rejected read pairs
    hits_dict['bad_tags'] = {'buffer': [], 'countY': 0, 'countN': 0}
    hits_dict['bad_qual'] = {'buffer': [], 'countY': 0, 'countN': 0}
    # initialize files
    for sample_id in samples:
        dmx_out = demux_root+sample_id+"_readpairs.txt"
        open(dmx_out, 'w').write('')
    open(demux_root+"bad_tags"+"_readpairs.txt", 'w').write('')
    open(demux_root+"bad_qual"+"_readpairs.txt", 'w').write('')
    # iterate through reads
    pair_count = 0
    for titles, seqs, quals in FastqJointIterator(open(fwd_file),
                                                  open(rev_file)) :
        F_title = titles[0][0]
        R_title = titles[0][1]
        F_seq = seqs[0][0].upper()
        R_seq = seqs[0][1].upper()
        F_qual = quals[0][0]
        R_qual = quals[0][1]
        flip = False
        sample_id = False
        # iterate through barcode tags
        # TODO: implement more robust solution to ambiguous base problem
        for tag_pair in tag_pairs:
            L_tag1 = (tag_pair[0]+primers['fwdRA']).upper()
            L_tag2 = (tag_pair[0]+primers['fwdRG']).upper()
            R_tag = (tag_pair[1]+primers['rev']).upper()
            tag_hit = False
            while True:
                # start by checking For R_tag since there's only one
                if not R_seq.find(R_tag, 0, len(R_tag)) is 0:
                    if not F_seq.find(R_tag, 0, len(R_tag)) is 0:
                        # no R_tag match -> reject
                        break
                    else: # is there an L_tag in R_seq?
                        while True:
                            if not R_seq.find(L_tag1, 0, len(L_tag1)) is 0:
                                if not R_seq.find(L_tag2, 0, len(L_tag2)) is 0:
                                    # no L_tag match -> reject
                                    break
                                else:
                                    R_clip = len(L_tag2)
                            else:
                                R_clip = len(L_tag1)
                            tag_hit = True
                            flip = True
                            F_clip = len(R_tag)
                            break
                else: # is there an L_tag in F_seq?
                    while True:
                        if not F_seq.find(L_tag1, 0, len(L_tag1)) is 0:
                            if not F_seq.find(L_tag2, 0, len(L_tag2)) is 0:
                                # no L_tag match -> reject
                                break
                            else:
                                F_clip = len(L_tag2)
                        else:
                            F_clip = len(L_tag1)
                        tag_hit = True
                        R_clip = len(R_tag)
                        break
                break
            if not tag_hit:     # continue iterating
                sample_id = False
            else:               # got it, stop iterating
                sample_id = key_by_value(samples, tag_pair)[0]
                break
        # in case no matches were found with any of the tags
        if not sample_id:
            sample_id = 'bad_tags'
        # for matched read pairs, clip off tag+primer and strip low qual runs
        else:
            F_trim = F_qual[F_clip:].find('##')
            if F_trim > -1:
                F_seq = F_seq[F_clip:F_clip+F_trim]
                F_qual = F_qual[F_clip:F_clip+F_trim]
            else:
                F_seq = F_seq[F_clip:]
                F_qual = F_qual[F_clip:]
            R_trim = R_qual[R_clip:].find('##')
            if R_trim > -1:
                R_seq = R_seq[R_clip:R_clip+R_trim]
                R_qual = R_qual[R_clip:R_clip+R_trim]
            else:
                R_seq = R_seq[R_clip:]
                R_qual = R_qual[R_clip:]
            if len(F_seq)+len(R_seq) < rp_min_len:
                # increment sample hit 'No' counter
                hits_dict[sample_id]['countN'] +=1
                sample_id = 'bad_qual'
        # bundle read data in ordered string
        readF = str("@%s\n%s\n+\n%s\n" % (F_title, F_seq, F_qual))
        readR = str("@%s\n%s\n+\n%s\n" % (R_title, R_seq, R_qual))
        if flip:
            read_pair = readR+readF
        else:
            read_pair = readF+readR
        # output to the appropriate buffer
        hits_dict[sample_id]['buffer'].append(read_pair)
        # increment sample 'Yes' hit counter
        hits_dict[sample_id]['countY'] +=1
        # when buffer capacity is reached, output to file and reset buffer
        if hits_dict[sample_id]['countY']% 100000==0:
            dmx_out = demux_root+sample_id+"_readpairs.txt"
            dump_buffer(dmx_out, hits_dict[sample_id]['buffer'])
            hits_dict[sample_id]['buffer'] = []
        # increment counter
        pair_count +=1
        # report on the progress
        if pair_count%1000000==0:
            print "\t", pair_count, "reads processed", datetime.now()
        if pair_count == max_pairs: # for testing purposes
            break
    print "\t", "Total", pair_count, "read pairs processed"
    print "\t", "Counts per sample:"
    # prepare graphing data containers
    pcntY = []
    pcntN = []
    sample_ids = []
    # write out whatever remains in each of the samples buffers
    for sample_id in samples:
        dmx_out = demux_root+sample_id+"_readpairs.txt"
        dump_buffer(dmx_out, hits_dict[sample_id]['buffer'])
        hits_dict[sample_id]['buffer'] = []
        acc = hits_dict[sample_id]['countY']
        rej = hits_dict[sample_id]['countN']
        print "\t\t", sample_id, acc, "pairs", datetime.now()
        pcntY.append(acc)
        pcntN.append(rej)
        sample_ids.append(sample_id)
        # generate FastQC report (use --noextract to not open zipped reports)
        run_FastQC(dmx_out, report_root+qc_dir, '--quiet', ' ')
        #print "see QC report"
        # add line in QC file
        link = qc_dir+sample_id+"_readpairs_fastqc/fastqc_report.html"
        html_comps = ["<tr>",
                      "<th><a href='"+link+"'>"+sample_id+"</a></th>",
                      "<td>", str(acc), "</td>",
                      "<td>", str(rej), "</td>",
                      "<td>", str(acc+rej), "</td>",
                      "<td>", str(int((float(acc)/(acc+rej))*100)),
                      "</td></tr>"]
        html_block = "".join(html_comps)
        open(qc_main_file, 'a').write(html_block)
    # write out whatever remains in the bad_qual buffer
    dmx_out = demux_root+"bad_qual_readpairs.txt"
    dump_buffer(dmx_out, hits_dict['bad_qual']['buffer'])
    hits_dict['bad_qual']['buffer'] = []
    print "\t\t", "rejected (low quality)", hits_dict['bad_qual']['countY'],\
    datetime.now()
    # generate FastQC report (use --noextract to not open zipped reports)
    run_FastQC(dmx_out, report_root+qc_dir, '--quiet', ' ')
    #print "see QC report"
    # add line in QC file
    link = qc_dir+"bad_qual_readpairs_fastqc/fastqc_report.html"
    html_comps = ["<tr>",
                  "<th><a href='"+link+"'>"+"bad_qual"+"</a></th>",
                  "<td>", '0', "</td>",
                  "<td>", str(hits_dict['bad_qual']['countY']), "</td>",
                  "<td>", str(hits_dict['bad_qual']['countY']), "</td>",
                  "<td>", '0',"</td></tr>"]
    html_block = "".join(html_comps)
    open(qc_main_file, 'a').write(html_block)
    # write out whatever remains in the bad_tags buffer
    dmx_out = demux_root+"bad_tags_readpairs.txt"
    dump_buffer(dmx_out, hits_dict['bad_tags']['buffer'])
    hits_dict['bad_tags']['buffer'] = []
    print "\t\t", "rejected (bad tags)", hits_dict['bad_tags']['countY'],\
    datetime.now()
    # generate FastQC report (use --noextract to not open zipped reports)
    run_FastQC(dmx_out, report_root+qc_dir, '--quiet', ' ')
    #print "see QC report"
    # add line in QC file
    link = qc_dir+"bad_tags_readpairs_fastqc/fastqc_report.html"
    html_comps = ["<tr>",
                  "<th><a href='"+link+"'>"+"bad_tags"+"</a></th>",
                  "<td>", '0', "</td>",
                  "<td>", str(hits_dict['bad_tags']['countY']), "</td>",
                  "<td>", str(hits_dict['bad_tags']['countY']), "</td>",
                  "<td>", '0',"</td></tr>"]
    html_block = "".join(html_comps)
    open(qc_main_file, 'a').write(html_block)
    # close table and add notes
    line_bq = "rejected after demultiplexing due to low sequence quality \
    (top stacks in bar chart)"
    line_bt = "could not be assigned to a sample due to mismatches in tag \
    and/or primer"
    html_comps = ["</table></p>",
                  "<p><b>", "bad_qual", ": </b>", line_bq,
                  "<br><b>", "bad_tags", ": </b>", line_bt, "</p>",]
    html_block = "".join(html_comps)
    open(qc_main_file, 'a').write(html_block)
    # add bad tags category for counts graphing (switch is on purpose)
    pcntY.append(hits_dict['bad_tags']['countN'])
    pcntN.append(hits_dict['bad_tags']['countY'])
    sample_ids.append('bad_tags')# check that the totals add up
    assert pair_count == sum(pcntY)+sum(pcntN)
    # plot the read counts per sample
    series = pcntY, pcntN
    legend = 'Accepted', 'Rejected'
    colors = 'g', 'r'
    titles = 'Number of read pairs', 'Read pairs per sample'
    two_storey_bar_chart(series, sample_ids, legend, colors, cntsplt, titles)
Exemplo n.º 49
0
def merge_pair_libs(dataset, run_id):
    """Merge read pairs from Illumina sample libs and output FastA."""
    # identify inputs and outputs
    set_id = dataset['set_id']
    print " ", set_id
    run_root = root_dir+set_id+"/"+run_id+"/"
    dmx_root = run_root+dirs['demux']
    merged_root = run_root+dirs['merged']
    report_root = run_root+dirs['reports']
    master_file = run_root+dirs['merged']+run_id+".fas"
    ensure_dir(merged_root)
    ensure_dir(report_root)
    merger_file = report_root+"merged_pairs.html"
    cntsplt = report_root+"merge_counts"
    samples = dataset['samples']
    # set up files for reporting
    html_comps = ["<p><b>Read pairs merged for run ", run_id, "</b></p>",
                  "<p><img src='merge_counts.png' alt='merge_counts'/></p>",
                  "<p><table border='1'><tr>",
                  "<th>Sample</th>",
                  "<th>Accepted</th>",
                  "<th>Rejected</th>",
                  "<th>Total</th>",
                  "<th>% OK</th></tr>"]
    html_block = "".join(html_comps)
    open(merger_file, 'w').write(html_block)
    # initialize master file
    open(master_file, 'w').write('')
    # merge per sample (demuxed)
    merge_countA = []
    merge_countR = []
    sample_ids = samples.keys()
    for sample_id in sample_ids:
        print "\t", sample_id,
        lib_file = dmx_root+sample_id+"_readpairs.txt"
        merge_out = merged_root+sample_id+"_merged.fas"
        open(merge_out, 'w').write('')
        # prepare container and files for output batching and reporting
        buffer = []
        countY = 0
        countF = 0
        countN = 0
        # iterate through the read pairs
        count = 0
        for titles, seqs, quals in FastqGGIterator(open(lib_file)):
            count +=1
            seq1 = seqs[0]
            seq2 = seqs[1]
            qual1 = quals[0]
            qual2 = quals[1]
            # merge reads   TODO: better safeguard against merge failure
            try: merged = merge_overlaps(seq1, qual1, seq2, qual2)
            except: countF +=1
            else:
                if merged.find('N') > -1:
                    countN +=1  # if there are still N quality must be too low
                else:
                    countY +=1
                    # compose string for output
                    mcomps = [">",sample_id,"_",str(count),"\n",merged,"\n"]
                    mstring = "".join(mcomps)
                    # output to buffer
                    buffer.append(mstring)
            # when buffer capacity is reached, output to file and reset buffer
            if countY % 10000==0:
                dump_buffer(merge_out, buffer)
                dump_buffer(master_file, buffer)
                buffer = []
        # write out whatever remains in the buffer
        dump_buffer(merge_out, buffer)
        dump_buffer(master_file, buffer)
        # sum up
        assert countY+countF+countN == count
        print count, "pairs", datetime.now()
        print "\t\t", str(countY), "merged and accepted"
        print "\t\t", str(countN), "merged but rejected due to residual Ns"
        print "\t\t", str(countF), "failed to merge"
        # add line in QC file
        html_comps = ["<tr>",
                      "<th>", sample_id, "</b></th>",
                      "<td>", str(countY), "</td>",
                      "<td>", str(countN + countF), "</td>",
                      "<td>", str(count), "</td>",
                      "<td>", str(int((float(countY)/count)*100)),
                      "</td></tr>"]
        html_block = "".join(html_comps)
        open(merger_file, 'a').write(html_block)
        # pass values
        merge_countA.append(countY)
        merge_countR.append(countN+countF)
    # close table and add notes
    line_N = "either failed to merge or still contained Ns after merging"
    html_comps = ["</table></p>",
                  "<p><b>", "Rejected", ":</b> ", line_N, "</p>"]
    html_block = "".join(html_comps)
    open(merger_file, 'a').write(html_block)
    # plot the read counts per sample
    series = merge_countA, merge_countR
    lgnd = 'Accepted', 'Rejected'
    colors = 'g', 'r'
    titles = 'Number of read pairs', 'Read pairs merged per sample'
    two_storey_bar_chart(series, sample_ids, lgnd, colors, cntsplt, titles)
Exemplo n.º 50
0
def process_ref(ref, ref_annot_flag, r_root_dir, fixed_dirs, run_dirs,
                run_id, timestamp, prot_db_name, project_id):
    """Re-annotate contig and extract reference segments using coordinates."""
    # set inputs and outputs
    run_root = r_root_dir+run_id+"/"
    ref_name = ref['name']
    in_file = fixed_dirs['ori_g_dir']+ref['file']
    seg_out_root = run_root+run_dirs['ref_seg_dir']+ref_name+"/"
    gen_fas_root = fixed_dirs['fas_contigs_dir']+ref_name+"/"
    if ref_annot_flag:
        ref_gbk = run_root+run_dirs['ref_gbk_dir']+ref_name+"_re-annot.gbk"
    else: ## bypass re-annotated ONLY IF ORIGINAL INPUT IS GBK #todo: fix
        ref_gbk = in_file
    ref_fas = run_root+run_dirs['ref_fas_dir']+ref_name+".fas"
    genome_fas = gen_fas_root+ref_name+"_1.fas"
    report_root = run_root+run_dirs['reports']+ref_name+"/"
    ref_log = report_root+run_id+"_"+ref_name+"_log.txt"
    ensure_dir([seg_out_root, report_root, gen_fas_root])
    print " ", ref_name, "...",
    # initialize run_ref object
    run_ref = Reference(ref_name, in_file, ref['input'], ref['seg_mode'],
                        ref['capture'], ref_fas, ref_gbk, seg_out_root,
                        ref_log)
    # initialize reference log
    cl_header = ["# Console log:", run_id, "/", ref_name, timestamp, "\n\n"]
    open(ref_log, 'w').write(" ".join(cl_header))
    # open record and ensure we have a fasta in the right place
    if not path.exists(ref_fas):
        if run_ref.input == 'fas':
            copyfile(in_file, ref_fas)
        elif run_ref.input == 'gbk':
            record = load_genbank(in_file)
            record.id = ref_name
            write_fasta(ref_fas, record)
        else:
            msg = "ERROR: Input not recognized for "+ref_name
            run_ref.log(msg)
            raise Exception(msg)
    # make a BLAST DB
    make_ref_DB(ref, run_id, fixed_dirs, r_root_dir, run_dirs)
    copyfile(ref_fas, genome_fas)
    # re-annotate ref contig
    if ref_annot_flag:
        record = annot_ref(ref_name, ref_fas, prot_db_name, fixed_dirs,
                           project_id)
    else: ## bypass re-annotation ONLY IF ORIGINAL INPUT IS GBK #todo: fix
        record = load_genbank(in_file)
    # load or generate segment definitions
    if run_ref.seg_mode == 'chop':
        run_ref.get_segs_from_chop(len(record.seq), ref['chop_size'])
    elif run_ref.seg_mode == 'list':
        run_ref.get_segs_from_list(ref['segs'])
    elif run_ref.seg_mode == 'feats':
        run_ref.get_segs_from_feats(ref['feat_type'])
    # extract segment sequences
    rec_annot = run_ref.extract_segs_seqs(record, seg_out_root)
    # write re-annotated reference sequence to file
    write_genbank(ref_gbk, rec_annot)
    # report results
    logstring = " ".join([str(len(run_ref.segs)), "segments"])
    print logstring
    run_ref.log(logstring)
    return run_ref
Exemplo n.º 51
0
def unpack_genomes(genome, separator, fixed_dirs, ctg_thresholds):
    """Unpack genome files.

    Here, unpacking means extracting data and producing specific files to
    standardize how the information is made available to downstream analysis.
    Depending on the input file format, different unpacking methods are
    invoked. In all cases, this ensures that for each genome, there is a
    multifasta file of the contigs all together as well as a separate Genbank
    file for each contig.

    Supported input file formats are the following:
    - mfas: Basic whole genome sequence in multifasta file of contigs. This
    can be used to process a finished genome in a single Fasta file as well.
    - cgbk: All contigs concatenated in a single GenBank file (Genoscope,
    French WGS). This can be used to process a finished genome in a single
    GanBank file as well.
    # TODO: provide support for other possible input formats

    Unpacking 'cgbk' genomes involves an initial step to detect occurrences
    of the sequence separator and collect the start and stop coordinates of
    each contig. Each pair of coordinates can then be used to extract the
    contig sequence and create a SeqRecord for that contig, which SeqIO
    normally does when it unpacks multifasta files.

    """
    # set up inputs
    infile = genome['file'] #TODO: make GUI input loader (upstream)
    inpath = fixed_dirs['ori_g_dir']+infile
    g_name = genome['name']
    print " ", g_name, "...",
    # prep output destinations
    mfas_dir = fixed_dirs['mfas_contigs_dir']
    fas_dir = fixed_dirs['fas_contigs_dir']+g_name+"/"
    ensure_dir([mfas_dir, fas_dir])
    mfas_file = mfas_dir+g_name+"_contigs.fas"
    records = []
    # select unpacking method
    if genome['input'] is 'fas':
        try: path.exists(inpath) is True
        except ValueError: raise Exception("Bad input file path")
        genome_recs = load_multifasta(inpath)
        # generate GenBank files
        counter = 0
        for rec in genome_recs:
            counter +=1
            ctg_num = str(counter)
            new_id = g_name+"_"+ctg_num  # workaround for long ids
            new_seq = rec.seq
            new_seq.alphabet = generic_dna
            new_rec = SeqRecord(seq=new_seq, id=new_id)
            records.append(new_rec)  # for multifasta output
            fas_file = fas_dir+new_id+".fas"
            write_fasta(fas_file, new_rec)
    elif genome['input'] is 'gbk':
        # load in genome data
        genome_rec = load_genbank(inpath)
        g_string = genome_rec.seq
        # find split coordinates
        coord_pairs = multisplit_finder(g_string, separator)
        # split record
        counter = 0
        for (start, stop) in coord_pairs:
            counter +=1
            ctg_num = str(counter)
            new_record = genome_rec[start:stop]
            new_record.id = g_name+"_"+ctg_num
            records.append(new_record)  # for multifasta output
            fas_file = fas_dir+g_name+"_"+ctg_num+".fas"
            write_fasta(fas_file, new_record)
    else:
        xmsg = "Input file format "+genome['input']+" unspecified/unsupported"
        raise Exception(xmsg)
    print counter, "contigs"
    # write master file
    write_fasta(mfas_file, records)
    # pass records to stats logger
    ctg_stats(g_name, fixed_dirs, ctg_thresholds, records)
Exemplo n.º 52
0
def build_scaffolds(run_ref, r_root_dir, run_dirs, prox_D, separator,
                    genomes, run_id, timestamp, mtype, mode):
    """Build a scaffold of contigs based on the reference.

    This takes contigs that gave positive hits when blasted with reference
    segments. The contigs were aligned against the complete reference in a
    previous step for mapping purposes. Now the output of that step is re-used
    determine their position. A caveat is that if there are natural local
    rearrangements in the sequence relative to the reference, they may not be
    resolved appropriately. The problem is somewhat moderated by the fact that
    this function takes the best (usually the largest) hit region as "anchor"
    to position the contig within the scaffold. But if the rearranged region
    takes up a significant portion of the contig length, the anchoring will
    probably not be called correctly. Visual inspection of the finalized
    maps should help diagnose any such problems. The order can be fixed
    manually using the Mauve Contig Mover, which is part of Mauve 2.

    Note that not all hit contigs are "real" hits, so filtering should be
    applied before scaffolding to generate constructs.

    Model-based filtering produces a list of contigs that will be passed to
    the scaffolder. If filtering manually by looking at the maps,
    there are two options available: either select exclusively OR exclude a
    subset of contigs for the scaffolding process. This is done by listing
    their ID number in the genome dictionaries in the config file then
    resuming the pipeline from this step.

    """
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    ctgs_root = run_root+run_dirs['run_gbk_ctgs_dir']+ref_n+"/"
    mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/contigs/"
    scaffolds_dir = run_root+run_dirs['scaffolds_dir']+ref_n+"/"
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Build scaffold constructs @", timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs
        g_name = genome['name']
        ctgs_dir = ctgs_root+g_name+"/"
        print "\t", g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # set outputs
        mauve_dir = mauve_root+g_name+"/"
        ensure_dir([mauve_dir, scaffolds_dir])
        scaff_fas = scaffolds_dir+g_name+"_"+ref_n+"_scaffold.fas"
        scaff_gbk = scaffolds_dir+g_name+"_"+ref_n+"_scaffold.gbk"
        # list genbank files in matches directory
        dir_contents = listdir(ctgs_dir)
        anchors_array = np.zeros(1, dtype=[('ctg', 'i4'),
                                           ('start', 'i4'),
                                           ('end', 'i4'),
                                           ('orient', 'i2')])
        # identify contigs we want to select
        subset = []
        for item in dir_contents:
            pattern = re.compile(r'.*_(\d*)\.gbk$')
            match = pattern.match(item)
            if match:
                ctg_num = match.group(1)

                if mode == "exclude":
                    try:
                        if int(ctg_num) in genome[mode]:
                            msg = "("+ctg_num+")"
                            print msg,
                            run_ref.log(msg)
                        else:
                            subset.append(ctg_num)
                    except KeyError:
                        msg = "WARNING: no ignored segments list, including all"
                        print msg
                        msg = ctg_num
                        print msg,
                        subset.append(ctg_num)
                        run_ref.log(msg)
                        
                elif mode == "select":
                    try:
                        if int(ctg_num) in genome[mode]:
                            msg = ctg_num
                            print msg,
                            run_ref.log(msg)
                            subset.append(ctg_num)
                        else:
                            msg = "("+ctg_num+")"
                            print msg,
                            run_ref.log(msg)
                    except KeyError:
                        msg = "WARNING: no selected segments list, including all"
                        print msg
                        msg = ctg_num
                        print msg,
                        subset.append(ctg_num)
                        run_ref.log(msg)
                        
        # at this point we should have a subset of contigs selected
        for ctg_num in subset:
            logstring = "".join(["\t", ctg_num])
            run_ref.log(logstring)
            # set inputs
            mauve_file = mauve_dir+ctg_num+".mauve"
            bb_file = mauve_file+".backbone"
            try:
                # parse Mauve output
                coords = mauver_load2_k0(bb_file, prox_D, mtype)
                # determine which segment to use as anchor
                anchor_seg = get_anchor_loc(coords)
                anchors_array = np.insert(anchors_array, 0,
                                          (ctg_num,
                                           anchor_seg['start'],
                                           anchor_seg['end'],
                                           anchor_seg['orient']))
            except IOError:
                msg = "\tERROR: Mauve alignment not found\n\t"
                print msg
                run_ref.log(msg)
            except Exception:
                msg = "\tERROR: Iteration failure\n\t"
                print msg
                run_ref.log(msg)

        # abort if there is no valid contig to proceed with
        try:
            assert len(anchors_array) > 1 # always 1 left from stub
        except AssertionError:
            msg = "\tWARNING: Contig list empty\n\t"
            print msg
            run_ref.log(msg)
        else:
            # order contigs by anchor location
            anchors_array = np.sort(anchors_array, order='start')
            # load contig records from the genbank files in the matches directory
            ctg_list = []
            for ctg_anchor in anchors_array:
                ctg_num = ctg_anchor['ctg']
                if ctg_num > 0:
                    contig_gbk = ctgs_dir+g_name+"_"+str(ctg_num)+".gbk"
                    record = load_genbank(contig_gbk)
                    if ctg_anchor['orient'] == -1: # flip record
                        record = record.reverse_complement(id=True, name=True,
                            annotations=True, description=True)
                    ctg_list.append(record)
                else: # workaround for having 0 value leftover from stub
                    pass # having it might come in handy in later dev
            # output scaffold files
            write_fasta(scaff_fas, ctg_list)
            scaff_record = SeqRecord('', id='temp')
            scaff_bumper = SeqRecord(separator, id='join')
            for record in ctg_list:
                feat_start = len(scaff_record.seq)
                scaff_record += record
                feat_stop = len(scaff_record.seq)
                scaff_record += scaff_bumper
                feat_loc = FeatureLocation(feat_start, feat_stop)
                pattern = re.compile(r'.*_(\d*)$')
                match = pattern.match(record.id)
                try: ctg_num = match.group(1)
                except Exception: ctg_num = 'N'
                feature = SeqFeature(location=feat_loc,
                                     type='contig',
                                     qualifiers={'id': ctg_num})
                scaff_record.features.append(feature)
            scaff_record.description = g_name+" scaffold from "+ref_n
            try:
                scaff_record.id = g_name
                write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper
            except ValueError:
                scaff_record.id = g_name[:10]
                write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper
            print ""
Exemplo n.º 53
0
def glomp_blast_out(dataset, ref_nick):
    """Consolidate Blast output files."""
    # Identify the genome
    nickname = dataset['nickname']
    # Determine the input file root
    root_dir = dirs['blast_out_dir']+nickname+"/"+ref_nick+"/"
    file_root = root_dir+nickname
    # Signal process start
    print "-- Consolidating B_out for", nickname, "against", ref_nick, "--"
    print datetime.now()
    # Cycle through bin types
    series_index = 0
    averages = []       # for comparing series later
    binned_pos = []
    for bin_type in bin_types:
        index = 1
        bin_arrays =[]
        while os.path.isfile(file_root+bin_type+"_"+str(index)+"_blast.out"):
            infile = file_root+bin_type+"_"+str(index)+"_blast.out"
            rec_array = read_array(infile, blast_dtypes)
            if len(rec_array) > 0:
                bin_arrays.append(rec_array)
            index +=1
        print "\t\t"+str(len(bin_arrays)), "arrays for", \
        nickname+bin_type, "series"
        if len(bin_arrays) > 0:
            series = numpy.hstack(bin_arrays)
        else:
            series = []
        print "\t\t"+str(len(series)), "total records in", \
        nickname+bin_type, "series"
        # Save to file
        cons_outfile = file_root+bin_type+"_cons_out.npy"
        numpy.save(cons_outfile, series)
        # Evaluate match positions on reference
        positions = []
        match_read = []
        for row in series:
            # collect match read info while we're at it
            # use regex to extract query index
            query_pattern = re.compile(r'\w*_(\d*)')
            query_match = query_pattern.match(row[0])
            query_index = int(query_match.group(1))
            match_read.append(query_index)
            # use regex to extract ref coords
            ref_pattern = re.compile(r'\w*_\d*_(\d*)')
            ref_match = ref_pattern.match(row[1])
            ref_pos = int(ref_match.group(1))
            pos_scaled = ref_pos/cpm['size']   # adjust to db segment length
            positions.append(pos_scaled)
        # uniquify the match read array
        unique_matches = numpy.unique(match_read)
        print "\t"+str(len(unique_matches)), "unique matches for", bin_type
        # write to file for future use
        match_dir_root = dirs['match_dir']+nickname+"/"+ref_nick+"/"
        ensure_dir(match_dir_root)
        match_outfile = match_dir_root+nickname+bin_type+"_match.npy"
        numpy.save(match_outfile, unique_matches)
        # now count ocurrences per position
        pos_np = numpy.array(positions)
        binned = numpy.bincount(pos_np)
        binned_pos.append(binned)
        pos_count_average = numpy.average(binned)
        averages.append((pos_count_average, series_index))
        series_index +=1
    # compare series
    averages.sort()
    averages.reverse()
    order_indices = []
    for pair in averages:
        order_indices.append(pair[1])
    # identify reference
    ref_name = [reference['full_name'] for reference in references if
                reference['nickname'] is ref_nick]
    # prep directory & file
    fig_root = dirs['reports_dir']+"match_figs/"
    fig_file = fig_root+nickname+"_"+ref_nick+".png"
    ensure_dir(fig_root)
    # generate a figure
    pylot.autoscale(enable=True, axis='both', tight=True)
    pylot.xlabel('Position on the chromosome (/'+str(cpm['size'])+')')
    pylot.ylabel('Number of matches (includes multiples)')
    pylot.title(nickname+' matches to '+ref_name)
    pylot.grid(True)
    for index in order_indices:
        label_root = nickname+bin_types[index]
        label_str = label_root+" ("+str(numpy.sum(binned_pos[index]))+")"
        pylot.plot(binned_pos[index], label=label_str)
    pylot.legend(loc=1)
    pylot.savefig(fig_file, dpi=None, facecolor='w', edgecolor='w',
                  orientation='portrait', papertype=None, format=None)
    pylot.clf()
    print "\t"+str(series_index), "series consolidated and parsed"
    print "-- Done, see plot --"
    print datetime.now()
    return "OK"
Exemplo n.º 54
0
def filter_contigs(run_ref, run_id, genomes, norm_matches, seg_size, threshold, r_root_dir, run_dirs, fixed_dirs, timestamp):
    """Filter contigs."""
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    fas_root = fixed_dirs['fas_contigs_dir']
    report_root = run_root+run_dirs['reports']+ref_n+"/"
    ensure_dir([report_root])
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Filter contigs @", timestamp, "\n"])
    run_ref.log(logstring)
    # process
    
    # evaluate segment specificity using negative controls
    neg_controls = [genome['name'] for genome in genomes if ('ctrl' in genome.keys() and genome['ctrl'] == 'neg')]
    neg_dat = [norm_matches[g_name]['ctg_scores'] for g_name in neg_controls]
    neg_RA = np.vstack(neg_dat)
    neg_mean = nanmean(neg_RA, axis=0)
    # process the genomes we're testing
    test_genomes = [genome['name'] for genome in genomes if not ('ctrl' in genome.keys())]
    for g_name in test_genomes: 
        print "\t", g_name,
        ctg_hits = norm_matches[g_name]['ctg_scores']
        ctg_stats = {}
        #process individual contigs
        counter = 0
        for ctg_RA in ctg_hits:
            # identify this contig by name
            ctg_name = norm_matches[g_name]['ctg_names'][counter]
            counter += 1
            # subtract background signal from match scores
            recal_ctg_RA = np.subtract(ctg_RA, neg_mean)
            recal_ctg_RA = recal_ctg_RA.clip(min=0)
            # compute total similarity score
            s_score = np.sum(recal_ctg_RA)
            # compute clustering score (primitive)
            streak = False
            c_score = 0
            for hit in recal_ctg_RA:
                if hit == 0:
                    if streak == True:
                        c_score += -1
                        streak = False
                    else: 
                        c_score += 0
                elif hit > 0:
                    if streak == True:
                        c_score += 2
                    else: 
                        c_score += 1
                        streak = True
            # compute backbone vs. cargo burden
            ctg_rec = load_fasta(fas_root+g_name+"/"+ctg_name+".fas")
            bbone = np.sum(np.ma.make_mask(recal_ctg_RA))*seg_size
            if bbone > len(ctg_rec):
                bbone = len(ctg_rec)    # workaround for last segment being always a little short
            cargo = len(ctg_rec) - bbone
            # make inverted array mask (used for redundancy detection)
            ctg_mask = np.ma.getmaskarray(np.ma.masked_equal(recal_ctg_RA,0))
            # consolidate contig information
            ctg_stats[ctg_name] = {'s_score': s_score, 
                                    'c_score': c_score, 
                                    'vector': recal_ctg_RA, 
                                    'inv_mask':ctg_mask,
                                    'bbone': bbone,
                                    'cargo': cargo}
        # detect redundant contigs
        ### use np.ma.mask_or(m1, m2)
        ### if any elements returns false there is a redundancy between two contigs
        ### if so evaluate which has better c_score and s_score
        
        # compute overall stats for the genome
        gs_score = sum([ctg_stats[contig]['s_score'] for contig in ctg_stats])
        gc_score = sum([ctg_stats[contig]['c_score'] for contig in ctg_stats])
        g_bbone = sum([ctg_stats[contig]['bbone'] for contig in ctg_stats])
        g_cargo = sum([ctg_stats[contig]['cargo'] for contig in ctg_stats])
        print gs_score, gc_score, g_bbone, g_cargo,
        # 
        if gs_score > threshold:
            ## run plotters again 
            ## pass the genome on to the next step (others will be dropped)
            print "MATCH"
        else:
            print "(-)"
Exemplo n.º 55
0
def align_cstrct2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs,
                     genomes, max_size, chop_mode, mtype, mauve_exec):
    """Align constructs pairwise to the reference contig."""
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    ref_ctg_file = run_ref.file
    mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/constructs/"
    segments_root = run_root+run_dirs['aln_seg_dir']+ref_n+"/constructs/"
    scaff_root = run_root+run_dirs['scaffolds_dir']+ref_n+"/"
    ensure_dir([segments_root])
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Align scaffold constructs to reference @",
                         timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs
        g_name = genome['name']
        scaff_gbk = scaff_root+g_name+"_"+ref_n+"_scaffold.gbk"
        file_list = (ref_ctg_file, scaff_gbk)
        print "\t", g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # set outputs
        mauve_dir = mauve_root+g_name+"/"
        aln_segs_dir = segments_root+g_name+"/"
        ensure_dir([mauve_dir, aln_segs_dir])
        mauve_outfile = mauve_dir+g_name+"_"+ref_n+".mauve"
        segfile = aln_segs_dir+g_name+"_"+ref_n+"_segs.txt"
        # abort if the reference file is not found
        try: open(ref_ctg_file, 'r')
        except IOError:
            msg = "ERROR: Reference file not found"
            print msg
            run_ref.log(msg)
            raise
        # abort if there is no scaffold construct
        try: open(scaff_gbk, 'r')
        except IOError:
            msg = "WARNING: No scaffold construct to align"
            print msg
            run_ref.log(msg)
        else:
            # prep segments file
            open(segfile, 'w').write('')
            # purge any pre-existing sslist file
            sslist_file = scaff_gbk+".sslist"
            if os.path.isfile(sslist_file):
                try: os.remove(sslist_file)
                except Exception: raise
            # do Mauve alignment
            align_mauve(file_list, mauve_outfile, mauve_exec)
            try:
                # parse Mauve output (without initial clumping)
                coords = mauver_load2_k0(mauve_outfile+".backbone", 0, mtype)
                print len(coords), '->',
                logstring = "".join(["\t", str(len(coords))])
                run_ref.log(logstring)
                # chop segments that are too long
                chop_array = chop_rows(coords, max_size, chop_mode, mtype)
                print len(chop_array), 'segments <', max_size, 'bp',
                logstring = "".join(["\t", str(len(chop_array))])
                run_ref.log(logstring)
                # make detailed pairwise alignments of the segments
                ref_rec = load_genbank(ref_ctg_file)
                query_rec = load_genbank(scaff_gbk)
                id = iter_align(chop_array, ref_rec, query_rec,
                                aln_segs_dir, segfile)
                print "@", id, "% id. overall"
                logstring = "".join(["\t", str(id)])
                run_ref.log(logstring)
            except IOError:
                msg = "\nERROR: Mauve alignment failed"
                run_ref.log(msg)
                print msg
Exemplo n.º 56
0
def glompX_blast_out(genomes, run_ref, blast_mode, r_root_dir, run_dirs,
                     run_id, fixed_dirs, blast_dtypes, references,
                     min_nt_match, min_nt_score, min_nt_idp, min_aa_match,
                     min_aa_score, min_aa_idp, capture_span, timestamp):
    """Collect Blast results and extract match contigs."""
    # load inputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    match_root = run_root+run_dirs['match_out_dir']+ref_n+"/"
    capture_root = run_root+run_dirs['capture_dir']+ref_n+"/"
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Collect Blast results @", timestamp, "\n\n"])
    run_ref.log(logstring)
    # collect results
    ref_hits = {}
    control_scores = []
    run_ref.log("Segs/Gs\t")
    run_ref.log("\t".join([genome['name'] for genome in genomes]))
    for seg in run_ref.segs:
        seg_n = seg['name']
        print "\t", seg_n, "...",
        run_ref.log("".join(["\n", seg_n]))
        blast_dir = run_root+run_dirs['blast_out_dir']+ref_n+"/"+seg_n+"/"
        capture_dir = capture_root+"/"+seg_n+"/"
        ensure_dir([blast_dir, capture_dir])
        ref_flag = True
        for genome in genomes:
            g_name = genome['name']
            print "|",
            # process
            if g_name not in ref_hits.keys():
                ref_hits[g_name] = {}
            matches_dir = match_root+g_name+"/"
            ensure_dir([matches_dir])
            blast_infile = blast_dir+g_name+"_out.txt"
            genome_ctg_dir = fixed_dirs['fas_contigs_dir']+g_name+"/"
            rec_array = read_array(blast_infile, blast_dtypes)
            if len(rec_array) > 0:  # take qualified hits
                p_cnt = 0
                n_cnt = 0
                if g_name in [ref['name'] for ref in references]:
                    copyfile(genome_ctg_dir+g_name+"_1.fas",
                             matches_dir+g_name+".fas")
                    if ref_flag:
                        # positive control TODO: better solution
                        control_scores.append(rec_array[0][11])
                        ref_flag = False
                for line in rec_array:
                    idp = line[2]
                    q_start, q_stop = line[8], line[9]
                    score = line[11]
                    length = abs(q_stop-q_start)
                    # check the blast mode to use the right thresholds
                    if blast_mode == 'n' or blast_mode == 'tx':
                        min_match = min_nt_match
                        min_score = min_nt_score
                        min_idp = min_nt_idp
                    elif blast_mode == 'tn':
                        min_match = min_aa_match
                        min_score = min_aa_score
                        min_idp = min_aa_idp
                    else: # default to nucleotide mode
                        min_match = min_nt_match
                        min_score = min_nt_score
                        min_idp = min_nt_idp
                    if length>min_match and score>min_score and idp>min_idp:
                        print "+",
                        p_cnt +=1
                        contig_id = line[1]
                        if contig_id not in ref_hits[g_name].keys():
                            ref_hits[g_name][contig_id] = {seg_n: score}
                        else:
                            ref_hits[g_name][contig_id][seg_n] = score
                        pattern = re.compile(r'('+contig_id+')\.fas')
                        for item in listdir(genome_ctg_dir):
                            match = re.match(pattern, item)
                            if match:
                                fas_file = matches_dir+match.group(1)+".fas"
                                if not path.exists(fas_file):
                                    copyfile(genome_ctg_dir+item, fas_file)
                        # context capture
                        capture_flag = False
                        while True:
                            try:
                                if int(seg_n) in run_ref.capture:
                                    capture_flag = True
                                else:
                                    break
                            except ValueError:
                                if seg_n in run_ref.capture:
                                    capture_flag = True
                                else:
                                    break
                            else:
                                break
                        if capture_flag:
                            # load the sequence
                            contig_file = matches_dir+contig_id+".fas"
                            contig_rec = load_fasta(contig_file)
                            # check orientation
                            if q_start < q_stop:
                                c_start = q_start-capture_span
                                c_stop = q_stop+capture_span
                            else:
                                c_start = q_stop-capture_span
                                c_stop = q_start+capture_span
                            print c_start, c_stop
                            # check limits
                            if c_start < 0:
                                c_start = 1
                            if c_stop > len(contig_rec.seq):
                                c_stop = len(contig_rec.seq)
                            # proceed
                            cxt_file = capture_dir+g_name+"_"+contig_id+".fas"
                            cxt_rec = SeqRecord(id=contig_id+"_"
                                                    +str(c_start)+"_"
                                                    +str(c_stop),
                                                seq=contig_rec.seq
                                                    [c_start:c_stop])
                            write_fasta(cxt_file, cxt_rec)
                    else:
                        print "-",
                        n_cnt +=1
                if n_cnt > 0:
                    logstring = "".join(["\t", str(p_cnt), " (",
                                         str(n_cnt), ")"])
                else:
                    logstring = "".join(["\t", str(p_cnt)])
                run_ref.log(logstring)
            else:
                print "-",
                run_ref.log("".join(["\t", "0"]))
        print ""
    return ref_hits, control_scores
Exemplo n.º 57
0
def prep_maps(run_ref, run_id, timestamp, g_select, r_root_dir, run_dirs,
              genomes, fixed_dirs, segtype, min_size, fct_flags,
              fct_colors, idpt): 
    """Set up generation of various maps."""
    # set inputs and outputs
    ref_ctg_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    ref_gbk = run_ref.gbk
    cst_root = run_root+run_dirs['scaffolds_dir']+ref_ctg_n+"/"
    segments_root = run_root+run_dirs['aln_seg_dir']+ref_ctg_n+"/"
    ctg_segs_root = segments_root+"contigs/"
    cst_segs_root = segments_root+"constructs/"
    maps_root = run_root+run_dirs['maps_dir']+ref_ctg_n+"/"
    ctg_aln_maps_root = maps_root+"contig_alns/"
    cst_ann_maps_root = maps_root+"constructs_annot/"
    cst_aln_maps_root = maps_root+"constructs_aln/"
    ensure_dir([cst_root, ctg_segs_root, cst_segs_root, maps_root,
                ctg_aln_maps_root, cst_ann_maps_root, cst_aln_maps_root])
    print " ", ref_ctg_n, "...",
    # log
    logstring = "".join(["\n\n# Generate maps @", timestamp, "\n\n"])
    run_ref.log(logstring)
    # map of reference with segment details
    map_ref_segs(run_ref, run_id, r_root_dir, run_dirs, min_size,
                 fct_flags, fct_colors, idpt)
    # log
    logstring = "ref_map"
    run_ref.log(logstring)
    print logstring
    # cycle through genomes
    for genome in genomes:
        # set inputs and outputs
        g_name = genome['name']
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        while True:
            try:
                if g_name in g_select: pass
                else: break
            except TypeError:
                pass
            print "\t", g_name, "...",
            scaff_gbk = cst_root+g_name+"_"+ref_ctg_n+"_scaffold.gbk"
            ctg_aln_maps_dir = ctg_aln_maps_root+g_name+"/"
            ensure_dir([ctg_aln_maps_dir])
            # maps of contigs aligned to reference
            logstring = "ctg_aln"
            print logstring,
            logstring = "".join(["\t", logstring])
            run_ref.log(logstring)
            map_ctg_alns(run_ref, ref_gbk, genome, ctg_segs_root,
                         ctg_aln_maps_dir, fixed_dirs, segtype, min_size,
                         fct_flags, fct_colors, idpt) 
            # map of scaffold construct
            logstring = "cst_ant"
            print logstring,
            logstring = "".join(["\t", logstring])
            run_ref.log(logstring)
            map_cst_annot(run_ref, genome, scaff_gbk, cst_ann_maps_root,
                          fct_flags, fct_colors)
            # map of construct aligned to reference
            logstring = "cst_aln"
            print logstring
            logstring = "".join(["\t", logstring])
            run_ref.log(logstring)
            map_cst_aln(run_ref, ref_gbk, genome, scaff_gbk, cst_segs_root,
                        cst_aln_maps_root, segtype, min_size, fct_flags,
                        fct_colors, idpt)
            break
Exemplo n.º 58
0
def align_ctg2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs,
                  genomes, mauve_exec, max_size, chop_mode, mtype):
    """Align contigs pairwise to the reference contig."""
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    ref_ctg_file = run_ref.file
    mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/contigs/"
    segments_root = run_root+run_dirs['aln_seg_dir']+ref_n+"/contigs/"
    q_ctgs_root = run_root+run_dirs['match_out_dir']+ref_n+"/"
    ensure_dir([segments_root])
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Align contigs to ref @", timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs and outputs
        g_name = genome['name']
        ctgs_fas_dir = q_ctgs_root+g_name+"/"
        mauve_dir = mauve_root+g_name+"/"
        aln_segs_root = segments_root+g_name+"/"
        ensure_dir([mauve_dir])
        print "\t", g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # list genbank files in matches directory
        dir_contents = listdir(ctgs_fas_dir)
        for item in dir_contents:
            pattern = re.compile(r'.*_(\d*)\.fas$')
            match = pattern.match(item)
            if match:
                ctg_num = match.group(1)
                print ctg_num,
                logstring = "".join(["\t", ctg_num])
                run_ref.log(logstring)
                # set inputs and outputs
                q_contig = ctgs_fas_dir+item
                file_list = (ref_ctg_file, q_contig)
                mauve_outfile = mauve_dir+ctg_num+".mauve"
                aln_segs_dir = aln_segs_root+ctg_num+"/"
                ensure_dir([aln_segs_dir])
                segfile = aln_segs_dir+ctg_num+"_"+ref_n+"_segs.txt"
                open(segfile, 'w').write('')
                # do Mauve alignment
                try:
                    open(ref_ctg_file, 'r')
                    open(q_contig, 'r')
                except IOError:
                    msg = "\nERROR: File missing, cannot align\n\t\t\t"
                    run_ref.log(msg)
                    print msg
                else:
                    align_mauve(file_list, mauve_outfile, mauve_exec)
                    try:
                        # parse Mauve output (without initial clumping)
                        coords = mauver_load2_k0(mauve_outfile+".backbone",
                                                 0, mtype)
                        # chop segments that are too long
                        chop_array = chop_rows(coords, max_size, chop_mode,
                                               mtype)
                        # make detailed pairwise alignments of the segments
                        ref_rec = load_genbank(ref_ctg_file)
                        query_rec = load_fasta(q_contig)
                        iter_align(chop_array, ref_rec, query_rec,
                                   aln_segs_dir, segfile)
                    except IOError:
                        msg = "\nERROR: Mauve alignment failed\n\t\t\t"
                        run_ref.log(msg)
                        print msg
                    except Exception:
                        msg = "\nERROR: Iteration failed\n\t\t\t"
                        run_ref.log(msg)
                        print msg
        print ""