def gen_qsubs():
    # Generate qsub shell scripts and commands for easy parallelization
    print('Generating qsub scripts...')
    qsubs_dir = _config.QSUBS_DIR + NAME + '/'
    util.ensure_dir_exists(qsubs_dir)
    qsub_commands = []

    num_scripts = 0
    for fn in os.listdir(inp_dir):
        basename, rnum, snum = re.compile("(.*)_R(\d)_001_(\d+).fastq").search(
            fn).groups()
        if int(rnum) == 1:
            command = '/cluster/bh0085/anaconda27/envs/py3/bin/python %s.py %s %s' % (
                NAME, snum, basename)
            script_id = NAME.split('_')[0]

            # Write shell scripts
            sh_fn = qsubs_dir + 'q_%s_%s_%s.sh' % (script_id, basename, snum)
            with open(sh_fn, 'w') as f:
                f.write('#!/bin/bash\n%s\n' % (command))
            num_scripts += 1

            # Write qsub commands
            qsub_commands.append('qsub -m e -wd %s %s' %
                                 (_config.SRC_DIR, sh_fn))

# Save commands
    with open(qsubs_dir + '_commands.txt', 'w') as f:
        f.write('\n'.join(qsub_commands))

    print('Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir))
    return
def gen_qsubs():
    # Generate qsub shell scripts and commands for easy parallelization
    print('Generating qsub scripts...')
    qsubs_dir = QSUBS_DIR + NAME + '/'
    util.ensure_dir_exists(qsubs_dir)
    qsub_commands = []
    num_scripts = 0

    for split in range(N_SPLITS):
        for bc_substr_idx in range(0, bc_substr_max, bc_substr_inc):
            command = './%s.py %s %s' % (NAME, split, bc_substr_idx)
            script_id = NAME.split('_')[0]

            # Write shell scripts
            sh_fn = qsubs_dir + f'q_{script_id}_{bc_substr_idx}_{split}.sh'
            with open(sh_fn, 'w') as f:
                f.write('#!/bin/bash\n%s\n' % (command))
                num_scripts += 1

            # Write qsub commands
            qsub_commands.append(f'qsub -m e -wd {_config.SRC_DIR} {sh_fn}')

# Save commands
    with open(qsubs_dir + '_commands.txt', 'w') as f:
        f.write('\n'.join(qsub_commands))

    print(f'Wrote {num_scripts} shell scripts to {qsubs_dir}')
    return
예제 #3
0
def gen_qsubs():
    # Generate qsub shell scripts and commands for easy parallelization
    print('Generating qsub scripts...')
    qsubs_dir = _config.QSUBS_DIR + NAME + '/'
    util.ensure_dir_exists(qsubs_dir)
    qsub_commands = []

    num_scripts = 0

    for k, exp in SEQUENCING_INFO.iterrows():
        bc = exp.Name
        for start in range(0, 1):
            end = len(LIBRARY_DF)
            command = '/cluster/bh0085/anaconda27/envs/py3/bin/python %s.py %s %s %s' % (
                NAME, bc, start, end)
            script_id = NAME.split('_')[0]

            # Write shell scripts
            sh_fn = qsubs_dir + 'q_%s_%s_%s.sh' % (script_id, bc, start)
            with open(sh_fn, 'w') as f:
                f.write('#!/bin/bash\n%s\n' % (command))
            num_scripts += 1

            # Write qsub commands
            qsub_commands.append('qsub -m e -wd %s %s' %
                                 (_config.SRC_DIR, sh_fn))

    # Save commands
    with open(qsubs_dir + '_commands.txt', 'w') as f:
        f.write('\n'.join(qsub_commands))

    print('Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir))
    return
예제 #4
0
def gen_qsubs():
    # Generate qsub shell scripts and commands for easy parallelization
    print('Generating qsub scripts...')
    qsubs_dir = os.path.join(QSUBS_DIR, NAME + '/')
    util.ensure_dir_exists(qsubs_dir)
    qsub_commands = []
    num_scripts = 0

    n_oligos_perbatch = 10

    for start in range(0, max(oligo_lib.index), n_oligos_perbatch):
        end = start + n_oligos_perbatch
        command = f'./{NAME}.py {start} {end}'
        script_id = NAME.split('_')[0]

        # Write shell scripts
        sh_fn = qsubs_dir + f'q_{script_id}_{start}.sh'
        with open(sh_fn, 'w') as f:
            f.write(f'#!/bin/bash\n{command}\n')
            num_scripts += 1

        # Write qsub commands
        qsub_commands.append(f'qsub -m e -wd {_config.SRC_DIR} {sh_fn}')

# Save commands
    with open(qsubs_dir + '_commands.txt', 'w') as f:
        f.write('\n'.join(qsub_commands))

    print(f'Wrote {num_scripts} shell scripts to {qsubs_dir}')
    return
예제 #5
0
def main(nm='', start='', end=''):

    if nm == '' and start == '' and end == '':
        gen_qsubs()
        return

    start, end = int(start), int(end)
    out_dir = out_place + nm + '/'
    util.ensure_dir_exists(out_dir)

    print('Preparing alignment output directories...')
    prepare_align_outdirs(out_dir, start, end)
    print('Done')

    global expected_cutsite
    expected_cutsite = 30

    inp_dir = inp_place + nm + '/'

    timer = util.Timer(total=end - start + 1)
    for iter_exp in range(start, end):
        data = defaultdict(list)
        for split in os.listdir(inp_dir):
            if split == 'aligns':
                continue
            inp_fn = inp_dir + '%s/%s.txt' % (split, iter_exp)
            remaster_aligns(inp_fn, data)
        save_alignments(data, out_dir, iter_exp)
        timer.update()

    return
예제 #6
0
def prepare_align_outdirs(out_plc, start, end):
    util.ensure_dir_exists(out_plc)
    timer = util.Timer(total=end - start + 1)
    for exp in range(start, end + 1):
        out_idx_dir = out_plc + str(exp) + '/'
        util.ensure_dir_exists(out_idx_dir)
        if len(os.listdir(out_idx_dir)) > 0:
            subprocess.check_output('rm -rf %s*' % (out_idx_dir), shell=True)
        timer.update()
    return
예제 #7
0
def main(inp_dir, out_dir, nm='', start='', end=''):
    print(NAME)
    inp_dir = '%s%s/' % (inp_dir, nm)
    util.ensure_dir_exists(out_dir)

    # Function calls
    if nm == '' and start == '' and end == '':
        gen_qsubs()
        return

    genotype_data(inp_dir, out_dir, nm, start, end)
    return out_dir
예제 #8
0
def gen_qsubs():
    # Generate qsub shell scripts and commands for easy parallelization
    print('Generating qsub scripts...')
    qsubs_dir = _config.QSUBS_DIR + NAME + '/'
    util.ensure_dir_exists(qsubs_dir)
    qsub_commands = []

    num_scripts = 0
    runtime = int(datetime.datetime.now().timestamp())
    for fn in os.listdir(inp_dir):
        basename, rnum, snum = re.compile("(.*)_R(\d)_001_(\d+).fastq").search(
            fn).groups()
        print(basename, rnum, snum)
        print(inp_dir)
        if int(rnum) == 1:
            command = '/cluster/bh0085/anaconda27/envs/py3/bin/python %s.py %s %s' % (
                NAME, snum, basename)
            script_id = NAME.split('_')[0]

            # Write shell scripts
            sh_fn = qsubs_dir + 'q_%s_%s_%s.sh' % (script_id, basename, snum)
            with open(sh_fn, 'w') as f:
                f.write('#!/bin/bash\n%s\n' % (command))
            num_scripts += 1

            # Write qsub commands
            outfile = os.path.join(logs_dir,
                                   f"o_{runtime}_{basename}_{snum}.txt")
            errorfile = os.path.join(logs_dir,
                                     f"e_{runtime}_{basename}_{snum}.txt")

            qsub_commands.append(
                f'qsub -o {outfile} -e {errorfile} -wd {_config.SRC_DIR} {sh_fn}'
            )

# Save commands
    with open(qsubs_dir + '_commands.txt', 'w') as f:
        f.write('\n'.join(qsub_commands))

    print('Wrote %s shell scripts to %s' % (num_scripts, qsubs_dir))
    return
def gen_qsubs():
    # Generate qsub shell scripts and commands for easy parallelization
    print('Generating qsub scripts...')
    qsubs_dir = QSUBS_DIR + NAME + '/'
    util.ensure_dir_exists(qsubs_dir)
    qsub_commands = []

    num_scripts = 0

    runtime = int(datetime.datetime.now().timestamp())

    for exp in EXP_NAMES:
        command = '/cluster/bh0085/anaconda27/envs/py3/bin/python %s.py %s' % (
            NAME, exp)
        script_id = NAME.split('_')[0]

        # Write shell scripts
        sh_fn = qsubs_dir + f'q_{script_id}_{exp}.sh'
        with open(sh_fn, 'w') as f:
            f.write('#!/bin/bash\n%s\n' % (command))
            num_scripts += 1

        # Write qsub commands

        # Write qsub commands
        outfile = os.path.join(logs_dir, f"o_{runtime}_{exp}.txt")
        errorfile = os.path.join(logs_dir, f"e_{runtime}_{exp}.txt")
        qsub_commands.append(
            f'qsub -o {outfile} -e {errorfile} -wd {_config.SRC_DIR} {sh_fn}')

# Save commands
    with open(qsubs_dir + '_commands.txt', 'w') as f:
        f.write('\n'.join(qsub_commands))

    print(f'Wrote {num_scripts} shell scripts to {qsubs_dir}')
    return
##
import pandas as pd
import numpy as np
import itertools as it
import os, re, sys
from collections import defaultdict

sys.path.append("/cluster/bh0085")
from mybio import util
from _config import DATA_DIR, OUT_PLACE, N_SPLITS, QSUBS_DIR, OLIGO_LIBRARY, POSITIVE_CONTROLS_FILE, EXP_NAMES
import _config

#IO DIRECTORY CONFIG
NAME = util.get_fn(__file__)
OUT_DIR = os.path.join(OUT_PLACE, NAME)
util.ensure_dir_exists(OUT_DIR)

##
# CUSTOM CODE
##

#load oligo library from experimental design
oligos_lib = OLIGO_LIBRARY
oligos_lib["id"] = oligos_lib.index
positive_controls = pd.read_csv(POSITIVE_CONTROLS_FILE)

from _config import DATA_DIR, OUT_PLACE, N_SPLITS, QSUBS_DIR
TX_INP_DIR = os.path.join(OUT_PLACE, "c0_bin_transcript_umis")
OLIGO_INP_DIR = os.path.join(OUT_PLACE, "b1_demultiplex_oligos")

#load oligo library from experimental design
import _config
import sys, os, fnmatch, datetime, subprocess, imp
import numpy as np
from collections import defaultdict

sys.path.append('/cluster/bh0085/')
from mybio import util

import pandas as pd
import gzip

# Default params
inp_dirs = [_config.SHE2955_DIR, _config.SHE3447_DIR]
NAME = util.get_fn(__file__)
out_dir = _config.OUT_PLACE + NAME + '/'
util.ensure_dir_exists(out_dir)


##
# Functions
##
def split(inp_fn, out_nm):
    inp_fn_numlines = util.line_count(inp_fn)

    num_splits = 30
    split_size = int(inp_fn_numlines / num_splits)
    if num_splits * split_size < inp_fn_numlines:
        split_size += 1
    while split_size % 4 != 0:
        split_size += 1
    print('Using split size %s' % (split_size))
예제 #12
0
import _config

from scipy.stats import entropy
import scipy.stats as stats


import matplotlib.pyplot as plt
#%matplotlib inline

DEFAULT_INP_DIR = _config.OUT_PLACE + 'f1_agg_reports/'
print(DEFAULT_INP_DIR)

if not "__file__" in vars(): __file__="f_test"
NAME = util.get_fn(__file__)
FIG_DIR = os.path.join(FIGS_PLACE, NAME)
util.ensure_dir_exists(FIG_DIR)

agg_results = pd.read_csv(os.path.join(DEFAULT_INP_DIR,f"{PRJ_NAME}_aggregate_stats.csv"),index_col=0).T
lib_results = pd.read_csv(os.path.join(DEFAULT_INP_DIR,f"{PRJ_NAME}_library_stats.csv"),index_col=0).T

f,subs = plt.subplots(1,3)
f.set_size_inches(12,4)

plt.sca(subs[0])
ax = plt.gca()
plt.bar(lib_results.index,lib_results.frac_crispr_of_mapped)
ax.set_title("CRISPR cutting efficiency\n(Fraction non-wildtype reads)")
ax.set_ylabel("cutting efficiency")
ax.set_xlabel("library ID")
ax.set_xticks(lib_results[::10].index)
예제 #13
0
def demultiplex(split, filename):

    if "AH3W5GBGX9" in filename:
        print()
        exp_design = exp_design_2955
        exp_test_strs = exp_test_strs_2955
    else:

        exp_design = exp_design_3447
        exp_test_strs = exp_test_strs_3447

    for name in list(exp_design["Name"]) + ['other']:
        util.ensure_dir_exists(os.path.join(out_dir, '%s' % (filename), name))
        util.exists_empty_fn(
            os.path.join(out_dir, '%s/%s/R1_%s.fa' % (filename, name, split)))
        util.exists_empty_fn(
            os.path.join(out_dir, '%s/%s/R2_%s.fa' % (filename, name, split)))

    print(os.path.join(out_dir, name, '%s' % (filename)))
    for snum, sgroup in it.groupby(sorted(
            os.listdir(inp_dir),
            key=lambda x: re.compile("(\d+)\.fastq").search(x).groups()[0]),
                                   key=lambda x: re.compile("(\d+)\.fastq").
                                   search(x).groups()[0]):

        if snum != split: continue
        files = list(sgroup)
        fns = list([sf for sf in files if filename in sf])

        print(("LANE: {0}, FILES: {1}".format(snum, fns)))
        read_files = dict([[int(re.compile("R(\d+)").search(e).group(1)), e]
                           for e in fns])

        inp_fn1 = os.path.join(inp_dir, read_files[1])
        inp_fn2 = os.path.join(inp_dir, read_files[2])

        lc = util.line_count(inp_fn1)
        num_bad_q, num_tot, num_other, num_mapped = 0, 0, 0, 0
        timer = util.Timer(total=lc)
        i = -1

        ##
        # Functions
        ##
        def match(r1, r2, h1, h2):
            for k, v in list(exp_test_strs.items()):
                try:
                    idx = h1.index(v)
                    return k, r1
                except ValueError as e:
                    continue
            return "other", r1

        with open(inp_fn1) as f1:
            with open(inp_fn2) as f2:
                print(inp_fn1)
                print(inp_fn2)
                while 1:
                    i += 1
                    if i % 10000 == 0:
                        print((
                            "{0} records, ({1}%) [{2} bad] [{3} other]".format(
                                i / 4, 100 * float(i) / lc, num_bad_q,
                                num_other)))

                    try:
                        line1 = next(f1)
                        line2 = next(f2)
                    except StopIteration as e:
                        break

                    if i % 4 == 0:
                        h1 = line1.strip()
                        h2 = line2.strip()
                    if i % 4 == 1:
                        r1 = line1.strip()
                        r2 = line2.strip()
                    if i % 4 == 3:
                        num_tot += 1
                        qs1 = line1.strip()
                        qs2 = line2.strip()

                        markbad = False
                        for qs in [qs1, qs2]:
                            quals = [ord(s) - 33 for s in qs]
                            if np.mean(quals) < 30:
                                markbad = True

                        if markbad:
                            num_bad_q += 1
                            continue

                        demultiplex_id, trimmed_read = match(r1, r2, h1, h2)
                        if demultiplex_id == 'other':
                            num_other += 1

                        out1_fn = out_dir + '%s/%s/R1_%s.fa' % (
                            filename, demultiplex_id, split)
                        if len(('>' + h1[1:] + '\n' + r1 +
                                '\n').splitlines()) > 2:
                            print('>' + h1[1:] + '\n' + r1 + '\n')
                            raise Exception()
                        #print('>' + h1[1:] + '\n' + r1 + '\n')
                        with open(out1_fn, 'a') as f:
                            f.write('>' + h1[1:] + '\n' + r1 + '\n')

                        out2_fn = out_dir + '%s/%s/R2_%s.fa' % (
                            filename, demultiplex_id, split)
                        with open(out2_fn, 'a') as f:
                            f.write('>' + h2[1:] + '\n' + r2 + '\n')
                        num_mapped += 1

                    #timer.update()

    #logs = pd.Series({"num_bad_q":num_bad_q,
    #               "num_tot":num_tot})
    #logs.to_csv(os.path.join(LOGS_DIR,f"{datetime.date.today().isoformat}_{filename}_{split}.csv"))
    print(('Rejected %s fraction of reads' % (num_bad_q / num_tot)))
    print("<json>" + json.dumps({
        "num_bad_q": num_bad_q,
        "num_tot": num_tot,
        "num_other": num_other,
        "num_mapped": num_mapped,
    }) + "</json>")

    return
예제 #14
0
import _config
import sys, os, fnmatch, datetime, subprocess
sys.path.append('/cluster/bh0085/')
import numpy as np
from collections import defaultdict
from mybio import util
import pickle

from _config import LIBRARY_DF, SEQUENCING_INFO

# Default params
inp_place = _config.OUT_PLACE + 'b_alignment_rev/'
NAME = util.get_fn(__file__)
out_place = _config.OUT_PLACE + NAME + '/'
util.ensure_dir_exists(out_place)

global expected_cutsite


##
# Left or right
##
def left_or_right_event(seq, cutsite):
    # Given a sequence with exactly one - event and possible end gaps, determine if it's on the left or right of the cutsite.
    left_test = len(seq[:cutsite + 1].replace('-', ' ').split())
    right_test = len(seq[cutsite - 1:].replace('-', ' ').split())
    if left_test == 2 and right_test == 1:
        return 'left'
    if left_test == 1 and right_test == 2:
        return 'right'
예제 #15
0
def matchmaker(nm, split):
    print(nm, split)
    stdout_fn = os.path.join(_config.LOGS_DIR, 'nh_c_%s_%s.out' % (nm, split))
    util.exists_empty_fn(stdout_fn)
    out_dir = os.path.join(out_root_dir, nm, split)
    util.ensure_dir_exists(out_dir)

    inp_fn = inp_dir + '%s_R2_%s.fastq' % (nm, split)

    lsh_dict = build_targets_better_lsh()
    alignment_buffer = init_alignment_buffer()

    prepare_outfns(out_dir)

    qf = 0
    print(inp_fn)
    tot_reads = util.line_count(inp_fn)
    timer = util.Timer(total=tot_reads)
    with open(inp_fn) as f:
        for i, line in enumerate(f):
            if i % 4 == 0:
                pass
            if i % 4 == 1:
                l2 = line.strip()
            if i % 4 == 3:
                # Quality filter
                q2 = line.strip()
                qs = [ord(s) - 33 for s in q2]
                if np.mean(qs) < 28:
                    qf += 1
                    continue

                #l2 = compbio.reverse_complement(l2)
                #l2 = l2[82] # -- note, changed from :61 to 61:. Can comment out entirely?
                l2 = reverse_complement(l2)

                #l2 = l2[-62:]

                align_header = '>1'

                # Try to find designed target from LSH
                cand_idxs = find_best_designed_target(l2, lsh_dict)
                if len(cand_idxs) == 0:
                    continue

                # Run alignment
                best_idx, align = alignment(l2, cand_idxs)
                align = align.decode("utf-8")

                # Store alignment into buffer
                store_alignment(alignment_buffer, best_idx, align_header,
                                align)

            if i % int(tot_reads / 100) == 1 and i > 1:
                # Flush alignment buffer
                alignment_buffer = flush_alignments(alignment_buffer, out_dir)

                # Stats for the curious
                with open(stdout_fn, 'a') as outf:
                    outf.write('Time: %s\n' % (datetime.datetime.now()))
                    outf.write('Progress: %s\n' % (i / int(tot_reads / 100)))
                    outf.write('Quality filtered pct: %s\n' % (qf / (i / 4)))

            #timer.update()

    # Final flush
    alignment_buffer = flush_alignments(alignment_buffer, out_dir)

    return