def prepare_g2p(test_case): if os.path.exists('../g2p/g2p_fpr.txt'): test_case.pssm = Pssm(path_to_lookup='../g2p/g2p_fpr.txt', path_to_matrix='../g2p/g2p.matrix') else: test_case.pssm = Pssm(path_to_lookup='micall/g2p/g2p_fpr.txt', path_to_matrix='micall/g2p/g2p.matrix') test_case.g2p_csv = DummyFile() test_case.g2p_summary_csv = DummyFile()
def test_multiple_sequences(self): pssm = Pssm() nucs = [('TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAG' 'AGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT'), ('TGTACAAGTCCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAG' 'AGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT')] expected_aa = None # Not returned when submitting more than one seq. expected_scores = [0.06775, 0.06486] scores, aligned_aa = pssm.run_g2p(nucs) rounded_scores = [round(score, 5) for score in scores] self.assertEqual(expected_aa, aligned_aa) self.assertEqual(expected_scores, rounded_scores)
def process_run(run_info, args): pssm = Pssm() for filename in os.listdir(run_info.scratch_path): filepath = os.path.join(run_info.scratch_path, filename) if os.path.isdir(filepath): shutil.rmtree(filepath) else: os.remove(filepath) if run_info.interop_path is None: run_summary = None else: logger.info('Summarizing run.') run_summary = summarize_run(run_info) with ProcessPoolExecutor(max_workers=args.max_active) as pool: for _ in pool.map(functools.partial(process_sample, args=args, pssm=pssm, use_denovo=run_info.is_denovo), run_info.get_all_samples()): pass for _ in pool.map(functools.partial(process_resistance, run_info=run_info), run_info.sample_groups): pass collate_samples(run_info) if run_summary is not None: summarize_samples(run_info, run_summary) if not args.keep_scratch: shutil.rmtree(run_info.scratch_path, ignore_errors=True) logger.info('Done.')
def main(): logger.info("Starting on %s with %d CPU's.", socket.gethostname(), multiprocessing.cpu_count()) args = parse_args() if args.link_run is not None: json = link_json(args.link_run, args.data_path) else: json_path = os.path.join(args.data_path, 'input', 'AppSession.json') with open(json_path, 'rU') as json_file: json = parse_json(json_file) pssm = Pssm() scratch_path = os.path.join(args.data_path, 'scratch') makedirs(scratch_path) for filename in os.listdir(scratch_path): filepath = os.path.join(scratch_path, filename) if os.path.isdir(filepath): shutil.rmtree(filepath) else: os.remove(filepath) if json.run_id is not None: logger.info('Summarizing run.') run_summary = summarize_run(args, json) pool = Pool() pool.map( functools.partial(try_sample, run_info=json, data_path=args.data_path, pssm=pssm), range(len(json.samples))) if json.run_id is not None: summarize_samples(args, json, run_summary) logger.info('Done.')
def test_single_sequence(self): pssm = Pssm() nucs = ('TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAG' 'AGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT') expected_aa = [['C'], ['T'], ['R'], ['P'], ['N'], ['-'], ['N'], ['N'], ['T'], ['-'], ['-'], ['R'], ['K'], ['S'], ['I'], ['H'], ['I'], ['-'], ['-'], ['-'], ['G'], ['P'], ['G'], ['R'], ['-'], ['-'], ['-'], ['A'], ['F'], ['Y'], ['A'], ['T'], ['-'], ['-'], ['-'], ['-'], ['G'], ['E'], ['I'], ['I'], ['G'], ['D'], ['I'], ['-'], ['-'], ['R'], ['Q'], ['A'], ['H'], ['C']] expected_score = 0.067753 score, aligned_aa = pssm.run_g2p(nucs) self.assertEqual(expected_aa, aligned_aa) self.assertAlmostEqual(expected_score, score, places=5)
def setUp(self): super(SamG2PTest, self).setUp() if os.path.exists('../g2p/g2p_fpr.txt'): self.pssm = Pssm(path_to_lookup='../g2p/g2p_fpr.txt', path_to_matrix='../g2p/g2p.matrix') else: self.pssm = Pssm(path_to_lookup='micall/g2p/g2p_fpr.txt', path_to_matrix='micall/g2p/g2p.matrix') self.nuc_csv = StringIO("""\ seed,region,q-cutoff,query.nuc.pos,refseq.nuc.pos,A,C,G,T HIV1B-env-seed,V3LOOP,15,877,1,0,0,0,100 HIV1B-env-seed,V3LOOP,15,981,105,0,0,0,100 """) self.g2p_csv = DummyFile() self.g2p_summary_csv = DummyFile() self.addTypeEqualityFunc(str, self.assertMultiLineEqual)
def test_ambiguous_sequence(self): pssm = Pssm() nucs = ('TGTACAAGWCCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAG' 'AGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT') expected_aa = [['C'], ['T'], ['R', 'S'], ['P'], ['N'], ['-'], ['N'], ['N'], ['T'], ['-'], ['-'], ['R'], ['K'], ['S'], ['I'], ['H'], ['I'], ['-'], ['-'], ['-'], ['G'], ['P'], ['G'], ['R'], ['-'], ['-'], ['-'], ['A'], ['F'], ['Y'], ['A'], ['T'], ['-'], ['-'], ['-'], ['-'], ['G'], ['E'], ['I'], ['I'], ['G'], ['D'], ['I'], ['-'], ['-'], ['R'], ['Q'], ['A'], ['H'], ['C']] # Average of two possible scores (see test_multiple_sequences). expected_score = (0.06775 + 0.06486) / 2 score, aligned_aa = pssm.run_g2p(nucs) self.assertEqual(expected_aa, aligned_aa) self.assertAlmostEqual(expected_score, score, places=5)
def main(): args = parse_args() from micall.g2p.pssm_lib import Pssm pssm = Pssm() sam_g2p(pssm=pssm, remap_csv=args.remap_csv, nuc_csv=args.nuc_csv, g2p_csv=args.g2p_csv, g2p_summary_csv=args.g2p_summary_csv, min_count=DEFAULT_MIN_COUNT)
def main(): logging.basicConfig(level=logging.WARN) args = parse_args() sample = load_sample(args) pssm = Pssm() sample.process(pssm, force_gzip=True) # dataset files change .gz to .raw with tarfile.open(args.coverage_maps_tar, mode='w') as tar: for image_name in os.listdir(sample.coverage_maps): image_path = os.path.join(sample.coverage_maps, image_name) archive_path = os.path.join('coverage_maps', image_name) tar.add(image_path, archive_path)
def main(): args = parse_args() from micall.g2p.pssm_lib import Pssm pssm = Pssm() fastq_g2p(pssm=pssm, fastq1=args.fastq1, fastq2=args.fastq2, g2p_csv=args.g2p_csv, g2p_summary_csv=args.g2p_summary_csv, unmapped1=args.unmapped1, unmapped2=args.unmapped2, aligned_csv=args.aligned_csv, min_count=DEFAULT_MIN_COUNT, min_valid=MIN_VALID, min_valid_percent=MIN_VALID_PERCENT)
def __init__(self, filename1, bad_cycles_filename): super(MicallDD, self).__init__() if True or 'filter' in filename1: self.filename1 = filename1 else: self.filename1 = self.filter_fastqs(filename1) self.bad_cycles_filename = bad_cycles_filename self.pssm = Pssm() reads = defaultdict(list) read_fastq(self.filename1, reads) read_count = len(reads) read_fastq(get_reverse_filename(self.filename1), reads) added_count = len(reads) - read_count if added_count > 0: raise RuntimeError('Found {} new reads.'.format(added_count)) self.reads = reads.values()
def main(): logger.info("Starting on %s with %d CPU's.", socket.gethostname(), multiprocessing.cpu_count()) args = parse_args() if args.link_run is not None: run_info = link_samples(args.link_run, args.data_path) else: run_info = load_samples(args.data_path) pssm = Pssm() for filename in os.listdir(run_info.scratch_path): filepath = os.path.join(run_info.scratch_path, filename) if os.path.isdir(filepath): shutil.rmtree(filepath) else: os.remove(filepath) if run_info.interop_path is None: run_summary = None else: logger.info('Summarizing run.') run_summary = summarize_run(run_info) pool = Pool() pool.map(functools.partial(process_sample, args=args, pssm=pssm), run_info.get_all_samples()) pool.close() pool.join() pool = Pool() pool.map(functools.partial(process_resistance, run_info=run_info), run_info.sample_groups) pool.close() pool.join() collate_samples(run_info) if run_summary is not None: summarize_samples(run_info, run_summary) logger.info('Done.')
simple_prefix, pssm, ruby_script, delete_results=False) if not txtfilename.endswith('.txt'): with open(simple_prefix + '.txt', 'w') as simplefile: for line in simple_remap_lines: simplefile.write(line) if __name__ == '__main__': parser = argparse.ArgumentParser( description='Find the simplest test failure by trimming SAM files.') parser.add_argument('workdir', help='path to folder holding SAM files') parser.add_argument('ruby_script', help='path to Ruby version of G2P') parser.add_argument('--pattern', default='*.remap.csv', help='File name pattern to match SAM files') args = parser.parse_args() logger = init_logging_console_only(logging.INFO) pssm = Pssm(path_to_lookup='../g2p/g2p_fpr.txt', path_to_matrix='../g2p/g2p.matrix') for txtfilename in sorted( glob.glob(os.path.join(args.workdir, args.pattern))): logger.info(os.path.basename(txtfilename)) compare_conseqs(txtfilename, args.ruby_script, pssm) logger.info('Done.')
def __init__(self, parent, *args, **kwargs): self.pssm = Pssm( path_to_lookup=AssetWrapper('micall/g2p/g2p_fpr.txt').path, path_to_matrix=AssetWrapper('micall/g2p/g2p.matrix').path) tk.Frame.__init__(self, parent, *args, **kwargs) self.parent = parent parent.report_callback_exception = self.report_callback_exception self.rundir = None # path to MiSeq run folder containing data self.workdir = gettempdir() # default to temp directory os.chdir(self.workdir) self.line_counter = LineCounter() self.run_info = None self.target_files = [] self.button_frame = tk.Frame(self) self.button_frame.pack(side='top') self.console_frame = tk.Frame(self) self.console_frame.pack(side='top', fill='both', expand=True) try: with open(MiCall.CONFIG_FILE, 'rU') as f: self.config = json.load(f) except: self.config = {} self.nthreads = self.config.get('threads', None) if not self.nthreads: self.nthreads = int(round(cpu_count() * 0.5)) self.config['threads'] = self.nthreads self.write_config() self.button_run = tk.Button(self.button_frame, text="Run", command=self.process_files) self.button_run.grid(row=0, column=1, sticky='W') self.progress_bar = Progressbar(self.button_frame, orient='horizontal', length=500, mode='determinate') self.progress_bar.grid(row=1, columnspan=5) scrollbar = tk.Scrollbar(self.console_frame) scrollbar.pack(side=tk.RIGHT, fill=tk.Y) self.console = tk.Text(self.console_frame, bg='black', fg='white', yscrollcommand=scrollbar.set) self.console.pack(side=tk.LEFT, fill=tk.BOTH) self.console.tag_configure('ERROR', foreground="red") scrollbar.config(command=self.console.yview) # redirect stderr to Text widget #sys.stderr = Redirector(self.console) self.write('Welcome to MiCall v{}, running with {} threads.\n'.format( pipeline_version, self.nthreads))
def main(): logger.info("Starting on %s with %d CPU's.", socket.gethostname(), multiprocessing.cpu_count()) args = parse_args() if args.link_run is not None: run_json = link_json(args.link_run, args.data_path) run_json.has_runinfo = True else: json_path = os.path.join(args.data_path, 'input', 'AppSession.json') try: with open(json_path, 'r') as json_file: run_json = parse_json(json_file) except: if os.path.exists(json_path): # copy the input file to the output dir for postmortem analysis logger.error("Error occurred while parsing '%s'" % json_path) with open(json_path, 'r') as json_file: file_cont = json_file.read() out_path = os.path.join(args.data_path, 'logs', 'AppSession.json') with open(out_path, 'w') as json_file: json_file.write(file_cont) else: logger.error("Error: no such file as '%s'" % json_path) raise # Do we have run_ids for all sample_ids ? if run_json.run_id is None: run_json.has_runinfo = False else: bs = BSrequest() sample_id_set = bs.check_run_sample_ids( [run_json.run_id], [s["Id"] for s in run_json.samples]) run_json.has_runinfo = (len(sample_id_set) == len( run_json.samples)) logger.info("setting json.has_run_info to %s" % run_json.has_runinfo) pssm = Pssm() scratch_path = os.path.join(args.data_path, 'scratch') makedirs(scratch_path) for filename in os.listdir(scratch_path): filepath = os.path.join(scratch_path, filename) if os.path.isdir(filepath): shutil.rmtree(filepath) else: os.remove(filepath) args.g2p_path = args.qc_path = create_app_result(args.data_path, run_json, suffix='results') if run_json.run_id is None: run_summary = None else: logger.info('Summarizing run.') run_summary = summarize_run(args, run_json) pool = Pool() pool.map( functools.partial(try_sample, run_info=run_json, args=args, pssm=pssm), range(len(run_json.samples))) pool.close() pool.join() collate_samples(args, run_json) if run_json.run_id is not None: summarize_samples(args, run_json, run_summary) logger.info('Done.')