def test_seq_pipeline_parallel_run_with_fasta_qual(self): 'The pipeline runs in parallel with fasta and qual' pipeline = 'sanger_with_qual' fhand_adaptors = NamedTemporaryFile() fhand_adaptors.write(ADAPTORS) fhand_adaptors.flush() arabidopsis_genes = 'arabidopsis_genes+' univec = os.path.join(TEST_DATA_DIR, 'blast', arabidopsis_genes) configuration = {'remove_vectors': {'vectors': univec}, 'remove_adaptors': {'adaptors': fhand_adaptors.name}} seq1 = create_random_seqwithquality(500, qual_range=50) seq2 = create_random_seqwithquality(500, qual_range=51) seq3 = create_random_seqwithquality(500, qual_range=52) seqs = [seq1, seq2, seq3] inseq_fhand, inqual_fhand = create_temp_seq_file(seqs, format='qual') in_fhands = {} in_fhands['in_seq'] = open(inseq_fhand.name) in_fhands['in_qual'] = open(inqual_fhand.name) outseq_fhand = NamedTemporaryFile() outqual_fhand = NamedTemporaryFile() writer = SequenceWriter(outseq_fhand, qual_fhand=outqual_fhand, file_format='fasta') writers = {'seq': writer} seq_pipeline_runner(pipeline, configuration, in_fhands, processes=4, writers=writers) out_fhand = open(outseq_fhand.name, 'r') result_seq = out_fhand.read() assert result_seq.count('>') == 3
def test_seq_pipeline_parallel_run(self): 'It tests that the pipeline runs ok' pipeline = 'sanger_without_qual' fhand_adaptors = NamedTemporaryFile() fhand_adaptors.write(ADAPTORS) fhand_adaptors.flush() arabidopsis_genes = 'arabidopsis_genes+' univec = os.path.join(TEST_DATA_DIR, 'blast', arabidopsis_genes) configuration = {'remove_vectors': {'vectors': univec}, 'remove_adaptors': {'adaptors': fhand_adaptors.name}} in_fhands = {} in_fhands['in_seq'] = open(os.path.join(TEST_DATA_DIR, 'seq.fasta'), 'r') out_fhand = NamedTemporaryFile() writer = SequenceWriter(out_fhand, file_format='fasta') writers = {'seq': writer} seq_pipeline_runner(pipeline, configuration, in_fhands, processes=4, writers=writers) out_fhand = open(out_fhand.name, 'r') result_seq = out_fhand.read() assert result_seq.count('>') == 6 #are we keeping the description? assert 'mdust' in result_seq
def _run_annotation(self, pipeline, configuration, inputs, output_dir): 'It runs the analysis.' self._log({'analysis_started':True}) pickle_fpaths = inputs['pickle'] try: seqs_fpaths = inputs['input'] except KeyError: seqs_fpaths = [] seqs_paths = self._get_seq_or_pickle_path(seqs_fpaths, pickle_fpaths) for seq_path in seqs_paths: seq_fpath = seq_path.last_version temp_pickle = NamedTemporaryFile(suffix='.pickle', mode='a', delete=False) in_fhands = {'in_seq': open(seq_fpath)} writer = SequenceWriter(fhand=temp_pickle, file_format='pickle') if seq_path.basename in configuration: #there is a different configuration for every file to annotate config = configuration[seq_path.basename] else: config = configuration seq_pipeline_runner(pipeline, configuration=config, in_fhands=in_fhands, processes=self.threads, writers={'repr': writer}) temp_pickle.close() repr_path = VersionedPath(os.path.join(output_dir, seq_path.basename + '.pickle')) repr_fpath = repr_path.next_version shutil.move(temp_pickle.name, repr_fpath) self._log({'analysis_finished':True})
def run(self): '''It runs the analysis. It checks if the analysis is already done per input file''' logger = logging.getLogger("franklin") self._log({'analysis_started': True}) input_paths = self._get_input_fpaths()['reads'] output_dir = self._create_output_dirs()['reads'] for input_path in input_paths: input_fpath = str(input_path) fname = os.path.split(input_fpath)[-1] output_fpath = os.path.join(output_dir, fname) if os.path.exists(output_fpath): msg = '%s already cleaned. Not cleaned again' % output_fpath logger.info(msg) continue file_info = scrape_info_from_fname(input_path) input_fhand = open(input_fpath) output_fhand = open(output_fpath, 'w') pipeline = self._guess_cleaning_pipepile(file_info) infhands = {'in_seq': input_fhand} writer = SequenceWriter(output_fhand, file_format=file_info['format']) configuration = self.create_cleaning_configuration( platform=file_info['pl'], library=file_info['lb']) try: seq_pipeline_runner(pipeline, configuration, infhands, file_info['format'], processes=self.threads, writers={'seq': writer}) except Exception as error: output_fhand.close() os.remove(output_fpath) raise(error) output_fhand.close() input_fhand.close() self._log({'analysis_finished': True}) return
def run(self): 'It runs the analysis.' output_dir = self._create_output_dirs()['result'] inputs = self._get_input_fpaths() pickle_paths = inputs['pickle'] output_files = {'vcf': ('vcf',), 'orf':('orf_seq.fasta', 'orf_pep.fasta'), 'ssr':('ssr',), 'gff':('gff3',), 'orthologs':('orthologs',), } for seq_path in pickle_paths: outputs = {} for kind, extensions in output_files.items(): outputs[kind] = [] for extension in extensions: output_fpath = os.path.join(output_dir, seq_path.basename + '.' + extension) if os.path.exists(output_fpath): os.remove(output_fpath) output_fhand = open(output_fpath, 'a') outputs[kind].append(output_fhand) for kind, output in outputs.items(): if len(output) == 1: outputs[kind] = output[0] in_fhands = {'in_seq': open(seq_path.last_version)} writers = {} if 'pickle' in outputs: writers['pickle'] = SequenceWriter(fhand=outputs['pickle'], file_format='pickle') if 'vcf' in outputs: ref_name = os.path.basename(in_fhands['in_seq'].name) fhand = outputs['vcf'] grouping = self._project_settings['Snvs']['vcf_grouping'] writers['vcf'] = VariantCallFormatWriter(fhand=fhand, reference_name=ref_name, grouping=grouping) if 'gff' in outputs: default_type = None writers['gff'] = SeqGffWriter(fhand=outputs['gff'], default_type=default_type) if 'orf' in outputs: fhand, pep_fhand = outputs['orf'] writers['orf'] = OrfWriter(fhand=fhand, pep_fhand=pep_fhand) if 'ssr' in outputs: writers['ssr'] = SsrWriter(fhand=outputs['ssr']) if 'orthologs' in outputs: writers['orthologs'] = \ OrthologWriter(fhand=outputs['orthologs']) if 'snv_illumina' in outputs: writers['snv_illumina'] = \ SnvIlluminaWriter(fhand=outputs['snv_illumina']) feature_counter = seq_pipeline_runner(pipeline=None, configuration=None, in_fhands=in_fhands, writers=writers) # We need to close fhands and remove void files. # sequence writer could have a qual fhand # orf writer has a pep_fhand for kind, fhands in outputs.items(): kind_key = 'sequence' if kind == 'quality' else kind if kind in feature_counter: self._close_and_remove_files(fhands, feature_counter[kind_key]) if 'vcf' in outputs and os.path.exists(outputs['vcf'].name): compress_and_index_vcf(outputs['vcf'].name)