def test_seq_pipeline_parallel_run_with_fasta_qual(self):
        'The pipeline runs in parallel with fasta and qual'
        pipeline = 'sanger_with_qual'

        fhand_adaptors = NamedTemporaryFile()
        fhand_adaptors.write(ADAPTORS)
        fhand_adaptors.flush()
        arabidopsis_genes = 'arabidopsis_genes+'
        univec = os.path.join(TEST_DATA_DIR, 'blast', arabidopsis_genes)
        configuration = {'remove_vectors': {'vectors': univec},
                         'remove_adaptors': {'adaptors': fhand_adaptors.name}}

        seq1 = create_random_seqwithquality(500, qual_range=50)
        seq2 = create_random_seqwithquality(500, qual_range=51)
        seq3 = create_random_seqwithquality(500, qual_range=52)
        seqs = [seq1, seq2, seq3]
        inseq_fhand, inqual_fhand = create_temp_seq_file(seqs, format='qual')

        in_fhands = {}
        in_fhands['in_seq'] = open(inseq_fhand.name)
        in_fhands['in_qual'] = open(inqual_fhand.name)

        outseq_fhand = NamedTemporaryFile()
        outqual_fhand = NamedTemporaryFile()
        writer = SequenceWriter(outseq_fhand, qual_fhand=outqual_fhand,
                                file_format='fasta')
        writers = {'seq': writer}

        seq_pipeline_runner(pipeline, configuration, in_fhands,
                            processes=4, writers=writers)
        out_fhand = open(outseq_fhand.name, 'r')

        result_seq = out_fhand.read()
        assert result_seq.count('>') == 3
    def test_seq_pipeline_parallel_run(self):
        'It tests that the pipeline runs ok'
        pipeline = 'sanger_without_qual'

        fhand_adaptors = NamedTemporaryFile()
        fhand_adaptors.write(ADAPTORS)
        fhand_adaptors.flush()
        arabidopsis_genes = 'arabidopsis_genes+'
        univec = os.path.join(TEST_DATA_DIR, 'blast', arabidopsis_genes)
        configuration = {'remove_vectors': {'vectors': univec},
                         'remove_adaptors': {'adaptors': fhand_adaptors.name}}

        in_fhands = {}
        in_fhands['in_seq'] = open(os.path.join(TEST_DATA_DIR, 'seq.fasta'),
                                   'r')
        out_fhand = NamedTemporaryFile()
        writer = SequenceWriter(out_fhand, file_format='fasta')
        writers = {'seq': writer}

        seq_pipeline_runner(pipeline, configuration, in_fhands,
                            processes=4, writers=writers)
        out_fhand = open(out_fhand.name, 'r')

        result_seq = out_fhand.read()
        assert result_seq.count('>') == 6
        #are we keeping the description?
        assert 'mdust' in result_seq
Пример #3
0
    def _run_annotation(self, pipeline, configuration, inputs, output_dir):
        'It runs the analysis.'

        self._log({'analysis_started':True})
        pickle_fpaths = inputs['pickle']
        try:
            seqs_fpaths = inputs['input']
        except KeyError:
            seqs_fpaths = []
        seqs_paths = self._get_seq_or_pickle_path(seqs_fpaths, pickle_fpaths)
        for seq_path in seqs_paths:
            seq_fpath = seq_path.last_version
            temp_pickle = NamedTemporaryFile(suffix='.pickle', mode='a',
                                           delete=False)
            in_fhands = {'in_seq': open(seq_fpath)}

            writer = SequenceWriter(fhand=temp_pickle,
                                    file_format='pickle')

            if seq_path.basename in configuration:
                #there is a different configuration for every file to annotate
                config = configuration[seq_path.basename]
            else:
                config = configuration

            seq_pipeline_runner(pipeline, configuration=config,
                                in_fhands=in_fhands,
                                processes=self.threads,
                                writers={'repr': writer})
            temp_pickle.close()
            repr_path = VersionedPath(os.path.join(output_dir,
                                                 seq_path.basename + '.pickle'))
            repr_fpath = repr_path.next_version
            shutil.move(temp_pickle.name, repr_fpath)
        self._log({'analysis_finished':True})
Пример #4
0
    def run(self):
        '''It runs the analysis. It checks if the analysis is already done per
        input file'''
        logger = logging.getLogger("franklin")
        self._log({'analysis_started': True})
        input_paths = self._get_input_fpaths()['reads']
        output_dir = self._create_output_dirs()['reads']
        for input_path in input_paths:
            input_fpath = str(input_path)
            fname = os.path.split(input_fpath)[-1]
            output_fpath = os.path.join(output_dir, fname)
            if os.path.exists(output_fpath):
                msg = '%s already cleaned. Not cleaned again' % output_fpath
                logger.info(msg)
                continue
            file_info = scrape_info_from_fname(input_path)
            input_fhand = open(input_fpath)
            output_fhand = open(output_fpath, 'w')
            pipeline = self._guess_cleaning_pipepile(file_info)
            infhands = {'in_seq': input_fhand}
            writer = SequenceWriter(output_fhand,
                                    file_format=file_info['format'])

            configuration = self.create_cleaning_configuration(
                                                      platform=file_info['pl'],
                                                      library=file_info['lb'])
            try:
                seq_pipeline_runner(pipeline, configuration, infhands,
                                    file_info['format'],
                                    processes=self.threads,
                                    writers={'seq': writer})
            except Exception as error:
                output_fhand.close()
                os.remove(output_fpath)
                raise(error)
            output_fhand.close()
            input_fhand.close()

        self._log({'analysis_finished': True})
        return
Пример #5
0
    def run(self):
        'It runs the analysis.'
        output_dir = self._create_output_dirs()['result']
        inputs = self._get_input_fpaths()
        pickle_paths = inputs['pickle']

        output_files = {'vcf': ('vcf',),
                        'orf':('orf_seq.fasta', 'orf_pep.fasta'),
                        'ssr':('ssr',),
                        'gff':('gff3',),
                        'orthologs':('orthologs',), }

        for seq_path in pickle_paths:
            outputs = {}
            for kind, extensions in output_files.items():
                outputs[kind] = []
                for extension in extensions:
                    output_fpath = os.path.join(output_dir,
                                            seq_path.basename + '.' + extension)
                    if os.path.exists(output_fpath):
                        os.remove(output_fpath)
                    output_fhand = open(output_fpath, 'a')
                    outputs[kind].append(output_fhand)

            for kind, output in outputs.items():
                if len(output) == 1:
                    outputs[kind] = output[0]

            in_fhands = {'in_seq': open(seq_path.last_version)}

            writers = {}
            if 'pickle' in outputs:
                writers['pickle'] = SequenceWriter(fhand=outputs['pickle'],
                                                 file_format='pickle')

            if 'vcf' in outputs:
                ref_name = os.path.basename(in_fhands['in_seq'].name)
                fhand = outputs['vcf']
                grouping = self._project_settings['Snvs']['vcf_grouping']
                writers['vcf'] = VariantCallFormatWriter(fhand=fhand,
                                                        reference_name=ref_name,
                                                        grouping=grouping)
            if 'gff' in outputs:
                default_type = None
                writers['gff'] = SeqGffWriter(fhand=outputs['gff'],
                                           default_type=default_type)
            if 'orf' in outputs:
                fhand, pep_fhand = outputs['orf']
                writers['orf'] = OrfWriter(fhand=fhand, pep_fhand=pep_fhand)

            if 'ssr' in outputs:
                writers['ssr'] = SsrWriter(fhand=outputs['ssr'])

            if 'orthologs' in outputs:
                writers['orthologs'] = \
                                      OrthologWriter(fhand=outputs['orthologs'])

            if 'snv_illumina' in outputs:
                writers['snv_illumina'] = \
                                SnvIlluminaWriter(fhand=outputs['snv_illumina'])

            feature_counter = seq_pipeline_runner(pipeline=None,
                                                  configuration=None,
                                                  in_fhands=in_fhands,
                                                  writers=writers)

            # We need to close fhands and remove void files.
            # sequence writer could have a qual fhand
            # orf writer has a pep_fhand
            for kind, fhands in outputs.items():
                kind_key = 'sequence' if kind == 'quality' else kind
                if kind in feature_counter:
                    self._close_and_remove_files(fhands,
                                                 feature_counter[kind_key])
            if 'vcf' in outputs and os.path.exists(outputs['vcf'].name):
                compress_and_index_vcf(outputs['vcf'].name)