Пример #1
0
def create_irs_data(step_data, annotation_step, params):
    SeqIO = import_bio_seq_io()

    seq_idents = annotation_step.all_sequences()  # set
    ref_ident = find_referent_genome(seq_idents, params.referent_genome)

    step = annotation_step.project.new_step(ChloroplastSSCBlast, step_data)
    ref_seq_rec = annotation_step.get_sequence_record(ref_ident)
    ssc_location = step.get_type_description_elem('ssc_location',
                                                  default=dict())
    ensure_directory(step.step_file('run_dir'))

    # Store query data
    query_file = step.step_file('run_dir', 'query.fa')
    if not os.path.isfile(query_file):
        irs = find_chloroplast_irs(ref_seq_rec)
        if not irs:
            raise ZCItoolsValueError(
                f"Referent genome ({ref_ident}) doesn't have IRS!")
        write_fasta(query_file,
                    [('ira', str(irs[0].extract(ref_seq_rec).seq))])

    files_to_zip = [query_file]
    calc_seq_idents = []

    # All sequences, to create database from
    for seq_ident in sorted(seq_idents):
        if not os.path.isfile(step.step_file('run_dir', f'{seq_ident}.xml')):
            fa_file = step.step_file('run_dir', f'{seq_ident}.fa')
            files_to_zip.append(fa_file)
            calc_seq_idents.append(seq_ident)
            if not os.path.isfile(fa_file):
                seq_rec = annotation_step.get_sequence_record(seq_ident)
                SeqIO.write([seq_rec], fa_file, 'fasta')
                # Store SSC position
                irs = find_chloroplast_irs(seq_rec)
                ssc_location[seq_ident] = [len(seq_rec), int(irs[0].location.end), irb_start(irs[1])] \
                    if irs else [len(seq_rec), -1, -1]

    if calc_seq_idents:
        # Store finish.yml
        finish_f = step.step_file('finish.yml')
        write_yaml(dict(calc_seq_idents=calc_seq_idents), finish_f)

        run = True  # ToDo: ...
        step.save(dict(ssc_location=ssc_location), completed=False)
        if run:
            run_module_script(run_irs_blast, step)
            finish_irs_data(step)
        else:
            files_to_zip.append(finish_f)
            set_run_instructions(run_irs_blast, step, files_to_zip,
                                 _instructions)
    #
    elif params.force_blast_parse:
        finish_irs_data(step)

    return step
Пример #2
0
 def save_description(self, type_description, create=True, completed=True):
     self._step_data['completed'] = completed
     pd = dict(self._step_data)
     if create:
         pd['created'] = datetime.datetime.now().isoformat()
         pd['updated'] = None
     else:
         pd['updated'] = datetime.datetime.now().isoformat()
     write_yaml(
         dict(data_type=self._STEP_TYPE, data=type_description, project=pd),
         self.step_file('description.yml'))
Пример #3
0
def create_ogdraw(step_data,
                  image_format,
                  annotations_step,
                  common_db,
                  sequences=None):
    step = ImagesStep(annotations_step.project, step_data, remove_data=True)
    all_images = sorted(
        sequences.split(';') if sequences else annotations_step.all_sequences(
        ))

    # Fetch common db sequences
    to_fetch = step.get_common_db_records(common_db, all_images, info=True)

    # If OGDraw is done on GeSeq data, than jpg images are already in
    if image_format == 'jpg':
        # Extract jpg files job-results-<num>/GeSeqJob-<num>-<num>_<seq_ident>_OGDRAW.jpg
        for filename in annotations_step.step_files(
                matches='^job-results-[0-9]*.zip'):
            with ZipFile(annotations_step.step_file(filename), 'r') as zip_f:
                for z_i in zip_f.infolist():
                    m = _re_zip_jpg.search(z_i.filename)
                    if m:
                        seq_ident = m.group(1)
                        if seq_ident in to_fetch:
                            to_fetch.remove(seq_ident)
                            extract_from_zip(
                                zip_f, z_i.filename,
                                step.step_file(seq_ident + '.jpg'))

    # Store sequence
    if to_fetch:
        # Note: it is important that file has extension gbff (multiple sequence data)
        sequences = dict()
        for i, d in enumerate(split_list(to_fetch, 30)):
            annotations_step.concatenate_seqs_genbank(
                step.step_file(f'sequences_{i + 1}.gbff'), d)
            sequences[i + 1] = d

        # Store instructions
        write_str_in_file(
            step.step_file('INSTRUCTIONS.txt'),
            _instructions.format(step_name=step_data['step_name'],
                                 image_format=image_format))
        # Store image format used
        write_yaml(dict(image_format=image_format, sequences=sequences),
                   step.step_file('finish.yml'))

    #
    step.set_images(all_images)
    step.save(completed=not to_fetch)
    return step
Пример #4
0
def create_new_hybrids_data(project, step_data, params):
    # Check input files
    if not os.path.isfile(params.data_file):
        raise ZCItoolsValueError(
            f"Input data file {params.data_file} doesn't exist!")
    if not os.path.isfile(params.gtyp_cat_file):
        raise ZCItoolsValueError(
            f"Input genotype category probabilities {params.gtyp_cat_file} doesn't exist!"
        )
    data_file = os.path.basename(params.data_file)
    gtyp_cat_file = os.path.basename(params.gtyp_cat_file)

    step = NewHybridsStep(project, step_data, remove_data=True)
    step.set_data(data_file, gtyp_cat_file, params.theta_prior,
                  params.pi_prior, params.burn_in, params.num_sweeps)

    # Copy input files
    files_to_zip = [step.step_file(data_file), step.step_file(gtyp_cat_file)]
    copy_file(params.data_file, files_to_zip[0])
    copy_file(params.gtyp_cat_file, files_to_zip[1])

    # Create run directories
    seeds = random.sample(
        list(itertools.product(range(1, _MAX_SMALL_NUMBER + 1), repeat=2)),
        params.num_runs)

    for seed in seeds:
        files_to_zip.append(step.step_file(step.seed_dir(seed)))
        ensure_directory(files_to_zip[-1])

    files_to_zip.append(step.step_file('finish.yml'))
    write_yaml(
        dict(data_file=data_file,
             gtyp_cat_file=gtyp_cat_file,
             theta_prior=params.theta_prior,
             pi_prior=params.pi_prior,
             burn_in=params.burn_in,
             num_sweeps=params.num_sweeps), files_to_zip[-1])

    # Stores description.yml
    step.save(completed=params.run)

    # Run or set instructions
    if params.run:
        run_module_script(run_new_hybrids, step)
    else:
        set_run_instructions(run_new_hybrids, step, files_to_zip,
                             _instructions)
    #
    return step
Пример #5
0
def create_irs_data(step_data, input_step, params, common_db):  # , run):
    # Creates Annotations step from input sequences/annotations
    # Steps subdirectory 'run_dir' contains input and output calculation files
    SeqIO = import_bio_seq_io()
    files_to_zip = []
    calc_seq_idents = []

    step = input_step.project.new_step(AnnotationsStep, step_data)
    # Set sequences
    step.set_sequences(input_step.all_sequences())
    ensure_directory(step.step_file('run_dir'))

    for seq_ident in input_step.all_sequences():
        out_file = step.step_file('run_dir', f'{seq_ident}.out')
        if not os.path.isfile(out_file):
            seq_rec = input_step.get_sequence_record(seq_ident)
            # Set fasta file for calculation
            files_to_zip.append(step.step_file('run_dir', f'{seq_ident}.fa'))
            SeqIO.write([seq_rec], files_to_zip[-1], 'fasta')
            calc_seq_idents.append(seq_ident)
        elif not os.path.isfile(step.step_file(f'{seq_ident}.gb')):
            calc_seq_idents.append(seq_ident)

    if files_to_zip:
        # Store finish.yml
        finish_f = step.step_file('finish.yml')
        write_yaml(dict(fa_files=files_to_zip), finish_f)

        run = True  # ToDo: ...
        step.save(completed=False)
        if run:
            run_module_script(run_irs_mummer, step)
            finish_irs_data(step, common_db, calc_seq_idents=calc_seq_idents)
        else:
            files_to_zip.append(finish_f)
            set_run_instructions(run_irs_mummer, step, files_to_zip,
                                 _instructions)
    #
    elif calc_seq_idents:
        finish_irs_data(step, common_db, calc_seq_idents=calc_seq_idents)
    elif params.force_mummer_parse:
        finish_irs_data(step, common_db)

    #
    return step
Пример #6
0
def create_raxml_data(step_data, alignment_step, partitions_obj, run_threads):
    # List of dicts with attrs: filename, short, partitions (filename or None)
    # This data is used to optimize calculation
    files_to_proc = []
    files_to_zip = []

    if alignment_step._IS_COLLECTION:
        step = RAxMLSteps(alignment_step.project, step_data, remove_data=True)
        for align_step in alignment_step.step_objects():
            substep = step.create_substep(align_step.get_local_name())
            substep.set_sequences(align_step.all_sequences())
            substep.seq_sequence_type(align_step.get_sequence_type())
            _copy_alignment_file(align_step, substep, files_to_proc,
                                 partitions_obj)
            #
            substep.save(completed=False)
    else:
        step = RAxMLStep(alignment_step.project, step_data, remove_data=True)
        step.set_sequences(alignment_step.all_sequences())
        step.seq_sequence_type(alignment_step.get_sequence_type())
        _copy_alignment_file(alignment_step, step, files_to_proc,
                             partitions_obj)

    # Store files desc
    files_to_zip = [d['filename'] for d in files_to_proc]  # files to zip
    files_to_zip.extend(filter(None, (d['partitions'] for d in files_to_proc)))

    # Remove step directory from files since run script is called from step directory
    for d in files_to_proc:
        d['filename'] = step.strip_step_dir(d['filename'])
    finish_f = step.step_file('finish.yml')
    write_yaml(files_to_proc, finish_f)

    # Stores description.yml
    step.save(completed=bool(run_threads))

    if run_threads:
        run_module_script(run_raxml, step, threads=run_threads)
    else:
        files_to_zip.append(finish_f)
        set_run_instructions(run_raxml, step, files_to_zip, _instructions)
    #
    return step
Пример #7
0
def init_project(project, dirname, project_desc, workflow,
                 workflow_parameters):
    if os.path.isfile('project_log.yml'):
        print(f'Warning: init project called on existing project!')
        print(f'Warning: project {dirname} was not created!')

    elif ensure_directory(dirname, check_empty=True):
        # Add setting file
        settings = dict(settings_defaults)
        if workflow:
            if workflow_parameters:
                w_pars = dict(
                    x.split('=') for x in workflow_parameters.split(';'))
            else:
                w_pars = dict()
            wf_cls = project.get_workflow_cls(workflow)
            if (not_in :=
                [p for p in wf_cls.required_parameters() if p not in w_pars]):
                raise ZCItoolsValueError(
                    f"Workflow's parameters not specified: {', '.join(not_in)}!"
                )

            settings['workflow'] = workflow
            settings['workflow_parameters'] = wf_cls.format_parameters(w_pars)
        write_yaml(settings, os.path.join(dirname, 'settings.yml'))

        # Create empty project.log file
        with open(os.path.join(dirname, 'project_log.yml'), 'w') as r:
            pass

        # Set README.txt file
        with open(os.path.join(dirname, 'README.txt'), 'w') as r:
            if project_desc:
                r.write(f'Project description:\n{project_desc}\n')
            r.write(_readme)
            if workflow:
                r.write(_wf_readme.format(workflow=workflow))
Пример #8
0
    def _run_command(self, command, args, cmd_args=None):
        self._args = args  # Store commands args
        command_obj = self.commands_map[command](self, args)
        command_type = command_obj.get_command_type()

        # General work
        if not command_type:
            if command_obj._PROJECT_COMMAND and not self._check_is_project_valid(
            ):
                return
            command_obj.run()

        # Create new step
        elif command_type in ('new_step', 'new_steps'):
            if not self._check_is_project_valid():
                return

            # Run command
            command_args = dict(
                (k, v) for k, v in vars(args).items()
                if k not in ('command', 'step_num', 'step_description'))
            db_id = command_obj.common_db_identifier()
            step_data = dict(
                prev_steps=command_obj.prev_steps(),
                common_db_identifier=list(db_id) if db_id else None,
                command=command,
                command_args=command_args,
                cmd=' '.join(cmd_args or sys.argv[1:]))
            ret = None
            if command_type == 'new_step':
                step_data['step_name'] = self.new_step_name(command_obj, args)
                ret = command_obj.run(step_data)
                if ret:
                    if not ret.is_completed():
                        print(
                            f'Step is not finished, check instruction ({ret.directory}/INSTRUCTIONS.txt)!'
                        )
                else:
                    print(
                        "Warning: create step command didn't return step object!"
                    )
            else:
                ret = command_obj.run(step_data)
                if ret is not None:
                    for s in ret:
                        if not s.is_completed():
                            print(
                                f'Step is not finished, check instruction ({s.directory}/INSTRUCTIONS.txt)!'
                            )
                else:
                    print(
                        "Warning: create steps command didn't return any step object!"
                    )

            if ret:
                # Store log data into project_log.yml
                step_data = dict((k, v) for k, v in step_data.items()
                                 if k in ('cmd', 'step_name'))
                # Do not store if step_data is equal as from last command?
                log = read_yaml('project_log.yml')
                if not log or log[-1] != step_data:
                    write_yaml([step_data], 'project_log.yml',
                               mode='a')  # Appends yml list

        else:
            print(f"Warning: not supported command_type {command_type}?!")
Пример #9
0
def create_permutations(project,
                        step_data,
                        raw_file,
                        permutations,
                        num_traits=None,
                        run=False):
    # Check input files
    map_file = raw_file.replace('.raw', '.map')
    data_dir, base_raw_file = os.path.split(raw_file)
    tmp_files = ('tmp.00m', 'tmp.00c', 'tmp.00r')
    for mf in (raw_file, map_file):
        if not os.path.isfile(mf):
            raise ZCItoolsValueError(
                f"Input MapMaker file {mf} doesn't exist!")
    for qf in tmp_files:
        f = os.path.join(data_dir, qf)
        if not os.path.isfile(f):
            raise ZCItoolsValueError(
                f"Input Windows QTL Cartographer file {qf} doesn't exist!")

    #
    step = QTLCartStep(project, step_data, remove_data=True)
    step.set_data(num_traits, permutations)

    # Copy input files
    files_to_zip = []
    for qf in tmp_files:
        files_to_zip.append(step.step_file(qf))
        copy_file(os.path.join(data_dir, qf), files_to_zip[-1])

    # Create trait directories
    # ToDo: find max traits and fix it/set default
    assert num_traits and num_traits > 0, num_traits
    trait_dirs = []
    for t_idx in range(1, num_traits + 1):
        trait_dirs.append(step.trait_dir(t_idx))
        t_dir = step.step_file(trait_dirs[-1])
        ensure_directory(t_dir)
        files_to_zip.append(os.path.join(t_dir, 'qtlcart.rc'))
        write_str_in_file(
            files_to_zip[-1],
            _qtlcart_rc.format(trait=t_idx, num_traits=num_traits))
        # # Create links to input files
        # for qf in tmp_files:
        #     link_file(os.path.join('..', qf), os.path.join(t_dir, qf))
        #

    files_to_zip.append(step.step_file('finish.yml'))
    write_yaml(dict(permutations=permutations, trait_dirs=trait_dirs),
               files_to_zip[-1])

    # Stores description.yml
    step.save(completed=run)

    # Run or set instructions
    if run:
        run_module_script(run_qtl_cart_perm, step)
    else:
        set_run_instructions(run_qtl_cart_perm, step, files_to_zip,
                             _instructions)
    #
    return step
Пример #10
0
 def save_summary_data(self, data):
     assert isinstance(data, dict), data
     write_yaml(data, self.step_file('summary.yml'))
Пример #11
0
 def write_data(self, filename):
     write_yaml(dict(reads=self.reads, paired_reads=self.paired_reads), filename)
Пример #12
0
def create_irs_data(step_data, input_step, params):
    # Creates Annotations step from input sequences/annotations
    # Steps subdirectory 'run_dir' contains input and output calculation files
    SeqIO = import_bio_seq_io()
    seq_idents = input_step.all_sequences()

    step = input_step.project.new_step(AnnotationsStep, step_data)
    step.set_sequences(seq_idents)
    # seq_ident -> mummer data ([length, start_1, start_2])
    mummer_results = step.get_type_description_elem('mummer_results', default=dict())
    #
    ensure_directory(step.step_file('run_dir'))
    calc_mummer = []  # tuples (seq_ident, fasta file, mummer output file)

    # Mummer
    for seq_ident in sorted(seq_idents - set(mummer_results)):
        fa_file = step.step_file('run_dir', f'{seq_ident}.fa')
        mummer_res_file = step.step_file('run_dir', f'{seq_ident}.out')
        if not os.path.isfile(fa_file):
            seq_rec = input_step.get_sequence_record(seq_ident)
            SeqIO.write([seq_rec], fa_file, 'fasta')
            calc_mummer.append((seq_ident, fa_file, mummer_res_file))
        elif not os.path.isfile(mummer_res_file):
            calc_mummer.append((seq_ident, fa_file, mummer_res_file))

    # Run mummer
    if calc_mummer:
        mummer_exe = 'repeat-match'  # ToDo:
        n = 3000
        threads = multiprocessing.cpu_count()
        with ThreadPoolExecutor(max_workers=threads) as executor:
            for seq_ident, fa_file, mummer_res_file in calc_mummer:
                executor.submit(_run_single, mummer_exe, n, fa_file, mummer_res_file)

        for seq_ident, _, mummer_res_file in calc_mummer:
            rep = _read_mummer_repeat(mummer_res_file)
            if not rep:
                raise ZCItoolsValueError(f'No repeat for sequence {seq_ident}!')
            mummer_results[seq_ident] = rep

    # Find sequences extend with alignment
    files_to_zip = []
    calc_mafft = []
    for seq_ident in sorted(seq_idents):
        length, s1, s2 = mummer_results[seq_ident]
        if length >= 23000:
            continue

        if step.is_file('run_dir', f'{seq_ident}_right_align.fa') and \
           step.is_file('run_dir', f'{seq_ident}_right_align.fa'):
            continue

        #
        calc_mafft.append(seq_ident)
        _seq = input_step.get_sequence_record(seq_ident).seq
        seq = str(_seq)
        comp_seq = str(_seq.complement())
        missing = 26000 - length

        # Right side
        p1 = _extract_subseq_plus(seq, s1 + length, missing)
        p2 = _extract_subseq_minus(comp_seq, s2 - length, missing)
        assert len(p1) == len(p2), (length, s1, s2, missing, (len(p1), len(p2)))
        files_to_zip.append(step.step_file('run_dir', f'{seq_ident}_right.fa'))
        write_fasta(files_to_zip[-1], [('p1', p1), ('p2', p2)])

        # Left side
        p1 = _extract_subseq_minus(comp_seq, s1 - 1, missing)
        p2 = _extract_subseq_plus(seq, s2 + 1, missing)
        assert len(p1) == len(p2), (length, s1, s2, missing, (len(p1), len(p2)))
        files_to_zip.append(step.step_file('run_dir', f'{seq_ident}_left.fa'))
        write_fasta(files_to_zip[-1], [('p1', p1), ('p2', p2)])

    # Mafft
    if calc_mafft:
        finish_f = step.step_file('finish.yml')
        write_yaml(dict(calc_seq_idents=calc_mafft), finish_f)

        run = True  # ToDo: ...
        step.save(additional_data=dict(mummer_results=mummer_results), completed=False)
        if run:
            run_module_script(run_mafft_irs, step)
            finish_irs_data(step)
        else:
            files_to_zip.append(finish_f)
            set_run_instructions(run_mafft_irs, step, files_to_zip, _instructions)
    #
    elif params.force_parse:
        finish_irs_data(step)

    return step
Пример #13
0
def orientate_chloroplast_start(step_data, annotation_step, params):
    # Find referent genome
    # For each sequence, different than referent, directory is created named <seq_ident>.
    # It contains files:
    #  - {lsc|ira|ss}_{plus|minus}.fa       : input alignment files, contain 2 sequences.
    #  - align_{lsc|ira|ss}_{plus|minus}.fa : result alignment files.
    seq_idents = annotation_step.all_sequences()  # set
    ref_ident = find_referent_genome(seq_idents, params.referent_genome)
    #
    length = params.length_to_check
    step = annotation_step.project.new_step(ChloroplastOrientateStep, step_data, remove_data=False)
    sequence_data = step.get_type_description_elem('sequence_data', default=dict())
    #
    seq_rec = annotation_step.get_sequence_record(ref_ident)
    partition = find_chloroplast_partition(seq_rec)
    ref_parts = [str(partition.get_part_by_name(n).extract(seq_rec).seq)[:length] for n in _part_names]
    files_to_zip = []
    align_files = []

    #
    all_versions = ('plus', 'minus', 'plus_c', 'minus_c') if params.complement else ('plus', 'minus')
    for seq_ident in sorted(seq_idents):
        seq_rec = None
        if seq_ident not in sequence_data:
            seq_rec = annotation_step.get_sequence_record(seq_ident)
            partition = find_chloroplast_partition(seq_rec)

            # Count gene orientation
            l_seq = len(seq_rec)
            in_parts = partition.put_features_in_parts(
                Feature(l_seq, feature=f) for f in seq_rec.features if f.type == 'gene')

            lsc_count = sum(f.feature.strand if any(x in f.name for x in ('rpl', 'rps')) else 0
                            for f in in_parts.get('lsc', []))
            ssc_count = sum(f.feature.strand for f in in_parts.get('ssc', []))
            ira_count = sum(f.feature.strand if 'rrn' in f.name else 0 for f in in_parts.get('ira', []))

            sequence_data[seq_ident] = dict(
                length=len(seq_rec),
                lsc=(lsc_count <= 0), lsc_count=lsc_count, lsc_length=len(partition.get_part_by_name('lsc')),
                ssc=(ssc_count <= 0), ssc_count=ssc_count, ssc_length=len(partition.get_part_by_name('ssc')),
                ira=(ira_count >= 0), ira_count=ira_count, ira_length=len(partition.get_part_by_name('ira')))

        if all(all(step.is_file(seq_ident, f'align_{n}_{v}.fa') for v in all_versions) for n in _part_names):
            continue
        #
        if seq_rec is None:
            seq_rec = annotation_step.get_sequence_record(seq_ident)
            partition = find_chloroplast_partition(seq_rec)
        for n, ref_p in zip(_part_names, ref_parts):
            # Find missing output files
            _num = len(align_files)
            for x in all_versions:
                if not step.is_file(seq_ident, f'align_{n}_{x}.fa'):
                    files_to_zip.append(step.step_file(seq_ident, f'{n}_{x}.fa'))
                    align_files.append((seq_ident, n, x))
            if _num == len(align_files):
                continue

            # Store input files
            if all(step.is_file(seq_ident, f'align_{n}_{v}.fa') for v in all_versions):
                continue
            ensure_directory(step.step_file(seq_ident))
            part_s = partition.get_part_by_name(n).extract(seq_rec)

            f_p = step.step_file(seq_ident, f'{n}_plus.fa')
            f_p_c = step.step_file(seq_ident, f'{n}_plus_c.fa')
            if not os.path.isfile(f_p):
                write_fasta(f_p, [(ref_ident, ref_p), (seq_ident, str(part_s.seq)[:length])])
            if not os.path.isfile(f_p_c):
                write_fasta(f_p_c, [(ref_ident, ref_p),
                                    (seq_ident, str(part_s.reverse_complement().seq)[:(-length-1):-1])])

            f_m = step.step_file(seq_ident, f'{n}_minus.fa')
            f_m_c = step.step_file(seq_ident, f'{n}_minus_c.fa')
            if not os.path.isfile(f_m):
                write_fasta(f_m, [(ref_ident, ref_p), (seq_ident, str(part_s.reverse_complement().seq)[:length])])
            if not os.path.isfile(f_m_c):
                write_fasta(f_m_c, [(ref_ident, ref_p), (seq_ident, str(part_s.seq)[:(-length-1):-1])])

    #
    output_file = f"{params.output_file_prefix}_{length}{'_c' if params.complement else ''}.xlsx"
    data = dict(sequence_data=sequence_data, check_length=length, output_file=output_file, complement=params.complement)
    if align_files:
        # Store finish.yml
        finish_f = step.step_file('finish.yml')
        write_yaml(dict(align_files=align_files), finish_f)

        run = True  # ToDo: ...
        step.save(data, completed=False)
        if run:
            run_module_script(run_orientate, step)
            orientate_chloroplast_finish(step)  # , common_db, calc_seq_idents=calc_seq_idents)
        else:
            files_to_zip.append(finish_f)
            set_run_instructions(run_orientate, step, files_to_zip, _instructions)
    #
    elif params.force_parse:
        step.save(data)
        orientate_chloroplast_finish(step)  # , common_db, calc_seq_idents=calc_seq_idents)
    #
    else:
        step.save(data, completed=False)

    return step
Пример #14
0
def create_mr_bayes_data(step_data, alignment_step, args, partitions_obj,
                         run_threads):
    # List of dicts with attrs: filename, short
    # This data is used to optimize calculation
    # ToDo: almost the same as raxml.py. Differs in class types, _copy_alignment_file() and file formats
    files_to_proc = []

    if alignment_step._IS_COLLECTION:
        step = MrBayesSteps(alignment_step.project,
                            step_data,
                            remove_data=True)
        for align_step in alignment_step.step_objects():
            substep = step.create_substep(align_step.get_local_name())
            substep.set_sequences(align_step.all_sequences())
            substep.seq_sequence_type(align_step.get_sequence_type())
            _copy_alignment_file(align_step, substep, files_to_proc, args,
                                 partitions_obj)
            #
            substep.save(completed=False)
        if args.num_runs and args.num_runs > 1:
            print(
                'Warning: number of runs for collection of alignments is not supported.'
            )
    else:
        if args.num_runs and args.num_runs > 1:
            step = MrBayesSteps(alignment_step.project,
                                step_data,
                                remove_data=True)
            for run_idx in range(args.num_runs):
                substep = step.create_substep(f'RUN_{run_idx + 1}')
                substep.set_sequences(alignment_step.all_sequences())
                substep.seq_sequence_type(alignment_step.get_sequence_type())
                # ToDo: make symbolic links?
                _copy_alignment_file(alignment_step, substep, files_to_proc,
                                     args, partitions_obj)
                #
                substep.save(completed=False)
        else:
            step = MrBayesStep(alignment_step.project,
                               step_data,
                               remove_data=True)
            step.set_sequences(alignment_step.all_sequences())
            step.seq_sequence_type(alignment_step.get_sequence_type())
            _copy_alignment_file(alignment_step, step, files_to_proc, args,
                                 partitions_obj)

    # Store files desc
    files_to_zip = [d['filename'] for d in files_to_proc]  # files to zip
    # Remove step directory from files since run script is called from step directory
    for d in files_to_proc:
        d['filename'] = step.strip_step_dir(d['filename'])
        d['result_prefix'] = step.strip_step_dir(d['result_prefix'])
    finish_f = step.step_file('finish.yml')
    write_yaml(files_to_proc, finish_f)

    # Stores description.yml
    step.save(completed=bool(run_threads))

    if run_threads:
        run_module_script(run_mr_bayes,
                          step,
                          threads=run_threads,
                          use_mpi=(not args.no_mpi))
    else:
        files_to_zip.append(finish_f)
        set_run_instructions(run_mr_bayes, step, files_to_zip, _instructions)
    #
    return step