Python ensure_directory примеры, common_utils.file_utils.ensure_directory Python примеры использования

Пример #1

0

Показать файл

def _run_align(seq_ident,
               seq_data,
               close_data,
               seq_2_result_object,
               match_length=100,
               first_nice=True):
    # In input dictionary seq_2_result_object for key seq_ident set value (align_seq_ident, ira, irb)
    # Store fasta
    f_dir = seq_data._analyse.step.step_file('find_irs', seq_ident)
    ensure_directory(f_dir)
    seq_fasta = os.path.join(f_dir, f"{seq_ident}.fa")
    write_fasta(seq_fasta, [(seq_ident, seq_data._seq.seq)])

    # It is (probably) better first to prefer newer sequences!
    close_data = sorted(close_data, reverse=True, key=lambda d: d.first_date)
    all_aligns = []
    for d in close_data:
        ira = d._partition.get_part_by_name['ira']
        rec = ira.extract(d._seq)
        qry_fasta = os.path.join(f_dir, f"qry_{d.seq_ident}.fa")
        write_fasta(qry_fasta, [('end1', rec.seq[:match_length]),
                                ('end2', rec.seq[-match_length:])])
        align = run_align_cmd(seq_fasta, qry_fasta, f"res_{d.seq_ident}")
        if first_nice and (irs := _get_nice_irs(align)):
            seq_2_result_object[seq_ident] = (d.seq_ident, *irs)
            return
        align.seq_ident = d.seq_ident
        all_aligns.append(align)

Пример #2

0

Показать файл

Файл: commands.py Проект: CroP-BioDiv/zcitools

    def run(self):
        import os.path
        from collections import defaultdict
        from ..utils.import_methods import import_bio_seq_io
        from ..utils.helpers import get_bio_io_type
        from common_utils.file_utils import ensure_directory, write_fasta

        args = self.args
        od = args.output_directory
        SeqIO = import_bio_seq_io()
        ensure_directory(od)
        genes = defaultdict(dict)  # gene -> dict(species -> data)

        for i_filename in args.input_files:
            for seq in SeqIO.parse(
                    i_filename, get_bio_io_type(i_filename,
                                                args.input_format)):
                name = seq.id
                split_on = name.index('_')
                gene = name[:split_on]
                species = name[(split_on + 1):]
                genes[gene][species] = seq.seq

        for gene, data in genes.items():
            write_fasta(os.path.join(od, f'{gene}.fasta'),
                        sorted(data.items()))

Пример #3

0

Показать файл

def create_irs_data(step_data, annotation_step, params):
    SeqIO = import_bio_seq_io()

    seq_idents = annotation_step.all_sequences()  # set
    ref_ident = find_referent_genome(seq_idents, params.referent_genome)

    step = annotation_step.project.new_step(ChloroplastSSCBlast, step_data)
    ref_seq_rec = annotation_step.get_sequence_record(ref_ident)
    ssc_location = step.get_type_description_elem('ssc_location',
                                                  default=dict())
    ensure_directory(step.step_file('run_dir'))

    # Store query data
    query_file = step.step_file('run_dir', 'query.fa')
    if not os.path.isfile(query_file):
        irs = find_chloroplast_irs(ref_seq_rec)
        if not irs:
            raise ZCItoolsValueError(
                f"Referent genome ({ref_ident}) doesn't have IRS!")
        write_fasta(query_file,
                    [('ira', str(irs[0].extract(ref_seq_rec).seq))])

    files_to_zip = [query_file]
    calc_seq_idents = []

    # All sequences, to create database from
    for seq_ident in sorted(seq_idents):
        if not os.path.isfile(step.step_file('run_dir', f'{seq_ident}.xml')):
            fa_file = step.step_file('run_dir', f'{seq_ident}.fa')
            files_to_zip.append(fa_file)
            calc_seq_idents.append(seq_ident)
            if not os.path.isfile(fa_file):
                seq_rec = annotation_step.get_sequence_record(seq_ident)
                SeqIO.write([seq_rec], fa_file, 'fasta')
                # Store SSC position
                irs = find_chloroplast_irs(seq_rec)
                ssc_location[seq_ident] = [len(seq_rec), int(irs[0].location.end), irb_start(irs[1])] \
                    if irs else [len(seq_rec), -1, -1]

    if calc_seq_idents:
        # Store finish.yml
        finish_f = step.step_file('finish.yml')
        write_yaml(dict(calc_seq_idents=calc_seq_idents), finish_f)

        run = True  # ToDo: ...
        step.save(dict(ssc_location=ssc_location), completed=False)
        if run:
            run_module_script(run_irs_blast, step)
            finish_irs_data(step)
        else:
            files_to_zip.append(finish_f)
            set_run_instructions(run_irs_blast, step, files_to_zip,
                                 _instructions)
    #
    elif params.force_blast_parse:
        finish_irs_data(step)

    return step

Пример #4

0

Показать файл

Файл: steps.py Проект: CroP-BioDiv/zcitools

 def set_group_rows(self, group, rows):
     # ToDo: check rows?
     if rows:
         self._rows = None  # Remove get_rows() cache
         data_dir = self._data_subdirectory()
         ensure_directory(data_dir)
         write_csv(os.path.join(data_dir, group), self._columns[1:], rows)
     else:
         append_line_to_file(self._no_data_filename(), group)

Пример #5

0

Показать файл

def calculate_and_add_irs_to_seq_rec(step, seq_ident, seq_rec):
    SeqIO = import_bio_seq_io()
    # Store input fasta file
    ensure_directory(step.step_file('run_dir'))
    input_filename = step.step_file('run_dir', f'{seq_ident}.fa')
    SeqIO.write([seq_rec], input_filename, 'fasta')
    # Run MUMmer
    run_one(input_filename)
    m_res = _MUMmerResult(step.step_file('run_dir', f'{seq_ident}.out'),
                          seq_ident, len(seq_rec))
    return m_res.set_annotation(seq_ident, seq_rec,
                                step.step_file(f'{seq_ident}.gb'))

Пример #6

0

Показать файл

def create_new_hybrids_data(project, step_data, params):
    # Check input files
    if not os.path.isfile(params.data_file):
        raise ZCItoolsValueError(
            f"Input data file {params.data_file} doesn't exist!")
    if not os.path.isfile(params.gtyp_cat_file):
        raise ZCItoolsValueError(
            f"Input genotype category probabilities {params.gtyp_cat_file} doesn't exist!"
        )
    data_file = os.path.basename(params.data_file)
    gtyp_cat_file = os.path.basename(params.gtyp_cat_file)

    step = NewHybridsStep(project, step_data, remove_data=True)
    step.set_data(data_file, gtyp_cat_file, params.theta_prior,
                  params.pi_prior, params.burn_in, params.num_sweeps)

    # Copy input files
    files_to_zip = [step.step_file(data_file), step.step_file(gtyp_cat_file)]
    copy_file(params.data_file, files_to_zip[0])
    copy_file(params.gtyp_cat_file, files_to_zip[1])

    # Create run directories
    seeds = random.sample(
        list(itertools.product(range(1, _MAX_SMALL_NUMBER + 1), repeat=2)),
        params.num_runs)

    for seed in seeds:
        files_to_zip.append(step.step_file(step.seed_dir(seed)))
        ensure_directory(files_to_zip[-1])

    files_to_zip.append(step.step_file('finish.yml'))
    write_yaml(
        dict(data_file=data_file,
             gtyp_cat_file=gtyp_cat_file,
             theta_prior=params.theta_prior,
             pi_prior=params.pi_prior,
             burn_in=params.burn_in,
             num_sweeps=params.num_sweeps), files_to_zip[-1])

    # Stores description.yml
    step.save(completed=params.run)

    # Run or set instructions
    if params.run:
        run_module_script(run_new_hybrids, step)
    else:
        set_run_instructions(run_new_hybrids, step, files_to_zip,
                             _instructions)
    #
    return step

Пример #7

0

Показать файл

def create_irs_data(step_data, input_step, params, common_db):  # , run):
    # Creates Annotations step from input sequences/annotations
    # Steps subdirectory 'run_dir' contains input and output calculation files
    SeqIO = import_bio_seq_io()
    files_to_zip = []
    calc_seq_idents = []

    step = input_step.project.new_step(AnnotationsStep, step_data)
    # Set sequences
    step.set_sequences(input_step.all_sequences())
    ensure_directory(step.step_file('run_dir'))

    for seq_ident in input_step.all_sequences():
        out_file = step.step_file('run_dir', f'{seq_ident}.out')
        if not os.path.isfile(out_file):
            seq_rec = input_step.get_sequence_record(seq_ident)
            # Set fasta file for calculation
            files_to_zip.append(step.step_file('run_dir', f'{seq_ident}.fa'))
            SeqIO.write([seq_rec], files_to_zip[-1], 'fasta')
            calc_seq_idents.append(seq_ident)
        elif not os.path.isfile(step.step_file(f'{seq_ident}.gb')):
            calc_seq_idents.append(seq_ident)

    if files_to_zip:
        # Store finish.yml
        finish_f = step.step_file('finish.yml')
        write_yaml(dict(fa_files=files_to_zip), finish_f)

        run = True  # ToDo: ...
        step.save(completed=False)
        if run:
            run_module_script(run_irs_mummer, step)
            finish_irs_data(step, common_db, calc_seq_idents=calc_seq_idents)
        else:
            files_to_zip.append(finish_f)
            set_run_instructions(run_irs_mummer, step, files_to_zip,
                                 _instructions)
    #
    elif calc_seq_idents:
        finish_irs_data(step, common_db, calc_seq_idents=calc_seq_idents)
    elif params.force_mummer_parse:
        finish_irs_data(step, common_db)

    #
    return step

Пример #8

0

Показать файл

Файл: base_step.py Проект: CroP-BioDiv/zcitools

    def __init__(self,
                 project,
                 step_data,
                 remove_data=False,
                 update_mode=False,
                 no_check=False,
                 step_directory=None):
        assert project.__class__.__name__ == 'RunCommand', self.__class__.__name__  # For now
        self.project = project
        self._step_data = step_data
        # step_data['step_name'] is string or list of strings for substeps
        self._step_name_list = step_directory or \
            ([step_data['step_name']] if isinstance(step_data['step_name'], str) else step_data['step_name'])
        self.directory = os.path.join(*self._step_name_list)
        self._update_mode = update_mode

        # Call init data method
        if remove_data:
            remove_directory(self.directory, create=True)
            d = None
        else:
            d = self.get_description()
            if not d:
                ensure_directory(self.directory)
        if d:
            if d['data_type'] != self._STEP_TYPE:
                raise ZCItoolsValueError(
                    f"Step class of tyep '{self._STEP_TYPE}' created with data of type '{d['data_type']}'!"
                )
            type_desc = d['data']
            # Update project data
            self._step_data.update((k, v) for k, v in d['project'].items()
                                   if k not in self._step_data)
        else:
            type_desc = None
        #
        self._init_data(type_desc)

        # Check data if exists and step not set in update mode
        if type_desc and not self._update_mode and self.is_completed(
        ) and not no_check:
            self._check_data()

Пример #9

0

Показать файл

Файл: commands.py Проект: CroP-BioDiv/zcitools

    def run(self):
        import os.path
        from ..utils.import_methods import import_bio_seq_io
        from ..utils.helpers import get_bio_io_type, feature_qualifiers_to_desc
        from common_utils.file_utils import ensure_directory, write_fasta, basename_no_ext

        args = self.args
        od = args.output_directory
        SeqIO = import_bio_seq_io()
        ensure_directory(od)
        type_ = args.filter_type

        for i_filename in args.input_files:
            # Note: One sequence in one file!
            seq_rec = SeqIO.read(
                i_filename, get_bio_io_type(i_filename, args.input_format))
            # ToDo: filtrirati po necemu?
            # ToDo: sortirati po necemu?
            write_fasta(
                os.path.join(od,
                             f"extract_{basename_no_ext(i_filename)}.fasta"),
                ((feature_qualifiers_to_desc(f), str(f.extract(seq_rec).seq))
                 for f in seq_rec.features
                 if location and f.type == type_ and 'gene' in f.qualifiers))

Пример #10

0

Показать файл

def init_project(project, dirname, project_desc, workflow,
                 workflow_parameters):
    if os.path.isfile('project_log.yml'):
        print(f'Warning: init project called on existing project!')
        print(f'Warning: project {dirname} was not created!')

    elif ensure_directory(dirname, check_empty=True):
        # Add setting file
        settings = dict(settings_defaults)
        if workflow:
            if workflow_parameters:
                w_pars = dict(
                    x.split('=') for x in workflow_parameters.split(';'))
            else:
                w_pars = dict()
            wf_cls = project.get_workflow_cls(workflow)
            if (not_in :=
                [p for p in wf_cls.required_parameters() if p not in w_pars]):
                raise ZCItoolsValueError(
                    f"Workflow's parameters not specified: {', '.join(not_in)}!"
                )

            settings['workflow'] = workflow
            settings['workflow_parameters'] = wf_cls.format_parameters(w_pars)
        write_yaml(settings, os.path.join(dirname, 'settings.yml'))

        # Create empty project.log file
        with open(os.path.join(dirname, 'project_log.yml'), 'w') as r:
            pass

        # Set README.txt file
        with open(os.path.join(dirname, 'README.txt'), 'w') as r:
            if project_desc:
                r.write(f'Project description:\n{project_desc}\n')
            r.write(_readme)
            if workflow:
                r.write(_wf_readme.format(workflow=workflow))

Пример #11

0

Показать файл

def create_permutations(project,
                        step_data,
                        raw_file,
                        permutations,
                        num_traits=None,
                        run=False):
    # Check input files
    map_file = raw_file.replace('.raw', '.map')
    data_dir, base_raw_file = os.path.split(raw_file)
    tmp_files = ('tmp.00m', 'tmp.00c', 'tmp.00r')
    for mf in (raw_file, map_file):
        if not os.path.isfile(mf):
            raise ZCItoolsValueError(
                f"Input MapMaker file {mf} doesn't exist!")
    for qf in tmp_files:
        f = os.path.join(data_dir, qf)
        if not os.path.isfile(f):
            raise ZCItoolsValueError(
                f"Input Windows QTL Cartographer file {qf} doesn't exist!")

    #
    step = QTLCartStep(project, step_data, remove_data=True)
    step.set_data(num_traits, permutations)

    # Copy input files
    files_to_zip = []
    for qf in tmp_files:
        files_to_zip.append(step.step_file(qf))
        copy_file(os.path.join(data_dir, qf), files_to_zip[-1])

    # Create trait directories
    # ToDo: find max traits and fix it/set default
    assert num_traits and num_traits > 0, num_traits
    trait_dirs = []
    for t_idx in range(1, num_traits + 1):
        trait_dirs.append(step.trait_dir(t_idx))
        t_dir = step.step_file(trait_dirs[-1])
        ensure_directory(t_dir)
        files_to_zip.append(os.path.join(t_dir, 'qtlcart.rc'))
        write_str_in_file(
            files_to_zip[-1],
            _qtlcart_rc.format(trait=t_idx, num_traits=num_traits))
        # # Create links to input files
        # for qf in tmp_files:
        #     link_file(os.path.join('..', qf), os.path.join(t_dir, qf))
        #

    files_to_zip.append(step.step_file('finish.yml'))
    write_yaml(dict(permutations=permutations, trait_dirs=trait_dirs),
               files_to_zip[-1])

    # Stores description.yml
    step.save(completed=run)

    # Run or set instructions
    if run:
        run_module_script(run_qtl_cart_perm, step)
    else:
        set_run_instructions(run_qtl_cart_perm, step, files_to_zip,
                             _instructions)
    #
    return step

Пример #12

0

Показать файл

def create_circos_correlation(project, step_data, params):
    # Read correlation data
    cm = None
    if params.input_filename:
        cm = CorrelationMatrix.from_file(params.input_filename)

    if not cm:
        raise ZCItoolsValueError('No correlation input data!')
    num_c = cm.num_columns()
    if num_c < 2:
        raise ZCItoolsValueError('Not much of a matrix!')

    step = ImagesStep(project, step_data, remove_data=True)
    one_width = params.one_width
    gap_correlations = params.gap_correlations
    ow_2 = one_width // 2
    one_plus_gap = one_width + gap_correlations

    # Note: column lowercase names are used as column identifiers
    data_dir = step.step_file('data')
    etc_dir = step.step_file('etc')
    ensure_directory(data_dir)
    ensure_directory(etc_dir)

    colors = dict(
        (lc, 'green') for lc in cm._columns_lower)  # ToDo: some defaults
    colors['plus_'] = 'blue'
    colors['minus_'] = 'red'
    for col_def in params.group_color:
        col_fields = col_def.split(',', 1)
        if len(col_fields) == 2 and cm.check_column(col_fields[0]):
            colors[cm.check_column(col_fields[0])] = col_fields[1]
        else:
            print(f"Warning: '{col_def}' is not column color definition!")

    # data directory
    # karyotype.txt: defines groups (as chromosomes)
    # chr - <name> <label> <start> <end> <color>
    # ...
    gl = (num_c - 1) * one_width + (num_c -
                                    2) * gap_correlations  # group length
    write_str_in_file(
        os.path.join(data_dir, 'karyotype.txt'),
        '\n'.join(f"chr - {lc} {c} 0 {gl} color_{lc}"
                  for lc, c in zip(cm._columns_lower, cm._columns)))

    # tiles.txt: defines abs(correlation) == 1 interval, as tiles
    # <name> <start> <end> [options]
    with open(os.path.join(data_dir, 'tiles.txt'), 'w') as out:
        for idx1, c1 in enumerate(cm._columns_lower):
            for idx2, c2 in enumerate(cm._columns_lower):
                if idx1 == idx2:
                    continue
                pos = (idx1 - idx2 - 1) if idx1 > idx2 else (idx1 - idx2 +
                                                             (num_c - 1))
                start = pos * one_plus_gap
                out.write(
                    f"{c1} {start} {start + one_width} fill_color=color_{c2}\n"
                )

    # cells.txt: defines correlations as links
    # <cell_idx> <group_1> <start_1> <end_1> color=color_{plus|minus}_,dist={int}
    # <cell_idx> <group_2> <start_2> <end_2> color=color_{plus|minus}_,dist={int}
    # ...
    with open(os.path.join(data_dir, 'links.txt'), 'w') as out:
        cell_idx = 0
        for idx1, c1 in enumerate(cm._columns_lower):
            rest_c = cm._columns_lower[idx1 + 1:]
            for idx2, c2 in enumerate(rest_c):
                corr = cm.get(c1, c2)
                if corr is not None:
                    w = round(abs(corr) * one_width)
                    w_1 = w // 2
                    w_2 = w - w_1  # - 1?
                    centar = ow_2 + idx2 * one_plus_gap
                    color = 'plus_' if corr >= 0 else 'minus_'
                    dist = min(idx2 + 1, idx1 + (len(rest_c) - idx2))
                    atts = f"color=color_{color},dist={dist}"
                    out.write(
                        f"cell_{cell_idx} {c1} {gl - centar - w_2} {gl - centar + w_1} {atts}\n"
                    )
                    out.write(
                        f"cell_{cell_idx} {c2} {centar - w_1} {centar + w_2} {atts}\n"
                    )
                    cell_idx += 1

    # etc directory
    write_str_in_file(
        os.path.join(etc_dir, 'circos.conf'),
        _circos_conf.format(colors='\n'.join(f"color_{lc} = {c}"
                                             for lc, c in colors.items())))

    subprocess.run(['circos', '-conf', 'etc/circos.conf'], cwd=step.directory)

    # View it
    if params.show_image:
        image_viewer = get_settings().get('image_viewer')
        if image_viewer:
            subprocess.Popen([image_viewer, step.step_file('circos.png')])

Пример #13

0

Показать файл

            ]
            rows.append(row)

            taxid = ncbi_2_taxid[seq_ident]
            search_in = set(all_taxids)
            search_in.discard(taxid)
            close_taxids = ncbi_tax.find_close_taxids(
                taxid, ncbi_2_max_taxid[seq_ident], search_in)
            if not close_taxids:
                print(
                    f"Warning: sequence {seq_ident} doesn't have close relative in accession set!"
                )
                continue

            f_dir = step.step_file('repair_ns', seq_ident)
            ensure_directory(f_dir)
            # seq_fasta = os.path.join(f_dir, f"{seq_ident}.fa")
            # write_fasta(seq_fasta, [(seq_ident, seq_data['_seq'].seq)])

            # executor.submit(_run_manage_ns, seq_ident, sequences, ns, f_dir, [taxid_2_ncbi[t] for t in close_taxids])
            _run_manage_ns(seq_ident, sequences, ns, f_dir,
                           [taxid_2_ncbi[t] for t in close_taxids])
        #
        columns = [('seq_ident', 'seq_ident'), ('length', 'int'),
                   ('num_ns_parts', 'int'), ('ns_length', 'int'),
                   ('close_seq_idents', 'str'), ('fix', 'str')]
        step.set_table_data(rows, columns)
    else:
        step.set_columns([('seq_ident', 'seq_ident')])  # Dummy table

    step.save()

Пример #14

0

Показать файл

def create_irs_data(step_data, input_step, params):
    # Creates Annotations step from input sequences/annotations
    # Steps subdirectory 'run_dir' contains input and output calculation files
    SeqIO = import_bio_seq_io()
    seq_idents = input_step.all_sequences()

    step = input_step.project.new_step(AnnotationsStep, step_data)
    step.set_sequences(seq_idents)
    # seq_ident -> mummer data ([length, start_1, start_2])
    mummer_results = step.get_type_description_elem('mummer_results', default=dict())
    #
    ensure_directory(step.step_file('run_dir'))
    calc_mummer = []  # tuples (seq_ident, fasta file, mummer output file)

    # Mummer
    for seq_ident in sorted(seq_idents - set(mummer_results)):
        fa_file = step.step_file('run_dir', f'{seq_ident}.fa')
        mummer_res_file = step.step_file('run_dir', f'{seq_ident}.out')
        if not os.path.isfile(fa_file):
            seq_rec = input_step.get_sequence_record(seq_ident)
            SeqIO.write([seq_rec], fa_file, 'fasta')
            calc_mummer.append((seq_ident, fa_file, mummer_res_file))
        elif not os.path.isfile(mummer_res_file):
            calc_mummer.append((seq_ident, fa_file, mummer_res_file))

    # Run mummer
    if calc_mummer:
        mummer_exe = 'repeat-match'  # ToDo:
        n = 3000
        threads = multiprocessing.cpu_count()
        with ThreadPoolExecutor(max_workers=threads) as executor:
            for seq_ident, fa_file, mummer_res_file in calc_mummer:
                executor.submit(_run_single, mummer_exe, n, fa_file, mummer_res_file)

        for seq_ident, _, mummer_res_file in calc_mummer:
            rep = _read_mummer_repeat(mummer_res_file)
            if not rep:
                raise ZCItoolsValueError(f'No repeat for sequence {seq_ident}!')
            mummer_results[seq_ident] = rep

    # Find sequences extend with alignment
    files_to_zip = []
    calc_mafft = []
    for seq_ident in sorted(seq_idents):
        length, s1, s2 = mummer_results[seq_ident]
        if length >= 23000:
            continue

        if step.is_file('run_dir', f'{seq_ident}_right_align.fa') and \
           step.is_file('run_dir', f'{seq_ident}_right_align.fa'):
            continue

        #
        calc_mafft.append(seq_ident)
        _seq = input_step.get_sequence_record(seq_ident).seq
        seq = str(_seq)
        comp_seq = str(_seq.complement())
        missing = 26000 - length

        # Right side
        p1 = _extract_subseq_plus(seq, s1 + length, missing)
        p2 = _extract_subseq_minus(comp_seq, s2 - length, missing)
        assert len(p1) == len(p2), (length, s1, s2, missing, (len(p1), len(p2)))
        files_to_zip.append(step.step_file('run_dir', f'{seq_ident}_right.fa'))
        write_fasta(files_to_zip[-1], [('p1', p1), ('p2', p2)])

        # Left side
        p1 = _extract_subseq_minus(comp_seq, s1 - 1, missing)
        p2 = _extract_subseq_plus(seq, s2 + 1, missing)
        assert len(p1) == len(p2), (length, s1, s2, missing, (len(p1), len(p2)))
        files_to_zip.append(step.step_file('run_dir', f'{seq_ident}_left.fa'))
        write_fasta(files_to_zip[-1], [('p1', p1), ('p2', p2)])

    # Mafft
    if calc_mafft:
        finish_f = step.step_file('finish.yml')
        write_yaml(dict(calc_seq_idents=calc_mafft), finish_f)

        run = True  # ToDo: ...
        step.save(additional_data=dict(mummer_results=mummer_results), completed=False)
        if run:
            run_module_script(run_mafft_irs, step)
            finish_irs_data(step)
        else:
            files_to_zip.append(finish_f)
            set_run_instructions(run_mafft_irs, step, files_to_zip, _instructions)
    #
    elif params.force_parse:
        finish_irs_data(step)

    return step

Пример #15

0

Показать файл

def orientate_chloroplast_start(step_data, annotation_step, params):
    # Find referent genome
    # For each sequence, different than referent, directory is created named <seq_ident>.
    # It contains files:
    #  - {lsc|ira|ss}_{plus|minus}.fa       : input alignment files, contain 2 sequences.
    #  - align_{lsc|ira|ss}_{plus|minus}.fa : result alignment files.
    seq_idents = annotation_step.all_sequences()  # set
    ref_ident = find_referent_genome(seq_idents, params.referent_genome)
    #
    length = params.length_to_check
    step = annotation_step.project.new_step(ChloroplastOrientateStep, step_data, remove_data=False)
    sequence_data = step.get_type_description_elem('sequence_data', default=dict())
    #
    seq_rec = annotation_step.get_sequence_record(ref_ident)
    partition = find_chloroplast_partition(seq_rec)
    ref_parts = [str(partition.get_part_by_name(n).extract(seq_rec).seq)[:length] for n in _part_names]
    files_to_zip = []
    align_files = []

    #
    all_versions = ('plus', 'minus', 'plus_c', 'minus_c') if params.complement else ('plus', 'minus')
    for seq_ident in sorted(seq_idents):
        seq_rec = None
        if seq_ident not in sequence_data:
            seq_rec = annotation_step.get_sequence_record(seq_ident)
            partition = find_chloroplast_partition(seq_rec)

            # Count gene orientation
            l_seq = len(seq_rec)
            in_parts = partition.put_features_in_parts(
                Feature(l_seq, feature=f) for f in seq_rec.features if f.type == 'gene')

            lsc_count = sum(f.feature.strand if any(x in f.name for x in ('rpl', 'rps')) else 0
                            for f in in_parts.get('lsc', []))
            ssc_count = sum(f.feature.strand for f in in_parts.get('ssc', []))
            ira_count = sum(f.feature.strand if 'rrn' in f.name else 0 for f in in_parts.get('ira', []))

            sequence_data[seq_ident] = dict(
                length=len(seq_rec),
                lsc=(lsc_count <= 0), lsc_count=lsc_count, lsc_length=len(partition.get_part_by_name('lsc')),
                ssc=(ssc_count <= 0), ssc_count=ssc_count, ssc_length=len(partition.get_part_by_name('ssc')),
                ira=(ira_count >= 0), ira_count=ira_count, ira_length=len(partition.get_part_by_name('ira')))

        if all(all(step.is_file(seq_ident, f'align_{n}_{v}.fa') for v in all_versions) for n in _part_names):
            continue
        #
        if seq_rec is None:
            seq_rec = annotation_step.get_sequence_record(seq_ident)
            partition = find_chloroplast_partition(seq_rec)
        for n, ref_p in zip(_part_names, ref_parts):
            # Find missing output files
            _num = len(align_files)
            for x in all_versions:
                if not step.is_file(seq_ident, f'align_{n}_{x}.fa'):
                    files_to_zip.append(step.step_file(seq_ident, f'{n}_{x}.fa'))
                    align_files.append((seq_ident, n, x))
            if _num == len(align_files):
                continue

            # Store input files
            if all(step.is_file(seq_ident, f'align_{n}_{v}.fa') for v in all_versions):
                continue
            ensure_directory(step.step_file(seq_ident))
            part_s = partition.get_part_by_name(n).extract(seq_rec)

            f_p = step.step_file(seq_ident, f'{n}_plus.fa')
            f_p_c = step.step_file(seq_ident, f'{n}_plus_c.fa')
            if not os.path.isfile(f_p):
                write_fasta(f_p, [(ref_ident, ref_p), (seq_ident, str(part_s.seq)[:length])])
            if not os.path.isfile(f_p_c):
                write_fasta(f_p_c, [(ref_ident, ref_p),
                                    (seq_ident, str(part_s.reverse_complement().seq)[:(-length-1):-1])])

            f_m = step.step_file(seq_ident, f'{n}_minus.fa')
            f_m_c = step.step_file(seq_ident, f'{n}_minus_c.fa')
            if not os.path.isfile(f_m):
                write_fasta(f_m, [(ref_ident, ref_p), (seq_ident, str(part_s.reverse_complement().seq)[:length])])
            if not os.path.isfile(f_m_c):
                write_fasta(f_m_c, [(ref_ident, ref_p), (seq_ident, str(part_s.seq)[:(-length-1):-1])])

    #
    output_file = f"{params.output_file_prefix}_{length}{'_c' if params.complement else ''}.xlsx"
    data = dict(sequence_data=sequence_data, check_length=length, output_file=output_file, complement=params.complement)
    if align_files:
        # Store finish.yml
        finish_f = step.step_file('finish.yml')
        write_yaml(dict(align_files=align_files), finish_f)

        run = True  # ToDo: ...
        step.save(data, completed=False)
        if run:
            run_module_script(run_orientate, step)
            orientate_chloroplast_finish(step)  # , common_db, calc_seq_idents=calc_seq_idents)
        else:
            files_to_zip.append(finish_f)
            set_run_instructions(run_orientate, step, files_to_zip, _instructions)
    #
    elif params.force_parse:
        step.save(data)
        orientate_chloroplast_finish(step)  # , common_db, calc_seq_idents=calc_seq_idents)
    #
    else:
        step.save(data, completed=False)

    return step

Python ensure_directory примеры использования