Exemplo n.º 1
0
# -*- coding: utf-8 *-*

import sys
import os
import errno
import time

# global configuration
import vprimer.glv as glv
import vprimer.utils as utl

from vprimer.logging_config import LogConf
log = LogConf.open_log(__name__)

import vcfpy

from vprimer.allele_select import AlleleSelect

class Variant(object):

    def __init__(self):
        pass


    def pick_variant(self):
        """
        """

        # open vcf through vcfpy
        reader = vcfpy.Reader.from_path(glv.conf.vcf_file)
Exemplo n.º 2
0
    def open_log(self):

        global log
        log = LogConf.open_log(__name__)
Exemplo n.º 3
0
    def open_log_enzyme(self):

        # in glv
        global log
        log = LogConf.open_log(__name__)

        self.default_enzyme_names = [
            ['# A'],
            ['AluI', 'AGCT', 'AG^_CT', '4'],
            ['ApaI', 'GGGCCC', 'G_GGCC^C', '6'],
            ['AscI', 'GGCGCGCC', 'GG^CGCG_CC', '8'],
            ['AvrII', 'CCTAGG', 'C^CTAG_G', '6'],
            [''],
            ['# B'],
            ['BamHI', 'GGATCC', 'G^GATC_C', '6'],
            ['BbsI', 'GAAGAC', 'GAAGACNN^NNNN_N', '6'],
            ['BclI', 'TGATCA', 'T^GATC_A', '6'],
            ['BglII', 'AGATCT', 'A^GATC_T', '6'],
            ['BsaI', 'GGTCTC', 'GGTCTCN^NNNN_N', '6'],
            ['BsiWI', 'CGTACG', 'C^GTAC_G', '6'],
            ['BsmFI', 'GGGAC', 'GGGACNNNNNNNNNN^NNNN_N', '5'],
            ['BspHI', 'TCATGA', 'T^CATG_A', '6'],
            ['BssHII', 'GCGCGC', 'G^CGCG_C', '6'],
            ['Bst1107I', 'GTATAC', 'GTA^_TAC', '6'],
            ['BstBI', 'TTCGAA', 'TT^CG_AA', '6'],
            ['BstEII', 'GGTNACC', 'G^GTNAC_C', '7'],
            ['BstXI', 'CCANNNNNNTGG', 'CCAN_NNNN^NTGG', '12'],
            [''],
            ['# C'],
            ['ClaI', 'ATCGAT', 'AT^CG_AT', '6'],
            [''],
            ['# D'],
            ['DdeI', 'CTNAG', 'C^TNA_G', '5'],
            ['DpnI', 'GATC', 'GA^_TC', '4'],
            ['DraI', 'TTTAAA', 'TTT^_AAA', '6'],
            ['DraIII', 'CACNNNGTG', 'CAC_NNN^GTG', '9'],
            [''],
            ['# E'],
            ['Eco52I', 'CGGCCG', 'C^GGCC_G', '6'],
            ['EcoO109I', 'RGGNCCY', 'RG^GNC_CY', '7'],
            ['EcoO65I', 'GGTNACC', 'G^GTNAC_C', '7'],
            ['EcoRI', 'GAATTC', 'G^AATT_C', '6'],
            ['EcoRV', 'GATATC', 'GAT^_ATC', '6'],
            ['EcoT14I', 'CCWWGG', 'C^CWWG_G', '6'],
            [''],
            ['# F'],
            ['FseI', 'GGCCGGCC', 'GG_CCGG^CC', '8'],
            [''],
            ['# H'],
            ['HaeII', 'RGCGCY', 'R_GCGC^Y', '6'],
            ['HincII', 'GTYRAC', 'GTY^_RAC', '6'],
            ['HindIII', 'AAGCTT', 'A^AGCT_T', '6'],
            ['HinfI', 'GANTC', 'G^ANT_C', '5'],
            ['HpaI', 'GTTAAC', 'GTT^_AAC', '6'],
            ['HphI', 'GGTGA', 'GGTGANNNNNNN_N^N', '5'],
            [''],
            ['# K'],
            ['KpnI', 'GGTACC', 'G_GTAC^C', '6'],
            [''],
            ['# M'],
            ['MluI', 'ACGCGT', 'A^CGCG_T', '6'],
            ['MseI', 'TTAA', 'T^TA_A', '4'],
            [''],
            ['# N'],
            ['NcoI', 'CCATGG', 'C^CATG_G', '6'],
            ['NdeI', 'CATATG', 'CA^TA_TG', '6'],
            ['NheI', 'GCTAGC', 'G^CTAG_C', '6'],
            ['NlaIII', 'CATG', '_CATG^', '4'],
            ['NotI', 'GCGGCCGC', 'GC^GGCC_GC', '8'],
            ['NruI', 'TCGCGA', 'TCG^_CGA', '6'],
            ['NsiI', 'ATGCAT', 'A_TGCA^T', '6'],
            [''],
            ['# P'],
            ['PacI', 'TTAATTAA', 'TTA_AT^TAA', '8'],
            ['PmeI', 'GTTTAAAC', 'GTTT^_AAAC', '8'],
            ['PmlI', 'CACGTG', 'CAC^_GTG', '6'],
            ['Psp1406I', 'AACGTT', 'AA^CG_TT', '6'],
            ['PstI', 'CTGCAG', 'C_TGCA^G', '6'],
            ['PvuII', 'CAGCTG', 'CAG^_CTG', '6'],
            [''],
            ['# R'],
            ['RsaI', 'GTAC', 'GT^_AC', '4'],
            [''],
            ['# S'],
            ['SacI', 'GAGCTC', 'G_AGCT^C', '6'],
            ['SacII', 'CCGCGG', 'CC_GC^GG', '6'],
            ['SalI', 'GTCGAC', 'G^TCGA_C', '6'],
            ['SapI', 'GCTCTTC', 'GCTCTTCN^NNN_N', '7'],
            ['SbfI', 'CCTGCAGG', 'CC_TGCA^GG', '8'],
            ['ScaI', 'AGTACT', 'AGT^_ACT', '6'],
            ['SfiI', 'GGCCNNNNNGGCC', 'GGCCN_NNN^NGGCC', '13'],
            ['SmaI', 'CCCGGG', 'CCC^_GGG', '6'],
            ['SnaBI', 'TACGTA', 'TAC^_GTA', '6'],
            ['SpeI', 'ACTAGT', 'A^CTAG_T', '6'],
            ['SphI', 'GCATGC', 'G_CATG^C', '6'],
            ['SspI', 'AATATT', 'AAT^_ATT', '6'],
            ['StuI', 'AGGCCT', 'AGG^_CCT', '6'],
            ['SwaI', 'ATTTAAAT', 'ATTT^_AAAT', '8'],
            [''],
            ['# T'],
            ['TaqI', 'TCGA', 'T^CG_A', '4'],
            ['Tth111I', 'GACNNNGTC', 'GACN^N_NGTC', '9'],
            [''],
            ['# X'],
            ['XbaI', 'TCTAGA', 'T^CTAG_A', '6'],
            ['XhoI', 'CTCGAG', 'C^TCGA_G', '6'],
            ['XmaI', 'CCCGGG', 'C^CCGG_G', '6'],
            [''],
            ['# end'],
        ]
Exemplo n.º 4
0
    def __init__(self):

        self.init_version = 0.0

        self.ini_file = ''
        self.ini_file_path = ''
        self.ini = configparser.ConfigParser()
        # don't convert to lower case
        self.ini.optionxform = str

        # log
        self.log = LogConf()

        self.thread = 0

        self.parallel_blast_cnt = 0
        self.parallele_full_thread = 0
        self.blast_num_threads = 0

        # default 1000
        self.blast_word_size = 0
        self.use_joblib_threading = 'no'
        self.parallel = False

        # indel_len
        self.min_indel_len = 0
        self.max_indel_len = 0
        # product_size
        self.min_product_size = 0
        self.max_product_size = 0

        # group
        self.regions_dict = dict()
        self.distin_g_list = list()
        self.g_members_dict = dict()

        self.cwd = os.getcwd()

        self.ref_dir = ''
        self.out_dir = ''
        self.out_bak_dir = ''
        self.log_dir = ''

        self.pick_mode = ''
        self.progress = ''
        self.stop = 'no'

        self.ref = ''
        self.vcf = ''

        self.min_indel_len = 0
        self.max_indel_len = 0
        self.min_product_size = 0
        self.max_product_size = 0
        self.fragment_pad_len = 0

        self.PRIMER_THERMODYNAMIC_PARAMETERS_PATH = ''
        self.PRIMER_PRODUCT_SIZE_RANGE = ''
        self.PRIMER_NUM_RETURN = ''
        self.PRIMER_MIN_SIZE = ''
        self.PRIMER_OPT_SIZE = ''
        self.PRIMER_MAX_SIZE = ''
        self.PRIMER_MIN_GC = ''
        self.PRIMER_OPT_GC = ''
        self.PRIMER_MAX_GC = ''
        self.PRIMER_MIN_TM = ''
        self.PRIMER_OPT_TM = ''
        self.PRIMER_MAX_TM = ''
        self.PRIMER_MAX_POLY_X = ''
        self.PRIMER_PAIR_MAX_DIFF_TM = ''

        self.alternate_distance = 0

        # ------------------------
        self.ref_fasta = ''
        self.ref_fasta_chrom_list = ''
        self.ref_fasta_fai = ''
        self.ref_fasta_pickle = ''
        self.ref_fasta_slink_system = ''
        self.ref_fasta_user = ''

        self.vcf_file_user = ''
        self.vcf_file_slink_system = ''
        self.vcf_file = ''

        #-- sample_nickname
        self.vcf_basename_to_fullname = dict()
        self.vcf_nickname_to_fullname = dict()
        self.nickname_to_basename = dict()
        self.basename_to_nickname = dict()

        #-- enzyme
        self.enzyme_files_list = list()
        self.enzyme_files_str = ''

        self.blastdb_title = ''
        self.blastdb = ''
Exemplo n.º 5
0
class Conf(object):
    def __init__(self):

        self.init_version = 0.0

        self.ini_file = ''
        self.ini_file_path = ''
        self.ini = configparser.ConfigParser()
        # don't convert to lower case
        self.ini.optionxform = str

        # log
        self.log = LogConf()

        self.thread = 0

        self.parallel_blast_cnt = 0
        self.parallele_full_thread = 0
        self.blast_num_threads = 0

        # default 1000
        self.blast_word_size = 0
        self.use_joblib_threading = 'no'
        self.parallel = False

        # indel_len
        self.min_indel_len = 0
        self.max_indel_len = 0
        # product_size
        self.min_product_size = 0
        self.max_product_size = 0

        # group
        self.regions_dict = dict()
        self.distin_g_list = list()
        self.g_members_dict = dict()

        self.cwd = os.getcwd()

        self.ref_dir = ''
        self.out_dir = ''
        self.out_bak_dir = ''
        self.log_dir = ''

        self.pick_mode = ''
        self.progress = ''
        self.stop = 'no'

        self.ref = ''
        self.vcf = ''

        self.min_indel_len = 0
        self.max_indel_len = 0
        self.min_product_size = 0
        self.max_product_size = 0
        self.fragment_pad_len = 0

        self.PRIMER_THERMODYNAMIC_PARAMETERS_PATH = ''
        self.PRIMER_PRODUCT_SIZE_RANGE = ''
        self.PRIMER_NUM_RETURN = ''
        self.PRIMER_MIN_SIZE = ''
        self.PRIMER_OPT_SIZE = ''
        self.PRIMER_MAX_SIZE = ''
        self.PRIMER_MIN_GC = ''
        self.PRIMER_OPT_GC = ''
        self.PRIMER_MAX_GC = ''
        self.PRIMER_MIN_TM = ''
        self.PRIMER_OPT_TM = ''
        self.PRIMER_MAX_TM = ''
        self.PRIMER_MAX_POLY_X = ''
        self.PRIMER_PAIR_MAX_DIFF_TM = ''

        self.alternate_distance = 0

        # ------------------------
        self.ref_fasta = ''
        self.ref_fasta_chrom_list = ''
        self.ref_fasta_fai = ''
        self.ref_fasta_pickle = ''
        self.ref_fasta_slink_system = ''
        self.ref_fasta_user = ''

        self.vcf_file_user = ''
        self.vcf_file_slink_system = ''
        self.vcf_file = ''

        #-- sample_nickname
        self.vcf_basename_to_fullname = dict()
        self.vcf_nickname_to_fullname = dict()
        self.nickname_to_basename = dict()
        self.basename_to_nickname = dict()

        #-- enzyme
        self.enzyme_files_list = list()
        self.enzyme_files_str = ''

        self.blastdb_title = ''
        self.blastdb = ''
        # ------------------------

    def read_ini(self):

        # ---------------------------------------------------------------
        # ini_file from param.p['config'] absolute or relative path
        self.ini_file = glv.param.p.config
        print("ini_file = {}".format(self.ini_file))
        # ---------------------------------------------------------------

        # ini_file_path
        self.ini_file_path = "{}/{}".format(self.cwd, self.ini_file)
        print("self.ini_file_path = {}".format(self.ini_file_path))

        # read ini_file
        if os.path.exists(self.ini_file_path):
            print("found {}".format(self.ini_file_path))

            # https://docs.python.org/ja/3/library/configparser.html
            with open(self.ini_file_path, encoding='utf-8') as fp:
                self.ini.read_file(fp)

                # adjustment of variable format
                self._rectify_variable()
                # into_variable
                self._ini_into_variable()

                # if want to know vcf
                if (glv.param.p.vcf_samples == True):
                    sample_names = utl.get_vcf_sample_name_list(
                        self.vcf_file_user)
                    print('{}\n'.format('\n'.join(sample_names)))
                    sys.exit(1)

                # several path
                self._set_path_and_all_start()

                # thread
                self._thread_adjusting()

                # nickname
                self._get_nickname()
                # regions
                self._get_regions()
                # distin_g
                self._get_distinguish_groups()
                # g_members
                self._get_members()

                # merge
                self._merge_conf()
                # log all conf
                self._print_conf()

        else:
            print("Not found {}, exit.".format(self.ini_file_path))
            #raise FileNotFoundError(errno.ENOENT,
            #    os.strerror(errno.ENOENT),
            #    self.ini_file_path)
            sys.exit(1)

        return self

    def _print_conf(self):

        log.info("")
        log.info("Adopted configurations:")

        log.info("[global]")
        log.info("init_version={}".format(self.init_version))
        log.info("thread={}".format(self.thread))
        log.info("use_joblib_threading={}".format(self.use_joblib_threading))
        log.info("parallel={}".format(self.parallel))
        log.info("ref={}".format(self.ref))
        log.info("vcf={}".format(self.vcf))
        log.info("min_indel_len={}".format(self.min_indel_len))
        log.info("max_indel_len={}".format(self.max_indel_len))
        log.info("min_product_size={}".format(self.min_product_size))
        log.info("max_product_size={}".format(self.max_product_size))
        log.info("ref_dir={}".format(self.ref_dir))
        log.info("log_dir={}".format(self.log_dir))
        log.info("out_dir={}".format(self.out_dir))
        log.info("out_bak_dir={}".format(self.out_bak_dir))
        log.info("pick_mode={}".format(self.pick_mode))
        log.info("progress={}".format(self.progress))

        log.info("[vprimer']")
        log.info("fragment_pad_len={}".format(self.fragment_pad_len))

        log.info("[caps]")
        log.info("enzyme_files_str={}".format(self.enzyme_files_str))

        log.info("[primer3]")
        log.info("PRIMER_THERMODYNAMIC_PARAMETERS_PATH={}".format(
            self.PRIMER_THERMODYNAMIC_PARAMETERS_PATH))
        log.info("PRIMER_PRODUCT_SIZE_RANGE={}".format(
            self.PRIMER_PRODUCT_SIZE_RANGE))
        log.info("PRIMER_NUM_RETURN={}".format(self.PRIMER_NUM_RETURN))
        log.info("PRIMER_MIN_SIZE={}".format(self.PRIMER_MIN_SIZE))
        log.info("PRIMER_OPT_SIZE={}".format(self.PRIMER_OPT_SIZE))
        log.info("PRIMER_MAX_SIZE={}".format(self.PRIMER_MAX_SIZE))
        log.info("PRIMER_MIN_GC={}".format(self.PRIMER_MIN_GC))
        log.info("PRIMER_OPT_GC={}".format(self.PRIMER_OPT_GC))
        log.info("PRIMER_MAX_GC={}".format(self.PRIMER_MAX_GC))
        log.info("PRIMER_MIN_TM={}".format(self.PRIMER_MIN_TM))
        log.info("PRIMER_OPT_TM={}".format(self.PRIMER_OPT_TM))
        log.info("PRIMER_MAX_TM={}".format(self.PRIMER_MAX_TM))
        log.info("PRIMER_MAX_POLY_X={}".format(self.PRIMER_MAX_POLY_X))
        log.info("PRIMER_PAIR_MAX_DIFF_TM={}".format(
            self.PRIMER_PAIR_MAX_DIFF_TM))

        log.info("[blast]")
        log.info("alternate_distance={}".format(self.alternate_distance))
        log.info("blast_word_size={}".format(self.blast_word_size))
        log.info("")

    def _ini_into_variable(self):

        sect = 'global'

        self.init_version = \
            float(self._set_default(sect, 'init_version', 1.0))

        self.use_joblib_threading = \
             str(self._set_default(sect, 'use_joblib_threading', 'yes'))

        self.parallel = \
            self._conv_joblib(self.use_joblib_threading)

        #log.info("{} {} {}".format(
        #    self.ini[sect]['use_joblib_threading'],
        #    self.use_joblib_threading,
        #    self.parallel))

        self.thread = \
            int(self._set_default(sect, 'thread', 2))

        self.ref = \
            str(self._set_default(sect, 'ref', ''))

        # user's fasta: convert relative path to absolute path based on cwd
        if self.ref.startswith('/'):
            # originally absolute path
            self.ref_fasta_user = self.ref
        else:
            # cwd + relative path
            self.ref_fasta_user = "******".format(self.cwd, self.ref)

        self.vcf = \
            str(self._set_default(sect, 'vcf', ''))

        # user's vcf: convert relative path to absolute path based on cwd
        if self.vcf.startswith('/'):
            # originally absolute path
            self.vcf_file_user = self.vcf
        else:
            # cwd + relative path
            self.vcf_file_user = "******".format(self.cwd, self.vcf)

        self.min_indel_len = \
            int(self._set_default(sect, 'min_indel_len', 50))
        self.max_indel_len = \
            int(self._set_default(sect, 'max_indel_len', 200))
        self.min_product_size = \
            int(self._set_default(sect, 'min_product_size', 200))
        self.max_product_size = \
            int(self._set_default(sect, 'max_product_size', 500))

        self.ref_dir = \
            str(self._set_default(sect, 'ref_dir', 'refs'))
        self.log_dir = \
            str(self._set_default(sect, 'log_dir', 'logs'))
        self.out_dir = \
            str(self._set_default(sect, 'out_dir', 'out_dir'))
        self.out_bak_dir = ''

        self.pick_mode = \
            str(self._set_default(sect, 'pick_mode', 'all'))
        self.progress = \
            str(self._set_default(sect, 'progress', 'all'))

        #sect = 'regions'
        #sect = 'sample_nickname'
        #sect = 'groups'

        sect = 'vprimer'
        self.fragment_pad_len = \
            int(self._set_default(sect, 'fragment_pad_len', 500))

        sect = 'caps'

        self.enzyme_files_str = \
            (self._set_default(sect, 'enzyme_files', ''))

        sect = 'primer3'

        self.PRIMER_THERMODYNAMIC_PARAMETERS_PATH = \
            str(self._set_default(
                sect, 'PRIMER_THERMODYNAMIC_PARAMETERS_PATH', ''))
        self.PRIMER_PRODUCT_SIZE_RANGE = "{}-{}".format(
            self.min_product_size, self.max_product_size)

        self.PRIMER_NUM_RETURN = str(1)

        self.PRIMER_MIN_SIZE = \
            str(self._set_default(sect, 'PRIMER_MIN_SIZE', 23))

        self.PRIMER_OPT_SIZE = \
            str(self._set_default(sect, 'PRIMER_OPT_SIZE', 25))
        self.PRIMER_MAX_SIZE = \
            str(self._set_default(sect, 'PRIMER_MAX_SIZE', 27))
        self.PRIMER_MIN_GC = \
            str(self._set_default(sect, 'PRIMER_MIN_GC', 40))
        self.PRIMER_OPT_GC = \
            str(self._set_default(sect, 'PRIMER_OPT_GC', 50))
        self.PRIMER_MAX_GC = \
            str(self._set_default(sect, 'PRIMER_MAX_GC', 60))
        self.PRIMER_MIN_TM = \
            str(self._set_default(sect, 'PRIMER_MIN_TM', 57.0))
        self.PRIMER_OPT_TM = \
            str(self._set_default(sect, 'PRIMER_OPT_TM', 60.0))
        self.PRIMER_MAX_TM = \
            str(self._set_default(sect, 'PRIMER_MAX_TM', 63.0))
        self.PRIMER_MAX_POLY_X = \
            str(self._set_default(sect, 'PRIMER_MAX_POLY_X', 4))
        self.PRIMER_PAIR_MAX_DIFF_TM = \
            str(self._set_default(
                sect, 'PRIMER_PAIR_MAX_DIFF_TM', 4))

        sect = 'blast'

        self.alternate_distance = \
            int(self._set_default(sect, 'alternate_distance', 10000))
        self.blast_word_size = \
            int(self._set_default(
                sect, 'blast_word_size', self.PRIMER_MIN_SIZE))

    def _set_default(self, sect, key, value):

        if key in self.ini[sect]:
            return self.ini[sect][key]
        else:
            return value

    def _conv_joblib(self, string):

        bool_joblib = False
        if string == 'yes':
            bool_joblib = True

        return bool_joblib

    def _thread_adjusting2(self):

        #log.info("thread={} parallel={}".format(
        #    self.thread, self.parallel))

        self.parallel = \
            self._conv_joblib(self.use_joblib_threading)

        if self.thread < 6:
            self.parallel = False

        if self.parallel == True:

            # unit is 4+1=5
            parallel_base = self.thread
            self.parallele_full_thread = parallel_base

            # blast = 4
            self.parallel_blast_cnt = int(parallel_base / 5)
            self.blast_num_threads = 4

        else:
            # 6 = 5
            full_thread = self.thread - 1
            self.parallele_full_thread = full_thread
            self.blast_num_threads = full_thread

        #log.info("thread={}, parallel={}, parallel_blast_cnt={}".format(
        #    self.parallel, self.thread, self.parallel_blast_cnt))
        #log.info("parallele_full_thread={}, blast_num_threads={}".format(
        #    self.parallele_full_thread, self.blast_num_threads))

    def _thread_adjusting(self):
        ''' in Palallel, if there are 10 threads blast cmd will use at least
            2 cores so par 1 parallel.
            So main python always use 1,
            parallel use 1 thread, blast use 2 threads
        '''

        #self.parallel_blast_cnt = 1
        #self.parallele_full_thread = 1
        #self.blast_num_threads = 1

        #self._thread_adjusting2()
        #return

        #log.info("thread={} use_joblib_threading={}, parallel={}".format(
        #    self.thread, self.use_joblib_threading, self.parallel))

        self.parallel = \
            self._conv_joblib(self.use_joblib_threading)

        if self.thread < 6:
            self.parallel = False

        if self.parallel == True:
            #(7) = 6
            parallel_base = self.thread
            self.parallele_full_thread = parallel_base

            # 6/3=2
            self.parallel_blast_cnt = int(parallel_base / 3)
            self.blast_num_threads = 2

        else:
            # 6 = 5
            full_thread = self.thread - 1
            self.parallele_full_thread = full_thread
            self.blast_num_threads = full_thread

        #log.info("thread={}, use_joblib_threading={}".format(
        #    self.thread, self.use_joblib_threading))
        #log.info("parallel={}, parallel_blast_cnt={}".format(
        #    self.parallel, self.parallel_blast_cnt))
        #log.info("parallele_full_thread={}, blast_num_threads={}".format(
        #    self.parallele_full_thread, self.blast_num_threads))

    def _get_nickname(self):

        # [sample_nickname]
        # nickname    basename
        # sample1   = sample1_sorted.bam
        # sample2   = sample1_sorted.bam

        log.info("[sample_nickname]")

        for nickname in self.ini['sample_nickname']:
            basename = self.ini['sample_nickname'][nickname]

            # fullname will filled in vcf_file get_sample_name_list
            self.nickname_to_basename[nickname] = basename
            self.basename_to_nickname[basename] = nickname

            log.info("{} => {}".format(nickname, basename))

    def _merge_conf(self):

        # debug by param
        if glv.param.p.stop != None:
            glv.conf.stop = glv.param.p.stop

        # update by param
        if glv.param.p.thread != None:
            glv.conf.thread = glv.param.p.thread
            self.use_joblib_threading = 'yes'
            self._thread_adjusting()

        if glv.param.p.joblib != None:
            self.use_joblib_threading = glv.param.p.joblib
            self._thread_adjusting()

        if glv.param.p.ref != None:
            self.ref = glv.param.p.ref

        if glv.param.p.vcf != None:
            self.vcf = glv.param.p.vcf

        if glv.param.p.progress != None:
            self.progress = glv.param.p.progress

        # indel_len
        if glv.param.p.min_indel != None:
            self.min_indel_len = glv.param.p.min_indel

        if glv.param.p.max_indel != None:
            self.max_indel_len = glv.param.p.max_indel

        # product_size
        if glv.param.p.min_product != None:
            self.min_product_size = glv.param.p.min_product

        if glv.param.p.max_product != None:
            self.conf.max_product_size = glv.param.p.max_product

        self.PRIMER_PRODUCT_SIZE_RANGE = "{}-{}".format(
            self.min_product_size, self.max_product_size)

        log.info("glv.param.p={}".format(glv.param.p))

        return self

    def _set_path_and_all_start(self):

        #---------------------------------------------------
        # out_dir
        if glv.param.p.out_dir != None:
            self.out_dir = glv.param.p.out_dir

        # result out dir
        self.out_dir = "{}/{}".format(self.cwd, self.out_dir)

        # logs dir under out dir
        self.log_dir = "{}/{}".format(self.out_dir, self.log_dir)

        # system reference dir
        self.ref_dir = "{}/{}".format(self.cwd, self.ref_dir)

        # out_bak_dir
        self.out_bak_dir = "{}/{}".format(self.out_dir, 'bak')

        # make dir
        self._make_dir_tree()

        # set to LogConf
        global log
        log = self.log.start(__name__, self.out_dir, self.log_dir)

        # log for utils
        utl.start_log()

        # cp ini file to outdir
        self._copy_ini_file()

    def _copy_ini_file(self):

        # ini file
        self.ini_file_path
        # out_dir
        self.out_dir

        # back up
        ini_base = os.path.basename(self.ini_file_path)
        out_dir_ini_file = "{}/{}".format(self.out_dir, ini_base)
        utl.save_to_tmpfile(out_dir_ini_file)

        cmd = "cp {} {}".format(self.ini_file_path, out_dir_ini_file)
        utl.try_exec(cmd)

    def _make_dir_tree(self):

        # already made at conf
        dirs = [
            self.out_dir,
            self.log_dir,
            self.out_bak_dir,
            self.ref_dir,
        ]

        for dir in dirs:
            utl.makedirs(dir)

    def _get_regions(self):

        # chrom:1-1000
        for rg in self.ini['regions']:
            rg_list = self.ini['regions'][rg].split(':')

            if rg_list[0] == '':
                #log.error("error, region={} is empty, exit.".format(rg))
                sys.exit(1)

            if len(rg_list) == 1:
                self.regions_dict[rg] = {
                    'chr': rg_list[0],
                    'start': None,
                    'end': None,
                    'reg': self.ini['regions'][rg]
                }

            else:
                pos = rg_list[1].split('-')
                self.regions_dict[rg] = {
                    'chr': rg_list[0],
                    'start': int(pos[0]),
                    'end': int(pos[1]),
                    'reg': self.ini['regions'][rg]
                }

            log.info("[regions]")
            log.info("{}: {}".format(rg, self.regions_dict[rg]))

    def _get_distinguish_groups(self):

        distins = self.ini['groups']['distinguish_groups'].split(';')
        #log.debug("distins={}".format(distins))

        for distin in distins:

            #log.debug("distin={}".format(distin))
            # [global][pick_mode]
            pick_mode = self.ini['global']['pick_mode']

            # group / reg1, reg2 : pick_mode
            g_pick_mode = distin.split(':')
            if len(g_pick_mode) == 1:
                # not exist pick_mode, only group_regions
                g_region = g_pick_mode
                #log.debug("(1) pick_mode don't exist: {}".format(distin))

            else:
                # exist pick_mode
                if len(g_pick_mode[1]) != 0:
                    pick_mode = g_pick_mode[1]
                    #log.debug("(3) pick_mode exist: {}".format(distin))
                #else:
                #log.debug("(2) pick_mode empty: {}".format(distin))

            # split into group and regions
            g_region = g_pick_mode[0].split('/')

            #log.debug("g_region: {}".format(g_region))

            if len(g_region) == 1:
                # there is no region, only groups separated by <>
                groups = g_region[0].split('<>')
                if groups[0] == '' or groups[1] == '':
                    #log.error(
                    #    "error, empty distinguish_groups {} exit.".format(
                    #    distin))
                    sys.exit(1)

                g_dict = {
                    0: groups[0],
                    1: groups[1],
                    'region': [''],
                    'pick_mode': pick_mode
                }

            else:
                # all staff gathered
                groups = g_region[0].split('<>')
                if groups[0] == '' or groups[1] == '':
                    #log.error(
                    #    "error, empty distinguish_groups {} exit.".format(
                    #    distin))
                    sys.exit(1)

                regions = g_region[1].split(',')

                # gGroup1 <> gGroup2 /
                if len(regions[0]) == 0:
                    g_dict = {
                        0: groups[0],
                        1: groups[1],
                        'region': [''],
                        'pick_mode': pick_mode
                    }
                else:
                    g_dict = {
                        0: groups[0],
                        1: groups[1],
                        'region': regions,
                        'pick_mode': pick_mode
                    }

            self.distin_g_list.append(g_dict)

        log.info("[groups]")
        log.info("{}".format(self.distin_g_list))

    def _get_members(self):
        # group_members =
        #     gHitomebore     : Hitomebore, Sasanishiki
        #     gArroz_da_terra : Arroz_da_terra, Kasalath
        #     gTakanari       : Mochibijin, Tawawakko
        #     gNortai         : Nortai, NERICA1

        group_members_str_org = self.ini['groups']['group_members']
        #log.info("{}".format(group_members_str_org))
        group_members_str = re.sub(r";", ",", group_members_str_org)
        #log.info("{}".format(group_members_str))
        g_members_tmp = group_members_str.split(',')

        group_line = ''
        for gm_tmp in g_members_tmp:
            if ':' in gm_tmp:
                group_line = "{};{},".format(group_line, gm_tmp)
            else:
                group_line = "{},{}".format(group_line, gm_tmp)

        group_line = re.sub(r"^;", "", group_line)
        group_line = re.sub(r",+", ",", group_line)
        group_line = re.sub(r",;", ";", group_line)
        group_line = re.sub(r",$", "", group_line)

        log.info("{}".format(group_line))

        g_members = group_line.split(';')
        #log.info("g_members {}".format(g_members))

        for g_member in g_members:
            gmem_list = g_member.split(':')
            #log.debug("{}".format(gmem_list))

            if len(gmem_list) == 1:
                #log.error("error, empty group_members {} exit.".format(
                #    g_member))
                sys.exit(1)

            elif gmem_list[0] == '' or gmem_list[1] == '':
                #log.error("error, empty group_members {} exit.".format(
                #    g_member))
                sys.exit(1)

            self.g_members_dict[gmem_list[0]] = gmem_list[1].split(',')
            #log.info("{}: {}".format(
            #    gmem_list[0],
            #    self.g_members_dict[gmem_list[0]]))

        log.info("[members]")
        log.info("{}".format(self.g_members_dict))

    def _rectify_variable(self):

        for section in self.ini.sections():
            for key in self.ini[section]:

                val = self.ini[section][key]
                # hash comment remove
                val = utl.strip_hash_comment(val)

                # remove \n at the beginning of value
                val = val.lstrip()

                # replace internal \n to semicolons
                val = val.replace('\n', ';')

                # replace white space to one space
                if key == 'group_members':
                    #log.info("{}".format(val))

                    val = re.sub(r"\s+", " ", val)
                    val = re.sub(r"\s*:\s*", ":", val)
                    val = re.sub(r" ", ",", val)
                    val = re.sub(r",+", ",", val)
                    val = re.sub(r",;", ";", val)
                    # g_members ['group0:TDr2946A_Mal,TDr1489A_Fem',
                    # 'TDr2284A_Mon,TDr1499A_Mon,TDr1509A_Fem',
                    # 'group1:TDr1510A_Fem,TDr3782A_Mal',
                    # 'TDr1533A_Mal,TDr1543A_Fem,TDr1858C_Fem']

                else:
                    #val = re.sub(r"\s+", " ", val)
                    # remove white spaces
                    val = re.sub(r"\s+", "", val)

                # reset
                self.ini[section][key] = val
Exemplo n.º 6
0
    def __init__(self):

        self.ini = None
        self.param = None

        # ----------------------------------------------------------
        # A dictionary that associates a value with a variable name
        # for all parameters
        self.conf_dict = dict()

        # ----------------------------------------------------------
        self.conf_dict = {
            # show_genotype  no / gt / int
            'show_genotype': {
                'dtype': 'str',
                'default': 'no'
            },

            # debug
            'analyse_caps': {
                'dtype': 'bool',
                'default': 'False'
            },

            # ini, param, ini, easy
            'ini_version': {
                'dtype': 'str',
                'default': glv.ini_version
            },
            'ini_file': {
                'dtype': 'str',
                'default': ''
            },
            'out_dir': {
                'dtype': 'str',
                'default': 'out_vprimer'
            },
            'thread': {
                'dtype': 'int',
                'default': '10'
            },
            'use_joblib_threading':  # param
            {
                'dtype': 'str',
                'default': 'yes'
            },

            #
            'vcf': {
                'dtype': 'str',
                'default': ''
            },
            'ref': {
                'dtype': 'str',
                'default': ''
            },

            #
            'pick_mode': {
                'dtype': 'str',
                'default': 'all'
            },
            'indel_size': {
                'dtype': 'str',
                'default': '20-200'
            },
            'product_size': {
                'dtype': 'str',
                'default': '200-500'
            },

            # list enzyme_file refs/enzyme_names.txt
            'enzyme_file': {
                'dtype': 'str',
                'default': 'no_enzyme'
            },
            # list enzyme
            'enzyme': {
                'dtype': 'str',
                'default': ''
            },

            # list target
            'target': {
                'dtype': 'str',
                'default': ''
            },
            # list a_samples
            'a_sample': {
                'dtype': 'str',
                'default': ''
            },
            # list b_samples
            'b_sample': {
                'dtype': 'str',
                'default': ''
            },

            # list regions
            'regions': {
                'dtype': 'str',
                'default': ''
            },
            # list distinguish_groups
            'distinguish_groups': {
                'dtype': 'str',
                'default': ''
            },
            # list group_members
            'group_members':
            # The default for group_member is group_members_vcf_str
            # read from vcf.
            # Do not initialize the key for safety as we will update it later.
            #                {'dtype': 'str',    'default': ''},
            {
                'dtype': 'str',
            },
            # gen p3_params.txt
            'p3_params': {
                'dtype': 'str',
                'default': 'no_p3_params'
            },
            'fragment_pad_len': {
                'dtype': 'int',
                'default': '500'
            },
            'blast_distance': {
                'dtype': 'int',
                'default': '10000'
            },
            # blast_word_size will be set later after p3 PRIMER_MIN_SIZE
            # is set.
            'blast_word_size':  # PRIMER_MIN_SIZE
            {
                'dtype': 'int',
                'default': '23'
            },

            #
            'show_samples': {
                'dtype': 'bool',
                'default': 'False'
            },
            'show_fasta': {
                'dtype': 'bool',
                'default': 'False'
            },
            'progress': {
                'dtype': 'str',
                'default': 'all'
            },
            'stop': {
                'dtype': 'str',
                'default': 'none'
            },
        }

        # cwd, log --------------------------------------------
        self.cwd = glv.cwd
        self.log = LogConf()

        # for debug
        self.analyse_caps = False

        # show_genotype ---------------------------------------
        self.show_genotype = ""  # ---- INI

        # ini file --------------------------------------------
        self.user_ini_file = ""  # ---- INI
        self.ini_file_path = ""
        self.ini_version_user = ""  # ---- INI
        self.ini_version_system = ""  # ---- INI

        # out_dir ---------------------------------------------
        self.user_out_dir = ""  # ---- INI
        self.out_dir_path = ""
        self.log_dir_path = ""
        self.out_bak_dir_path = ""

        # ref_dir ---------------------------------------------
        self.ref_dir_path = ""  # ---- INI

        # out_curr_setting ------------------------------------
        self.curr_setting_file = ""
        self.curr_setting_file_path = ""

        # thread ----------------------------------------------
        self.thread = 0  # ---- INI
        self.use_joblib_threading = "yes"
        self.parallel = True
        self.parallel_full_thread = 0
        self.parallel_blast_cnt = 0
        self.blast_num_threads = 0

        # vcf -------------------------------------------------
        self.user_vcf_file = ""  # ---- INI
        self.user_vcf_file_path = ""
        self.vcf_file_slink_system = ""
        self.vcf_file_path = ""

        # sample_nickname -------------------------------------
        self.vcf_sample_name_file = ""

        self.vcf_sample_nickname_list = list()
        self.vcf_sample_basename_list = list()
        self.vcf_sample_fullname_list = list()

        self.vcf_sample_nickname_dict = dict()
        self.vcf_sample_basename_dict = dict()
        self.vcf_sample_fullname_dict = dict()

        self.group_members_vcf_str = ""

        # show_samples-----------------------------------------
        self.show_samples = False

        # show_fasta ------------------------------------------
        self.show_fasta = False

        # ref -------------------------------------------------
        self.user_ref_fasta = ""  # ---- INI
        self.user_ref_fasta_path = ""

        # It will be set in main later
        # glv.ref = glv.ref.prepare_ref()
        self.ref_fasta_slink_system = ""
        self.ref_fasta_path = ""
        self.ref_fasta_chrom_dict_list = list()
        self.ref_fasta_chrom_list = list()
        self.ref_fasta_chrom_region_list = list()

        self.ref_fasta_fai = ""
        self.ref_fasta_chrom_txt = ""
        self.ref_fasta_pickle = ""

        # pick_mode -------------------------------------------
        self.pick_mode = ""  # ---- INI

        # indel len -------------------------------------------
        self.indel_size = ""  # ---- INI
        self.min_indel_len = 0
        self.max_indel_len = 0

        # product size ----------------------------------------
        self.product_size = ""  # ---- INI
        self.min_product_size = 0
        self.max_product_size = 0

        # enzyme ----------------------------------------------
        self.enzyme_files_user_str = ""  # ---- INI
        self.enzyme_files_user_list = list()
        self.enzyme_files_list = list()

        self.enzyme_str = ""
        #self.enzyme_list = list()
        self.enzyme_name_list = list()

        # region group member string --------------------------
        # select string by priority, next make dict or list variables
        self.regions_str = ""  # ---- INI
        self.group_members_str = ""  # ---- INI
        self.distinguish_groups_str = ""  # ---- INI

        self.region_name_list = list()
        self.group_name_list = list()

        self.regions_dict = dict()
        self.group_members_dict = dict()
        self.distinguish_groups_list = list()

        # primer3 ---------------------------------------------
        self.p3_params_file = ""  # ---- INI
        self.p3_params_file_path = ""
        self.primer3_header_dict = dict()

        self.fragment_pad_len = 0  # ---- INI

        # primer3 params
        self.p3key = { \
            'PRIMER_MIN_SIZE': 23,
            'PRIMER_OPT_SIZE': 25,
            'PRIMER_MAX_SIZE': 27,
            'PRIMER_MIN_GC': 40,
            'PRIMER_OPT_GC': 50,
            'PRIMER_MAX_GC': 60,
            'PRIMER_MIN_TM': 57.0,
            'PRIMER_OPT_TM': 60.0,
            'PRIMER_MAX_TM': 63.0,
            'PRIMER_MAX_POLY_X': 4,
            'PRIMER_PAIR_MAX_DIFF_TM': 4,
        }

        # blast -----------------------------------------------
        self.blast_distance = 0  # ---- INI

        # not set now -----------------------------------------
        self.blast_word_size = 0
        self.blastdb_title = ""
        self.blastdb = ""

        # start stop ------------------------------------------
        self.progress = ""
        self.stop = ""
Exemplo n.º 7
0
class ConfBase(object):
    def __init__(self):

        self.ini = None
        self.param = None

        # ----------------------------------------------------------
        # A dictionary that associates a value with a variable name
        # for all parameters
        self.conf_dict = dict()

        # ----------------------------------------------------------
        self.conf_dict = {
            # show_genotype  no / gt / int
            'show_genotype': {
                'dtype': 'str',
                'default': 'no'
            },

            # debug
            'analyse_caps': {
                'dtype': 'bool',
                'default': 'False'
            },

            # ini, param, ini, easy
            'ini_version': {
                'dtype': 'str',
                'default': glv.ini_version
            },
            'ini_file': {
                'dtype': 'str',
                'default': ''
            },
            'out_dir': {
                'dtype': 'str',
                'default': 'out_vprimer'
            },
            'thread': {
                'dtype': 'int',
                'default': '10'
            },
            'use_joblib_threading':  # param
            {
                'dtype': 'str',
                'default': 'yes'
            },

            #
            'vcf': {
                'dtype': 'str',
                'default': ''
            },
            'ref': {
                'dtype': 'str',
                'default': ''
            },

            #
            'pick_mode': {
                'dtype': 'str',
                'default': 'all'
            },
            'indel_size': {
                'dtype': 'str',
                'default': '20-200'
            },
            'product_size': {
                'dtype': 'str',
                'default': '200-500'
            },

            # list enzyme_file refs/enzyme_names.txt
            'enzyme_file': {
                'dtype': 'str',
                'default': 'no_enzyme'
            },
            # list enzyme
            'enzyme': {
                'dtype': 'str',
                'default': ''
            },

            # list target
            'target': {
                'dtype': 'str',
                'default': ''
            },
            # list a_samples
            'a_sample': {
                'dtype': 'str',
                'default': ''
            },
            # list b_samples
            'b_sample': {
                'dtype': 'str',
                'default': ''
            },

            # list regions
            'regions': {
                'dtype': 'str',
                'default': ''
            },
            # list distinguish_groups
            'distinguish_groups': {
                'dtype': 'str',
                'default': ''
            },
            # list group_members
            'group_members':
            # The default for group_member is group_members_vcf_str
            # read from vcf.
            # Do not initialize the key for safety as we will update it later.
            #                {'dtype': 'str',    'default': ''},
            {
                'dtype': 'str',
            },
            # gen p3_params.txt
            'p3_params': {
                'dtype': 'str',
                'default': 'no_p3_params'
            },
            'fragment_pad_len': {
                'dtype': 'int',
                'default': '500'
            },
            'blast_distance': {
                'dtype': 'int',
                'default': '10000'
            },
            # blast_word_size will be set later after p3 PRIMER_MIN_SIZE
            # is set.
            'blast_word_size':  # PRIMER_MIN_SIZE
            {
                'dtype': 'int',
                'default': '23'
            },

            #
            'show_samples': {
                'dtype': 'bool',
                'default': 'False'
            },
            'show_fasta': {
                'dtype': 'bool',
                'default': 'False'
            },
            'progress': {
                'dtype': 'str',
                'default': 'all'
            },
            'stop': {
                'dtype': 'str',
                'default': 'none'
            },
        }

        # cwd, log --------------------------------------------
        self.cwd = glv.cwd
        self.log = LogConf()

        # for debug
        self.analyse_caps = False

        # show_genotype ---------------------------------------
        self.show_genotype = ""  # ---- INI

        # ini file --------------------------------------------
        self.user_ini_file = ""  # ---- INI
        self.ini_file_path = ""
        self.ini_version_user = ""  # ---- INI
        self.ini_version_system = ""  # ---- INI

        # out_dir ---------------------------------------------
        self.user_out_dir = ""  # ---- INI
        self.out_dir_path = ""
        self.log_dir_path = ""
        self.out_bak_dir_path = ""

        # ref_dir ---------------------------------------------
        self.ref_dir_path = ""  # ---- INI

        # out_curr_setting ------------------------------------
        self.curr_setting_file = ""
        self.curr_setting_file_path = ""

        # thread ----------------------------------------------
        self.thread = 0  # ---- INI
        self.use_joblib_threading = "yes"
        self.parallel = True
        self.parallel_full_thread = 0
        self.parallel_blast_cnt = 0
        self.blast_num_threads = 0

        # vcf -------------------------------------------------
        self.user_vcf_file = ""  # ---- INI
        self.user_vcf_file_path = ""
        self.vcf_file_slink_system = ""
        self.vcf_file_path = ""

        # sample_nickname -------------------------------------
        self.vcf_sample_name_file = ""

        self.vcf_sample_nickname_list = list()
        self.vcf_sample_basename_list = list()
        self.vcf_sample_fullname_list = list()

        self.vcf_sample_nickname_dict = dict()
        self.vcf_sample_basename_dict = dict()
        self.vcf_sample_fullname_dict = dict()

        self.group_members_vcf_str = ""

        # show_samples-----------------------------------------
        self.show_samples = False

        # show_fasta ------------------------------------------
        self.show_fasta = False

        # ref -------------------------------------------------
        self.user_ref_fasta = ""  # ---- INI
        self.user_ref_fasta_path = ""

        # It will be set in main later
        # glv.ref = glv.ref.prepare_ref()
        self.ref_fasta_slink_system = ""
        self.ref_fasta_path = ""
        self.ref_fasta_chrom_dict_list = list()
        self.ref_fasta_chrom_list = list()
        self.ref_fasta_chrom_region_list = list()

        self.ref_fasta_fai = ""
        self.ref_fasta_chrom_txt = ""
        self.ref_fasta_pickle = ""

        # pick_mode -------------------------------------------
        self.pick_mode = ""  # ---- INI

        # indel len -------------------------------------------
        self.indel_size = ""  # ---- INI
        self.min_indel_len = 0
        self.max_indel_len = 0

        # product size ----------------------------------------
        self.product_size = ""  # ---- INI
        self.min_product_size = 0
        self.max_product_size = 0

        # enzyme ----------------------------------------------
        self.enzyme_files_user_str = ""  # ---- INI
        self.enzyme_files_user_list = list()
        self.enzyme_files_list = list()

        self.enzyme_str = ""
        #self.enzyme_list = list()
        self.enzyme_name_list = list()

        # region group member string --------------------------
        # select string by priority, next make dict or list variables
        self.regions_str = ""  # ---- INI
        self.group_members_str = ""  # ---- INI
        self.distinguish_groups_str = ""  # ---- INI

        self.region_name_list = list()
        self.group_name_list = list()

        self.regions_dict = dict()
        self.group_members_dict = dict()
        self.distinguish_groups_list = list()

        # primer3 ---------------------------------------------
        self.p3_params_file = ""  # ---- INI
        self.p3_params_file_path = ""
        self.primer3_header_dict = dict()

        self.fragment_pad_len = 0  # ---- INI

        # primer3 params
        self.p3key = { \
            'PRIMER_MIN_SIZE': 23,
            'PRIMER_OPT_SIZE': 25,
            'PRIMER_MAX_SIZE': 27,
            'PRIMER_MIN_GC': 40,
            'PRIMER_OPT_GC': 50,
            'PRIMER_MAX_GC': 60,
            'PRIMER_MIN_TM': 57.0,
            'PRIMER_OPT_TM': 60.0,
            'PRIMER_MAX_TM': 63.0,
            'PRIMER_MAX_POLY_X': 4,
            'PRIMER_PAIR_MAX_DIFF_TM': 4,
        }

        # blast -----------------------------------------------
        self.blast_distance = 0  # ---- INI

        # not set now -----------------------------------------
        self.blast_word_size = 0
        self.blastdb_title = ""
        self.blastdb = ""

        # start stop ------------------------------------------
        self.progress = ""
        self.stop = ""

    # --- before log file open
    def collect_param_ini(self, param, ini):
        ''' aggregate all parameters into one dictionary
        '''

        self.param = param
        self.ini = ini

        for vname in self.conf_dict:

            param_value = self._get_param_value(vname, param)
            ini_value = None

            if param.p['ini_file'] is not None:
                # ini
                ini_value = self._get_ini_value(vname, ini)

            self.conf_dict[vname]['param'] = param_value
            self.conf_dict[vname]['ini'] = ini_value

    def _get_param_value(self, vname, param):
        ''' get data from parameter
        param handles values with the correct data type
        '''

        ret = None

        # a_sample="DRS_013.all.rd DRS_084.all.rd,DRS_099.all.rd ref"
        # b_sample="DRS_025.all.rd, DRS_061.all.rd DRS_101.all.rd"

        # {'a_sample': ['DRS_013.all.rd',
        #   'DRS_084.all.rd,DRS_099.all.rd', 'ref'],
        # 'b_sample': ['DRS_025.all.rd,', 'DRS_061.all.rd',
        #   'DRS_101.all.rd'],

        if vname in param.p:

            val = param.p[vname]

            if type(val) == list:
                # 'DRS_084.all.rd,DRS_099.all.rd'
                # 'DRS_025.all.rd,'
                #print("{}={}".format(vname, val))

                mod_list = list()
                for item in val:
                    sep_list = item.split(",")
                    for sep in sep_list:
                        if sep != "":
                            mod_list.append(sep)

                #ret = ','.join(val)
                ret = ','.join(mod_list)
                #print("\t{}={}".format(vname, ret))

            else:
                ret = val

        return ret

    def _get_ini_value(self, vname, ini):
        ''' get data from ini file
        ini handles values as strings
        '''

        ret = None

        if vname in ini.ini['vprimer']:

            val = ini.ini['vprimer'][vname]
            if type(val) == list:
                ret = ','.join(val)
            else:
                ret = self._cast_val(val, self.conf_dict[vname]['dtype'])

        return ret

    def _cast_val(self, value, dtype):
        ''' for ini file data, casting data to fit data type
        '''

        #print("_cast_val: value={}, dtype={}".format(value, dtype))
        #print("type(value={}".format(type(value)))
        #print("type(dtype={}".format(type(dtype)))

        if dtype == 'int':
            #print("int")
            return int(value)

        elif dtype == 'float':
            #print("float")
            return float(value)

        elif dtype == 'bool':

            # for param
            if type(value) == bool:
                return value
            else:
                # form ini
                if value == "True":
                    return True
                elif value == "False":
                    return False
                else:
                    return None

        elif dtype == 'str':
            #print("str")
            return str(value)

        else:
            #print("else")
            return str(value)

    def _make_out_dir_tree(self):
        '''
        '''

        # if already made at conf
        dirs = [self.out_dir_path, self.log_dir_path, self.out_bak_dir_path]

        for dir in dirs:
            if os.path.isdir(dir):
                # prelog
                utl.prelog("exist dir {}.".format(dir), __name__)
            else:
                utl.prelog("not exist dir {}.".format(dir), __name__)
                utl.makedirs(dir)

    def out_dir_logging_start(self):
        '''
        '''

        # user defined path
        self.user_out_dir = self._value_choice('out_dir')

        # absolute path
        self.out_dir_path = utl.full_path(self.user_out_dir)
        # log_dir
        self.log_dir_path = "{}/{}".format(self.out_dir_path, "logs")
        # bak_dir
        self.out_bak_dir_path = "{}/{}".format(self.out_dir_path, "bak")

        # out_dir and log_dir
        self._make_out_dir_tree()

        # for conf
        global log
        log = self.log.logging_start(__name__, self.out_dir_path,
                                     self.log_dir_path)

        # for utl
        utl.open_log()

    # --- after log file open
    def choice_variables(self):
        ''' decide variable values
        '''

        # print param and ini variables
        self._print_param_ini()

        # for debug
        self.analyse_caps = self._value_choice('analyse_caps')

        # out_dir ---------------------------------------------
        self.user_out_dir = self._value_choice('out_dir')
        self.out_dir_path = utl.full_path(self.user_out_dir)

        # vcf -------------------------------------------------
        self.user_vcf_file = self._value_choice('vcf')
        self.user_vcf_file_path = utl.full_path(self.user_vcf_file)

        # ref -------------------------------------------------
        self.user_ref_fasta = self._value_choice('ref')
        self.user_ref_fasta_path = utl.full_path(self.user_ref_fasta)

        # thread ----------------------------------------------
        self.thread = self._value_choice('thread')


        if self.out_dir_path == "" or \
            self.user_vcf_file_path == "" or \
            self.user_ref_fasta_path == "":
            err_mes = "out_dir={} and vcf={} and ref={} ".format(
                self.out_dir_path, self.user_vcf_file_path,
                self.user_ref_fasta_path)
            err_mes += "are all required. exit."
            log.error(err_mes)
            sys.exit(1)

        log.info("thread={}".format(self.thread))

        # out_dir ---------------------------------------------
        self.out_dir_path = utl.full_path(self.user_out_dir)
        self.log_dir_path = "{}/{}".format(self.out_dir_path, "logs")
        self.out_bak_dir_path = "{}/{}".format(self.out_dir_path, "bak")

        #pprint.pprint(self.conf_dict)
        # INI show_genotype
        self.show_genotype = self._value_choice('show_genotype')
        if self.show_genotype == "":
            self.show_genotype = "gt"

        if not self.show_genotype in glv.show_genotype_list:
            err_mes = "show_genotype is selected from one of "
            err_mes += ", ".join(glv.show_genotype_list)
            log.error("{}. exit.".format(err_mes))
            log.error("show_genotype={}".format(self.show_genotype))
            sys.exit(1)

        # ini_file --------------------------------------------
        # INI
        self.ini_version_user = self._value_choice('ini_version')
        self.ini_version_system = self.conf_dict['ini_version']['default']

        self.user_ini_file = self.conf_dict['ini_file']['param']
        self.ini_file_path = utl.full_path(self.user_ini_file)

        # ref_dir ---------------------------------------------
        self.ref_dir_path = utl.full_path("refs")
        # make ref_dir
        utl.makedirs(self.ref_dir_path)

        # out_curr_setting ------------------------------------
        self.curr_setting_file = "current_setting_ini.txt"
        self.curr_setting_file_path = "{}/{}".format(self.out_dir_path,
                                                     self.curr_setting_file)

        # thread ----------------------------------------------
        self.use_joblib_threading = self._value_choice('use_joblib_threading')

        if not self.use_joblib_threading in ['yes', 'no']:
            err_mes = "use_joblib_threading Choose from Yes or No."
            log.error("{} exit.".format(err_mes))
            log.error("use_joblib_threading={}".format(
                self.use_joblib_threading))
            sys.exit(1)

        # thread adjust
        self.parallel, \
        self.parallel_full_thread, \
        self.parallel_blast_cnt, \
        self.blast_num_threads \
            = self._thread_adjusting()

        # vcf -------------------------------------------------
        basename_user_vcf = os.path.basename(self.user_vcf_file_path)
        self.vcf_file_slink_system = "{}/{}{}".format(self.ref_dir_path,
                                                      'slink_',
                                                      basename_user_vcf)

        # gtonly.gz
        self.vcf_file_path = "{}/{}{}".format(self.ref_dir_path,
                                              basename_user_vcf,
                                              "_GTonly.vcf.gz")

        # read
        self.prepare_vcf()

        # sample_nickname -------------------------------------
        basename_vcf_file = os.path.basename(self.vcf_file_path)
        self.vcf_sample_name_file = "{}/sample_name_{}.txt".format(
            self.ref_dir_path, basename_vcf_file)
        self.save_vcf_sample_name_txt()

        self.vcf_sample_nickname_list, \
        self.vcf_sample_basename_list, \
        self.vcf_sample_fullname_list, \
        self.vcf_sample_nickname_dict, \
        self.vcf_sample_basename_dict, \
        self.vcf_sample_fullname_dict, \
        self.group_members_vcf_str \
            = self.make_vcf_sample_variable()

        # illegal
        self.conf_dict['group_members']['default'] = \
            self.group_members_vcf_str

        # show_fasta-------------------------------------------
        self.show_fasta = self._value_choice('show_fasta')

        # show_samples-----------------------------------------
        self.show_samples = self._value_choice('show_samples')

        # Because it stops at show_fasta
        if self.show_fasta != True and self.show_samples == True:
            log.info("only show_samples mode, exit.")
            log.info("program finished {}\n".format(
                utl.elapsed_time(time.time(), glv.now_epochtime)))
            sys.exit(1)

        # pick_mode
        self.pick_mode = self._value_choice('pick_mode')

        # indel len
        self.indel_size = self._value_choice('indel_size')
        self.min_indel_len, self.max_indel_len = \
            [int(i) for i in self.indel_size.split('-')]

        # product size
        self.product_size = self._value_choice('product_size')
        self.min_product_size, self.max_product_size = \
            [int(i) for i in self.product_size.split('-')]

        # ref -------------------------------------------------
        # It will be set in main later
        # glv.ref = glv.ref.prepare_ref()
        self.ref_fasta_slink_system = ""
        self.ref_fasta_path = ""
        self.ref_fasta_chrom_list = []
        self.ref_fasta_fai = ""
        self.ref_fasta_chrom_txt = ""
        self.ref_fasta_pickle = ""

        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # enzyme
        self.enzyme_files_user_str = self._value_choice('enzyme_file')
        #self.enzyme_files_user_list = list()
        #self.enzyme_files_list = list()
        self.enzyme_str = self._value_choice('enzyme')
        #self.enzyme_name_list = list()

        # start stop
        self.progress = self._value_choice('progress')
        self.stop = self._value_choice('stop')

        # primer3
        self.fragment_pad_len = self._value_choice('fragment_pad_len')
        self.p3_params_file = self._value_choice('p3_params')

        # blast
        self.blast_distance = self._value_choice('blast_distance')

        # not set now
        self.blast_word_size = 0
        self.blastdb_title = ""
        self.blastdb = ""

        # region group member string ---------------------------------
        # select string by priority, next make dict or list variables

        self.regions_str = self.set_regions_str()
        self.group_members_str = self.set_group_members_str()
        self.distinguish_groups_str = self.set_distinguish_groups_str()

    def setup_variables(self):
        '''
        '''

        # setup only regions and members
        if self.show_genotype != "no":
            self._setup_genotype_variables()

        # Satisfy three structural variables
        #   1) regions_dict
        #   2) distinguish_groups_list
        #   3) group_members_dict

        # 1.1) Adjusting regions_str in easy mode --------------------------
        #log.debug("org self.regions_str={}".format(self.regions_str))

        #   Easy mode lacks the region name, so make up for it here.
        if '<EASY_MODE>' in self.regions_str:
            # <EASY_MODE>chrom_01:1-200000,chrom_02,chrom_03:all
            regions_str = ""
            rg_cnt = 1
            for region in self.regions_str.split(','):
                region = re.sub(r"^<EASY_MODE>", "", region)
                # region_name = region1
                region_name = "easy_region{}".format(rg_cnt)
                regions_str += "{}:{},".format(region_name, region)
                rg_cnt += 1

            # complete
            self.regions_str = re.sub(r",$", "", regions_str)

        # ====================
        # 1.2) make regions dictionary from regions_str ....................
        self.regions_str, \
        self.regions_dict, \
        self.region_name_list \
            = self.set_regions_dict(self.regions_str)

        self._set_chosen_value('regions', self.regions_str)
        log.info("chosen self.regions_str={}".format(self.regions_str))

        #log.debug("glv.conf.regions_dict={}".format(self.regions_dict))
        #log.debug("glv.conf.region_name_list={}".format(
        #    self.region_name_list))

        # 2.1) group_members_str -------------------------------------------
        #log.debug("org self.group_members_str={}".format(
        #    self.group_members_str))

        # 2.2) make group_members dictionary from group_members_str ........
        self.group_members_dict, \
        self.group_name_list \
            = self.set_group_members_dict(self.group_members_str)

        self._set_chosen_value('group_members', self.group_members_str)
        log.info("chosen self.group_members_str={}".format(
            self.group_members_str))

        #log.debug("glv.conf.group_members_dict={}".format(
        #    self.group_members_dict))
        #log.debug("glv.conf.group_name_list={}".format(
        #    self.group_name_list))

        #print("self.group_members_str={}".format(self.group_members_str))
        #print("self.group_members_dict={}".format(self.group_members_dict))
        #print("self.group_name_list={}".format(self.group_name_list))

        # 3.1) distinguish_groups_str --------------------------------------
        #log.debug("org self.distinguish_groups_str={}".format(
        #    self.distinguish_groups_str))

        #   Easy mode lacks the region name, so make up for it here.
        if "<EASY_MODE>" in self.distinguish_groups_str:
            region_names_str = "+".join(self.region_name_list)
            self.distinguish_groups_str = re.sub(r"<EASY_MODE>",
                                                 region_names_str,
                                                 self.distinguish_groups_str)

        # 3.2) make distinguish_groups_list from distinguish_groups_str ....
        # Avoid checking when show_genotype
        if self.show_genotype == "no":
            self.distinguish_groups_str, \
            self.distinguish_groups_list \
                = self.set_distinguish_groups_list(
                    self.distinguish_groups_str)

        self._set_chosen_value('distinguish_groups',
                               self.distinguish_groups_str)
        log.info("chosen self.distinguish_groups_str={}".format(
            self.distinguish_groups_str))

        #log.debug("distinguish_groups_list=\n{}".format(
        #    pprint.pformat(self.distinguish_groups_list)))

        # 4.1) Create a file to set the environment of primer3
        if self.p3_params_file == "no_p3_params":
            self.p3_params_file = "{}/{}".format(self.ref_dir_path,
                                                 "p3_params.txt")

        self.p3_params_file_path = utl.full_path(self.p3_params_file)
        #log.debug("self.p3_params_file_path={}".format(
        #    self.p3_params_file_path))

        # primer3, make and read parameters
        self.primer3_header_dict = self._set_primer3_header_dict()

        # 4.2) blast_word_size = PRIMER_MAX_SIZE
        self.blast_word_size = self.primer3_header_dict['PRIMER_MIN_SIZE']

        # 5.1) File that describes the enzyme name to be handled
        # enzyme_files_list
        self.enzyme_files_user_list = list()
        for file in self.enzyme_files_user_str.split(','):
            # user file full path list
            file_path_user = utl.full_path(file)
            self.enzyme_files_user_list.append(file_path_user)

            basename_user = os.path.basename(file_path_user)
            enzyme_file_slink_system = "{}/{}{}".format(
                self.ref_dir_path, "slink_", basename_user)

            self.enzyme_files_list.append(enzyme_file_slink_system)

        #pprint.pprint(self.enzyme_files_user_list)
        #pprint.pprint(self.enzyme_files_list)

        # 5.2) enzyme
        self.enzyme_files_list, self.enzyme_name_list \
            = self.read_enzyme_file()

        log.info("enzyme_files_list={}".format(self.enzyme_files_list))
        log.info("enzyme_name_list={}".format(self.enzyme_name_list))

        # 6) progress, stop
        if not self._check_progress_stop():
            if self.progress != "all" and self.stop != "no":
                err_mes = "The progress or stop settings are incorrect."
                log.error("{} exit.".format(err_mes))
                log.error("progress={}, stop={}".format(
                    self.progress, self.stop))
                log.error("{}".format(", ".join(
                    glv.outlist.outf_prefix.keys())))
                sys.exit(1)

    def _setup_genotype_variables(self):
        '''
        '''

        if self.regions_str == "all":
            self.regions_str = "{}".format(",".join(self.ref_fasta_chrom_list))

        if self.group_members_str == "all":
            self.group_members_str = "all:{}".format(",".join(
                self.vcf_sample_nickname_list))
        '''
        print("regions_str={}".format(
            self.regions_str))
        print("group_members_str={}".format(
            self.group_members_str))
        print("distinguish_groups_str={}".format(
            self.distinguish_groups_str))


        print("region_name_list={}".format(
            self.region_name_list))
        print("group_name_list={}".format(
            self.group_name_list))

        print("regions_dict={}".format(
            self.regions_dict))
        print("group_members_dict={}".format(
            self.group_members_dict))
        print("distinguish_groups_list={}".format(
            self.distinguish_groups_list))
        #sys.exit(1)
        '''

    def _check_progress_stop(self):
        '''
        '''

        ret = True

        if self.progress == "all" and self.stop == "no":
            ret = True

        elif self.progress not in glv.outlist.outf_prefix.keys():
            ret = False

        elif self.stop not in glv.outlist.outf_prefix.keys():
            ret = False

        return ret

    def _thread_adjusting(self):
        ''' in Parallel, if there are 10 threads blast cmd will use at least
            2 cores so par 1 parallel.
            main python always use 1,
            parallel use 1 thread, blast use 2 threads
        '''

        parallel = True

        parallel_full_thread = 0
        parallel_blast_cnt = 0
        blast_num_threads = 0

        if self.thread < 6 or self.use_joblib_threading != "yes":
            parallel = False

        if parallel == True:

            # unit is 4+1=5
            parallel_base = self.thread
            parallel_full_thread = parallel_base

            # blast = 4
            parallel_blast_cnt = int(parallel_base / 5)
            blast_num_threads = 4

        else:
            # 6 = 5
            full_thread = self.thread - 1
            parallel_full_thread = full_thread
            blast_num_threads = full_thread

        return parallel, parallel_full_thread,\
            parallel_blast_cnt, blast_num_threads

    def _value_choice(self, vname):
        '''
        '''

        chosen_value = ""

        # If neither param nor ini has a key, use the default value
        if self.conf_dict[vname]['param'] is None and \
            self.conf_dict[vname]['ini'] is None:
            #print("cv 1")
            chosen_value = self.conf_dict[vname]['default']

        elif self.conf_dict[vname]['param'] is not None:
            #print("cv 2")
            chosen_value = self.conf_dict[vname]['param']

        elif self.conf_dict[vname]['ini'] is not None:
            #print("cv 3")
            chosen_value = self.conf_dict[vname]['ini']

        elif self.conf_dict[vname]['default'] is not None:
            #print("cv 4")
            chosen_value = self.conf_dict[vname]['default']

        else:
            utl.prelog("not found value of key {}.".format(vname), __name__)
            sys.exit(1)

        #print("vname={}".format(vname))
        #print("dtype={}".format(self.conf_dict[vname]['dtype']))
        #print(chosen_value)
        #print(type(chosen_value))

        # cast by dtype
        chosen_value = self._cast_val(chosen_value,
                                      self.conf_dict[vname]['dtype'])

        #print(type(chosen_value))

        # chosen value
        self._set_chosen_value(vname, chosen_value)
        #print("{}={}".format(vname, self.conf_dict[vname]['chosen']))

        return chosen_value

    def _set_chosen_value(self, vname, chosen_value):

        # Assuming the chosen key does not exist
        self.conf_dict[vname]['chosen'] = chosen_value

    def _is_chrom_name(self, chrom_name):
        '''
        '''
        ret = False
        #print("_is_chrom_name, {}, {}".format(
        #    chrom_name, self.ref_fasta_chrom_list))
        if chrom_name in self.ref_fasta_chrom_list:
            ret = True
        #print("{}".format(ret))
        return ret

    def _is_region_name(self, region_name):
        '''
        '''
        ret = False
        #print("_is_region_name, {}, {}".format(
        #    region_name, self.region_name_list))
        if region_name in self.region_name_list:
            ret = True
        #print("{}".format(ret))
        return ret

    def _is_group_name(self, group_name):
        '''
        '''
        ret = False
        #print("_is_group_name, {}, {}".format(
        #    group_name, self.group_name_list))
        if group_name in self.group_name_list:
            ret = True
        #print("{}".format(ret))
        return ret

    def _is_valid_int_range(self, range_str):
        '''
        '''
        ret = True

        if "-" not in range_str:
            ret = False
        else:
            min_size, max_size = range_str.split("-")

            if not min_size.isdecimal() or \
                not max_size.isdecimal():
                ret = False
            elif int(min_size) > int(max_size):
                ret = False

        return ret

    def _is_valid_chrom_range(self, range_str):

        ret = True

        #print("_is_valid_chrom_range, range_str={}".format(range_str))
        chrom_name, rg_str = range_str.split(':')
        #print("_is_valid_chrom_range, chrom_name={}, rg_str={}".format(
        #    chrom_name, rg_str))

        min_pos, max_pos = [int(i) for i in rg_str.split('-')]
        #print("_is_valid_chrom_range, min_pos={}, max_pos={}".format(
        #    min_pos, max_pos))

        region_str, start, end, length = self._get_chrom_info(chrom_name)
        #print("_is_valid_chrom_range, _get_chrom_info={}, {}, {}, {}".format(
        #    region_str, start, end, length))

        if min_pos < start or end < max_pos:
            ret = False
        #print("_is_valid_chrom_range, ret={}".format(ret))

        return ret

    def _get_chrom_info(self, chrom_name):
        '''
        '''

        # glv.conf.ref_fasta_chrom_dict_list
        start = 0
        end = 0
        length = 0

        for d in self.ref_fasta_chrom_dict_list:
            #end': 30583384, 'length': 30583384, 'start': 1
            if chrom_name == d.get('chrom'):
                start = d.get('start')
                end = d.get('end')
                length = d.get('length')

        region_str = chrom_name

        if start is not None:
            region_str = "{}:{}-{}".format(region_str, start, end)

        #pprint.pprint(glv.conf.ref_fasta_chrom_dict_list)
        #print("{}, {}".format(region_str, length))

        return region_str, start, end, length

    def _is_easy_mode(self):
        ''' On the command line, determine if we are currently in easy mode
        '''

        easy_mode = False
        a_sample = False
        b_sample = False

        #print("self.conf_dict['a_sample']['param']={}".format(
        #    self.conf_dict['a_sample']['param']))
        #print("self.conf_dict['b_sample']['param']={}".format(
        #    self.conf_dict['b_sample']['param']))

        if self.conf_dict['a_sample']['param'] is not None and \
            self.conf_dict['a_sample']['param'] != "":
            a_sample = True

        if self.conf_dict['b_sample']['param'] is not None and \
            self.conf_dict['b_sample']['param'] != "":
            b_sample = True

        #print("a_sample={}".format(a_sample))
        #print("b_sample={}".format(b_sample))

        if a_sample == True or b_sample == True:
            easy_mode = True

        #print("easy_mode={}".format(easy_mode))

        return easy_mode

    def _print_param_ini(self):
        ''' for debug
        '''
        # parameter and ini_file variables

        #utl.prelog
        log.info("\n======== param.p ====================")
        log.info("self.param.p=\n\n{}\n".format(pprint.pformat(self.param.p)))

        log.info("\n======== ini.ini['vprimer'] =========")

        if self.param.p['ini_file'] is not None:
            log.info("\nini_file={}\n\n{}\n".format(
                self.param.p['ini_file'],
                pprint.pformat(dict(self.ini.ini['vprimer']))))
        else:
            log.info("ini_file not specified.")
        log.info("\n=====================================\n")

    def _set_primer3_header_dict(self):
        '''
        '''

        primer3_header_dict = dict()

        if os.path.isfile(self.p3_params_file_path):
            log.info("found {}.".format(self.p3_params_file_path))
            # This file may have been edited by the user, so copy it
            utl.save_to_tmpfile(self.p3_params_file_path, True, True)

        else:
            log.info("not found {}.".format(self.p3_params_file_path))
            with open(self.p3_params_file_path, mode='w') as f:
                f.write("{}={}\n".format('#PARAM', 'VALUE'))

                for key, value in list(self.p3key.items()):
                    f.write("{}={}\n".format(key, value))

        # 1.1) open and read parameters
        with open(self.p3_params_file_path, mode='r') as f:
            # iterator
            for r_liner in f:
                r_line = r_liner.strip()  # cr, ws

                if r_line.startswith('#') or r_line == '':
                    continue

                r_line = utl.strip_hash_comment(r_line)
                vname, value = r_line.split('=')
                if vname == 'PRIMER_PRODUCT_SIZE_RANGE' or \
                    vname == 'PRIMER_NUM_RETURN':
                    continue

                primer3_header_dict[vname] = value

        # constant value for primer3

        # PRIMER_FIRST_BASE_INDEX=1
        primer3_header_dict['PRIMER_FIRST_BASE_INDEX'] = str(1)
        # PRIMER_PRODUCT_SIZE_RANGE=???-???
        primer3_header_dict['PRIMER_PRODUCT_SIZE_RANGE'] = \
            "{}-{}".format(self.min_product_size, self.max_product_size)
        # PRIMER_NUM_RETURN=1
        primer3_header_dict['PRIMER_NUM_RETURN'] = str(1)

        return primer3_header_dict
Exemplo n.º 8
0
    def open_log_vcffile(self):

        # in glv
        global log
        log = LogConf.open_log(__name__)