Пример #1
0
def parse_sample(sampleList):

    total_number = 0
    with open(sampleList, 'r') as f:
        sampleInfo = bundle()
        for line in f:
            total_number += 1
            line = line.strip()
            field = line.split()
            sampleName = field[0]
            rg_LB = field[2]
            rg_ID = "{}-{}".format(sampleName, field[3])
            fq_dir = field[-1].strip()
            fq1s = glob.glob("%s/*1.fq.gz" % fq_dir)
            fq1 = ''
            fq2 = ''
            if fq1s:
                fq1 = fq1s[0].strip()
            else:
                logger.error("fq1 under %s don't exists." % sampleName)
                exit(3)

            rg = "@RG\\tID:%s\\tPL:COMPLETE\\tLB:%s\\tSM:%s\\tCN:BGI" % (
                rg_ID, rg_LB, sampleName)
            fq_lib_name = rg_ID

            if not sampleInfo.has_key(sampleName):
                sampleInfo[sampleName] = bundle()
                sample_lane_counter = 0
            else:
                sample_lane_counter = len(sampleInfo[sampleName])

            dataTag = 'data' + str(sample_lane_counter)
            if not sampleInfo[sampleName].has_key(dataTag):
                sampleInfo[sampleName][dataTag] = bundle()

            #find adp1
            sampleInfo[sampleName][dataTag]['fq1'] = fq1

            #find fq2 and adp2
            fq2 = fq1
            fq2 = fq2.replace("1.fq.gz", "2.fq.gz")
            if os.path.exists(fq2):
                sampleInfo[sampleName][dataTag]['fq2'] = fq2
            else:
                logger.warning("%s of line: %d is SE data!" %
                               (sampleName, total_number))

            sampleInfo[sampleName][dataTag]['rg'] = rg
            #sampleInfo[sampleName][dataTag]['libname'] = fq_lib_name
            sampleInfo[sampleName][dataTag]['gender'] = 'male'

    return sampleInfo
Пример #2
0
def parse_sample(sampleList):
    
    total_number = 0
    with open(sampleList,'r') as f:
        sampleInfo = bundle()
        for line in f:
            total_number += 1
            line = line.strip()
            field = line.split()
            sampleName = field[0]
            rg_LB = field[2]
            rg_ID = "{}-{}".format(sampleName, field[3])
            fq_dir = field[-1].strip()
            fq1s = glob.glob("%s/*1.fq.gz" % fq_dir)
            fq1 = ''
            fq2 = ''
            if fq1s:
                fq1 = fq1s[0].strip()
            else:
                logger.error("fq1 under %s don't exists." % sampleName)
                exit(3)
        
            rg = "@RG\\tID:%s\\tPL:COMPLETE\\tLB:%s\\tSM:%s\\tCN:BGI" % (rg_ID,rg_LB,sampleName)
            fq_lib_name = rg_ID
            
            if not sampleInfo.has_key(sampleName):
                    sampleInfo[sampleName] = bundle()
                    sample_lane_counter = 0
            else:
                sample_lane_counter = len(sampleInfo[sampleName])
                    
            dataTag = 'data'+str(sample_lane_counter)
            if not sampleInfo[sampleName].has_key(dataTag):
                sampleInfo[sampleName][dataTag] = bundle()
                
            #find adp1
            sampleInfo[sampleName][dataTag]['fq1'] = fq1
                
            #find fq2 and adp2
            fq2 = fq1
            fq2 = fq2.replace("1.fq.gz", "2.fq.gz")
            if os.path.exists(fq2):
                sampleInfo[sampleName][dataTag]['fq2'] = fq2
            else:
                logger.warning("%s of line: %d is SE data!" % (sampleName,total_number))
                
            sampleInfo[sampleName][dataTag]['rg'] = rg
            #sampleInfo[sampleName][dataTag]['libname'] = fq_lib_name
            sampleInfo[sampleName][dataTag]['gender'] = 'male'
            
    return sampleInfo
Пример #3
0
def parse_sample(sampleList):
    
    with open(sampleList,'r') as f:
        sampleInfo = bundle()
        for line in f:
            line = line.strip()
            tmp = line.split()
            sampleInfo[tmp[0]] = tmp[1]
    return sampleInfo
Пример #4
0
def parse_sample(sampleList):

    with open(sampleList, 'r') as f:
        sampleInfo = bundle()
        for line in f:
            line = line.strip()
            tmp = line.split()
            sampleInfo[tmp[0]] = tmp[1]
    return sampleInfo
Пример #5
0
def parse_sample(sampleList):

    with open(sampleList, 'r') as f:
        sampleInfo = bundle()
        sample_lane_counter = 0

        for line in f:
            line = line.strip()
            if line[0] == '#':
                continue
            if re.match(r"^\s*$", line):
                continue
            sampleName = ''
            m = re.match(r"^>(\S+)$", line)
            if m:
                sampleName = m.group(1)
                if not sampleInfo.has_key(sampleName):
                    sampleInfo[sampleName] = bundle()
                    sample_lane_counter = 0
                else:
                    sample_lane_counter += 1
                dataTag = 'data' + str(sample_lane_counter)
                sampleInfo[sampleName][dataTag] = bundle()

                for info in f:
                    info = info.strip()
                    if info[0] == '#':
                        continue
                    if re.match(r"^\s*$", info):
                        continue
                    m2 = re.match(r"^(\S+)\s*=\s*(\S+)", info)
                    if m2:
                        sampleInfo[sampleName][dataTag][m2.group(
                            1)] = m2.group(2)
                    if re.match(r"^>\s*$", info):
                        break

    return sampleInfo
Пример #6
0
def parse_sample(sampleList):
    
    with open(sampleList,'r') as f:
        sampleInfo = bundle()
        sample_lane_counter = 0
        
        for line in f:
            line = line.strip()
            if line[0] == '#':
                continue
            if re.match(r"^\s*$", line):
                continue
            sampleName = ''
            m = re.match(r"^>(\S+)$", line)
            if m:
                sampleName = m.group(1)
                if not sampleInfo.has_key(sampleName):
                    sampleInfo[sampleName] = bundle()
                    sample_lane_counter = 0
                else:
                    sample_lane_counter += 1
                dataTag = 'data' + str(sample_lane_counter)
                sampleInfo[sampleName][dataTag] = bundle()
    
                for info in f:
                    info = info.strip()
                    if info[0] == '#':
                        continue
                    if re.match(r"^\s*$", info):
                        continue
                    m2 = re.match(r"^(\S+)\s*=\s*(\S+)",info)
                    if m2:
                        sampleInfo[sampleName][dataTag][m2.group(1)] = m2.group(2)
                    if re.match(r"^>\s*$", info):
                        break
                
    return sampleInfo
Пример #7
0
    def parse(self, mode):
        self.config.info = bundle()
        self.config.info.female_counter = 0
        self.config.info.male_counter = 0
        self.config.sample = bundle()

        total_number = 0
        male_total_number = 0
        female_total_number = 0
        fq_file_set = set()

        sampleInfo = self.sampleParser(mode)
        if mode == 3 or mode == 4:
            self.config.sample = sampleInfo
            if self.config.analysisList[0] == 'init':
                self.config.analysisList = self.config.analysisList[1:]

            if mode == 3:
                unavailableStep = ['filter', 'alignment']
                for step in unavailableStep:
                    if step in self.config.analysisList:
                        logger.error("Cann't run this step (%s) in mode3" % step)
                        exit(0)
                    #                         printtime("WARNING: step %s is dropped in mode 3."% step)
                    #                         self.config.analysisList.remove(step)
            elif mode == 4:
                unavailableStep = ['filter', 'alignment', 'rmdup', 'realignment', 'baserecal', 'genotype', 'bamSort']
                for step in unavailableStep:
                    if step in self.config.analysisList:
                        logger.error("Cann't run this step (%s) in mode4" % step)
                        exit(0)
                    #                         printtime("WARNING: step %s is dropped in mode 4."% step)
                    #                         self.config.analysisList.remove(step)
        else:
            fastq_necessary_property = ("id", "fq1", "rg")
            for sampleName in sampleInfo:
                rg_id_dict = bundle()
                sampleIsSE = bundle()
                self.config.sample[sampleName] = bundle(rg=bundle(), lane=bundle())

                sampleGender = self.rectify_gender(self.check_gender(sampleInfo[sampleName], sampleName))
                if sampleGender:
                    if self.config.ref.gender_mode != 'normal':
                        self.config.sample[sampleName].gender = sampleGender
                        if sampleGender == 'male':
                            self.config.info.male_counter += 1
                        else:
                            self.config.info.female_counter += 1
                    else:
                        self.config.sample[sampleName].gender = 'normal'
                else:
                    self.config.sample[sampleName].gender = 'normal'

                pool = ''
                for dataTag in sampleInfo[sampleName]:
                    if not pool:
                        pool = sampleInfo[sampleName][dataTag].get('pool')

                    if self.config.ref.gender_mode == 'both':
                        gender = self.rectify_gender(sampleInfo[sampleName][dataTag].get('gender'))
                        if gender == 'female':
                            sampleInfo[sampleName][dataTag]['id'] = female_total_number
                            female_total_number += 1
                        if gender == 'male':
                            sampleInfo[sampleName][dataTag]['id'] = male_total_number
                            male_total_number += 1
                    else:
                        sampleInfo[sampleName][dataTag]['id'] = total_number
                        total_number += 1

                    self.config.sample[sampleName]['rg'][dataTag] = sampleInfo[sampleName][dataTag]['rg']

                    # RG.ID 同一样本的RG.ID不能重复
                    rg_id = sampleInfo[sampleName][dataTag]['rg'].split('ID:')[1].split('\\t')[0]
                    if not rg_id_dict.has_key(rg_id):
                        rg_id_dict[rg_id] = True
                    else:
                        logger.error('The same RG.ID in the different data (%s) of %s' % (dataTag, sampleName))

                    #                     if not sampleInfo[sampleName][dataTag].has_key('gender'):
                    #                         logger.error("No gender info in %s %s!" % (sampleName,dataTag))

                    for prop in fastq_necessary_property:
                        if not sampleInfo[sampleName][dataTag].has_key(prop):
                            logger.error(
                                "fastq prperty: %s is not exists in the %s of %s. You must set it in your sample file." % (
                                prop, dataTag, sampleName))

                    if not os.path.exists(sampleInfo[sampleName][dataTag]['fq1']):
                        raise RuntimeError("%s don't exists!" % sampleInfo[sampleName][dataTag]['fq1'])

                    if sampleInfo[sampleName][dataTag].get('fq2') and sampleInfo[sampleName][dataTag]['fq2'] != 'null':
                        if not os.path.exists(sampleInfo[sampleName][dataTag]['fq2']):
                            raise RuntimeError("%s don't exists!" % sampleInfo[sampleName][dataTag]['fq2'])

                        # fq file
                        if sampleInfo[sampleName][dataTag]['fq2'] not in fq_file_set:
                            fq_file_set.add(sampleInfo[sampleName][dataTag]['fq2'])
                        else:
                            raise RuntimeError("%s used more than once!" % sampleInfo[sampleName][dataTag]['fq2'])

                        if sampleIsSE.has_key('isSE') and sampleIsSE['isSE'] == True:
                            logger.error("%s: Have an error abort the prperty:fq2 in your sample file." % sampleName)
                        sampleIsSE['isSE'] = False
                    else:
                        if sampleIsSE.has_key('isSE') and sampleIsSE['isSE'] == False:
                            logger.error("%s: Have an error abort the prperty:fq2 in your sample file." % sampleName)
                        sampleIsSE['isSE'] = True

                    # fq file
                    if sampleInfo[sampleName][dataTag]['fq1'] not in fq_file_set:
                        fq_file_set.add(sampleInfo[sampleName][dataTag]['fq1'])
                    else:
                        raise RuntimeError("%s used more than once!" % sampleInfo[sampleName][dataTag]['fq1'])

                    self.config.sample[sampleName]['lane'][dataTag] = sampleInfo[sampleName][dataTag]

                self.config.sample[sampleName].pool = pool
                self.config.init.isSE = sampleIsSE['isSE']
                self.config.sample[sampleName].isSE = sampleIsSE['isSE']
        return sampleInfo
Пример #8
0
class ParseSampleList(object):
    '''
    This class is used to parse sample list
    '''

    config = bundle()

    def __init__(self, sampleList, config):
        '''
        Constructor
        '''
        self.sampleList = sampleList
        self.config = config

    def rectify_gender(self, gender):
        if gender == 'F' or gender == 'female':
            return 'female'
        else:
            return 'male'

    def check_gender(self, sampleinfo, sampleName):
        sampleGender = ''
        for dataTag in sampleinfo:
            if not sampleGender:
                if sampleinfo[dataTag].get('gender'):
                    sampleGender = sampleinfo[dataTag]['gender']
            elif sampleGender != sampleinfo[dataTag]['gender']:
                logger.error("gender in %s is different in each lane!" %
                             sampleName)
        return sampleGender

    def sampleParser(self, modetype):
        modname = 'mode' + str(modetype)
        parse_sample = getattr(search_mod(modname, self.config.Path.modeDir),
                               'parse_sample')
        sampleInfo = parse_sample(self.sampleList)
        return sampleInfo

    def parse(self, mode):
        self.config.info = bundle()
        self.config.info.female_counter = 0
        self.config.info.male_counter = 0
        self.config.sample = bundle()

        total_number = 0
        male_total_number = 0
        female_total_number = 0
        fq_file_set = set()

        sampleInfo = self.sampleParser(mode)
        if mode == 3 or mode == 4:
            self.config.sample = sampleInfo
            if self.config.analysisList[0] == 'init':
                self.config.analysisList = self.config.analysisList[1:]

            if mode == 3:
                unavailableStep = ['filter', 'alignment']
                for step in unavailableStep:
                    if step in self.config.analysisList:
                        logger.error("Cann't run this step (%s) in mode3" %
                                     step)
                        exit(0)
                    #                         printtime("WARNING: step %s is dropped in mode 3."% step)
                    #                         self.config.analysisList.remove(step)
            elif mode == 4:
                unavailableStep = [
                    'filter', 'alignment', 'rmdup', 'realignment', 'baserecal',
                    'genotype', 'bamSort'
                ]
                for step in unavailableStep:
                    if step in self.config.analysisList:
                        logger.error("Cann't run this step (%s) in mode4" %
                                     step)
                        exit(0)
                    #                         printtime("WARNING: step %s is dropped in mode 4."% step)
                    #                         self.config.analysisList.remove(step)
        else:
            fastq_necessary_property = ("id", "fq1", "rg")
            for sampleName in sampleInfo:
                rg_id_dict = bundle()
                sampleIsSE = bundle()
                self.config.sample[sampleName] = bundle(rg=bundle(),
                                                        lane=bundle())

                sampleGender = self.rectify_gender(
                    self.check_gender(sampleInfo[sampleName], sampleName))
                if sampleGender:
                    if self.config.ref.gender_mode != 'normal':
                        self.config.sample[sampleName].gender = sampleGender
                        if sampleGender == 'male':
                            self.config.info.male_counter += 1
                        else:
                            self.config.info.female_counter += 1
                    else:
                        self.config.sample[sampleName].gender = 'normal'
                else:
                    self.config.sample[sampleName].gender = 'normal'

                pool = ''
                for dataTag in sampleInfo[sampleName]:
                    if not pool:
                        pool = sampleInfo[sampleName][dataTag].get('pool')

                    if self.config.ref.gender_mode == 'both':
                        gender = self.rectify_gender(
                            sampleInfo[sampleName][dataTag].get('gender'))
                        if gender == 'female':
                            sampleInfo[sampleName][dataTag][
                                'id'] = female_total_number
                            female_total_number += 1
                        if gender == 'male':
                            sampleInfo[sampleName][dataTag][
                                'id'] = male_total_number
                            male_total_number += 1
                    else:
                        sampleInfo[sampleName][dataTag]['id'] = total_number
                        total_number += 1

                    self.config.sample[sampleName]['rg'][dataTag] = sampleInfo[
                        sampleName][dataTag]['rg']

                    # RG.ID 同一样本的RG.ID不能重复
                    rg_id = sampleInfo[sampleName][dataTag]['rg'].split(
                        'ID:')[1].split('\\t')[0]
                    if not rg_id_dict.has_key(rg_id):
                        rg_id_dict[rg_id] = True
                    else:
                        logger.error(
                            'The same RG.ID in the different data (%s) of %s' %
                            (dataTag, sampleName))

                    #                     if not sampleInfo[sampleName][dataTag].has_key('gender'):
                    #                         logger.error("No gender info in %s %s!" % (sampleName,dataTag))

                    for prop in fastq_necessary_property:
                        if not sampleInfo[sampleName][dataTag].has_key(prop):
                            logger.error(
                                "fastq prperty: %s is not exists in the %s of %s. You must set it in your sample file."
                                % (prop, dataTag, sampleName))

                    if not os.path.exists(
                            sampleInfo[sampleName][dataTag]['fq1']):
                        raise RuntimeError(
                            "%s don't exists!" %
                            sampleInfo[sampleName][dataTag]['fq1'])

                    if sampleInfo[sampleName][dataTag].get(
                            'fq2'
                    ) and sampleInfo[sampleName][dataTag]['fq2'] != 'null':
                        if not os.path.exists(
                                sampleInfo[sampleName][dataTag]['fq2']):
                            raise RuntimeError(
                                "%s don't exists!" %
                                sampleInfo[sampleName][dataTag]['fq2'])

                        # fq file
                        if sampleInfo[sampleName][dataTag][
                                'fq2'] not in fq_file_set:
                            fq_file_set.add(
                                sampleInfo[sampleName][dataTag]['fq2'])
                        else:
                            raise RuntimeError(
                                "%s used more than once!" %
                                sampleInfo[sampleName][dataTag]['fq2'])

                        if sampleIsSE.has_key(
                                'isSE') and sampleIsSE['isSE'] == True:
                            logger.error(
                                "%s: Have an error abort the prperty:fq2 in your sample file."
                                % sampleName)
                        sampleIsSE['isSE'] = False
                    else:
                        if sampleIsSE.has_key(
                                'isSE') and sampleIsSE['isSE'] == False:
                            logger.error(
                                "%s: Have an error abort the prperty:fq2 in your sample file."
                                % sampleName)
                        sampleIsSE['isSE'] = True

                    # fq file
                    if sampleInfo[sampleName][dataTag][
                            'fq1'] not in fq_file_set:
                        fq_file_set.add(sampleInfo[sampleName][dataTag]['fq1'])
                    else:
                        raise RuntimeError(
                            "%s used more than once!" %
                            sampleInfo[sampleName][dataTag]['fq1'])

                    self.config.sample[sampleName]['lane'][
                        dataTag] = sampleInfo[sampleName][dataTag]

                self.config.sample[sampleName].pool = pool
                self.config.init.isSE = sampleIsSE['isSE']
                self.config.sample[sampleName].isSE = sampleIsSE['isSE']
        return sampleInfo
Пример #9
0
    def run(self, impl, sampleInfo):
        mode = self.option.mode
        result = bundle(output=bundle(), script=bundle())

        # extend program path
        self.init.multiUploader = self.expath('init.multiUploader')
        self.init.gzUploader = self.expath('init.gzUploader')
        self.init.check_log = self.expath('init.check_log')
        self.init.bgzip = self.expath('init.bgzip', False)
        self.init.samtools = self.expath('init.samtools', False)

        sampleName = self.option.multiSampleName
        scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
        self.analysisList = self.analysisList[1:]
        hdfs_gz_tmp = os.path.join(self.option.dirHDFS, sampleName, 'data', 'gz_tmp')
        tmp = impl.mkdir(self.option.workdir, "temp", sampleName, 'ubam')
        rawData = impl.mkdir(self.option.workdir, "ubam", sampleName)

        ubam = []
        DataParam = []
        output = bundle()
        cmd = []
        for sample_name in sampleInfo.keys():
            sample = sampleInfo[sample_name]
            output[sample_name] = bundle()
            for dataTag in sample.keys():
                output[sample_name][dataTag] = bundle()
                filename = '{}_{}.bam'.format(sample_name, dataTag)
                output[sample_name][dataTag]['bam'] = os.path.join(rawData, filename)
                ubam.append(output[sample_name][dataTag]['bam'])

                DataParam.append({
                    "KEY1": sample[dataTag]['fq1'],
                    "KEY2": sample[dataTag]['fq2'],
                    "KEY3": output[sample_name][dataTag]['bam'],
                    "KEY4": sample_name,
                    "KEY5": sample_name + "_" + dataTag
                })

        if DataParam:
            impl.write_file(
                fileName='data.list',
                scriptsdir=scriptsdir,
                commands=["${KEY1}\t${KEY2}\t${KEY3}\t${KEY4}\t${KEY5}"],
                JobParamList=DataParam)

            mapper = []
            mapper.append("#!/usr/bin/perl -w")
            mapper.append("use strict;\n")
            mapper.append("while(<STDIN>)\n{")
            mapper.append("\tchomp;\n\tmy @tmp = split(/\\t/);")
            mapper.append("\tif(!-e $tmp[1])\n\t{")
            mapper.append("\t\tprint \"$tmp[1] don't exist.\\n\";")
            mapper.append("\t\texit 1;\n\t}")
            mapper.append("\tsystem(\"%s FastqToSam -F1 $tmp[1] -F2 $tmp[2] -O $tmp[3] -SM $tmp[4] -RG $tmp[5] --TMP_DIR %s -PL illumina\");\n}" % (self.init.gatk, tmp ))
            impl.write_file(
                fileName='upload_mapper.pl',
                scriptsdir=scriptsdir,
                commands=mapper)

            hadoop_parameter = ' -D mapred.job.name="upload data" '
            if self.hadoop.get('queue'):
                hadoop_parameter += '-D mapreduce.job.queuename={} '.format(self.hadoop.queue)
            hadoop_parameter += ' -D mapred.map.tasks=%d ' % len(DataParam)
            hadoop_parameter += '-D mapreduce.map.memory.mb=10240 '
            hadoop_parameter += ' -D mapred.reduce.tasks=0 '
            hadoop_parameter += ' -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat '
            ParamDict = {
                "PROGRAM": "%s jar %s" % (self.hadoop.bin, self.hadoop.streamingJar),
                "MAPPER": os.path.join(scriptsdir, 'upload_mapper.pl'),
                "INPUT": 'file://' + os.path.join(scriptsdir, 'data.list'),
                "OUTPUT": hdfs_gz_tmp,
                "HADOOPPARAM": hadoop_parameter
            }

            cmd.append('%s ${OUTPUT}' % self.fs_cmd.delete)
            cmd.append('${PROGRAM} ${HADOOPPARAM} -input ${INPUT} -output ${OUTPUT} -mapper "perl ${MAPPER}"')

            # cmd.append('%s jar %s GzUploader -i %s -l' % (
            # self.hadoop.bin, self.init.gzUploader, os.path.join(scriptsdir, 'data.list')))

            # write script
            scriptPath = \
                impl.write_shell(
                    name='init',
                    scriptsdir=scriptsdir,
                    commands=cmd,
                    paramDict=ParamDict)
            result.script[sampleName] = scriptPath

        result.output = output

        if self.init.qualitySystem == '':
            self.check_qs(sampleInfo)
            print "[INFO   ]  -- qualitySystem is %s (autocheck)--" % self.init.qualitySystem
        else:
            print "[INFO   ]  -- qualitySystem is %s --" % self.init.qualitySystem

        return result
Пример #10
0
def parse_sample(sampleList):
    total_number = 0
    sampleInfo = bundle()
    with open(sampleList,'r') as f:
        for line in f:
            fq1s = []
            line = line.strip()
            field = line.split()
            rg_LB = field[2]
            rg_PU = field[3]
            sampleName = field[1]

            if field[3].find(',') != -1:
                fq1s.append(field[3].split(',')[0])
                rg_LB = field[1]
                rg_PU = field[2]
                sampleName = field[0]
            else:
                fq_dir = field[-1].strip()
                fq1s = glob.glob("%s/*1.fq.gz" % fq_dir)
                if not fq1s:
                    fq1s = glob.glob("%s/*/*1.fq.gz" % fq_dir)

            if len(fq1s) == 0 or not os.path.exists(fq1s[0]):
                logger.error("fq1 under %s don't exists." % sampleName)
                exit(3)

            for fq1 in fq1s:
                total_number += 1
                if not sampleInfo.has_key(sampleName):
                    sampleInfo[sampleName] = bundle()
                    sample_lane_counter = 0
                else:
                    sample_lane_counter = len(sampleInfo[sampleName])
                # fq_name = os.path.basename(fq1)
                # fq_dir = os.path.abspath(os.path.dirname(fq1))

                # slideID_laneID_barcode
                # CL100035764_L02_33_1.fq.gz
                # tmp = fq_name.split("_")
                # rg_PU = tmp[0] + "_" + tmp[1] + "_" + tmp[2]

                rg_ID = "{}_{}".format(sampleName, sample_lane_counter)
                rg = "@RG\\tID:%s\\tPL:COMPLETE\\tPU:%s\\tLB:%s\\tSM:%s\\tCN:BGI" % (rg_ID, rg_PU, rg_LB, sampleName)
                fq_lib_name = rg_ID


                dataTag = 'data'+str(sample_lane_counter)
                if not sampleInfo[sampleName].has_key(dataTag):
                    sampleInfo[sampleName][dataTag] = bundle()

                sampleInfo[sampleName][dataTag]['fq1'] = fq1

                #find fq2
                fq2 = fq1
                fq2 = fq2.replace("1.fq.gz", "2.fq.gz")
                if os.path.exists(fq2):
                    sampleInfo[sampleName][dataTag]['fq2'] = fq2
                else:
                    logger.warning("%s of line: %d is SE data!" % (sampleName,total_number))

                sampleInfo[sampleName][dataTag]['rg'] = rg
                sampleInfo[sampleName][dataTag]['libname'] = fq_lib_name
                sampleInfo[sampleName][dataTag]['gender'] = 'male'
            
    return sampleInfo
Пример #11
0
class init(Workflow):
    """ init data, init data path """

    INIT = bundle(hadoop=bundle(), init=bundle())
    INIT.init.multiUploader = 'multi_uploader.pl'
    INIT.init.gzUploader = "gaeatools.jar"
    INIT.init.bgzip = 'bgzip'
    INIT.init.perl = 'perl'
    INIT.init.samtools = 'samtools'
    INIT.init.qualitysystem = ''
    INIT.init.check_log = '%s' % os.path.join(os.environ['GAEA_HOME'], 'bin',
                                              'check_log.pl')
    INIT.init.check_state_param = ''
    INIT.hadoop.ishadoop2 = False
    INIT.hadoop.is_at_TH = False
    INIT.hadoop.fs_mode = 'hdfs'
    INIT.hadoop.input_format = 'hdfs'
    INIT.hadoop.mapper_num = '112'
    INIT.hadoop.reducer_num = '112'

    def check_qs(self, sampleInfo):
        for sample_name in sampleInfo:
            for dataTag in sampleInfo[sample_name]:
                fq = sampleInfo[sample_name][dataTag]['fq1']
                self.init.qualitysystem = qualitysystem.getqualitysystem(fq)
                if self.init.qualitysystem != '-1':
                    return self.init.qualitysystem

        if self.init.qualitysystem == '-1':
            raise RuntimeError('qualitysystem is wrong, the value is -1')

    def run(self, impl, sampleInfo):
        mode = self.option.mode
        result = bundle(output=bundle(), script=bundle())

        # extend program path
        self.init.multiUploader = self.expath('init.multiUploader')
        self.init.gzUploader = self.expath('init.gzUploader')
        self.init.check_log = self.expath('init.check_log')
        self.init.bgzip = self.expath('init.bgzip', False)
        self.init.samtools = self.expath('init.samtools', False)

        mapper = []
        mapper.append("#!/usr/bin/perl -w")
        mapper.append("use strict;\n")
        mapper.append("while(<STDIN>)\n{")
        mapper.append("\tchomp;\n\tmy @tmp = split(/\\t/);")
        mapper.append("\tif(!-e $tmp[1])\n\t{")
        mapper.append("\t\tprint \"$tmp[1] don't exist.\\n\";")
        mapper.append("\t\texit 1;\n\t}")
        mapper.append(
            "\tsystem(\"%s jar %s GzUploader -i $tmp[1] -o $tmp[2] -n $tmp[3]\");\n}"
            % (self.hadoop.bin, self.init.gzUploader))

        # self.analysisList = self.analysisList[1:]

        output = bundle()
        for sample_name in sampleInfo.keys():
            raw_data = os.path.join(self.option.dirHDFS, sample_name, 'fq')
            scriptsdir = impl.mkdir(self.gaeaScriptsDir, sample_name)
            DataParam = []
            sample = sampleInfo[sample_name]
            output[sample_name] = bundle()
            # output[sample_name]['outdir'] = bundle()
            for dataTag in sample.keys():
                output[sample_name][dataTag] = bundle()
                pathTup = impl.splitext(sample[dataTag]['fq1'])
                filename = '{}_{}_{}'.format(sample_name, dataTag, pathTup[0])
                DataParam.append({
                    "KEY": sample[dataTag]['fq1'],
                    "VALUE": raw_data,
                    "VALUE2": filename
                })
                output[sample_name][dataTag]['fq1'] = os.path.join(
                    raw_data, filename)

                if not self.init.isSE:
                    pathTup = impl.splitext(sample[dataTag]['fq2'])
                    filename = '{}_{}_{}'.format(sample_name, dataTag,
                                                 pathTup[0])
                    DataParam.append({
                        "KEY": sample[dataTag]['fq2'],
                        "VALUE": raw_data,
                        "VALUE2": filename
                    })
                    output[sample_name][dataTag]['fq2'] = os.path.join(
                        raw_data, filename)
                    # output[sample_name]['outdir'] = raw_data

            impl.write_file(fileName='data.list',
                            scriptsdir=scriptsdir,
                            commands=["${KEY}\t${VALUE}\t${VALUE2}"],
                            JobParamList=DataParam)

            ParamDict = {
                "PROGRAM":
                "%s jar %s GzUploader" %
                (self.hadoop.bin, self.init.gzUploader),
                "INPUT":
                os.path.join(scriptsdir, 'data.list'),
            }

            # write script
            scriptPath = \
                impl.write_shell(
                    name='init',
                    scriptsdir=scriptsdir,
                    commands=['${PROGRAM} -i ${INPUT} -l'],
                    paramDict=ParamDict)

            result.script[sample_name] = scriptPath
        result.output = output

        if self.init.qualitysystem == '':
            self.check_qs(sampleInfo)
            print "[INFO   ]  -- qualitysystem is %s (autocheck)--" % self.init.qualitysystem
        else:
            print "[INFO   ]  -- qualitysystem is %s --" % self.init.qualitysystem

        return result
Пример #12
0
    def run(self, impl, sampleInfo):
        mode = self.option.mode
        result = bundle(output=bundle())

        # extend program path
        self.init.multiUploader = self.expath('init.multiUploader')
        self.init.gzUploader = self.expath('init.gzUploader')
        self.init.check_log = self.expath('init.check_log')
        self.init.bgzip = self.expath('init.bgzip', False)
        self.init.samtools = self.expath('init.samtools', False)

        if self.hadoop.input_format == 'hdfs':
            if mode != 3 and mode != 4:
                if self.option.multiSample:
                    sampleName = self.option.multiSampleName
                    scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
                    self.analysisList = self.analysisList[1:]
                    output = bundle()

                    line = ["${ID}\t${RG}\t${FQ1}\t${FQ2}\t${ADP1}\t${ADP2}"]
                    if self.ref.gender_mode == 'both' and mode != 5:
                        output.female = os.path.join(scriptsdir, "femalesampleinfo.list")
                        output.male = os.path.join(scriptsdir, "malesampleinfo.list")
                        MSLF = open(output.female, 'w')
                        MSLM = open(output.male, 'w')
                        for sample_name in sampleInfo.keys():
                            sample = sampleInfo[sample_name]
                            LineParam = []
                            for dataTag in sampleInfo[sample_name].keys():
                                LineParam.append({
                                    "ID": sample[dataTag]['id'],
                                    "RG": sample[dataTag]['rg'],
                                    "FQ1": 'file://' + sample[dataTag]['fq1'],
                                    "FQ2": sample[dataTag].has_key('fq2') and 'file://' + sample[dataTag][
                                        'fq2'] or 'null',
                                    "ADP1": sample[dataTag].has_key('adp1') and 'file://' + sample[dataTag][
                                        'adp1'] or 'null',
                                    "ADP2": sample[dataTag].has_key('adp2') and 'file://' + sample[dataTag][
                                        'adp2'] or 'null'
                                })

                            gender = self.sample[sample_name]["gender"]
                            impl.fileAppend(
                                fh=gender == 'female' and MSLF or MSLM,
                                commands=line,
                                JobParamList=LineParam)
                    else:
                        output.normal = os.path.join(scriptsdir, "sampleinfo.list")
                        MSL = open(output.normal, 'w')
                        for sample_name in sampleInfo.keys():
                            sample = sampleInfo[sample_name]
                            LineParam = []
                            for dataTag in sample.keys():
                                LineParam.append({
                                    "ID": sample[dataTag]['id'],
                                    "RG": sample[dataTag]['rg'],
                                    "FQ1": 'file://' + sample[dataTag]['fq1'],
                                    "FQ2": sample[dataTag].has_key('fq2') and 'file://' + sample[dataTag][
                                        'fq2'] or 'null',
                                    "ADP1": sample[dataTag].has_key('adp1') and 'file://' + sample[dataTag][
                                        'adp1'] or 'null',
                                    "ADP2": sample[dataTag].has_key('adp2') and 'file://' + sample[dataTag][
                                        'adp2'] or 'null'
                                })

                            impl.fileAppend(
                                fh=MSL,
                                commands=line,
                                JobParamList=LineParam)

                    result.output[sampleName] = output
                else:
                    result.script = bundle()
                    for sampleName in sampleInfo.keys():
                        scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
                        sample = sampleInfo[sampleName]
                        output = bundle()
                        DataParam = []
                        cmd = []
                        cmd.append("source %s/bin/activate" % self.GAEA_HOME)
                        cmd.append("check.py -s %s/state.json -n %s %s" % (
                        self.stateDir, sampleName, self.init.check_state_param))
                        for dataTag in sample.keys():
                            laneData = os.path.join(self.option.dirHDFS, sampleName, 'fq', dataTag)
                            cmd.append('{} -p {}'.format(self.fs_cmd.mkdir, laneData))
                            output[dataTag] = bundle()
                            pathTup = impl.splitext(sample[dataTag]['fq1'])
                            if pathTup and pathTup[1] == '.gz':
                                DataParam.append({
                                    "KEY": sample[dataTag]['fq1'],
                                    "VALUE": laneData
                                })
                                output[dataTag]['fq1'] = os.path.join(laneData, pathTup[0])
                            else:
                                output[dataTag]['fq1'] = sample[dataTag]['fq1']

                            if self.init.isSE == False:
                                pathTup = impl.splitext(sample[dataTag]['fq2'])
                                if pathTup and pathTup[1] == '.gz':
                                    DataParam.append({
                                        "KEY": sample[dataTag]['fq2'],
                                        "VALUE": laneData
                                    })
                                    output[dataTag]['fq2'] = os.path.join(laneData, pathTup[0])
                                else:
                                    output[dataTag]['fq2'] = sample[dataTag]['fq2']

                            if sample[dataTag].has_key('adp1'):
                                pathTup = impl.splitext(sample[dataTag]['adp1'])
                                if pathTup and pathTup[1] == '.gz':
                                    DataParam.append({
                                        "KEY": sample[dataTag]['adp1'],
                                        "VALUE": laneData
                                    })
                                    output[dataTag]['adp1'] = os.path.join(laneData, pathTup[0])
                                else:
                                    output[dataTag]['adp1'] = sample[dataTag]['adp1']

                            if sample[dataTag].has_key('adp2'):
                                pathTup = impl.splitext(sample[dataTag]['adp2'])
                                if pathTup and pathTup[1] == '.gz':
                                    DataParam.append({
                                        "KEY": sample[dataTag]['adp2'],
                                        "VALUE": laneData
                                    })
                                    output[dataTag]['adp2'] = os.path.join(laneData, pathTup[0])
                                else:
                                    output[dataTag]['adp2'] = sample[dataTag]['adp2']
                                    #                 print DataParam
                        if DataParam:
                            impl.write_file(
                                fileName='data.list',
                                scriptsdir=scriptsdir,
                                commands=["${KEY}\t${VALUE}"],
                                JobParamList=DataParam)

                            ParamDict = {
                                "PROGRAM": self.init.multiUploader,
                                "HADOOP": self.hadoop.bin,
                                "UPLOAD": self.init.gzUploader,
                                "INPUT": os.path.join(scriptsdir, 'data.list')
                            }

                            cmd.append(
                                '%s ${PROGRAM} -b ${HADOOP} -d ${INPUT} -u ${UPLOAD}' % self.init.perl)

                            # write script
                            scriptPath = \
                                impl.write_shell(
                                    name='init',
                                    scriptsdir=scriptsdir,
                                    commands=cmd,
                                    paramDict=ParamDict)
                            result.script[sampleName] = scriptPath

                        result.output[sampleName] = output

                if self.init.qualitysystem == '':
                    self.check_qs(sampleInfo)
                    print "[INFO   ]  -- qualitysystem is %s (autocheck)--" % self.init.qualitysystem
                else:
                    print "[INFO   ]  -- qualitysystem is %s --" % self.init.qualitysystem

                    # self.init.qualitysystem = 0

            else:
                sampleName = self.option.multiSampleName

                startStep = self.analysisList[0]
                fs_type = ''
                if self.analysisDict[startStep].platform == 'H':
                    fs_type = 'file://'

                if self.option.multiSample:
                    n = 0
                    index = 0

                    inputDir = os.path.join(self.option.workdir, 'raw_data', 'bams_' + str(index))
                    result.output[sampleName + '_' + str(index)] = fs_type + inputDir
                    if os.path.exists(inputDir):
                        shutil.rmtree(inputDir)
                    impl.mkdir(inputDir)
                    print inputDir

                    for sample_name in sampleInfo:
                        if n == int(self.init.multisample_num):
                            n = 0
                            index += 1
                            inputDir = os.path.join(self.option.workdir, 'raw_data', 'bams_' + str(index))
                            result.output[sampleName + '_' + str(index)] = fs_type + inputDir
                            if os.path.exists(inputDir):
                                shutil.rmtree(inputDir)
                            impl.mkdir(inputDir)
                            print inputDir
                        bam = os.path.basename(sampleInfo[sample_name])
                        ln_bam = os.path.join(inputDir, bam)
                        os.symlink(sampleInfo[sample_name], ln_bam)
                        n += 1
                else:
                    for sample_name in sampleInfo:
                        result.output[sample_name] = fs_type + sampleInfo[sample_name]
        else:
            if mode != 3 and mode != 4:
                if self.option.multiSample:
                    sampleName = self.option.multiSampleName
                    scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
                    self.analysisList = self.analysisList[1:]
                    output = bundle()

                    line = ["${ID}\t${RG}\t${FQ1}\t${FQ2}\t${ADP1}\t${ADP2}"]
                    if self.ref.gender_mode == 'both' and mode != 5:
                        output.female = os.path.join(scriptsdir, "femalesampleinfo.list")
                        output.male = os.path.join(scriptsdir, "malesampleinfo.list")
                        MSLF = open(output.female, 'w')
                        MSLM = open(output.male, 'w')
                        for sample_name in sampleInfo.keys():
                            sample = sampleInfo[sample_name]
                            LineParam = []
                            for dataTag in sampleInfo[sample_name].keys():
                                LineParam.append({
                                    "ID": sample[dataTag]['id'],
                                    "RG": sample[dataTag]['rg'],
                                    "FQ1": 'file://' + sample[dataTag]['fq1'],
                                    "FQ2": sample[dataTag].has_key('fq2') and 'file://' + sample[dataTag][
                                        'fq2'] or 'null',
                                    "ADP1": sample[dataTag].has_key('adp1') and 'file://' + sample[dataTag][
                                        'adp1'] or 'null',
                                    "ADP2": sample[dataTag].has_key('adp2') and 'file://' + sample[dataTag][
                                        'adp2'] or 'null'
                                })

                            gender = self.sample[sample_name]["gender"]
                            impl.fileAppend(
                                fh=gender == 'female' and MSLF or MSLM,
                                commands=line,
                                JobParamList=LineParam)
                    else:
                        output.normal = os.path.join(scriptsdir, "sampleinfo.list")
                        MSL = open(output.normal, 'w')
                        for sample_name in sampleInfo.keys():
                            sample = sampleInfo[sample_name]
                            LineParam = []
                            for dataTag in sample.keys():
                                LineParam.append({
                                    "ID": sample[dataTag]['id'],
                                    "RG": sample[dataTag]['rg'],
                                    "FQ1": 'file://' + sample[dataTag]['fq1'],
                                    "FQ2": sample[dataTag].has_key('fq2') and 'file://' + sample[dataTag][
                                        'fq2'] or 'null',
                                    "ADP1": sample[dataTag].has_key('adp1') and 'file://' + sample[dataTag][
                                        'adp1'] or 'null',
                                    "ADP2": sample[dataTag].has_key('adp2') and 'file://' + sample[dataTag][
                                        'adp2'] or 'null'
                                })

                            impl.fileAppend(
                                fh=MSL,
                                commands=line,
                                JobParamList=LineParam)

                    result.output[sampleName] = output
                else:
                    result.script = bundle()
                    for sampleName in sampleInfo.keys():
                        scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
                        hdfs_gz_tmp = os.path.join(self.option.dirHDFS, sampleName, 'data', 'gz_tmp')
                        sample = sampleInfo[sampleName]
                        output = bundle()
                        DataParam = []
                        cmd = []
                        for dataTag in sample.keys():
                            rawData = impl.mkdir(self.option.workdir, 'fq', 'raw_data', sampleName)
                            laneData = os.path.join(rawData, dataTag)
                            cmd.append("mkdir -p -m 777 %s" % laneData)
                            output[dataTag] = bundle()
                            pathTup = impl.splitext(sample[dataTag]['fq1'])
                            if pathTup and pathTup[1] == '.gz':
                                DataParam.append({
                                    "KEY": sample[dataTag]['fq1'],
                                    "VALUE": os.path.join(laneData, pathTup[0])
                                })
                                output[dataTag]['fq1'] = os.path.join(laneData, pathTup[0])
                            else:
                                output[dataTag]['fq1'] = sample[dataTag]['fq1']

                            if self.init.isSE == False:
                                pathTup = impl.splitext(sample[dataTag]['fq2'])
                                if pathTup and pathTup[1] == '.gz':
                                    DataParam.append({
                                        "KEY": sample[dataTag]['fq2'],
                                        "VALUE": os.path.join(laneData, pathTup[0])
                                    })
                                    output[dataTag]['fq2'] = os.path.join(laneData, pathTup[0])
                                else:
                                    output[dataTag]['fq2'] = sample[dataTag]['fq2']

                            if sample[dataTag].has_key('adp1'):
                                pathTup = impl.splitext(sample[dataTag]['adp1'])
                                if pathTup and pathTup[1] == '.gz':
                                    DataParam.append({
                                        "KEY": sample[dataTag]['adp1'],
                                        "VALUE": os.path.join(laneData, pathTup[0])
                                    })
                                    output[dataTag]['adp1'] = os.path.join(laneData, pathTup[0])
                                else:
                                    output[dataTag]['adp1'] = sample[dataTag]['adp1']

                            if sample[dataTag].has_key('adp2'):
                                pathTup = impl.splitext(sample[dataTag]['adp2'])
                                if pathTup and pathTup[1] == '.gz':
                                    DataParam.append({
                                        "KEY": sample[dataTag]['adp2'],
                                        "VALUE": os.path.join(laneData, pathTup[0])
                                    })
                                    output[dataTag]['adp2'] = os.path.join(laneData, pathTup[0])
                                else:
                                    output[dataTag]['adp2'] = sample[dataTag]['adp2']
                                    #                 print DataParam
                        if DataParam:
                            impl.write_file(
                                fileName='data.list',
                                scriptsdir=scriptsdir,
                                commands=["${KEY}\t${VALUE}"],
                                JobParamList=DataParam)

                            mapper = []
                            mapper.append("#!/usr/bin/perl -w")
                            mapper.append("use strict;\n")
                            mapper.append("while(<STDIN>)\n{")
                            mapper.append("\tchomp;\n\tmy @tmp = split(/\\t/);")
                            mapper.append("\tif(!-e $tmp[1])\n\t{")
                            mapper.append("\t\tprint \"$tmp[1] don't exist.\\n\";")
                            mapper.append("\t\texit 1;\n\t}")
                            mapper.append("\tsystem(\"gzip -cd $tmp[1] >$tmp[2]\");\n}")
                            impl.write_file(
                                fileName='upload_mapper.pl',
                                scriptsdir=scriptsdir,
                                commands=mapper)

                            hadoop_parameter = ' -D mapred.job.name="gzip input data" '
                            hadoop_parameter += ' -D mapred.map.tasks=%d ' % len(DataParam)
                            hadoop_parameter += ' -D mapred.reduce.tasks=0 '
                            hadoop_parameter += ' -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat '
                            ParamDict = {
                                "PROGRAM": "%s jar %s" % (self.hadoop.bin, self.hadoop.streamingJar),
                                "MAPPER": os.path.join(scriptsdir, 'upload_mapper.pl'),
                                "INPUT": 'file://' + os.path.join(scriptsdir, 'data.list'),
                                "OUTPUT": hdfs_gz_tmp,
                                "HADOOPPARAM": hadoop_parameter
                            }

                            cmd.append('%s ${OUTPUT}' % self.fs_cmd.delete)
                            cmd.append(
                                '${PROGRAM} ${HADOOPPARAM} -input ${INPUT} -output ${OUTPUT} -mapper "perl ${MAPPER}"')

                            # write script
                            scriptPath = \
                                impl.write_shell(
                                    name='init',
                                    scriptsdir=scriptsdir,
                                    commands=cmd,
                                    paramDict=ParamDict)
                            result.script[sampleName] = scriptPath

                        result.output[sampleName] = output

                if self.init.qualitysystem == '':
                    self.check_qs(sampleInfo)
                    print "[INFO   ]  -- qualitysystem is %s (autocheck)--" % self.init.qualitysystem
                else:
                    print "[INFO   ]  -- qualitysystem is %s --" % self.init.qualitysystem

                    # self.init.qualitysystem = 0

            else:
                sampleName = self.option.multiSampleName

                startStep = self.analysisList[0]
                fs_type = ''
                if self.analysisDict[startStep].platform == 'H':
                    fs_type = 'file://'

                if self.option.multiSample:
                    inputDir = os.path.join(self.option.workdir, 'raw_data', 'bams')
                    result.output[sampleName] = fs_type + inputDir

                    if os.path.exists(inputDir):
                        shutil.rmtree(inputDir)
                    impl.mkdir(inputDir)

                    for sample_name in sampleInfo.keys():
                        bam = os.path.basename(sampleInfo[sample_name])
                        ln_bam = os.path.join(inputDir, sample_name + "_" + bam)
                        os.symlink(sampleInfo[sample_name], ln_bam)
                else:
                    for sample_name in sampleInfo.keys():
                        result.output[sample_name] = fs_type + sampleInfo[sample_name]

        # return
        return result
Пример #13
0
class Workflow(object):
    '''
    The basic class of each APP
    '''
    INIT = bundle()
    fs_cmd = bundle()

    result = bundle(output=bundle(), script=bundle())
    ParamDict = bundle()
    JobParamList = []
    cmd = []

    def __init__(self, state):
        self.__dict__.clear()
        self.__dict__.update(state)
        hadoop = self.hadoop.bin
        if self.hadoop.has_key('fs_mode') and self.hadoop.fs_mode == 'hdfs':

            if self.hadoop.has_key('ishadoop2'):
                if isinstance(self.hadoop.ishadoop2, str):
                    if self.hadoop.ishadoop2.upper() == 'FALSE':
                        self.hadoop.ishadoop2 = False
                    else:
                        self.hadoop.ishadoop2 = True

            if self.hadoop.ishadoop2:
                self.fs_cmd.delete = "%s fs -rm -r -skipTrash " % hadoop
                self.fs_cmd.mkdir = "%s fs -mkdir " % hadoop
                self.fs_cmd.put = "%s fs -put " % hadoop
                self.fs_cmd.cp = "%s fs -copyToLocal " % hadoop
                self.fs_cmd.ls = "%s fs -ls " % hadoop
            else:
                self.fs_cmd.delete = "%s dfs -rmr -skipTrash " % hadoop
                self.fs_cmd.mkdir = "%s dfs -mkdir " % hadoop
                self.fs_cmd.put = "%s dfs -put " % hadoop
                self.fs_cmd.cp = "%s dfs -copyToLocal " % hadoop
                self.fs_cmd.ls = "%s dfs -ls " % hadoop
        elif self.hadoop.has_key('is_at_TH') and self.hadoop.is_at_TH:
            self.fs_cmd.delete = "rm -rf "
            self.fs_cmd.mkdir = "mkdir -p "
            self.fs_cmd.put = "ln -s "
            self.fs_cmd.cp = "cp -r "
            self.fs_cmd.ls = "ls -l "
        else:
            self.fs_cmd.delete = "rm -rf "
            self.fs_cmd.mkdir = "mkdir -p "
            self.fs_cmd.put = "ln -s "
            self.fs_cmd.cp = "cp -rf "
            self.fs_cmd.ls = "ls -l "
        state.fs_cmd = self.fs_cmd

    def expath(self, paramName, mustBe=True):
        field = paramName.split('.')
        state = self.__dict__
        if len(field) == 1:
            path_tmp = state[field[0]]
        elif len(field) == 2:
            path_tmp = state[field[0]][field[1]]
        elif len(field) == 3:
            path_tmp = state[field[0]][field[1]][field[2]]
        elif len(field) == 4:
            path_tmp = state[field[0]][field[1]][field[2]][field[3]]
        elif len(field) == 5:
            path_tmp = state[field[0]][field[1]][field[2]][field[3]][field[4]]
        else:
            raise RuntimeError('paramName (%s) is wrong!' % paramName)

        if not path_tmp:
            if mustBe:
                raise RuntimeError('Program is not exists: %s' % paramName)
            else:
                return ''

        if os.path.exists(path_tmp):
            return path_tmp
        else:
            for p in self.Path.prgDir.split(':'):
                if os.path.exists(os.path.join(p, path_tmp)):
                    return os.path.join(p, path_tmp)

        if mustBe:
            raise RuntimeError('Program is not exists: %s = %s' %
                               (paramName, path_tmp))
        else:
            return path_tmp

    def main(self, impl, dependList):
        self.run(impl, dependList)

        print self.__class__.__name__
        #write script

        scriptPath = \
        impl.write_scripts(
                name = self.__class__.__name__,
                commands=self.cmd,
                JobParamList=self.JobParamList,
                paramDict=self.ParamDict)

        #result
        self.result.script.update(scriptPath)
        return self.result
Пример #14
0
    def run(self, impl, sampleInfo):
        mode = self.option.mode
        result = bundle(output=bundle(), script=bundle())

        # extend program path
        self.init.multiUploader = self.expath('init.multiUploader')
        self.init.gzUploader = self.expath('init.gzUploader')
        self.init.check_log = self.expath('init.check_log')
        self.init.bgzip = self.expath('init.bgzip', False)
        self.init.samtools = self.expath('init.samtools', False)

        sampleName = self.option.multiSampleName
        scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
        self.analysisList = self.analysisList[1:]
        hdfs_gz_tmp = os.path.join(self.option.dirHDFS, sampleName, 'data',
                                   'gz_tmp')
        tmp = impl.mkdir(self.option.workdir, "temp", sampleName, 'ubam')
        rawData = impl.mkdir(self.option.workdir, "ubam", sampleName)

        ubam = []
        DataParam = []
        output = bundle()
        cmd = []
        for sample_name in sampleInfo.keys():
            sample = sampleInfo[sample_name]
            output[sample_name] = bundle()
            for dataTag in sample.keys():
                output[sample_name][dataTag] = bundle()
                filename = '{}_{}.bam'.format(sample_name, dataTag)
                output[sample_name][dataTag]['bam'] = os.path.join(
                    rawData, filename)
                ubam.append(output[sample_name][dataTag]['bam'])

                DataParam.append({
                    "KEY1": sample[dataTag]['fq1'],
                    "KEY2": sample[dataTag]['fq2'],
                    "KEY3": output[sample_name][dataTag]['bam'],
                    "KEY4": sample_name,
                    "KEY5": sample_name + "_" + dataTag
                })

        if DataParam:
            impl.write_file(
                fileName='data.list',
                scriptsdir=scriptsdir,
                commands=["${KEY1}\t${KEY2}\t${KEY3}\t${KEY4}\t${KEY5}"],
                JobParamList=DataParam)

            mapper = []
            mapper.append("#!/usr/bin/perl -w")
            mapper.append("use strict;\n")
            mapper.append("while(<STDIN>)\n{")
            mapper.append("\tchomp;\n\tmy @tmp = split(/\\t/);")
            mapper.append("\tif(!-e $tmp[1])\n\t{")
            mapper.append("\t\tprint \"$tmp[1] don't exist.\\n\";")
            mapper.append("\t\texit 1;\n\t}")
            mapper.append(
                "\tsystem(\"%s FastqToSam -F1 $tmp[1] -F2 $tmp[2] -O $tmp[3] -SM $tmp[4] -RG $tmp[5] --TMP_DIR %s -PL illumina\");\n}"
                % (self.init.gatk, tmp))
            impl.write_file(fileName='upload_mapper.pl',
                            scriptsdir=scriptsdir,
                            commands=mapper)

            hadoop_parameter = ' -D mapred.job.name="upload data" '
            if self.hadoop.get('queue'):
                hadoop_parameter += '-D mapreduce.job.queuename={} '.format(
                    self.hadoop.queue)
            hadoop_parameter += ' -D mapred.map.tasks=%d ' % len(DataParam)
            hadoop_parameter += '-D mapreduce.map.memory.mb=10240 '
            hadoop_parameter += ' -D mapred.reduce.tasks=0 '
            hadoop_parameter += ' -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat '
            ParamDict = {
                "PROGRAM":
                "%s jar %s" % (self.hadoop.bin, self.hadoop.streamingJar),
                "MAPPER":
                os.path.join(scriptsdir, 'upload_mapper.pl'),
                "INPUT":
                'file://' + os.path.join(scriptsdir, 'data.list'),
                "OUTPUT":
                hdfs_gz_tmp,
                "HADOOPPARAM":
                hadoop_parameter
            }

            cmd.append('%s ${OUTPUT}' % self.fs_cmd.delete)
            cmd.append(
                '${PROGRAM} ${HADOOPPARAM} -input ${INPUT} -output ${OUTPUT} -mapper "perl ${MAPPER}"'
            )

            # cmd.append('%s jar %s GzUploader -i %s -l' % (
            # self.hadoop.bin, self.init.gzUploader, os.path.join(scriptsdir, 'data.list')))

            # write script
            scriptPath = \
                impl.write_shell(
                    name='init',
                    scriptsdir=scriptsdir,
                    commands=cmd,
                    paramDict=ParamDict)
            result.script[sampleName] = scriptPath

        result.output = output

        if self.init.qualitysystem == '':
            self.check_qs(sampleInfo)
            print "[INFO   ]  -- qualitysystem is %s (autocheck)--" % self.init.qualitysystem
        else:
            print "[INFO   ]  -- qualitysystem is %s --" % self.init.qualitysystem

        return result
Пример #15
0
    def run(self, impl, sampleInfo):
        mode = self.option.mode
        result = bundle(output=bundle(), script=bundle())

        # extend program path
        self.init.multiUploader = self.expath('init.multiUploader')
        self.init.gzUploader = self.expath('init.gzUploader')
        self.init.check_log = self.expath('init.check_log')
        self.init.bgzip = self.expath('init.bgzip', False)
        self.init.samtools = self.expath('init.samtools', False)

        sampleName = self.option.multiSampleName
        scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
        self.analysisList = self.analysisList[1:]
        hdfs_gz_tmp = os.path.join(self.option.dirHDFS, sampleName, 'data', 'gz_tmp')

        # result.output[sampleName] = output

        DataParam = []
        output = bundle()
        cmd = []
        for sample_name in sampleInfo.keys():
            sample = sampleInfo[sample_name]
            output[sample_name] = bundle()
            for dataTag in sample.keys():
                rawData = impl.mkdir(self.option.workdir, 'fq', 'raw_data', sample_name)
                laneData = os.path.join(rawData, dataTag)
                cmd.append("mkdir -p -m 777 %s" % laneData)
                output[sample_name][dataTag] = bundle()
                pathTup = impl.splitext(sample[dataTag]['fq1'])
                if pathTup and pathTup[1] == '.gz':
                    DataParam.append({
                        "KEY": sample[dataTag]['fq1'],
                        "VALUE": os.path.join(laneData, pathTup[0])
                    })
                    output[sample_name][dataTag]['fq1'] = os.path.join(laneData, pathTup[0])
                else:
                    output[sample_name][dataTag]['fq1'] = sample[dataTag]['fq1']

                if self.init.isSE == False:
                    pathTup = impl.splitext(sample[dataTag]['fq2'])
                    if pathTup and pathTup[1] == '.gz':
                        DataParam.append({
                            "KEY": sample[dataTag]['fq2'],
                            "VALUE": os.path.join(laneData, pathTup[0])
                        })
                        output[sample_name][dataTag]['fq2'] = os.path.join(laneData, pathTup[0])
                    else:
                        output[sample_name][dataTag]['fq2'] = sample[dataTag]['fq2']

                if sample[dataTag].has_key('adp1'):
                    pathTup = impl.splitext(sample[dataTag]['adp1'])
                    if pathTup and pathTup[1] == '.gz':
                        DataParam.append({
                            "KEY": sample[dataTag]['adp1'],
                            "VALUE": os.path.join(laneData, pathTup[0])
                        })
                        output[sample_name][dataTag]['adp1'] = os.path.join(laneData, pathTup[0])
                    else:
                        output[sample_name][dataTag]['adp1'] = sample[dataTag]['adp1']

                if sample[dataTag].has_key('adp2'):
                    pathTup = impl.splitext(sample[dataTag]['adp2'])
                    if pathTup and pathTup[1] == '.gz':
                        DataParam.append({
                            "KEY": sample[dataTag]['adp2'],
                            "VALUE": os.path.join(laneData, pathTup[0])
                        })
                        output[sample_name][dataTag]['adp2'] = os.path.join(laneData, pathTup[0])
                    else:
                        output[sample_name][dataTag]['adp2'] = sample[dataTag]['adp2']
                        #                 print DataParam
        if DataParam:
            impl.write_file(
                fileName='data.list',
                scriptsdir=scriptsdir,
                commands=["${KEY}\t${VALUE}"],
                JobParamList=DataParam)

            mapper = []
            mapper.append("#!/usr/bin/perl -w")
            mapper.append("use strict;\n")
            mapper.append("while(<STDIN>)\n{")
            mapper.append("\tchomp;\n\tmy @tmp = split(/\\t/);")
            mapper.append("\tif(!-e $tmp[1])\n\t{")
            mapper.append("\t\tprint \"$tmp[1] don't exist.\\n\";")
            mapper.append("\t\texit 1;\n\t}")
            mapper.append("\tsystem(\"gzip -cd $tmp[1] >$tmp[2]\");\n}")
            impl.write_file(
                fileName='upload_mapper.pl',
                scriptsdir=scriptsdir,
                commands=mapper)

            hadoop_parameter = ' -D mapred.job.name="gzip input data" '
            if self.hadoop.get('queue'):
                hadoop_parameter += '-D mapreduce.job.queuename={} '.format(self.hadoop.queue)
            hadoop_parameter += ' -D mapred.map.tasks=%d ' % len(DataParam)
            hadoop_parameter += ' -D mapred.reduce.tasks=0 '
            hadoop_parameter += ' -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat '
            ParamDict = {
                "PROGRAM": "%s jar %s" % (self.hadoop.bin, self.hadoop.streamingJar),
                "MAPPER": os.path.join(scriptsdir, 'upload_mapper.pl'),
                "INPUT": 'file://' + os.path.join(scriptsdir, 'data.list'),
                "OUTPUT": hdfs_gz_tmp,
                "HADOOPPARAM": hadoop_parameter
            }

            cmd.append('%s ${OUTPUT}' % self.fs_cmd.delete)
            cmd.append('${PROGRAM} ${HADOOPPARAM} -input ${INPUT} -output ${OUTPUT} -mapper "perl ${MAPPER}"')

            # write script
            scriptPath = \
                impl.write_shell(
                    name='init',
                    scriptsdir=scriptsdir,
                    commands=cmd,
                    paramDict=ParamDict)
            result.script[sampleName] = scriptPath

        result.output = output

        if self.init.qualitySystem == '':
            self.check_qs(sampleInfo)
            print "[INFO   ]  -- qualitySystem is %s (autocheck)--" % self.init.qualitySystem
        else:
            print "[INFO   ]  -- qualitySystem is %s --" % self.init.qualitySystem

        return result
Пример #16
0
    def run(self, impl, sampleInfo):
        mode = self.option.mode
        result = bundle(output=bundle(), script=bundle())

        # extend program path
        self.init.multiUploader = self.expath('init.multiUploader')
        self.init.gzUploader = self.expath('init.gzUploader')
        self.init.check_log = self.expath('init.check_log')
        self.init.bgzip = self.expath('init.bgzip', False)
        self.init.samtools = self.expath('init.samtools', False)

        mapper = []
        mapper.append("#!/usr/bin/perl -w")
        mapper.append("use strict;\n")
        mapper.append("while(<STDIN>)\n{")
        mapper.append("\tchomp;\n\tmy @tmp = split(/\\t/);")
        mapper.append("\tif(!-e $tmp[1])\n\t{")
        mapper.append("\t\tprint \"$tmp[1] don't exist.\\n\";")
        mapper.append("\t\texit 1;\n\t}")
        mapper.append(
            "\tsystem(\"%s jar %s GzUploader -i $tmp[1] -o $tmp[2] -n $tmp[3]\");\n}"
            % (self.hadoop.bin, self.init.gzUploader))

        # self.analysisList = self.analysisList[1:]

        output = bundle()
        DataParam = []
        for sample_name in sampleInfo.keys():
            raw_data = os.path.join(self.option.dirHDFS, sample_name, 'fq')
            scriptsdir = impl.mkdir(self.gaeaScriptsDir, sample_name)
            hdfs_gz_tmp = os.path.join(self.option.dirHDFS, sample_name,
                                       'data', 'gz_tmp')
            sample = sampleInfo[sample_name]
            output[sample_name] = bundle()
            for dataTag in sample.keys():
                output[sample_name][dataTag] = bundle()
                pathTup = impl.splitext(sample[dataTag]['fq1'])
                filename = '{}_{}_{}'.format(sample_name, dataTag, pathTup[0])
                DataParam.append({
                    "KEY": sample[dataTag]['fq1'],
                    "VALUE": raw_data,
                    "VALUE2": filename
                })
                output[sample_name][dataTag]['fq1'] = os.path.join(
                    raw_data, filename)

                if not self.init.isSE:
                    pathTup = impl.splitext(sample[dataTag]['fq2'])
                    filename = '{}_{}_{}'.format(sample_name, dataTag,
                                                 pathTup[0])
                    DataParam.append({
                        "KEY": sample[dataTag]['fq2'],
                        "VALUE": raw_data,
                        "VALUE2": filename
                    })
                    output[sample_name][dataTag]['fq2'] = os.path.join(
                        raw_data, filename)

            impl.write_file(fileName='upload_mapper.pl',
                            scriptsdir=scriptsdir,
                            commands=mapper)

            hadoop_parameter = ' -D mapred.job.name="upload data" '
            if self.hadoop.get('queue'):
                hadoop_parameter += '-D mapreduce.job.queuename={} '.format(
                    self.hadoop.queue)
            hadoop_parameter += ' -D mapred.map.tasks=%d ' % len(DataParam)
            hadoop_parameter += ' -D mapred.reduce.tasks=0 '
            hadoop_parameter += ' -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat '
            ParamDict = {
                "PROGRAM":
                "%s jar %s" % (self.hadoop.bin, self.hadoop.streamingJar),
                "MAPPER":
                os.path.join(scriptsdir, 'upload_mapper.pl'),
                "INPUT":
                'file://' + os.path.join(self.gaeaScriptsDir, 'data.list'),
                "OUTPUT":
                hdfs_gz_tmp,
                "HADOOPPARAM":
                hadoop_parameter
            }

            # write script
            scriptPath = \
                impl.write_shell(
                    name='init',
                    scriptsdir=scriptsdir,
                    commands=['${PROGRAM} -i ${INPUT} -l'],
                    paramDict=ParamDict)

            result.script[sample_name] = scriptPath

        impl.write_file(fileName='data.list',
                        scriptsdir=self.gaeaScriptsDir,
                        commands=["${KEY}\t${VALUE}\t${VALUE2}"],
                        JobParamList=DataParam)
        result.output = output

        if self.init.qualitysystem == '':
            self.check_qs(sampleInfo)
            print "[INFO   ]  -- qualitysystem is %s (autocheck)--" % self.init.qualitysystem
        else:
            print "[INFO   ]  -- qualitysystem is %s --" % self.init.qualitysystem

        return result
Пример #17
0
    def run(self, impl, sampleInfo):
        mode = self.option.mode
        result = bundle(output=bundle(), script=bundle())

        # extend program path
        self.init.multiUploader = self.expath('init.multiUploader')
        self.init.gzUploader = self.expath('init.gzUploader')
        self.init.check_log = self.expath('init.check_log')
        self.init.bgzip = self.expath('init.bgzip', False)
        self.init.samtools = self.expath('init.samtools', False)

        mapper = []
        mapper.append("#!/usr/bin/perl -w")
        mapper.append("use strict;\n")
        mapper.append("while(<STDIN>)\n{")
        mapper.append("\tchomp;\n\tmy @tmp = split(/\\t/);")
        mapper.append("\tif(!-e $tmp[1])\n\t{")
        mapper.append("\t\tprint \"$tmp[1] don't exist.\\n\";")
        mapper.append("\t\texit 1;\n\t}")
        mapper.append(
            "\tsystem(\"%s jar %s GzUploader -i $tmp[1] -o $tmp[2] -n $tmp[3]\");\n}"
            % (self.hadoop.bin, self.init.gzUploader))

        # self.analysisList = self.analysisList[1:]

        output = bundle()
        for sample_name in sampleInfo.keys():
            raw_data = os.path.join(self.option.dirHDFS, sample_name, 'fq')
            scriptsdir = impl.mkdir(self.gaeaScriptsDir, sample_name)
            DataParam = []
            sample = sampleInfo[sample_name]
            output[sample_name] = bundle()
            # output[sample_name]['outdir'] = bundle()
            for dataTag in sample.keys():
                output[sample_name][dataTag] = bundle()
                pathTup = impl.splitext(sample[dataTag]['fq1'])
                filename = '{}_{}_{}'.format(sample_name, dataTag, pathTup[0])
                DataParam.append({
                    "KEY": sample[dataTag]['fq1'],
                    "VALUE": raw_data,
                    "VALUE2": filename
                })
                output[sample_name][dataTag]['fq1'] = os.path.join(
                    raw_data, filename)

                if not self.init.isSE:
                    pathTup = impl.splitext(sample[dataTag]['fq2'])
                    filename = '{}_{}_{}'.format(sample_name, dataTag,
                                                 pathTup[0])
                    DataParam.append({
                        "KEY": sample[dataTag]['fq2'],
                        "VALUE": raw_data,
                        "VALUE2": filename
                    })
                    output[sample_name][dataTag]['fq2'] = os.path.join(
                        raw_data, filename)
                    # output[sample_name]['outdir'] = raw_data

            impl.write_file(fileName='data.list',
                            scriptsdir=scriptsdir,
                            commands=["${KEY}\t${VALUE}\t${VALUE2}"],
                            JobParamList=DataParam)

            ParamDict = {
                "PROGRAM":
                "%s jar %s GzUploader" %
                (self.hadoop.bin, self.init.gzUploader),
                "INPUT":
                os.path.join(scriptsdir, 'data.list'),
            }

            # write script
            scriptPath = \
                impl.write_shell(
                    name='init',
                    scriptsdir=scriptsdir,
                    commands=['${PROGRAM} -i ${INPUT} -l'],
                    paramDict=ParamDict)

            result.script[sample_name] = scriptPath
        result.output = output

        if self.init.qualitysystem == '':
            self.check_qs(sampleInfo)
            print "[INFO   ]  -- qualitysystem is %s (autocheck)--" % self.init.qualitysystem
        else:
            print "[INFO   ]  -- qualitysystem is %s --" % self.init.qualitysystem

        return result
Пример #18
0
def parse_sample(sampleList):

    total_number = 0
    with open(sampleList, 'r') as f:
        sampleInfo = bundle()
        for line in f:
            total_number += 1
            line = line.strip()
            field = line.split()
            sampleName = field[0]
            fq1 = field[1]
            fq2 = ''
            if len(field) >= 3:
                fq2 = field[2]
            if os.path.exists(fq1):
                logger.error("%s under %s don't exists." % (fq1, sampleName))
                exit(3)
            fq_dir = os.path.dirname(fq1)
            fq_name = os.path.basename(fq1)

            #date_md_flowcell_laneID_lib
            #100920_I126_FC801V9ABXX_L6_HUMlatXAOIDCBAPEI-8_2.fq
            tmp = fq_name.split("_")
            rg_ID = tmp[4] + "_" + tmp[2] + "-" + tmp[3]
            rg_PU = tmp[0] + "_" + tmp[1] + "_" + tmp[2] + "_" + tmp[
                3] + "_" + tmp[4]
            rg_LB = tmp[4]
            rg = "@RG\\tID:%s\\tPL:illumina\\tPU:%s\\tLB:%s\\tSM:%s\\tCN:BGI" % (
                rg_ID, rg_PU, rg_LB, sampleName)
            fq_lib_name = rg_ID

            if not sampleInfo.has_key(sampleName):
                sampleInfo[sampleName] = bundle()
                sample_lane_counter = 0
            else:
                sample_lane_counter = len(sampleInfo[sampleName])

            dataTag = 'data' + str(sample_lane_counter)
            if not sampleInfo[sampleName].has_key(dataTag):
                sampleInfo[sampleName][dataTag] = bundle()

            #find adp1
            sampleInfo[sampleName][dataTag]['fq1'] = fq1
            adp1 = glob.glob("%s/*1.adapter.list*" % fq_dir)
            if adp1:
                adp1_file = adp1[0].strip()
                sampleInfo[sampleName][dataTag]['adp1'] = adp1_file
            else:
                sampleInfo[sampleName][dataTag]['adp1'] = 'null'

            #find fq2 and adp2
            fq2 = fq1
            fq2 = fq2.replace("1.fq.gz", "2.fq.gz")
            if os.path.exists(fq2):
                sampleInfo[sampleName][dataTag]['fq2'] = fq2
                adp2 = glob.glob("%s/*2.adapter.list*" % fq_dir)
                if adp2:
                    adp2_file = adp2[0].strip()
                    sampleInfo[sampleName][dataTag]['adp2'] = adp2_file
                else:
                    sampleInfo[sampleName][dataTag]['adp2'] = 'null'
            else:
                logger.warning("%s of line: %d is SE data!" %
                               (sampleName, total_number))

            sampleInfo[sampleName][dataTag]['rg'] = rg
            sampleInfo[sampleName][dataTag]['libname'] = fq_lib_name

    return sampleInfo
Пример #19
0
    def run(self, impl, sampleInfo):
        mode = self.option.mode
        result = bundle(output=bundle())

        # extend program path
        self.init.multiUploader = self.expath('init.multiUploader')
        self.init.gzUploader = self.expath('init.gzUploader')
        self.init.check_log = self.expath('init.check_log')
        self.init.bgzip = self.expath('init.bgzip', False)
        self.init.samtools = self.expath('init.samtools', False)

        if self.hadoop.input_format == 'hdfs':
            if mode != 3 and mode != 4:
                if self.option.multiSample:
                    sampleName = self.option.multiSampleName
                    scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
                    self.analysisList = self.analysisList[1:]
                    output = bundle()

                    line = ["${ID}\t${RG}\t${FQ1}\t${FQ2}\t${ADP1}\t${ADP2}"]
                    if self.ref.gender_mode == 'both' and mode != 5:
                        output.female = os.path.join(scriptsdir, "femalesampleinfo.list")
                        output.male = os.path.join(scriptsdir, "malesampleinfo.list")
                        MSLF = open(output.female, 'w')
                        MSLM = open(output.male, 'w')
                        for sample_name in sampleInfo.keys():
                            sample = sampleInfo[sample_name]
                            LineParam = []
                            for dataTag in sampleInfo[sample_name].keys():
                                LineParam.append({
                                    "ID": sample[dataTag]['id'],
                                    "RG": sample[dataTag]['rg'],
                                    "FQ1": 'file://' + sample[dataTag]['fq1'],
                                    "FQ2": sample[dataTag].has_key('fq2') and 'file://' + sample[dataTag][
                                        'fq2'] or 'null',
                                    "ADP1": sample[dataTag].has_key('adp1') and 'file://' + sample[dataTag][
                                        'adp1'] or 'null',
                                    "ADP2": sample[dataTag].has_key('adp2') and 'file://' + sample[dataTag][
                                        'adp2'] or 'null'
                                })

                            gender = self.sample[sample_name]["gender"]
                            impl.fileAppend(
                                fh=gender == 'female' and MSLF or MSLM,
                                commands=line,
                                JobParamList=LineParam)
                    else:
                        output.normal = os.path.join(scriptsdir, "sampleinfo.list")
                        MSL = open(output.normal, 'w')
                        for sample_name in sampleInfo.keys():
                            sample = sampleInfo[sample_name]
                            LineParam = []
                            for dataTag in sample.keys():
                                LineParam.append({
                                    "ID": sample[dataTag]['id'],
                                    "RG": sample[dataTag]['rg'],
                                    "FQ1": 'file://' + sample[dataTag]['fq1'],
                                    "FQ2": sample[dataTag].has_key('fq2') and 'file://' + sample[dataTag][
                                        'fq2'] or 'null',
                                    "ADP1": sample[dataTag].has_key('adp1') and 'file://' + sample[dataTag][
                                        'adp1'] or 'null',
                                    "ADP2": sample[dataTag].has_key('adp2') and 'file://' + sample[dataTag][
                                        'adp2'] or 'null'
                                })

                            impl.fileAppend(
                                fh=MSL,
                                commands=line,
                                JobParamList=LineParam)

                    result.output[sampleName] = output
                else:
                    result.script = bundle()
                    for sampleName in sampleInfo.keys():
                        scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
                        sample = sampleInfo[sampleName]
                        output = bundle()
                        DataParam = []
                        cmd = []
                        cmd.append("source %s/bin/activate" % self.GAEA_HOME)
                        cmd.append("check.py -s %s/state.json -n %s %s" % (
                        self.stateDir, sampleName, self.init.check_state_param))
                        for dataTag in sample.keys():
                            laneData = os.path.join(self.option.dirHDFS, sampleName, 'fq', dataTag)
                            cmd.append('{} -p {}'.format(self.fs_cmd.mkdir, laneData))
                            output[dataTag] = bundle()
                            pathTup = impl.splitext(sample[dataTag]['fq1'])
                            if pathTup and pathTup[1] == '.gz':
                                DataParam.append({
                                    "KEY": sample[dataTag]['fq1'],
                                    "VALUE": laneData
                                })
                                output[dataTag]['fq1'] = os.path.join(laneData, pathTup[0])
                            else:
                                output[dataTag]['fq1'] = sample[dataTag]['fq1']

                            if self.init.isSE == False:
                                pathTup = impl.splitext(sample[dataTag]['fq2'])
                                if pathTup and pathTup[1] == '.gz':
                                    DataParam.append({
                                        "KEY": sample[dataTag]['fq2'],
                                        "VALUE": laneData
                                    })
                                    output[dataTag]['fq2'] = os.path.join(laneData, pathTup[0])
                                else:
                                    output[dataTag]['fq2'] = sample[dataTag]['fq2']

                            if sample[dataTag].has_key('adp1'):
                                pathTup = impl.splitext(sample[dataTag]['adp1'])
                                if pathTup and pathTup[1] == '.gz':
                                    DataParam.append({
                                        "KEY": sample[dataTag]['adp1'],
                                        "VALUE": laneData
                                    })
                                    output[dataTag]['adp1'] = os.path.join(laneData, pathTup[0])
                                else:
                                    output[dataTag]['adp1'] = sample[dataTag]['adp1']

                            if sample[dataTag].has_key('adp2'):
                                pathTup = impl.splitext(sample[dataTag]['adp2'])
                                if pathTup and pathTup[1] == '.gz':
                                    DataParam.append({
                                        "KEY": sample[dataTag]['adp2'],
                                        "VALUE": laneData
                                    })
                                    output[dataTag]['adp2'] = os.path.join(laneData, pathTup[0])
                                else:
                                    output[dataTag]['adp2'] = sample[dataTag]['adp2']
                                    #                 print DataParam
                        if DataParam:
                            impl.write_file(
                                fileName='data.list',
                                scriptsdir=scriptsdir,
                                commands=["${KEY}\t${VALUE}"],
                                JobParamList=DataParam)

                            ParamDict = {
                                "PROGRAM": self.init.multiUploader,
                                "HADOOP": self.hadoop.bin,
                                "UPLOAD": self.init.gzUploader,
                                "INPUT": os.path.join(scriptsdir, 'data.list')
                            }

                            cmd.append(
                                '%s ${PROGRAM} -b ${HADOOP} -d ${INPUT} -u ${UPLOAD}' % self.init.perl)

                            # write script
                            scriptPath = \
                                impl.write_shell(
                                    name='init',
                                    scriptsdir=scriptsdir,
                                    commands=cmd,
                                    paramDict=ParamDict)
                            result.script[sampleName] = scriptPath

                        result.output[sampleName] = output

                if self.init.qualitySystem == '':
                    self.check_qs(sampleInfo)
                    print "[INFO   ]  -- qualitySystem is %s (autocheck)--" % self.init.qualitySystem
                else:
                    print "[INFO   ]  -- qualitySystem is %s --" % self.init.qualitySystem

                    # self.init.qualitySystem = 0

            else:
                sampleName = self.option.multiSampleName

                startStep = self.analysisList[0]
                fs_type = ''
                if self.analysisDict[startStep].platform == 'H':
                    fs_type = 'file://'

                if self.option.multiSample:
                    n = 0
                    index = 0

                    inputDir = os.path.join(self.option.workdir, 'raw_data', 'bams_' + str(index))
                    result.output[sampleName + '_' + str(index)] = fs_type + inputDir
                    if os.path.exists(inputDir):
                        shutil.rmtree(inputDir)
                    impl.mkdir(inputDir)
                    print inputDir

                    for sample_name in sampleInfo:
                        if n == int(self.init.multisample_num):
                            n = 0
                            index += 1
                            inputDir = os.path.join(self.option.workdir, 'raw_data', 'bams_' + str(index))
                            result.output[sampleName + '_' + str(index)] = fs_type + inputDir
                            if os.path.exists(inputDir):
                                shutil.rmtree(inputDir)
                            impl.mkdir(inputDir)
                            print inputDir
                        bam = os.path.basename(sampleInfo[sample_name])
                        ln_bam = os.path.join(inputDir, bam)
                        os.symlink(sampleInfo[sample_name], ln_bam)
                        n += 1
                else:
                    for sample_name in sampleInfo:
                        result.output[sample_name] = fs_type + sampleInfo[sample_name]
        else:
            if mode != 3 and mode != 4:
                if self.option.multiSample:
                    sampleName = self.option.multiSampleName
                    scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
                    self.analysisList = self.analysisList[1:]
                    output = bundle()

                    line = ["${ID}\t${RG}\t${FQ1}\t${FQ2}\t${ADP1}\t${ADP2}"]
                    if self.ref.gender_mode == 'both' and mode != 5:
                        output.female = os.path.join(scriptsdir, "femalesampleinfo.list")
                        output.male = os.path.join(scriptsdir, "malesampleinfo.list")
                        MSLF = open(output.female, 'w')
                        MSLM = open(output.male, 'w')
                        for sample_name in sampleInfo.keys():
                            sample = sampleInfo[sample_name]
                            LineParam = []
                            for dataTag in sampleInfo[sample_name].keys():
                                LineParam.append({
                                    "ID": sample[dataTag]['id'],
                                    "RG": sample[dataTag]['rg'],
                                    "FQ1": 'file://' + sample[dataTag]['fq1'],
                                    "FQ2": sample[dataTag].has_key('fq2') and 'file://' + sample[dataTag][
                                        'fq2'] or 'null',
                                    "ADP1": sample[dataTag].has_key('adp1') and 'file://' + sample[dataTag][
                                        'adp1'] or 'null',
                                    "ADP2": sample[dataTag].has_key('adp2') and 'file://' + sample[dataTag][
                                        'adp2'] or 'null'
                                })

                            gender = self.sample[sample_name]["gender"]
                            impl.fileAppend(
                                fh=gender == 'female' and MSLF or MSLM,
                                commands=line,
                                JobParamList=LineParam)
                    else:
                        output.normal = os.path.join(scriptsdir, "sampleinfo.list")
                        MSL = open(output.normal, 'w')
                        for sample_name in sampleInfo.keys():
                            sample = sampleInfo[sample_name]
                            LineParam = []
                            for dataTag in sample.keys():
                                LineParam.append({
                                    "ID": sample[dataTag]['id'],
                                    "RG": sample[dataTag]['rg'],
                                    "FQ1": 'file://' + sample[dataTag]['fq1'],
                                    "FQ2": sample[dataTag].has_key('fq2') and 'file://' + sample[dataTag][
                                        'fq2'] or 'null',
                                    "ADP1": sample[dataTag].has_key('adp1') and 'file://' + sample[dataTag][
                                        'adp1'] or 'null',
                                    "ADP2": sample[dataTag].has_key('adp2') and 'file://' + sample[dataTag][
                                        'adp2'] or 'null'
                                })

                            impl.fileAppend(
                                fh=MSL,
                                commands=line,
                                JobParamList=LineParam)

                    result.output[sampleName] = output
                else:
                    result.script = bundle()
                    for sampleName in sampleInfo.keys():
                        scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
                        hdfs_gz_tmp = os.path.join(self.option.dirHDFS, sampleName, 'data', 'gz_tmp')
                        sample = sampleInfo[sampleName]
                        output = bundle()
                        DataParam = []
                        cmd = []
                        for dataTag in sample.keys():
                            rawData = impl.mkdir(self.option.workdir, 'fq', 'raw_data', sampleName)
                            laneData = os.path.join(rawData, dataTag)
                            cmd.append("mkdir -p -m 777 %s" % laneData)
                            output[dataTag] = bundle()
                            pathTup = impl.splitext(sample[dataTag]['fq1'])
                            if pathTup and pathTup[1] == '.gz':
                                DataParam.append({
                                    "KEY": sample[dataTag]['fq1'],
                                    "VALUE": os.path.join(laneData, pathTup[0])
                                })
                                output[dataTag]['fq1'] = os.path.join(laneData, pathTup[0])
                            else:
                                output[dataTag]['fq1'] = sample[dataTag]['fq1']

                            if self.init.isSE == False:
                                pathTup = impl.splitext(sample[dataTag]['fq2'])
                                if pathTup and pathTup[1] == '.gz':
                                    DataParam.append({
                                        "KEY": sample[dataTag]['fq2'],
                                        "VALUE": os.path.join(laneData, pathTup[0])
                                    })
                                    output[dataTag]['fq2'] = os.path.join(laneData, pathTup[0])
                                else:
                                    output[dataTag]['fq2'] = sample[dataTag]['fq2']

                            if sample[dataTag].has_key('adp1'):
                                pathTup = impl.splitext(sample[dataTag]['adp1'])
                                if pathTup and pathTup[1] == '.gz':
                                    DataParam.append({
                                        "KEY": sample[dataTag]['adp1'],
                                        "VALUE": os.path.join(laneData, pathTup[0])
                                    })
                                    output[dataTag]['adp1'] = os.path.join(laneData, pathTup[0])
                                else:
                                    output[dataTag]['adp1'] = sample[dataTag]['adp1']

                            if sample[dataTag].has_key('adp2'):
                                pathTup = impl.splitext(sample[dataTag]['adp2'])
                                if pathTup and pathTup[1] == '.gz':
                                    DataParam.append({
                                        "KEY": sample[dataTag]['adp2'],
                                        "VALUE": os.path.join(laneData, pathTup[0])
                                    })
                                    output[dataTag]['adp2'] = os.path.join(laneData, pathTup[0])
                                else:
                                    output[dataTag]['adp2'] = sample[dataTag]['adp2']
                                    #                 print DataParam
                        if DataParam:
                            impl.write_file(
                                fileName='data.list',
                                scriptsdir=scriptsdir,
                                commands=["${KEY}\t${VALUE}"],
                                JobParamList=DataParam)

                            mapper = []
                            mapper.append("#!/usr/bin/perl -w")
                            mapper.append("use strict;\n")
                            mapper.append("while(<STDIN>)\n{")
                            mapper.append("\tchomp;\n\tmy @tmp = split(/\\t/);")
                            mapper.append("\tif(!-e $tmp[1])\n\t{")
                            mapper.append("\t\tprint \"$tmp[1] don't exist.\\n\";")
                            mapper.append("\t\texit 1;\n\t}")
                            mapper.append("\tsystem(\"gzip -cd $tmp[1] >$tmp[2]\");\n}")
                            impl.write_file(
                                fileName='upload_mapper.pl',
                                scriptsdir=scriptsdir,
                                commands=mapper)

                            hadoop_parameter = ' -D mapred.job.name="gzip input data" '
                            hadoop_parameter += ' -D mapred.map.tasks=%d ' % len(DataParam)
                            hadoop_parameter += ' -D mapred.reduce.tasks=0 '
                            hadoop_parameter += ' -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat '
                            ParamDict = {
                                "PROGRAM": "%s jar %s" % (self.hadoop.bin, self.hadoop.streamingJar),
                                "MAPPER": os.path.join(scriptsdir, 'upload_mapper.pl'),
                                "INPUT": 'file://' + os.path.join(scriptsdir, 'data.list'),
                                "OUTPUT": hdfs_gz_tmp,
                                "HADOOPPARAM": hadoop_parameter
                            }

                            cmd.append('%s ${OUTPUT}' % self.fs_cmd.delete)
                            cmd.append(
                                '${PROGRAM} ${HADOOPPARAM} -input ${INPUT} -output ${OUTPUT} -mapper "perl ${MAPPER}"')

                            # write script
                            scriptPath = \
                                impl.write_shell(
                                    name='init',
                                    scriptsdir=scriptsdir,
                                    commands=cmd,
                                    paramDict=ParamDict)
                            result.script[sampleName] = scriptPath

                        result.output[sampleName] = output

                if self.init.qualitySystem == '':
                    self.check_qs(sampleInfo)
                    print "[INFO   ]  -- qualitySystem is %s (autocheck)--" % self.init.qualitySystem
                else:
                    print "[INFO   ]  -- qualitySystem is %s --" % self.init.qualitySystem

                    # self.init.qualitySystem = 0

            else:
                sampleName = self.option.multiSampleName

                startStep = self.analysisList[0]
                fs_type = ''
                if self.analysisDict[startStep].platform == 'H':
                    fs_type = 'file://'

                if self.option.multiSample:
                    inputDir = os.path.join(self.option.workdir, 'raw_data', 'bams')
                    result.output[sampleName] = fs_type + inputDir

                    if os.path.exists(inputDir):
                        shutil.rmtree(inputDir)
                    impl.mkdir(inputDir)

                    for sample_name in sampleInfo.keys():
                        bam = os.path.basename(sampleInfo[sample_name])
                        ln_bam = os.path.join(inputDir, sample_name + "_" + bam)
                        os.symlink(sampleInfo[sample_name], ln_bam)
                else:
                    for sample_name in sampleInfo.keys():
                        result.output[sample_name] = fs_type + sampleInfo[sample_name]

        # return
        return result
Пример #20
0
def parse_sample(sampleList):
    
    total_number = 0
    with open(sampleList,'r') as f:
        sampleInfo = bundle()
        for line in f:
            total_number += 1
            line = line.strip()
            field = line.split()
            sampleName = field[0]
            fq1 = field[1]
            fq2 = ''
            if len(field) >= 3:
                fq2 = field[2]
            if os.path.exists(fq1):
                logger.error("%s under %s don't exists." % (fq1,sampleName))
                exit(3)
            fq_dir = os.path.dirname(fq1)
            fq_name = os.path.basename(fq1)
        
            #date_md_flowcell_laneID_lib
            #100920_I126_FC801V9ABXX_L6_HUMlatXAOIDCBAPEI-8_2.fq
            tmp = fq_name.split("_")
            rg_ID = tmp[4]+"_"+tmp[2]+"-"+tmp[3]
            rg_PU = tmp[0]+"_"+tmp[1]+"_"+tmp[2]+"_"+tmp[3]+"_"+tmp[4]
            rg_LB = tmp[4]
            rg = "@RG\\tID:%s\\tPL:illumina\\tPU:%s\\tLB:%s\\tSM:%s\\tCN:BGI" % (rg_ID,rg_PU,rg_LB,sampleName)
            fq_lib_name = rg_ID
            
            if not sampleInfo.has_key(sampleName):
                    sampleInfo[sampleName] = bundle()
                    sample_lane_counter = 0
            else:
                sample_lane_counter = len(sampleInfo[sampleName])
                    
            dataTag = 'data'+str(sample_lane_counter)
            if not sampleInfo[sampleName].has_key(dataTag):
                sampleInfo[sampleName][dataTag] = bundle()
                
            #find adp1
            sampleInfo[sampleName][dataTag]['fq1'] = fq1
            adp1 = glob.glob("%s/*1.adapter.list*" % fq_dir)
            if adp1:
                adp1_file = adp1[0].strip()
                sampleInfo[sampleName][dataTag]['adp1'] = adp1_file
            else:
                sampleInfo[sampleName][dataTag]['adp1'] = 'null'
                
            #find fq2 and adp2
            fq2 = fq1
            fq2 = fq2.replace("1.fq.gz", "2.fq.gz")
            if os.path.exists(fq2):
                sampleInfo[sampleName][dataTag]['fq2'] = fq2
                adp2 = glob.glob("%s/*2.adapter.list*" % fq_dir)
                if adp2:
                    adp2_file = adp2[0].strip()
                    sampleInfo[sampleName][dataTag]['adp2'] = adp2_file
                else:
                    sampleInfo[sampleName][dataTag]['adp2'] = 'null'
            else:
                logger.warning("%s of line: %d is SE data!" % (sampleName,total_number))
                
            sampleInfo[sampleName][dataTag]['rg'] = rg
            sampleInfo[sampleName][dataTag]['libname'] = fq_lib_name
            
    return sampleInfo
Пример #21
0
    def run(self, impl, sampleInfo):
        mode = self.option.mode
        result = bundle(output=bundle(), script=bundle())

        # extend program path
        self.init.multiUploader = self.expath('init.multiUploader')
        self.init.gzUploader = self.expath('init.gzUploader')
        self.init.check_log = self.expath('init.check_log')
        self.init.bgzip = self.expath('init.bgzip', False)
        self.init.samtools = self.expath('init.samtools', False)

        sampleName = self.option.multiSampleName
        scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
        self.analysisList = self.analysisList[1:]
        hdfs_gz_tmp = os.path.join(self.option.dirHDFS, sampleName, 'data',
                                   'gz_tmp')

        # result.output[sampleName] = output

        DataParam = []
        output = bundle()
        cmd = []
        for sample_name in sampleInfo.keys():
            sample = sampleInfo[sample_name]
            output[sample_name] = bundle()
            for dataTag in sample.keys():
                rawData = impl.mkdir(self.option.workdir, 'fq', 'raw_data',
                                     sample_name)
                laneData = os.path.join(rawData, dataTag)
                cmd.append("mkdir -p -m 777 %s" % laneData)
                output[sample_name][dataTag] = bundle()
                pathTup = impl.splitext(sample[dataTag]['fq1'])
                if pathTup and pathTup[1] == '.gz':
                    DataParam.append({
                        "KEY":
                        sample[dataTag]['fq1'],
                        "VALUE":
                        os.path.join(laneData, pathTup[0])
                    })
                    output[sample_name][dataTag]['fq1'] = os.path.join(
                        laneData, pathTup[0])
                else:
                    output[sample_name][dataTag]['fq1'] = sample[dataTag][
                        'fq1']

                if self.init.isSE == False:
                    pathTup = impl.splitext(sample[dataTag]['fq2'])
                    if pathTup and pathTup[1] == '.gz':
                        DataParam.append({
                            "KEY":
                            sample[dataTag]['fq2'],
                            "VALUE":
                            os.path.join(laneData, pathTup[0])
                        })
                        output[sample_name][dataTag]['fq2'] = os.path.join(
                            laneData, pathTup[0])
                    else:
                        output[sample_name][dataTag]['fq2'] = sample[dataTag][
                            'fq2']

                if sample[dataTag].has_key('adp1'):
                    pathTup = impl.splitext(sample[dataTag]['adp1'])
                    if pathTup and pathTup[1] == '.gz':
                        DataParam.append({
                            "KEY":
                            sample[dataTag]['adp1'],
                            "VALUE":
                            os.path.join(laneData, pathTup[0])
                        })
                        output[sample_name][dataTag]['adp1'] = os.path.join(
                            laneData, pathTup[0])
                    else:
                        output[sample_name][dataTag]['adp1'] = sample[dataTag][
                            'adp1']

                if sample[dataTag].has_key('adp2'):
                    pathTup = impl.splitext(sample[dataTag]['adp2'])
                    if pathTup and pathTup[1] == '.gz':
                        DataParam.append({
                            "KEY":
                            sample[dataTag]['adp2'],
                            "VALUE":
                            os.path.join(laneData, pathTup[0])
                        })
                        output[sample_name][dataTag]['adp2'] = os.path.join(
                            laneData, pathTup[0])
                    else:
                        output[sample_name][dataTag]['adp2'] = sample[dataTag][
                            'adp2']
                        #                 print DataParam
        if DataParam:
            impl.write_file(fileName='data.list',
                            scriptsdir=scriptsdir,
                            commands=["${KEY}\t${VALUE}"],
                            JobParamList=DataParam)

            mapper = []
            mapper.append("#!/usr/bin/perl -w")
            mapper.append("use strict;\n")
            mapper.append("while(<STDIN>)\n{")
            mapper.append("\tchomp;\n\tmy @tmp = split(/\\t/);")
            mapper.append("\tif(!-e $tmp[1])\n\t{")
            mapper.append("\t\tprint \"$tmp[1] don't exist.\\n\";")
            mapper.append("\t\texit 1;\n\t}")
            mapper.append("\tsystem(\"gzip -cd $tmp[1] >$tmp[2]\");\n}")
            impl.write_file(fileName='upload_mapper.pl',
                            scriptsdir=scriptsdir,
                            commands=mapper)

            hadoop_parameter = ' -D mapred.job.name="gzip input data" '
            if self.hadoop.get('queue'):
                hadoop_parameter += '-D mapreduce.job.queuename={} '.format(
                    self.hadoop.queue)
            hadoop_parameter += ' -D mapred.map.tasks=%d ' % len(DataParam)
            hadoop_parameter += ' -D mapred.reduce.tasks=0 '
            hadoop_parameter += ' -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat '
            ParamDict = {
                "PROGRAM":
                "%s jar %s" % (self.hadoop.bin, self.hadoop.streamingJar),
                "MAPPER":
                os.path.join(scriptsdir, 'upload_mapper.pl'),
                "INPUT":
                'file://' + os.path.join(scriptsdir, 'data.list'),
                "OUTPUT":
                hdfs_gz_tmp,
                "HADOOPPARAM":
                hadoop_parameter
            }

            cmd.append('%s ${OUTPUT}' % self.fs_cmd.delete)
            cmd.append(
                '${PROGRAM} ${HADOOPPARAM} -input ${INPUT} -output ${OUTPUT} -mapper "perl ${MAPPER}"'
            )

            # write script
            scriptPath = \
                impl.write_shell(
                    name='init',
                    scriptsdir=scriptsdir,
                    commands=cmd,
                    paramDict=ParamDict)
            result.script[sampleName] = scriptPath

        result.output = output

        if self.init.qualitysystem == '':
            self.check_qs(sampleInfo)
            print "[INFO   ]  -- qualitysystem is %s (autocheck)--" % self.init.qualitysystem
        else:
            print "[INFO   ]  -- qualitysystem is %s --" % self.init.qualitysystem

        return result
Пример #22
0
def parse_sample(sampleList):
    total_number = 0
    sampleInfo = bundle()
    with open(sampleList,'r') as f:
        for line in f:
            field = line.strip().split()
            sampleName = field[0]
            rg_LB = field[1]
            rg_PU = field[2]
            fq_paths = field[-1].strip()

            fq1s = []
            fq_tmp = fq_paths.split(";")
            for fq in fq_tmp:
                if fq.endswith("1.fq.gz"):
                    fq1s.append(fq)

            # fq1s = filter(lambda x: x.endswith("1.fq.gz"), fq_tmp)

            if len(fq1s) == 0 or not os.path.exists(fq1s[0]):
                logger.error("fq1 under %s don't exists." % sampleName)
                exit(3)

            for fq1 in fq1s:
                total_number += 1
                if not sampleInfo.has_key(sampleName):
                    sampleInfo[sampleName] = bundle()
                    sample_lane_counter = 0
                else:
                    sample_lane_counter = len(sampleInfo[sampleName])
                # fq_name = os.path.basename(fq1)
                # fq_dir = os.path.abspath(os.path.dirname(fq1))

                # slideID_laneID_barcode
                # CL100035764_L02_33_1.fq.gz
                # tmp = fq_name.split("_")
                # rg_PU = tmp[0] + "_" + tmp[1] + "_" + tmp[2]

                rg_ID = "{}_{}".format(sampleName, sample_lane_counter)
                rg = "@RG\\tID:%s\\tPL:COMPLETE\\tPU:%s\\tLB:%s\\tSM:%s\\tCN:BGI" % (rg_ID, rg_PU, rg_LB, sampleName)
                fq_lib_name = rg_ID


                dataTag = 'data'+str(sample_lane_counter)
                if not sampleInfo[sampleName].has_key(dataTag):
                    sampleInfo[sampleName][dataTag] = bundle()

                sampleInfo[sampleName][dataTag]['fq1'] = fq1

                #find fq2
                fq2 = fq1
                fq2 = fq2.replace("1.fq.gz", "2.fq.gz")
                if os.path.exists(fq2):
                    sampleInfo[sampleName][dataTag]['fq2'] = fq2
                else:
                    logger.warning("%s of line: %d is SE data!" % (sampleName,total_number))

                sampleInfo[sampleName][dataTag]['rg'] = rg
                sampleInfo[sampleName][dataTag]['libname'] = fq_lib_name
                sampleInfo[sampleName][dataTag]['gender'] = 'male'

    return sampleInfo
Пример #23
0
class init(Workflow):
    """ init data, init data path """

    INIT = bundle(hadoop=bundle(), init=bundle())
    INIT.init.multiUploader = 'multi_uploader.pl'
    INIT.init.gzUploader = "GzUpload.jar"
    INIT.init.bgzip = 'bgzip'
    INIT.init.perl = 'perl'
    INIT.init.samtools = 'samtools'
    INIT.init.qualitysystem = ''
    INIT.init.check_log = '%s' % os.path.join(os.environ['GAEA_HOME'], 'bin', 'check_log.pl')
    INIT.init.check_state_param = ''
    INIT.hadoop.ishadoop2 = False
    INIT.hadoop.is_at_TH = False
    INIT.hadoop.fs_mode = 'hdfs'
    INIT.hadoop.input_format = 'file'
    INIT.hadoop.mapper_num = '112'
    INIT.hadoop.reducer_num = '112'

    def check_qs(self, sampleInfo):
        for sample_name in sampleInfo:
            for dataTag in sampleInfo[sample_name]:
                fq = sampleInfo[sample_name][dataTag]['fq1']
                self.init.qualitysystem = qualitysystem.getqualitysystem(fq)
                if self.init.qualitysystem != '-1':
                    return self.init.qualitysystem

        if self.init.qualitysystem == '-1':
            raise RuntimeError('qualitysystem is wrong, the value is -1')

    def run(self, impl, sampleInfo):
        mode = self.option.mode
        result = bundle(output=bundle(), script=bundle())

        # extend program path
        self.init.multiUploader = self.expath('init.multiUploader')
        self.init.gzUploader = self.expath('init.gzUploader')
        self.init.check_log = self.expath('init.check_log')
        self.init.bgzip = self.expath('init.bgzip', False)
        self.init.samtools = self.expath('init.samtools', False)
        print self.init.gzUploader

        sampleName = self.option.multiSampleName
        scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName)
        self.analysisList = self.analysisList[1:]
        hdfs_gz_tmp = os.path.join(self.option.dirHDFS, sampleName, 'data', 'gz_tmp')
        rawData = os.path.join(self.option.dirHDFS, sampleName, 'fq')

        DataParam = []
        output = bundle()
        cmd = []
        for sample_name in sampleInfo.keys():
            sample = sampleInfo[sample_name]
            output[sample_name] = bundle()
            for dataTag in sample.keys():
                output[sample_name][dataTag] = bundle()
                pathTup = impl.splitext(sample[dataTag]['fq1'])
                filename = '{}_{}_{}'.format(sample_name, dataTag, pathTup[0])
                DataParam.append({
                    "KEY": sample[dataTag]['fq1'],
                    "VALUE": rawData,
                    "VALUE2": filename
                })
                output[sample_name][dataTag]['fq1'] = os.path.join(rawData, filename)

                if self.init.isSE == False:
                    pathTup = impl.splitext(sample[dataTag]['fq2'])
                    filename = '{}_{}_{}'.format(sample_name, dataTag, pathTup[0])
                    DataParam.append({
                        "KEY": sample[dataTag]['fq2'],
                        "VALUE": rawData,
                        "VALUE2": filename
                    })
                    output[sample_name][dataTag]['fq2'] = os.path.join(rawData, filename)

        if DataParam:
            impl.write_file(
                fileName='data.list',
                scriptsdir=scriptsdir,
                commands=["${KEY}\t${VALUE}\t${VALUE2}"],
                JobParamList=DataParam)

            mapper = []
            mapper.append("#!/usr/bin/perl -w")
            mapper.append("use strict;\n")
            mapper.append("while(<STDIN>)\n{")
            mapper.append("\tchomp;\n\tmy @tmp = split(/\\t/);")
            mapper.append("\tif(!-e $tmp[1])\n\t{")
            mapper.append("\t\tprint \"$tmp[1] don't exist.\\n\";")
            mapper.append("\t\texit 1;\n\t}")
            mapper.append("\tsystem(\"%s jar %s GzUploader -i $tmp[1] -o $tmp[2] -n $tmp[3]\");\n}" % (self.hadoop.bin, self.init.gzUploader))
            impl.write_file(
                fileName='upload_mapper.pl',
                scriptsdir=scriptsdir,
                commands=mapper)

            hadoop_parameter = ' -D mapred.job.name="upload data" '
            if self.hadoop.get('queue'):
                hadoop_parameter += '-D mapreduce.job.queuename={} '.format(self.hadoop.queue)
            hadoop_parameter += ' -D mapred.map.tasks=%d ' % len(DataParam)
            hadoop_parameter += ' -D mapred.reduce.tasks=0 '
            hadoop_parameter += ' -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat '
            ParamDict = {
                "PROGRAM": "%s jar %s" % (self.hadoop.bin, self.hadoop.streamingJar),
                "MAPPER": os.path.join(scriptsdir, 'upload_mapper.pl'),
                "INPUT": 'file://' + os.path.join(scriptsdir, 'data.list'),
                "OUTPUT": hdfs_gz_tmp,
                "HADOOPPARAM": hadoop_parameter
            }

            cmd.append('%s ${OUTPUT}' % self.fs_cmd.delete)
            # cmd.append('${PROGRAM} ${HADOOPPARAM} -input ${INPUT} -output ${OUTPUT} -mapper "perl ${MAPPER}"')
            cmd.append('%s jar %s GzUploader -i %s -l' % (self.hadoop.bin, self.init.gzUploader, os.path.join(scriptsdir, 'data.list')))

            # write script
            scriptPath = \
                impl.write_shell(
                    name='init',
                    scriptsdir=scriptsdir,
                    commands=cmd,
                    paramDict=ParamDict)
            result.script[sampleName] = scriptPath

        result.output = output

        if self.init.qualitysystem == '':
            self.check_qs(sampleInfo)
            print "[INFO   ]  -- qualitysystem is %s (autocheck)--" % self.init.qualitysystem
        else:
            print "[INFO   ]  -- qualitysystem is %s --" % self.init.qualitysystem

        return result