示例#1
0
def get_otu_by_rdp(workdir,input_fa,genus_loc = '.'):
    safe_makedir(workdir)
    now_dir = os.getcwd()
    os.chdir(workdir)

    fasta_dir = os.path.dirname(input_fa)
    fasta_base_name = os.path.basename(input_fa)
    if genus_loc == '.':
        blast_dir = os.path.join(fasta_dir,'blast')
        genus_result = os.path.join(blast_dir, fasta_base_name.replace('.fa', '_blast_result.tsv'))
    else:
        blast_dir = fasta_dir
        genus_result = os.path.join(blast_dir,fasta_base_name.replace('.fa','_genus_result.tsv'))

    otu_result = os.path.basename(input_fa).replace('.fa','_otus.txt')

    analysis_result = os.path.basename(input_fa).replace('.fa','_otus_result.tsv')

    #/usr/lib/qiime/bin/ old path
    cmd1 = 'pick_otus.py -i %s -m blast -o ./ -b %s' %(input_fa,db_16s)
    cmd2 = 'pick_rep_set.py -i %s -f %s -o rep.fna' % (otu_result,input_fa)
    # cmd3 = "source /sam/anBank/lib/miniconda2/bin/deactivate"
    logger.info(cmd1)
    logger.info(cmd2)
    os.system(cmd1)
    os.system(cmd2)
    # try:
    #     os.system(cmd3)
    # except:
    #     pass

    analysis_otu_info(genus_result, otu_result, analysis_result)

    os.chdir(now_dir)
示例#2
0
def get_fasta_seq(user, excel, input_dir_raw, output_dir, qual_length,
                  seq_start, seq_end):

    input_dir = os.path.join(base_dir, 'raw_data', user, input_dir_raw)
    if not os.path.exists(input_dir):
        logger.warn('We can not find sequences dir %s' % input_dir)
        exit()
    excel_file = os.path.join(input_dir, excel)

    seqs_info = ReadFiles.read_excel_onesheet(excel_file,
                                              sheet_name=' sheet1',
                                              same_line_debug=False)

    output_dir1 = os.path.join(base_dir, 'data', user)
    output_dir2 = os.path.join(base_dir, 'data', user, input_dir_raw)
    safe_makedir(output_dir1)
    safe_makedir(output_dir2)

    fasta_file = os.path.join(output_dir2, '%s.fa' % input_dir_raw)
    fasta_info_file = os.path.join(output_dir2,
                                   '%s_seq_info.tsv' % input_dir_raw)

    data2 = open(fasta_info_file, 'w')
    header = ['#seq_name', 'status', 'result']
    data2.write('%s\n' % '\t'.join(header))
    sucessful_seqs = 0
    seqs_name = []
    for k1 in seqs_info:
        seq_name = seqs_info[k1]['样品名称']
        seq_length = seqs_info[k1]['片段大小']
        seq_status = seqs_info[k1]['反应结果']
        result = 0
        if int(seq_length) > qual_length and '成功' in seq_status:
            sucessful_seqs += 1
            seqs_name.append(seq_name)
            result = 1
        info = [seq_name, seq_status, str(result)]
        data2.write('%s\n' % '\t'.join(info))

    data2.close()

    result_fp = open(fasta_file, 'w')
    for one_file in os.listdir(input_dir):
        if one_file.endswith('seq'):
            seq_name2 = one_file.split('_')[0]
            if seq_name2 in seqs_name:
                with open(os.path.join(input_dir, one_file)) as data1:
                    seq = data1.read().strip()
                    seq2 = seq[seq_start:seq_end]
                result_fp.write('>%s\n' % seq_name2)
                result_fp.write('%s\n' % seq2)

    logger.info("%s has %s successful seqs" % (input_dir, sucessful_seqs))
    result_fp.close()
    if sucessful_seqs == 0:
        logger.info('There is no successful seqs,please check it!')
        exit()

    return fasta_file, fasta_info_file
示例#3
0
def run_main(excel, input_dir, output_dir, qual_length, seq_start, seq_end,
             filter_identity, user, rdp):
    parallel = create_base_logger()
    setup_local_logging(config)

    fasta_file, fasta_info_file = get_fasta_seq(user, excel, input_dir,
                                                output_dir, qual_length,
                                                seq_start, seq_end)

    fasta_dir = os.path.dirname(fasta_file)
    fasta_base_name = os.path.basename(fasta_file)
    blast_dir = os.path.join(fasta_dir, 'blast')
    safe_makedir(blast_dir)
    blast_output = os.path.join(
        blast_dir, fasta_base_name.replace('.fa', '_blast_result.xml'))
    analysis_result = os.path.join(
        blast_dir, fasta_base_name.replace('.fa', '_blast_result.tsv'))
    fail_fasta_file = fasta_file.replace('.fa', '_fail_blast.fa')
    rdp_result_file = os.path.join(
        fasta_dir, 'rdp_assigned_taxonomy',
        fasta_base_name.replace('.fa', '_tax_assignments.txt'))

    blast_rdp_file = fasta_file.replace('.fa', '_blast_rdp_result.tsv')

    blast_input(fasta_file, blast_output)

    analysis_blast_result_xml(blast_output, analysis_result, fasta_file,
                              fail_fasta_file, filter_identity)

    get_taxonomy_info_by_rdp(fasta_file)

    otu_dir = os.path.join(os.path.dirname(fasta_file), 'otu')
    get_otu_by_rdp(otu_dir, fasta_file)

    merge_blast_rdp_file(analysis_result, rdp_result_file, blast_rdp_file)

    user_data_dir = os.path.join(data_dir, user)
    merge_result(user_data_dir)

    ## total seq otu
    if rdp == 'False':
        print 'We will skip total seqs otu analysis!'
        pass
    else:
        get_otu_by_rdp(os.path.join(user_data_dir, 'Total', 'otu'),
                       '../total.fa',
                       genus_loc='Y')

    logger.warn('Finish analysis! Thanks for using anBank')
示例#4
0
def run_split(fasta, otu_file, outdir):
    #otus = {}
    safe_makedir(outdir)
    with open(otu_file) as data1:
        for each_line in data1:
            if each_line.strip() == '':
                continue
            cnt = each_line.strip().split()
            otu_accession = cnt[0].split('|')[3].split('_')[1].split('.')[0]
            #otus[cnt] = cnt[1:]
            cmd = "extract_seqs_by_sample_id.py -i %s -o %s/%s.fa -s %s " % (
                fasta, outdir, otu_accession, ','.join(cnt[1:]))
            print cmd
            logger.info(cmd)
            os.system(cmd)

    pass
示例#5
0
def merge_result(data_dir):
    total_result = '%s/Total' % data_dir
    safe_makedir(total_result)
    total_fa = os.path.join(total_result, 'total.fa')
    total_seq_info = os.path.join(total_result, 'total_seq_info.tsv')

    total_genus_result = os.path.join(total_result, 'total_genus_result.tsv')

    data1 = open(total_fa, 'w')
    data2 = open(total_genus_result, 'w')
    data6 = open(total_seq_info, 'w')
    data6.write('#seq_name\tstatus\tresult\n')
    if os.path.exists(total_result):
        pass
    else:
        safe_makedir(total_result)
    for fn in os.listdir(data_dir):
        if fn.startswith('Total'):
            continue
        fn_fa = os.path.join(data_dir, fn, '%s.fa' % fn)
        fn_seq = os.path.join(data_dir, fn, '%s_seq_info.tsv' % fn)

        blast_analysis_fp = os.path.join(data_dir, fn,
                                         '%s_blast_rdp_result.tsv' % fn)
        seqs_name = []
        skip_header = 0
        if os.path.exists(fn_fa):
            with open(fn_fa) as data3:
                for each_line in data3:
                    if each_line.strip() == '':
                        continue
                    if each_line.startswith('>'):
                        seq_name = each_line.replace('>', '')
                        if seq_name not in seqs_name:
                            seqs_name.append(seq_name)
                        else:
                            # seqs_name.append(seq_name)
                            print 'You have same name %s please check it!' % seq_name
                            exit()
                    else:
                        pass
                    data1.write('%s\n' % (each_line.strip()))
        if os.path.exists(fn_seq):
            with open(fn_seq) as data5:
                for each_line in data5:
                    if each_line.strip() == '' or each_line.startswith('#'):
                        continue

                    data6.write('%s\n' % (each_line.strip()))
        if os.path.exists(blast_analysis_fp):
            with open(blast_analysis_fp) as data4:
                for each_line in data4:
                    if each_line.strip() == '':
                        continue
                    elif each_line.startswith('#') and skip_header == 0:
                        data2.write('%s\n' % (each_line.strip()))
                        skip_header = 1
                    elif not each_line.startswith('#'):
                        data2.write('%s\n' % (each_line.strip()))

    data1.close()
    data2.close()
    data6.close()
示例#6
0
def _create_log_handler(config, add_hostname=False, direct_hostname=False):

    logbook.set_datetime_format("local")

    handlers = [logbook.NullHandler()]
    format_str = "".join([
        "[{record.time:%Y-%m-%dT%H:%M}] " if config.get("include_time", True)
        else "", "{record.extra[source]}: " if add_hostname else "",
        "%s: " % (socket.gethostname)() if direct_hostname else "",
        "{record.message}"
    ])
    # format_str = logbook.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    # print 'log_geno.py testsssss'
    # print format_str
    # exit()

    log_dir = get_log_dir(config)
    if log_dir:
        if not os.path.exists(log_dir):
            filehandler.safe_makedir(log_dir)
            # Wait to propagate, Otherwise see logging errors on distributed filesystems.
            time.sleep(5)
        handlers.append(
            logbook.FileHandler(os.path.join(log_dir, "%s.log" % LOG_NAME),
                                format_string=format_str,
                                level="INFO",
                                filter=_not_cl))
        handlers.append(
            logbook.FileHandler(os.path.join(log_dir,
                                             "%s-debug.log" % LOG_NAME),
                                format_string=format_str,
                                level="DEBUG",
                                bubble=True,
                                filter=_not_cl))
        handlers.append(
            logbook.FileHandler(os.path.join(log_dir,
                                             "%s-commands.log" % LOG_NAME),
                                format_string=format_str,
                                level="DEBUG",
                                filter=_is_cl))
    handlers.append(
        logbook.StreamHandler(sys.stdout,
                              format_string="{record.message}",
                              level="DEBUG",
                              filter=_is_stdout))

    email = config.get("email",
                       config.get("resources", {}).get("log", {}).get("email"))
    if email:
        email_str = u'''Subject: [bcbio-nextgen] {record.extra[run]} \n\n {record.message}'''
        handlers.append(
            logbook.MailHandler(email, [email],
                                format_string=email_str,
                                level='INFO',
                                bubble=True))

    handlers.append(
        logbook.StreamHandler(sys.stderr,
                              format_string=format_str,
                              bubble=True,
                              filter=_not_cl))
    return CloseableNestedSetup(handlers)