Exemplo n.º 1
0
    def qc(self):
        self.get_proj_base_inf()
        '''
        generate qc analysis script
        '''
        qc_dir = os.path.join(self.proj_dir, 'qc')
        qc_script = os.path.join(self.proj_dir, 'qc.sh')
        qc_error = os.path.join(self.proj_dir, 'qc.error')
        # remove in future
        pathlib.Path(qc_error).touch()
        script_inf = '''
#! /bin/sh
qc_pipe.py qc_collection \\
    --OutDir {0} \\
    --CleanDir {1} \\
    --SampleInf {2} \\
    --species {3} \\
    --database {4} \\
    --database-version {5} \\
    --workers {6} \\

    '''.format(qc_dir, self.clean_dir, self.sample_inf, self.species,
               self.database, self.database_version, self.worker_number)
        write_obj_to_file(script_inf, qc_script)
        return self.run_script_or_cmd(qc_script)
Exemplo n.º 2
0
def get_diff_target(difffile, sRNA_target_dict, out_target):
    target_mRNA_list = []
    diff_list = [each.strip() for each in open(difffile)]
    for each_miRNA in diff_list:
        if each_miRNA in sRNA_target_dict:
            target_mRNA_list.extend(sRNA_target_dict[each_miRNA])
    target_mRNA_list = list(set(target_mRNA_list))
    python_tools.write_obj_to_file(target_mRNA_list, out_target)
Exemplo n.º 3
0
def get_diff_gene(difffile, outgene):
    diff_gene_list = []
    if os.path.isfile(difffile):
        with open(difffile) as difffile_info:
            for n, eachline in enumerate(difffile_info):
                if n != 0:
                    eachline_info = eachline.split('\t')
                    diff_gene_list.append(eachline_info[0])
    else:
        sys.exit('%s not exists!' % difffile)
    python_tools.write_obj_to_file(diff_gene_list, outgene)
Exemplo n.º 4
0
 def kallisto_quant(self):
     fq_cmd = ' '.join(self.fq_list)
     if len(self.fq_list) > 1 :  
         cmd = '%s quant -i %s -o %s %s' % (self.kallisto,self.index,self.out_dir,fq_cmd)          
         python_tools.circ_call_process(cmd)
     else :
         single_length = 2*self.fq_length
         cmd = '%s quant -i %s -o %s --single -l %s -s %s --plaintext -t %s %s ' % (self.kallisto,self.index,self.out_dir,self.fq_length,self.sd,self.thread,fq_cmd)
         python_tools.circ_call_process(cmd)
     quant_log = os.path.join(self.out_dir,'quant.cmd.log')
     python_tools.write_obj_to_file(cmd,quant_log)
     return cmd
Exemplo n.º 5
0
 def run(self):
     ignore_files = [
         '.ignore', 'logs', 'mapping_dir', 'bam_dir', 'mapping_stats.plot',
         'Rplots.pdf', 'mapping_stats.report'
     ]
     pdf_report_files = [
         'mapping_stats_plot.png', 'mapping_stats.report',
         'mapping_stats.txt'
     ]
     pdf_report_ini = path.join(self.OutDir, '.report_files')
     write_obj_to_file(pdf_report_files, pdf_report_ini)
     with self.output().open('w') as ignore_files_inf:
         for each_file in ignore_files:
             ignore_files_inf.write('{}\n'.format(each_file))
Exemplo n.º 6
0
 def run(self):
     ignore_files = [
         '.ignore', 'logs', 'fastqc_results/*zip', '.report_files'
     ]
     pdf_report_files_pattern = [
         'fastqc_general_stats.txt', 'gc_plot/*gc_distribution.line.png',
         'reads_quality_plot/*reads_quality.bar.png'
     ]
     pdf_report_files = rsync_pattern_to_file(self.OutDir,
                                              pdf_report_files_pattern)
     pdf_report_ini = path.join(self.OutDir, '.report_files')
     write_obj_to_file(pdf_report_files, pdf_report_ini)
     with self.output().open('w') as ignore_files_inf:
         for each_file in ignore_files:
             ignore_files_inf.write('{}\n'.format(each_file))
Exemplo n.º 7
0
 def run(self):
     ignore_files = ['.ignore', 'logs', 'kegg/blast_out',
                     'kegg/kegg_pathway_logs', '.report_files',
                     'report.go.table.txt', 'report.kegg.table.txt',
                     '*png']
     report_files_pattern = ['go/*/*go.enrichment.barplot.png', 'kegg/*/*kegg.enrichment.barplot.png',
                             'go/*/DAG/ALL*png', 'go/*/*.ALL.go.enrichment.txt',
                             'kegg/*/*ALL.kegg.enrichment.txt']
     report_files = rsync_pattern_to_file(self.OutDir, report_files_pattern)
     pathway_plots = get_enrichment_data(self.OutDir)
     report_files.extend(pathway_plots)
     report_ini = path.join(self.OutDir, '.report_files')
     write_obj_to_file(report_files, report_ini)
     with self.output().open('w') as ignore_inf:
         for each_file in ignore_files:
             ignore_inf.write('{}\n'.format(each_file))
Exemplo n.º 8
0
 def run(self):
     ignore_files = ['.ignore', 'logs', 'kallisto/*/run_info.json',
                     '.report_files', 'Rplots.pdf',
                     'expression_summary/pdf.*',
                     'expression_summary/html.*',
                     'expression_summary/ALL.Volcano_plot.*']
     report_files_pattern = ['expression_summary/*.png',
                             'differential_analysis/*/*png',
                             'expression_summary/*Gene.tpm.txt',
                             'expression_summary/*example.diff.table.txt',
                             'differential_analysis/*/*.edgeR.DE_results.txt']
     report_files = rsync_pattern_to_file(self.OutDir, report_files_pattern)
     report_ini = path.join(self.OutDir, '.report_files')
     write_obj_to_file(report_files, report_ini)
     with self.output().open('w') as ignore_inf:
         for each_file in ignore_files:
             ignore_inf.write('{}\n'.format(each_file))
Exemplo n.º 9
0
    def run(self):

        snp_plot_cmds = [
            'bcftools stats -s - {0}/snp.filter.vcf > {0}/snp.stats'.format(
                OutDir)
        ]
        snp_plot_cmds.append(
            'plot-vcfstats -p {0}/snp_plot -s {0}/snp.stats'.format(OutDir))
        snp_plot_cmds.append(
            'Rscript {0} --snp_stats {1}/snp_plot/tstv_by_sample.0.dat --sample_inf {2} --out_dir {1}'
            .format(SNP_PLOT, OutDir, SampleInf))
        snp_plot_cmd_file = path.join(OutDir, 'snp.plot.sh')
        write_obj_to_file(snp_plot_cmds, snp_plot_cmd_file)

        snp_summary_cmds = ['sh', snp_plot_cmd_file]
        snp_summary_inf = run_cmd(snp_summary_cmds)
        with self.output().open('w') as snp_summary_log:
            snp_summary_log.write(snp_summary_inf)
Exemplo n.º 10
0
 def run(self):
     ignore_files = [
         '.ignore', 'logs', 'read_duplication/*.DupRate_plot.*',
         'read_distribution/read_distribution.summary.txt',
         'junction_saturation', 'inner_distance/*inner_distance_plot*',
         'inner_distance/*inner_distance.txt', 'infer_experiment',
         'genebody_coverage/*geneBodyCoverage.curves.pdf',
         'genebody_coverage/*geneBodyCoverage.r', 'Rplots.pdf'
     ]
     pdf_report_files_pattern = [
         'inner_distance/*inner_distance.bar.png',
         'read_duplication/*reads_duplication.point.png',
         'genebody_coverage/*genebody_coverage.point.png',
         'read_distribution/read_distribution.bar.png',
         'read_distribution/*read_distribution.pie.png'
     ]
     pdf_report_files = rsync_pattern_to_file(self.OutDir,
                                              pdf_report_files_pattern)
     pdf_report_ini = path.join(self.OutDir, '.report_files')
     write_obj_to_file(pdf_report_files, pdf_report_ini)
     with self.output().open('w') as ignore_files_inf:
         for each_file in ignore_files:
             ignore_files_inf.write('{}\n'.format(each_file))
Exemplo n.º 11
0
    def run(self):
        gtf_list_file = path.join(OutDir, 'assembly_dir', 'gtf.list')
        gtf_path_list = [
            path.join(OutDir, 'assembly_dir', '{}.gtf'.format(each_sample))
            for each_sample in sample_list
        ]
        write_obj_to_file(gtf_path_list, gtf_list_file)

        merge_gtf_cmd = [
            'stringtie', '--merge', '-m', '200', '-T', '0.1', '-f', '0.1',
            '-o', '{}/stringtie_merge.gtf'.format(OutDir), gtf_list_file
        ]
        if RefGtf:
            merge_gtf_cmd.extend(['-G', RefGtf])

        get_fa_cmd = [
            'gffread', '{}/stringtie_merge.gtf'.format(OutDir), '-g', RefFa,
            '-w', '{}/stringtie_merge.fa'.format(OutDir)
        ]

        stringtie_merge_log_inf = run_cmd([merge_gtf_cmd, get_fa_cmd])

        with self.output().open('w') as stringtie_merge_log:
            stringtie_merge_log.write(stringtie_merge_log_inf)
Exemplo n.º 12
0
    def pipe(self):
        self.get_proj_base_inf()
        pipe_error = os.path.join(self.proj_dir, 'pipe.error')
        pipe_script = os.path.join(self.proj_dir, 'pipe.sh')
        pathlib.Path(pipe_error).touch()
        pipe_inf = '''
#! /bin/sh
mRNA_pipe_v2.py run_pipe \\
    --proj-name {0} \\
    --proj-dir {1} \\
    --clean-dir {2} \\
    --sample-inf {3} \\
    --species {4} \\
    --workers {5} \\
    --database {6} \\
    --database-version {7} \\
        '''.format(self.proj_name, self.proj_dir, self.clean_dir,
                   self.sample_inf, self.species, self.worker_number,
                   self.database, self.database_version)
        if self.kegg_bg:
            pipe_inf = '{0}    --kegg-bg {1} \\\n'.format(
                pipe_inf, self.kegg_bg)
        write_obj_to_file(pipe_inf, pipe_script)
        return self.run_script_or_cmd(pipe_script)
Exemplo n.º 13
0
                            pass

python_tools.circ_mkdir_unix(args.out_dir)
time_info = time.localtime()
output_time = '%s-%s-%s-%s:%s:%s' % (time_info.tm_year, time_info.tm_mon,
                                     time_info.tm_mday, time_info.tm_hour,
                                     time_info.tm_min, time_info.tm_sec)

md5_file = os.path.join(args.out_dir, 'fq_md5.txt')
md5_list = []
for each in sample_dict:
    if args.nocheck:
        logging.info('get md5 of %s start' % each)
        log_list = sample_dict[each].get_dna_md5()
        logging.info('get md5 of %s finished' % each)
    else:
        logging.info('check md5 of %s start' % each)
        if args.type == 'dna':
            log_list = sample_dict[each].check_dna_md5(args.out_dir)
        else:
            log_list = sample_dict[each].check_rna_md5(args.out_dir)
    if log_list:
        for each_log in log_list:
            if 'ok' in each_log:
                logging.info(each_log)
            else:
                logging.error(each_log)
        logging.info('check md5 of %s finished' % each)
    md5_list.extend(sample_dict[each].md5)
python_tools.write_obj_to_file(md5_list, md5_file)
Exemplo n.º 14
0
    shell_name = path.basename(sh_script)
    log_file = path.join(log_dir, '{0}.log'.format(shell_name))
    system('nohup sh {0} > {1} 2>&1 &'.format(sh_script, log_file))

if __name__ == '__main__':
    arguments = docopt(__doc__, sys.argv[1:], version='v1')
    cp_info_file = arguments['--cp_info_file']
    log_dir = arguments['--log_dir']
    if not cp_info_file or not log_dir:
        print(__doc__)
        sys.exit(1)

    ## get cp info dict
    cp_info_dict = {}
    with open(cp_info_file) as cp_info_file_inf:
        for eachline in cp_info_file_inf:
            eachline_inf = eachline.strip().split('\t')
            cp_info_dict[eachline_inf[0]] = eachline_inf[1]

    ## creat cp shell script for each directory and lauch it
    cp_info_file_name = path.basename(cp_info_file)
    shell_script = '{0}.sh'.format(cp_info_file_name)
    cmd_list = []
    for each in cp_info_dict:
        from_dir = each
        to_dir = cp_info_dict[each]
        cmd_list.extend(shell_cmd_for_cp(from_dir, to_dir, log_dir))
    write_obj_to_file(cmd_list, shell_script)
    nohup_sh_job(shell_script, log_dir)

Exemplo n.º 15
0
def accession2gi(accession):
    try:
        handle = Entrez.elink(dbfrom="nucleotide",
                              id=accession,
                              linkname="nucleotide_nuccore")
    except HTTPError:
        time.sleep(20)
        handle = Entrez.elink(dbfrom="nucleotide",
                              id=accession,
                              linkname="nucleotide_nuccore")
    record = Entrez.read(handle)
    handle.close()
    linked = record[0]['IdList'][0]
    time.sleep(10)
    return linked


if __name__ == '__main__':
    accession_list = [each.strip() for each in open(args.accession)]
    gi_list = []
    for each_id in accession_list:
        each_accession = ""
        try:
            each_accession = accession2gi(each_id)
#            gi_list.append(accession2gi(each_id))
        except:
            print 'can not find gi number of accession: %s' % each_id
        else:
            gi_list.append(each_accession)
    python_tools.write_obj_to_file(gi_list, args.gi)
Exemplo n.º 16
0
        'rawdata_number.json').st_size:
    sample_number_dict = python_tools.load_fn_to_obj('rawdata_number.json')
else:
    sample_number_dict = {}

total_size = []
cp_data_info = open(cp_data_info_file, 'w')
for each in sample_dict:
    if each in sample_number_dict:
        sample_dict[each].pre_num = sample_number_dict[each]
    else:
        sample_number_dict[each] = len(sample_dict[each].read1)
    cmd_line = sample_dict[each].add_sup_data(args.out_dir)
    for n, each_fq in enumerate(sample_dict[each].read1):
        read1_fq = sample_dict[each].read1[n]
        read2_fq = sample_dict[each].read2[n]
        read1_fq_size = os.stat(read1_fq).st_size / float(1024**3)
        read2_fq_size = os.stat(read2_fq).st_size / float(1024**3)
        total_size.extend([read1_fq_size, read2_fq_size])
        read1_fq_size_out = round(read1_fq_size, 2)
        read2_fq_size_out = round(read2_fq_size, 2)
        cp_data_info.write('%s\t%s\t%sG\t%s\t%sG\n' %
                           (sample_dict[each].name, read1_fq,
                            read1_fq_size_out, read2_fq, read2_fq_size_out))
    python_tools.write_obj_to_file(cmd_line, cp_cmd, True)
cp_data_info.write('total : %sG' % round(sum(total_size), 2))
cp_data_info.close()

cp_data_info_json = os.path.join(cwd, 'rawdata_number.json')
python_tools.write_obj_to_json(sample_number_dict, cp_data_info_json)
Exemplo n.º 17
0
    def check_data(self):
        qc_summary_check_file = path.join(self.qc_dir, 'qc_check_out.txt')
        write_obj_to_file('QC check logs.', qc_summary_check_file)
        # check q30:
        if path.exists(self.fastqc_dir):
            self.get_fastqc_summary()
            q30_check_msg = self.get_failed_msg('Q30',
                                                'Q30(%)',
                                                self.Q30_CUT,
                                                gt=False)
            write_obj_to_file(q30_check_msg,
                              qc_summary_check_file,
                              append=True)
        # check mapping
        if path.exists(self.mapping_dir):
            self.get_mapping_summary()
            self.qc_summary_df.loc[:, 'unique_mapping_rate'] = [
                float(each.rstrip('%'))
                for each in self.qc_summary_df.loc[:,
                                                   'Uniquely mapped reads %']
            ]
            self.qc_summary_df.loc[:, 'multiple_mapping_rate'] = [
                float(each.rstrip('%')) for each in
                self.qc_summary_df.loc[:, '% of reads mapped to multiple loci']
            ]
            self.qc_summary_df.loc[:,
                                   'total_mapping_rate'] = self.qc_summary_df.loc[:,
                                                                                  'unique_mapping_rate'] + self.qc_summary_df.loc[:,
                                                                                                                                  'multiple_mapping_rate']
            mapping_rate_median = self.qc_summary_df.loc[:,
                                                         'total_mapping_rate'].median(
                                                         )
            mapping_rate_cutoff = max((mapping_rate_median - 10),
                                      self.MAPPING_RATE_CUT)
            mapping_rate_check_msg = self.get_failed_msg('Mapping rate',
                                                         'total_mapping_rate',
                                                         mapping_rate_cutoff,
                                                         gt=False)
            write_obj_to_file(mapping_rate_check_msg,
                              qc_summary_check_file,
                              append=True)
            multi_map_median = self.qc_summary_df.loc[:,
                                                      'multiple_mapping_rate'].median(
                                                      )
            multi_map_cutoff = min((multi_map_median + 10), self.MULTI_MAP_CUT)
            multi_map_check_msg = self.get_failed_msg('Multiple mapping rate',
                                                      'multiple_mapping_rate',
                                                      multi_map_cutoff)
            write_obj_to_file(multi_map_check_msg,
                              qc_summary_check_file,
                              append=True)
            self.qc_summary_df = self.qc_summary_df.drop([
                'unique_mapping_rate', 'multiple_mapping_rate',
                'total_mapping_rate'
            ],
                                                         axis=1)
        # check rseqc
        if path.exists(self.rseqc_dir):
            self.get_rseqc_summary()
            # check duplication
            dup_check_msg = self.get_failed_msg('Duplication',
                                                'Duplication_seq',
                                                self.DUP_CUT)
            write_obj_to_file(dup_check_msg,
                              qc_summary_check_file,
                              append=True)
            # check tin
            tin_check_msg = self.get_failed_msg('TIN',
                                                'TIN(median)',
                                                self.TIN_CUT,
                                                gt=False)
            write_obj_to_file(tin_check_msg,
                              qc_summary_check_file,
                              append=True)

            # check cds
            cds_check_msg = self.get_failed_msg('CDS reads portion',
                                                'CDS_portion',
                                                self.CDS_CUT,
                                                gt=False)
            write_obj_to_file(cds_check_msg,
                              qc_summary_check_file,
                              append=True)

        # write qc summary out
        self.qc_summary_df.index.name = 'Sample'
        qc_summary_out = path.join(self.qc_dir, 'qc_summary.txt')
        self.qc_summary_df.to_csv(qc_summary_out, sep='\t')
Exemplo n.º 18
0
python_tools.circ_mkdir_unix(args.out_dir)
time_info = time.localtime()
output_time = '%s-%s-%s-%s:%s:%s' % (time_info.tm_year, time_info.tm_mon,
                                     time_info.tm_mday, time_info.tm_hour,
                                     time_info.tm_min, time_info.tm_sec)

md5_file = os.path.join(args.out_dir, 'fq_md5.txt')
md5_list = []
for each in sample_dict:
    if args.nocheck:
        logging.info('get md5 of %s start' % each)
        log_list = sample_dict[each].get_dna_md5()
        logging.info('get md5 of %s finished' % each)
    else:
        logging.info('check md5 of %s start' % each)
        if args.type == 'dna':
            log_list = sample_dict[each].get_dna_md5()
        else:
            log_list = sample_dict[each].get_rna_md5()
    if log_list:
        for each_log in log_list:
            if 'ok' in each_log:
                logging.info(each_log)
            else:
                logging.error(each_log)
        logging.info('check md5 of %s finished' % each)
    md5_list.extend(sample_dict[each].md5)
python_tools.write_obj_to_file('Sample\tfile\tfile_size\tmd5', md5_file)
python_tools.write_obj_to_file(md5_list, md5_file, True)
Exemplo n.º 19
0
 def kallisto_index(self):
     cmd = '%s index -i %s %s' % (self.kallisto,self.index,self.transcript_fa)
     python_tools.circ_call_process(cmd)
     run_log = os.path.join(self.out_dir,'index.cmd.log')
     python_tools.write_obj_to_file(cmd,run_log)
     return cmd
Exemplo n.º 20
0
import sys
import os
import RNAseq_tools
import python_tools

if not len(sys.argv) == 3:
    print 'python ' + sys.argv[0] + ' gtf length_filtered_gtf'
    sys.exit(0)

ori_gtf = sys.argv[1]
out_gtf = sys.argv[2]

tr_dict = RNAseq_tools.get_transcript_info(ori_gtf)
filter_gtf_list = RNAseq_tools.gtf_length_filter(ori_gtf, tr_dict)
python_tools.write_obj_to_file(filter_gtf_list, out_gtf)
Exemplo n.º 21
0
    sys.exit(0)

lncRNA_feature = sys.argv[1]
novel_gtf = sys.argv[2]
add_gtf = sys.argv[3]

lncRNA_tr_dict = {}
lncRNA_gene_dict = {}
with open(lncRNA_feature) as lncRNA_feature_inf:
    for n, eachline in enumerate(lncRNA_feature_inf):
        if n != 0:
            eachline_inf = eachline.strip().split('\t')
            tr_id = eachline_inf[4]
            gene_id = eachline_inf[5]
            tr_type = eachline_inf[-1]
            lncRNA_tr_dict[tr_id] = tr_type
            lncRNA_gene_dict[gene_id] = tr_type

out_list = []
for eachline in GFF_Reader(novel_gtf):
    gene_id = eachline.attr['gene_id']
    transcript_id = eachline.attr['transcript_id']
    gene_type = tr_type = 'TUCP'
    if gene_id in lncRNA_gene_dict:
        gene_type = lncRNA_gene_dict[gene_id]
    if transcript_id in lncRNA_tr_dict:
        tr_type = lncRNA_tr_dict[transcript_id]
    out_list.append('%s; gene_biotype "%s"; transcript_biotype "%s";' %
                    (eachline.get_gff_line().strip(), gene_type, tr_type))
python_tools.write_obj_to_file(out_list, add_gtf)
            if n == 0:
                for m, each_col in enumerate(eachline_info):
                    if '_novel_miRNA' in each_col:
                        sample_id = each_col.split('_novel_miRNA')[0]
                        sample_index_dict[m] = sample_id
            else:
                novel_id = eachline_info[0]
                novel_seq_list = [[], [], []]
                for m, each_col in enumerate(eachline_info):
                    if m in sample_index_dict:
                        sample_id = sample_index_dict[m]
                        if each_col != '-':
                            each_col_list = each_col.split(',')
                            each_col_list = [
                                each.strip() for each in each_col_list
                            ]
                            for each_col_info in each_col_list:
                                for i, each_seq in enumerate(
                                        miRNA_seq_dict[each_col_info]
                                    [sample_id]):
                                    novel_seq_list[i].append(
                                        miRNA_seq_dict[each_col_info]
                                        [sample_id][i])
                for j, each_seq_list in enumerate(novel_seq_list):
                    each_merged_seq = merge_seq_list(each_seq_list)
                    seq_dict_list[j][novel_id] = each_merged_seq

    for n, each_dict in enumerate(seq_dict_list):
        each_file = seq_file_list[n]
        python_tools.write_obj_to_file(each_dict, each_file)
with open(args.datasize_file) as datasize_file_info:
    for n,eachline in enumerate(datasize_file_info):
        if n!= 0:
            eachline_info = eachline.strip().split('\t')
            sample_id = eachline_info[0]
            datasize = float(eachline_info[1])
            if datasize > cutoff:
                sample_list.append(sample_id)

extract_cmd_file = os.path.join(cwd,'extract_cmd.sh')
extract_cmd_list = []
tmp_dir = os.path.join(args.data_dir,'tmp')
if not os.path.exists(tmp_dir):
    extract_cmd_list.append('mkdir -p "%s"' % tmp_dir)

sample_datasize = random.sample(target_reads_range,len(sample_list))
for n,each_sample in enumerate(sample_list):
    extract_cmd_list.append('echo "#### start extract data of sample %s ####"' % each_sample)
    extract_cmd_list.append('date')
    for read_num in (1,2):
        each_fq_file = os.path.join(args.data_dir,'%s_R%s.fastq.gz' % (each_sample,read_num))
        each_fq_bak_file = os.path.join(tmp_dir,'%s_R%s.fastq.gz' % (each_sample,read_num))
        each_extract_read = sample_datasize[n]
        extract_cmd_list.append('mv "%s" "%s"' % (each_fq_file,each_fq_bak_file))        
        extract_cmd_list.append('zcat "%s" | head -%s |gzip > "%s"' % (each_fq_bak_file,each_extract_read,each_fq_file))
    extract_cmd_list.append('date')
    extract_cmd_list.append('echo "#### finish extract data of sample %s ####"\n' % each_sample)
python_tools.write_obj_to_file(extract_cmd_list,extract_cmd_file)


Exemplo n.º 24
0
        if n == 0:
            if args.go:
                go_info = ['GO_term_accession', 'GO_domain', 'GO_term_name']
            else:
                go_info = [
                    'GO_term_accession', 'GO_domain', 'GO_term_name',
                    'InterPro_ID', 'InterPro_description'
                ]
        else:
            if args.go:
                go_info = ['--'] * 3
            else:
                go_info = ['--'] * 5
            if gene_id in go_anno_dict:
                go_info = []
                for each in go_anno_dict[gene_id]:
                    go_info.append(','.join(each))
        eachline_info.extend(go_info)

        if n == 0:
            ko_info = ['KO_ID', 'KO_name']
        else:
            ko_info = ['--'] * 2
            if gene_id in ko_anno_dict:
                ko_info = []
                for each in ko_anno_dict[gene_id]:
                    ko_info.append(','.join(each))
        eachline_info.extend(ko_info)
        output_info_list.append('\t'.join(eachline_info))
python_tools.write_obj_to_file(output_info_list, args.output)
Exemplo n.º 25
0
Entrez.email = '*****@*****.**'


def accession2gi(accession):
    try:
        handle = Entrez.esearch(db="protein", term=accession)
    except HTTPError:
        time.sleep(20)
        handle = Entrez.esearch(db="protein", term=accession)
    record = Entrez.read(handle)
    handle.close()
    linked = record['IdList'][0]
    time.sleep(10)
    return linked


if __name__ == '__main__':
    accession_list = [each.strip() for each in open(args.protein_acc)]
    gi_dict = {}
    for each_id in accession_list:
        each_accession = "NA"
        try:
            each_accession = accession2gi(each_id)
#            gi_list.append(accession2gi(each_id))
        except Exception as e:
            print e
            print 'can not find gi number of accession: %s' % each_id
        gi_dict[each_id] = each_accession
    python_tools.write_obj_to_file(gi_dict, args.protein_id)