def depth_info(self): cmd = '{igvtools} count -w {window_size} {infile} {outdir}/depthRaw_{outsuffix}.wig novo37'.format( **self.__dict__) print 'run cmd:', cmd assert not os.system(cmd) out_wig = '{outdir}/depthRaw_{outsuffix}.wig'.format(**self.__dict__) outfile = '{outdir}/depth_{outsuffix}'.format(**self.__dict__) with utils.safe_open(out_wig) as f, utils.safe_open(outfile, 'w') as out: for line in f: if line.startswith('track'): continue elif line.startswith('variableStep'): chrom = re.findall(r'chrom=(.+?) ', line)[0].strip('chr') continue linelist = line.strip().split('\t') start = int(linelist[0]) depth = float(linelist[1]) depth_log10 = math.log10(depth + 1) end = start + self.window_size - 1 if start + self.window_size > CHROM_LENGTH[chrom]: end = CHROM_LENGTH[chrom] line = 'hs{chrom}\t{start}\t{end}\t{depth_log10}\n'.format( **locals()) out.write(line) print 'write file: {}'.format(outfile)
def cnv_info(self): outfile = '{outdir}/{vtype}_{outsuffix}'.format(**self.__dict__) with utils.safe_open(self.infile) as f, utils.safe_open(outfile, 'w') as out: print 'open file: {}'.format(self.infile) for line in f: linelist = line.strip().split('\t') if linelist[0] in ('Chr', '#Chr'): headerlist = linelist continue chrom = linelist[0].strip('Chr').strip('chr') start = linelist[headerlist.index('Start')] end = linelist[headerlist.index('End')] if chrom not in self.normal_chrom: continue if self.vtype == 'freec': copynumber = int( linelist[headerlist.index('CopyNumber')]) - 2 copynumber = 6 if copynumber > 6 else copynumber elif self.vtype == 'cnvnator': copynumber = float(linelist[headerlist.index('RD')]) copynumber = 3 if copynumber > 3 else copynumber line = 'hs{chrom} {start} {end} {copynumber}\n'.format( **locals()) out.write(line) print 'write file: {}'.format(outfile)
def create_nginx_redirect_config(env_file, hostname): if hostname.startswith("www."): redirect_from = hostname[4:] else: redirect_from = "www.%s" % hostname config_dst = Path(os.path.join(VHOSTD_DIR, redirect_from)) redirect_prompt = ( "Do you want to set 301 redirect from %s to %s? " "(requires domain to be already configured)" ) % (redirect_from, hostname) if input_bool(redirect_prompt): env_file["LETSENCRYPT_HOST"] = env_file["VIRTUAL_HOST"] env_file.save() else: env_file["LETSENCRYPT_HOST"] = hostname env_file.save() if config_dst.is_file(): config_dst.unlink() return False with safe_open(REDIRECT_CONFIG, "r") as f: tpl = f.read() redirect_config = tpl.replace("domain.com", hostname) with safe_open(config_dst, "w") as f: f.write(redirect_config) return True
def write_ped(self, pedfile, context): with utils.safe_open(pedfile, 'w') as ped: line = '{familyid}\t{sampleid}\t{pa}\t{ma}\t{sex}\t{phenotype}\n'.format(**context) line += '{familyid}\t{sampleid}\t{pa}\t{ma}\t{sex}\t{phenotype}\n'.format(**context['pa_context']) line += '{familyid}\t{sampleid}\t{pa}\t{ma}\t{sex}\t{phenotype}\n'.format(**context['ma_context']) ped.write(line)
def main(): global no_neighbor_count with safe_open(outpath, exist_ok='exit') as outfile: # write header #outfile.write('\t'.join(['head_id', 'gene_id', 'flag', 'distance'])+'\n') relations = pd.DataFrame() for head_group_name, head_group in tqdm(head_groups, desc='Contigs'): if head_group_name not in gene_groups.groups: prinf( 'Não há nenhum gene no cromossomo. As heads abaixo não possuem NG.' ) prinf(head_group) no_neighbor_count += head_group.shape[0] continue gene_group = gene_groups.get_group(head_group_name) chunks = np.array_split(head_group, n_cpu) with mp.Pool() as pool: pool_results = pool.starmap(parse_chunk, ((c, gene_group, cn) for cn, c in enumerate(chunks))) for chunk_relations in pool_results: relations = relations.append(chunk_relations) # print('\nCHUN', chunk_relations, '\nREL', relations) relations.columns = ['head_id', 'gene_id', 'flag', 'distance'] relations.to_csv(outfile, sep='\t', index=False) log(f'\nConcluído. Relações salvas em {str(outpath)}.') return relations, no_neighbor_count
def render_html(self): if self.rep_ty == 'qc': self.context['report_type'] = 'QC' self.analy_type = '质控' elif self.rep_ty == 'mapping': self.context['report_type'] = 'Mapping' self.context['mapping'] = True self.analy_type = '比对' elif self.rep_ty == "primary": self.context['report_type'] = 'Primary' self.context['mapping'] = True self.context['primary'] = True self.analy_type = '基本分析' elif self.rep_ty == "advance": self.context['report_type'] = 'Advance' self.context['mapping'] = True self.context['primary'] = True self.context['advance'] = True self.analy_type = '高级分析' else: sys.exit("plz select rigth report type ('qc', 'mapping', 'primary', 'advance')") self.context['analy_type'] = self.analy_type check_html = self.env.get_template('TestDemo.html').render(self.context) outfile = os.path.join(self.checkDir , self.rep_ty+'_check.html') with utils.safe_open(outfile, 'w') as out: out.write(check_html) return outfile
def get_indel_info(self): '''处理indel vcf文件 处理indel的vcf文件时,为了和注释的pos,以及ref和alt对应,需要对vcf做处理 { 'chr_pos_ref/alt': [(case), (control)] } ''' with utils.safe_open(self.vcf, 'r') as fr: for line in fr: if line.startswith('##'): pass elif line.startswith('#'): head = line.strip('') head_index = utils.get_head_index(head) continue linelist = line.strip('').split('\t') _chr = linelist[head_index['#chrom']] pos = linelist[head_index['pos']] ref = linelist[head_index['ref']] alt = linelist[head_index['alt']] pos, ref, alt = utils.modify_pos_ref_alt(pos, ref, alt) if 'cancer' in head_index: case = linelist[head_index['cancer']] if 'normal' in head_index: control = linelist[head_index['normal']]
def main(): infile = args['infile'] filetype = args['type'] add_header = args['add_header'] outdir = args['outdir'] # print args;exit() if len(infile) == 1: infile_list = re.split(r'\s+|;|:|,', infile[0]) else: infile_list = infile for infile in infile_list: outfile = infile.replace('.xls', '.brief.xls') if outdir: outfile = os.path.join(outdir, os.path.basename(outfile)) with safe_open(infile) as f, safe_open(outfile, 'w') as out: if add_header: header_file = os.path.join(BASE_DIR, 'header/{}.header'.format(filetype)) print 'add header:', header_file for line in get_added_header(header_file): # print line out.write(line) for line in f: linelist = line.rstrip('\n').split('\t') # if all(h in linelist for h in ['CHROM', 'POS']): if linelist[0] in ('Priority', 'Chr', 'CHROM'): indices = list(get_indices(linelist, header_map[filetype])) new_header = get_new_line(linelist, indices) # print new_header out.write(new_header) continue new_line = get_new_line(linelist, indices) # print new_line out.write(new_line) print 'write brief file:', outfile
def align(): out_file = safe_open(out_path) # Check out_path. if out_file is not None: print('Alinhando heads contra heads...', end=' ') # Remember you are using megablast. run(f"blastn -task 'megablast' -query '{heads_path}' -subject '{heads_path}'" f" -outfmt '6 {COLUMNS}' -out '{out_path}' -evalue 1e-10" f" -num_threads {n_cpu}", shell=True) print(f'Alinhamentos salvos em {out_path}.\n')
def get_samples(self): with utils.safe_open(self.__dict__['infile']) as f: for line in f: if line.startswith('#CHROM'): linelist = line.strip().split('\t') samplelist = linelist[linelist.index('FORMAT') + 1:] break return ','.join(samplelist)
def save(self): lines = [] lines.append("# %s" % self.header) lines.append("# %s\n" % datetime.now()) for key in sorted(self.variables.keys()): value = self.variables[key] lines.append("%s=%s" % (key, value)) with safe_open(self.path, "w") as f: f.write("\n".join(lines))
def write_ped_ws(self, pedfile, wsfile, pedlist): samples_with_data = [] with utils.safe_open(pedfile, 'w') as pf, utils.safe_open(wsfile, 'w') as wf: ws_count = [] n = 0 for ped in pedlist: sampleid = ped['sampleid'] if self.sample_infos_all[sampleid]['data'] != '0': samples_with_data.append(sampleid) n += 1 ws_count.append(n) else: ws_count.append(0) ped_text = '{familyid}\t{sampleid}\t{pa}\t{ma}\t{sex}\t{phenotype}\n'.format( **ped) pf.write(ped_text) # print ws_count ws_text = ' '.join(map(str, ws_count)) + '\n' wf.write(ws_text) return samples_with_data
def samtools_call_hapmap(self, familyid, samples_with_data): vcf_list = '{analydir}/Advance/{newjob}/Linkage/{familyid}/vcf_{familyid}.list'.format( **dict(self.__dict__, **locals())) with utils.safe_open(vcf_list, 'w') as out: for sampleid in samples_with_data: out.write('{}.vcf\n'.format(sampleid)) for sampleid in samples_with_data: print '> samtools call hapmap for', sampleid cmd = ''' set -eo pipefail echo samtools call hapmap for {sampleid} start: `date "+%F %T"` cd {analydir}/Advance/{newjob}/Linkage/{familyid} samtoolsv0.1.19 mpileup \\ -d 10000 -C 50 -D -S -m 2 -F 0.02 -q 13 -Q 13 \\ -gf {reffasta} \\ -l {moduledir}/Linkage/annotHapMap2L.txt \\ {analydir}/Mapping/{sampleid}.{sampleid}/{sampleid}.final.bam | bcftools_lh view \\ -cg -t 0.5 \\ -> {sampleid}.vcf echo samtools call hapmap for {sampleid} done: `date "+%F %T"` '''.format(**dict(self.__dict__, **locals())) shell_path = '{analydir}/Advance/{newjob}/Linkage/{familyid}/samtools_call_hapmap_{sampleid}.sh'.format( **dict(self.__dict__, **locals())) utils.write_shell(shell_path, cmd) # add job now_point = 'samtools_call_hapmap' job_name = 'samtools_call_hapmap_{sampleid}'.format(**locals()) utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = ['final_bam_{sampleid}'.format(**locals())] after_jobs = ['linkdatagen_{familyid}'.format(**locals())] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def text2excel(outfile, *infiles): ''' infiles can be a string: 'a.xls,b.xls' ('a.xls,b.xls', ) or a list: ['a.xls', 'b.xls'] (['a.xls', 'b.xls'], ) or many positional args: 'a.xls', 'b.xls' ('a.xls', 'b.xls') ''' # wb = openpyxl.Workbook(encoding='utf8') wb = openpyxl.Workbook() if len(infiles) == 1: if isinstance(infiles[0], str): infile_list = infiles[0].split(',') elif isinstance(infiles[0], list): infile_list = infiles[0] elif len(infiles) >= 2: infile_list = list(infiles) else: exit('error infiles: {}'.format(infiles)) for infile in infile_list: sheetname = get_sheetname(infile) print 'create sheet:', sheetname sheet = wb.create_sheet(title=sheetname) # with codecs.open(infile, mode='r', encoding='gbk', errors='ignore') as f: with safe_open(infile) as f: for n, line in enumerate(f): row = n + 1 linelist = line.strip().split('\t') for m, value in enumerate(linelist): column = m + 1 sheet.cell(row=row, column=column, value=value) # remove default sheet try: wb.remove(wb['Sheet']) except AttributeError: wb.remove_sheet(wb.get_sheet_by_name('Sheet')) # for the old version outdir = os.path.dirname(outfile) if outdir: mkdir_if_not_exists(outdir) wb.save(filename=outfile) print 'write excel file:', outfile
def main(): for query in QUERIES: query_path = pardir / f'seqs/{query}.fa' out_path = pardir / f'alinhamentos/{query}_vs_genome.bl' out_file = safe_open(out_path) if out_file is None: continue print(f'Procurando alinhamentos de {query} contra genoma...') run(( f"blastn -task blastn -query {str(query_path)} -db {str(genomedb_path)} " f"-outfmt '6 {' '.join(BL_COLUMNS)}' -out {str(out_path)} " f"-evalue 1e-10 -num_threads {n_cpu}"), shell=True) print(f'Alinhamentos salvos em {str(out_path)}.\n')
def main(): with u.safe_open(outpath, exist_ok=False) as outfile: raw_annotations = read_csv(raw_annotations_path, sep='\t', comment='#', header=None, names=GFF3_COLUMNS) print('Leitura encerrada. Removendo anotações não-gênicas...') genes_gff = raw_annotations.loc[raw_annotations['type'] == 'gene'] genes_gff.loc[:, ['start', 'end']] = genes_gff[['start', 'end']].astype(int) lengths = genes_gff.end - genes_gff.start genes_gff.loc[:, 'attributes'] = genes_gff.attributes.str.replace( 'ID=gene:', 'gene_id=') genes_gff['attributes'] = genes_gff.attributes.str.extract( r'(gene_id.*Name[^;]+)') # com loc não funciona (?!): # genes_gff.loc[:, 'attributes'] = genes_gff.attributes.str.extract(r'(gene_id.*Name[^;]+)') genes_gff.loc[:, 'attributes'] += ';length=' + lengths.astype(str) # ###### REMOVER GENES COM FIM OU INÍCIO COINCIDENTES genes_gff = genes_gff.loc[ lengths.sort_values().index] # Ordenar por tamanho # Manter o maior gene entre os que coincidem. genes_gff = genes_gff.drop_duplicates(['seqid', 'start'], keep='last') genes_gff = genes_gff.drop_duplicates(['seqid', 'end'], keep='last') if genes_gff.duplicated([ 'seqid', 'start' ]).sum() or genes_gff.duplicated(['seqid', 'end']).sum(): print('ERRO: HÁ GENES COM INÍCIO/TÉRMINO DUPLICADOS:') print(genes_gff[genes_gff.duplicated(['seqid', 'start'], keep=False)]) print(genes_gff[genes_gff.duplicated(['seqid', 'end'], keep=False)]) raise ValueError genes_gff = genes_gff.sort_values(['seqid', 'start']) genes_gff.to_csv(outfile, sep='\t', index=False, header=None) print(f"Anotações gênicas mantidas em '{str(outpath)}'.")
def get_tran_relation(self): ''' input: self.transcript: gene transcript output: list: [gene=trans, gene=trans] ''' tran_relation = [] with utils.safe_open(self.transript_database, 'r') as fr: for line in fr: if line.startswith('#'): head_index = utils.get_head_index(line) continue linelist = line.strip('').split('\t') gene = linelist[head_index['#gene']] tran = linelist[head_index['transcript']] tran_relation.append('{gene}={tran}'.format(**locals())) return tran_relation
def main(): for kind, pattern in (('head', r'head\d+'), ('gene', r'Smp_\d+')): out_path = pardir/f'genome_annotation/{kind}_complement_annotations.gff3' outfile = safe_open(out_path, exist_ok=False) gff = pd.read_table(pardir/f'genome_annotation/{kind}_annotations.gff3', names=GFF3_COLUMNS) print(gff.strand.head()) gff.loc[gff.strand == '+', 'strand'] = 'plus' gff.loc[gff.strand == '-', 'strand'] = '+' gff.loc[gff.strand == 'plus', 'strand'] = '-' print(gff.strand.head()) gff['attributes'] = gff.attributes.str.replace( pattern, lambda match: match.group(0) + '_complement', regex=True) gff.to_csv(outfile, sep='\t', header=False, index=False) print(f"Wrote to '{str(out_path)}'.") outfile.close()
def make_readme(self): self.django_configure() title = open(self.args['pn']).read().strip() encoding = chardet.detect(title)['encoding'] if encoding != 'utf8': title = title.decode(encoding) self.context['title'] = title # self.context['software'] = self.softwares src = os.path.join(RESULT_DIR, 'src') dest = '{Readme}'.format(**self.__dict__) self.link_data(src, dest) max_code = max(map(float, self.analy_list)) if max_code < 2: report_type = 'qc' elif 2 <= max_code < 3: report_type = 'mapping' elif 3 <= max_code < 6.2: report_type = 'primary' elif max_code >= 6.2: report_type = 'advance' # print report_type self.context['report_type'] = report_type print json.dumps(self.context, ensure_ascii=False, indent=2) # print os.path.join(RESULT_DIR, 'templates') # template = loader.get_template('test.html') # template = loader.get_template('readme_template_chs.html') template = loader.get_template('index.html') if self.django_old: html = template.render(Context(self.context)) else: html = template.render(self.context) # print html dest_html = os.path.join(dest, 'index.html') with utils.safe_open(dest_html, 'w') as out: out.write(html)
def conifer_call(self, sampleIDs): if 'V5' in self.args['TR']: probe = 'V5' elif 'V6' in self.args['TR']: probe = 'V6' else: print '[Error] Only agilent V5 or V6 can do CoNIFER analysis. ' exit(1) # prepare data for conifer outfile = '{analydir}/SV/CoNIFER_{newjob}/sample_for_cnv_call'.format( **self.args) with utils.safe_open(outfile, 'w') as out: for sampleID in sampleIDs: bam = '{analydir}/Mapping/{sampleID}.{sampleID}/{sampleID}.final.bam'.format( sampleID=sampleID, analydir=self.analydir) out.write('{}\t{}\n'.format(sampleID, bam)) REF = 'hg19' if self.__dict__['ref'] == 'b37' else self.__dict__['ref'] cmd = ''' set -eo pipefail echo cnv call with conifer start: `date "+%F %T"`\n cd {analydir}/SV/CoNIFER_{newjob} python {moduledir}/Varition/CNV/CoNIFER/conifer_v0.2.2/conifer.pipe4.7.py \\ --svd 10 \\ --probe {probe} \\ --ref {ref} \\ --in sample_for_cnv_call \\ --suffix {newjob} \\ --out {analydir}/SV while read s b;do python {moduledir}/Varition/CNV/CoNIFER/conifer_v0.2.2/cnv_chrom_plot.py \\ {analydir}/SV/$s/conifer/$s.conifer.{REF}_multianno.xls \\ {ref} \\ {samp_info} done < sample_for_cnv_call rm -f *.hdf5 echo cnv call with conifer done: `date "+%F %T"` '''.format(**dict(self.__dict__, **locals())) shell_path = '{analydir}/SV/CoNIFER_{newjob}/conifer_call.sh'.format( **self.args) utils.write_shell(shell_path, cmd) # add job now_point = 'conifer_call' job_name = 'conifer_call' utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = [ 'final_bam_{sampleID}'.format(sampleID=sampleID) for sampleID in sampleIDs ] after_jobs = ['primary_report'] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def main(): filtered_outfile = safe_open(filtered_outpath, exist_ok=False) discarded_outfile = safe_open(discarded_outpath, exist_ok=False) n_cpu = mp.cpu_count() #================== LER E FILTRAR ALINHAMENTOS ==================# print('Lendo resultados do Blast...', end=' ') perere3_vs_genoma = pd.read_table(perere3_inpath, header=None, names=BL_COLUMNS) sr3_vs_genoma = pd.read_table(sr3_inpath, header=None, names=BL_COLUMNS) print('Resultados lidos.') # Sort positions # for data in (perere3_vs_genoma, sr3_vs_genoma): # data.sort_values('sstart', inplace=True) # data.reset_index(drop=True, inplace=True) print('Buscando alinhamentos em que o SR3 é melhor...') discarded = pd.DataFrame() filtered_perere3_vs_genoma = perere3_vs_genoma.copy() p_groups = perere3_vs_genoma.groupby('saccver') s_groups = sr3_vs_genoma.groupby( 'saccver') ## agrupar muda index??????????? print( 'Iterando para cada scaffold no genoma e para cada perere3 no scaffold.' ) for p_group_name in tqdm(p_groups.groups, desc='Scaffolds'): s_group = s_groups.get_group(p_group_name) p_group = p_groups.get_group(p_group_name) prinf('Combinando DataFrames...', end='\r') product = cartesian_product( p_group[['sstart', 'send', 'bitscore']].reset_index(), s_group[['sstart', 'send', 'bitscore']].reset_index()) # discard when perere3 aligns better prinf('Filtrando por bitscore do SR3...', end='\r') product = product.loc[product.bitscore_x < product.bitscore_y] if product.empty: continue prinf('Subdividindo produto... ', end='\r') product_chunks = np.array_split(product, n_cpu) prinf('Procurando sobreposições...', end='\r') with mp.Pool() as pool: chunks_discarded = pool.starmap(parse_product, enumerate(product_chunks)) group_discarded = pd.concat(chunks_discarded) discarded = discarded.append(group_discarded) # print(discarded[~discarded['index'].isin(filtered_perere3_vs_genoma.index)]) # print(filtered_perere3_vs_genoma.loc[discarded['index'].unique()]) print( f"Escrevendo posições das linhas removidas de '{str(perere3_inpath)}' em '{str(discarded_outpath)}'...", end=' ') discarded.columns = pd.MultiIndex.from_product([('perere3', 'sr3'), ('index', 'sstart', 'ssend', 'bitscore')]) discarded.to_csv(discarded_outfile, sep='\t', index=False) print('Arquivo escrito.') print('Filtrando...', end=' ') filtered_perere3_vs_genoma.drop(discarded[('perere3', 'index')], inplace=True) print(f'\nFiltragem concluída. {len(discarded)} alinhamentos removidos.') print( f"Escrevendo alinhamentos filtrados do perere3 em '{str(filtered_outpath)}'...", end=' ') filtered_perere3_vs_genoma.to_csv(filtered_outfile, sep='\t', index=False) print('Arquivo escrito.') return filtered_perere3_vs_genoma, discarded
import pandas as pd from matplotlib import pyplot as plt from sys import argv if '--sem-sentido' in argv: nosense_flag = '_unconsidering_sense' else: print( 'Estamos considerando sentido por default (--sem-sentido para não considerar).' ) nosense_flag = '' outpath = pardir / f'genome_annotation/head_genes_correlations{nosense_flag}.tsv' out_aggregated_counts = pardir / f'counted_reads/aggregated{nosense_flag}.tsv' outfile = safe_open(outpath) print('Buscando comprimentos de genes e heads...') gene_attibutes = read_tsv(pardir / 'genome_annotation/gene_annotations.gff3', names=GFF3_COLUMNS, usecols=['attributes'])['attributes'] head_attibutes = read_tsv(pardir / 'genome_annotation/head_annotations.gff3', names=GFF3_COLUMNS, usecols=['attributes'])['attributes'] gene_lengths = parse_gff_attributes(gene_attibutes, gene_id='Name')['length'] head_lengths = parse_gff_attributes(head_attibutes)['length'] lengths = pd.concat([head_lengths, gene_lengths]).astype(int) print('Concluído. Lendo arquivo de relações...') relations = read_tsv( pardir / f'genome_annotation/head_genes_relations{nosense_flag}.tsv')
GFF_COLS_SUBSET = ['seqid', 'start', 'end', 'strand', 'attributes'] # sequid é o nome do cromossomo (contig) if '--com-sentido' in argv: COLS_TO_GROUP = ['seqid', 'strand'] nosense_flag = '' else: print('Não estamos considerando sentido por default. ' 'Use --com-sentido para considerar, ou seja, relacionar ' 'apenas quando cópia e gene estiverem na mesma fita.') COLS_TO_GROUP = 'seqid' nosense_flag = '_unconsidering_sense' outpath = pardir / f'genome_annotation/head_genes_relations{nosense_flag}_multiprocessed.tsv' outfile = safe_open(outpath) n_cpu = mp.cpu_count() def parse_head_row(head_row, gene_group, outfile): parse_head_row.last_args = head_row, gene_group, outfile for _, gene_row in gene_group.iterrows(): if overlaps((gene_row.start, gene_row.end), (head_row.start, head_row.end)): flag = 'olap' chosen_gene_id = gene_row.id distance = 0 break # if none overlaps
output_root = '../hyperparameter_tuning/{}/{}/{}/perturbs_{}_sigma{}_temp{}_dweight{}_lr{}'.format( distance_function, data_name, model_type, opt, sigma_val, temperature_val, distance_weight_val, lr) num_iter = 10 sigma = np.full(n_examples, sigma_val) temperature = np.full(n_examples, temperature_val) distance_weight = np.full(n_examples, distance_weight_val) to_optimize = [perturbed] indicator = np.ones(n_examples) best_perturb = np.zeros(perturbed.shape) best_distance = np.full(n_examples, 1000.) # all distances should be below 1000 perturb_iteration_found = np.full(n_examples, 1000 * num_iter, dtype=np.int64) average_distance = np.zeros(num_iter) with utils.safe_open(output_root + '.txt', 'w') as fout: fout.write( '{} {} {} --sigma={} --temp={} --distance_weight={} --lr={}\n'.format( model_name, opt, distance_function, sigma_val, temperature_val, distance_weight_val, lr)) for i in range(num_iter): with tf.GradientTape(persistent=True) as t: p_model = utils.filter_hinge_loss(n_class, indicator, perturbed, sigma, temperature, prob_from_input) approx_prob = tf.gather_nd(p_model, example_class_index) if distance_function == 'euclidean': distance = utils.safe_euclidean(perturbed - feat_input, axis=1) elif distance_function == 'cosine': distance = utils.safe_cosine(perturbed, feat_input)
def sv_info(self): sv_context = defaultdict(list) svid_list = [] with utils.safe_open(self.infile) as f: print 'open file: {}'.format(self.infile) for line in f: linelist = line.strip().split('\t') if linelist[0] in ('Chr', ): headerlist = linelist continue chrom = linelist[headerlist.index('Chr')].strip('Chr').strip( 'chr') start = linelist[headerlist.index('Start')] end = linelist[headerlist.index('End')] func = linelist[headerlist.index('Func')] tchr = linelist[headerlist.index('TCHR')].strip('Chr').strip( 'chr') tstart = linelist[headerlist.index('TSTART')] svid = linelist[headerlist.index('SVID')] svtype = linelist[headerlist.index('SVType')] # 对于同一个ID,只记录第一条 if svid in svid_list: continue # 只保留常染色体+XY if any(each not in self.normal_chrom + ['na'] for each in [chrom, tchr]): # sys.stderr.write('skip a line of unnormal chrom: ' + line) continue # 只保留exonic或splicing区的变异 if not (func.startswith('exonic') or func.startswith('splicing')): # sys.stderr.write('skip a line of not exonic or splicing: ' + line) continue # 跳过breakpoint行 if svtype == 'breakpoint': continue # for lumpy if svtype == 'DUP': svtype = 'INS' # for breakdancer if svtype not in ('CTX', 'ITX', 'DEL', 'INS', 'INV'): svtype = linelist[headerlist.index('TX')][:3].upper() end1 = int(start) + 1 if svtype in ('CTX', 'ITX'): chrom2 = tchr start2 = tstart end2 = int(start2) + 1 elif svtype in ('DEL', 'INS', 'INV'): chrom2 = chrom start2 = end end2 = int(start2) + 1 info = 'hs{chrom} {start} {end1} hs{chrom2} {start2} {end2}'.format( **locals()) svid_list.append(svid) sv_context[svtype].append(info) for svtype in ('CTX', 'ITX', 'DEL', 'INS', 'INV'): outfile = '{outdir}/{vtype}_{svtype}_{outsuffix}'.format( **dict(self.__dict__, **locals())) with utils.safe_open(outfile, 'w') as out: for info in sv_context[svtype]: out.write(info + '\n') print 'write file: {}'.format(outfile)
def mutation_info(self): chrom_region = self._get_chrom_region() # print chrom_region['1'].items()[0] with utils.safe_open(self.infile) as f: print 'open file: {}'.format(self.infile) for line in f: linelist = line.strip().split('\t') if linelist[0] == '#CHROM': headerlist = linelist if line.startswith('#'): continue chrom = linelist[headerlist.index('#CHROM')] pos = int(linelist[headerlist.index('POS')]) now_region = self._get_now_region(chrom, pos, chrom_region) # print chrom, now_region if chrom not in self.normal_chrom: continue if self.vtype == 'snp': genotype = self._get_genotype(headerlist, linelist) if genotype == 'hom': chrom_region[chrom][now_region][0] += 1 else: chrom_region[chrom][now_region][1] += 1 elif self.vtype == 'indel': chrom_region[chrom][now_region][0] += 1 density_outfile = '{outdir}/{vtype}_density_{outsuffix}'.format( **self.__dict__) if self.vtype == 'snp': snp_hom_het_ratio_outfile = '{outdir}/{vtype}_ratio_{outsuffix}'.format( **self.__dict__) snp_hom_het_ratio_out = utils.safe_open(snp_hom_het_ratio_outfile, 'w') chrom_order = map(str, range(1, 23)) + ['X', 'Y'] with utils.safe_open(density_outfile, 'w') as density_out: for chrom, regions in sorted(chrom_region.iteritems(), key=lambda (k, v): chrom_order.index(k)): for start, end in sorted(regions): site_number = sum(regions[start, end]) density = float(site_number) / self.region_length line = 'hs{chrom}\t{start}\t{end}\t{density}\n'.format( **locals()) density_out.write(line) if self.vtype == 'snp': hom_ratio = het_ratio = 0 if site_number: hom_ratio = regions[start, end][0] / float(site_number) het_ratio = regions[start, end][1] / float(site_number) line = 'hs{chrom}\t{start}\t{end}\t{hom_ratio},{het_ratio}\n'.format( **locals()) snp_hom_het_ratio_out.write(line) print 'write file: {}'.format(density_outfile) if self.vtype == 'snp': snp_hom_het_ratio_out.close() print 'write file: {}'.format(snp_hom_het_ratio_outfile)
def get_added_header(header_file): with safe_open(header_file) as h: for line in h: yield line
def process_iobj(self, iobj): """ Processing :param iobj: :return: """ input_name = self.iobj_name(iobj) logger.info('Processing: %s' % input_name) finish_file = self.get_finish_file(input_name) if os.path.exists(finish_file): logger.info('Finish indicator file exists, skipping: %s' % finish_file) return self.cur_decompressor = None self.cur_state_file = self.get_state_file(input_name) file_leafs = self.get_classification_leafs(input_name) file_roots = self.get_classification_roots(input_name) self.last_record_resumed = None self.processor = newline_reader.NewlineReader(is_json=False) handle = iobj name = str(iobj) if name.endswith('lz4'): self.cur_decompressor = lz4framed.Decompressor(handle) handle = self.cur_decompressor if not self.is_dry() and (not self.args.continue1 or not os.path.exists(file_leafs) or not os.path.exists(file_roots)): utils.safely_remove(file_leafs) utils.safely_remove(file_roots) self.file_leafs_fh = utils.safe_open(file_leafs, mode='w', chmod=0o644) self.file_roots_fh = utils.safe_open(file_roots, mode='w', chmod=0o644) elif self.args.continue1: logger.info('Continuing with the started files') self.file_leafs_fh = open(file_leafs, mode='r+' if not self.is_dry() else 'r') self.file_roots_fh = open(file_roots, mode='r+' if not self.is_dry() else 'r') self.restore_checkpoint(iobj) self.continue_leafs(file_leafs) with iobj: resume_token_found = False resume_token = None resume_idx = 0 record_ctr = -1 already_processed = 0 read_start = self.read_data for idx, record in self.processor.process(handle): try: record_ctr += 1 self.read_data += len(record) # Check the checkpoint distance + boundary - process all newline chunks available if self.read_data - self.last_report >= 1024 * 1024 * 1024 and self.processor.step_cur_last_element: logger.info( '...progress: %s GB, idx: %s, pos: %s GB, ' 'found: %s, mem: %04.8f MB, readpos: %s (%4.6f GB)' % (self.read_data / 1024.0 / 1024.0 / 1024.0, idx, self.read_data, self.num_found, utils.get_mem_usage() / 1024.0, iobj.tell(), iobj.tell() / 1024.0 / 1024.0 / 1024.0)) self.last_report = self.read_data self.try_store_checkpoint(iobj=iobj, idx=idx, resume_idx=resume_idx, resume_token=resume_token) # Flush already seen IP database, not needed anymore # we are too far from the resumed checkpoint if read_start + 1024 * 1024 * 1024 * 2 > self.read_data: self.state_loaded_ips = set() js = json.loads(record) self.process_record(idx, js) except Exception as e: logger.error('Exception in processing %d: %s' % (self.ctr, e)) logger.debug(traceback.format_exc()) logger.debug(record) self.ctr += 1 logger.info('Total: %d' % self.ctr) logger.info('Total_chain: %d' % self.chain_ctr) logger.info('Not tls: %d' % self.not_tls) logger.info('Not cert ok: %d' % self.not_cert_ok) logger.info('Not chain ok: %d' % self.not_chain_ok) logger.info('Not parsed: %d' % self.not_parsed) logger.info('Not rsa: %d' % self.not_rsa) logger.info('Processed: %s' % iobj) if not self.is_dry(): self.file_leafs_fh.close() self.file_roots_fh.close() utils.try_touch(finish_file)
def handle_client(conn,addr): utils.write_output_formatted(MODE,f"[NEW CONNECTION] {addr} connected.",SERVER_OUTPUT_DIR_LOG) connected=True while connected: pre_msg_header=conn.recv(HEADER) msg_header=pre_msg_header.decode(FORMAT) conn.send(CONFIRMATION_MSG.encode(FORMAT)) if msg_header.strip(): header_elems=msg_header.split('-') msg_cat=header_elems[0].strip() msg_type=header_elems[1].strip() if msg_cat=="SEND": msg_size=int(header_elems[2].strip()) if msg_type=="TEXT": msg=conn.recv(msg_size).decode(FORMAT) conn.send(CONFIRMATION_MSG.encode(FORMAT)) utils.write_output_formatted(MODE,"Received text message: {}".format(msg),SERVER_OUTPUT_DIR_LOG) if msg==DISCONNECT_MSG: connected=False elif msg_type=="FILE": data=utils.receive_chunks(conn,msg_size) conn.send(CONFIRMATION_MSG.encode(FORMAT)) filename=msg_header.split('-')[3].strip() with utils.safe_open(f"./{filename}",'wb') as f: f.write(data) utils.write_output_formatted(MODE,"Received file {}".format(filename),SERVER_OUTPUT_DIR_LOG) elif msg_cat=="REQUEST": if msg_type=="LOG": logpath=utils.get_latest_log() if logpath: msg="True" msg_send=b' '*(HEADER-len(msg))+msg.encode(FORMAT) conn.send(msg_send) with open(logpath,'rb') as f: logdata=f.read() logdata_size=str(len(logdata)).encode(FORMAT) logdata_size += b' ' * (HEADER-len(logdata_size)) conn.send(logdata_size) utils.send_chunks(conn,logdata) utils.write_output_formatted(MODE,"Sent log file {}".format(logpath),SERVER_OUTPUT_DIR_LOG) else: msg="False" msg_send=b' '*(HEADER-len(msg))+msg.encode(FORMAT) conn.send(msg_send) elif msg_type=="PLOT": ticker=header_elems[2].strip() plot=utils.get_plot(ticker) if plot: msg="True" msg_send=b' '*(HEADER-len(msg))+msg.encode(FORMAT) conn.send(msg_send) with open(plot,'rb') as f: plotdata=f.read() plotdata_size=str(len(plotdata)).encode(FORMAT) plotdata_size += b' ' * (HEADER-len(plotdata_size)) conn.send(plotdata_size) utils.send_chunks(conn,plotdata) utils.write_output_formatted(MODE,"Sent plot {}".format(utils.get_plot(ticker)),SERVER_OUTPUT_DIR_LOG) else: msg="False" msg_send=b' '*(HEADER-len(msg))+msg.encode(FORMAT) conn.send(msg_send) utils.write_output_formatted(MODE,f"Closing connection with {addr}.",SERVER_OUTPUT_DIR_LOG) conn.close()
def read(self): with safe_open(self.path, "r") as f: return parse_env_file(f.read())