def maybe_gzip_open(filename, *args, **kwargs): if filename.endswith('.gz'): return closing(_gzip_open(filename, *args, **kwargs)) elif filename == '-': return sys.stdin else: return open(filename, *args, **kwargs)
def find_harm_cnvs_in_no_harm_patients(out_pred_w_cnv_file, cnv_dict): f = _gzip_open(out_pred_w_cnv_file, 'r') for line in f: line = line.rstrip() line_split = line.split() nw_numharm = int(line_split[18]) cnv_name = line_split[6] predicted = line_split[2] cnv_res = cnv_dict.get(cnv_name, []) cnv_res_split = cnv_res.split() if len(cnv_res) > 0: chrom = cnv_res_split[0] start = cnv_res_split[3] end = cnv_res_split[4] else: chrom = 'None' start = 'None' end = 'None' if nw_numharm == 0 and predicted == '2:HARMFUL': print '\t'.join(['%s:%s-%s' % (chrom, start, end), line, cnv_res]) f.close()
def gzip_open(*args, **kwargs): return closing(_gzip_open(*args, **kwargs))
def maybe_gzip_open(filename, *args, **kwargs): if filename.lower().endswith(".gz"): return closing(_gzip_open(filename, *args, **kwargs)) else: return open(filename, *args, **kwargs)
def maybe_gzip_open(filename, *args, **kwargs): if filename.lower().endswith('.gz'): return closing(_gzip_open(filename, *args, **kwargs)) else: return open(filename, *args, **kwargs)
def random_out(iteration, info_dict, num_sets): if args.add_remaining == 'new': num_sets_w_new = num_sets + 1 else: num_sets_w_new = num_sets out_dict_encoding = fill_out_set(num_sets_w_new, {'cnvbal':{}, 'cnvnbal':{}}) iteration_counts = OrderedDict() iteration_file = {} for set_id in range(num_sets_w_new): iteration_counts[set_id] = OrderedDict() iteration_file[set_id] = OrderedDict() for bal_nbal in ['cnvbal', 'cnvnbal']: iteration_counts[set_id][bal_nbal] = OrderedDict() iteration_counts[set_id][bal_nbal]['PHENOTYPE'] = OrderedDict() iteration_counts[set_id][bal_nbal]['PHENOTYPE']['HARMFUL'] = set() iteration_counts[set_id][bal_nbal]['PHENOTYPE']['BENIGN'] = set() iteration_counts[set_id][bal_nbal]['PATIENT'] = OrderedDict() iteration_counts[set_id][bal_nbal]['PATIENT']['HARMFUL'] = set() iteration_counts[set_id][bal_nbal]['PATIENT']['BENIGN'] = set() iteration_counts[set_id][bal_nbal]['CNV'] = OrderedDict() iteration_counts[set_id][bal_nbal]['CNV']['HARMFUL'] = set() iteration_counts[set_id][bal_nbal]['CNV']['BENIGN'] = set() iteration_counts[set_id][bal_nbal]['GENE'] = OrderedDict() iteration_counts[set_id][bal_nbal]['GENE']['HARMFUL'] = set() iteration_counts[set_id][bal_nbal]['GENE']['BENIGN'] = set() iteration_file[set_id][bal_nbal] = _gzip_open('%s_%s_%s_%s.arff.gz' % (set_id, args.out_file, iteration, bal_nbal), 'w') clone_dict = expand_info_dict(info_dict) # clone_dict = copy.deepcopy(info_dict) phenotype, out_genes_cnv_bal, out_genes_cnv_nbal = random_info_dict(clone_dict, args.balance_test, num_sets) i = 0 while phenotype: # print query_info_dict(clone_dict) for bal_nbal, out_genes in zip(['cnvbal', 'cnvnbal'], [out_genes_cnv_bal, out_genes_cnv_nbal]): for label, gene_list in out_genes.iteritems(): weka_gene_i = 0 for weka_gene in gene_list: (patient, cnv, case_control, dup, hposim, original_gene, hpo_term, infomax) = weka_gene.split('\t')[2:10] iteration_counts[label][bal_nbal]['PHENOTYPE'][case_control].add(hpo_term) iteration_counts[label][bal_nbal]['PATIENT'][case_control].add(patient) iteration_counts[label][bal_nbal]['CNV'][case_control].add(cnv) iteration_counts[label][bal_nbal]['GENE'][case_control].add('%s_%s_%s' % (cnv, original_gene, weka_gene_i)) # print hpo_term, patient, cnv, original_gene if not weka_gene in out_dict_encoding[label][bal_nbal]: out_dict_encoding[label][bal_nbal][weka_gene] = 0 out_dict_encoding[label][bal_nbal][weka_gene] += 1 weka_gene_i += 1 phenotype, out_genes_cnv_bal, out_genes_cnv_nbal = random_info_dict(clone_dict, args.balance_test, num_sets) # print phenotype, query_info_dict(clone_dict) i += 1 for bal_nbal in ['cnvbal', 'cnvnbal']: for label in range(num_sets_w_new): # num_count -> arff weight # TODO: maybe should be at randomization level? for gene_line, num_count in out_dict_encoding[label][bal_nbal].iteritems(): if args.weighted_gene_duplication == 'sim': exponator = 2 else: exponator = 1 print >> iteration_file[label][bal_nbal], gene_line.replace('{}', '{%s}' % num_count**exponator) # TODO: use this to duplicate lines instead of weighing # for i in range(num_count): # print >> iteration_file[label][bal_nbal], gene_line.replace('{}', '{%s}' % 1) iteration_file[label][bal_nbal].close() print 'iteration: %s, %s' % (iteration, i) for train_test_key, train_test_val in iteration_counts.iteritems(): for subset_key, subset_val in train_test_val.iteritems(): for case_control_key, case_control_val in subset_val.iteritems(): for asdf_key, asdf_val in case_control_val.iteritems(): print train_test_key, subset_key, case_control_key, asdf_key, len(asdf_val) sys.stdout.flush()
import re from gzip import open as _gzip_open parser = argparse.ArgumentParser(description='Remove certain features from an arff file.') parser.add_argument('--remove', '-R', help="The feature indexes to remove.") parser.add_argument('--input', '-i', help="Input arff file.", required=True) parser.add_argument('--output', '-o', help="Output arff file. If none, use stdout.") parser.add_argument('--debug', '-d', help="Debug", action='store_true') args = parser.parse_args() if args.output == None: output_file = sys.stdout else: output_file = _gzip_open(args.output, 'w') intervals = args.remove.split(',') remove_list = [] for interval in intervals: if '-' in interval: (start, end) = interval.split('-') start = int(start) end = int(end) remove_list += range(start-1, end) else: remove_list.append(int(interval)-1) remove_list_mone = [remove + 1 for remove in remove_list]
parser.add_argument('similarity_rank_cutoff', type=int) parser.add_argument('balance_test', choices=['bt_none', 'bt_remaining', 'bt_patient', 'bt_ptrem'], \ help="none for not balancing, remaining to balance and keep the remaining, patient to balance by patient, ptrem to balance by patient and keep the remainging.") parser.add_argument('--balance_genes', '-b', action='store_true') parser.add_argument('--debug', '-d', help="Debug.", action='store_true') args = parser.parse_args() if args.neighbour_weight_function == "0": anwf = "1" else: anwf = args.neighbour_weight_function fn_str = 'lambda x: %s' % args.neighbour_weight_function print fn_str NEIGHBOUR_WEIGHT_FUNCTION = eval(fn_str) f = _gzip_open(args.weka_file, 'r') line = f.readline().split('\t') for i in range(len(line)): if '{' in line[i]: cutoff_index = i break f.close() prev = None current = None # out_gene_file = open(args.out_gene_file, 'w') log_line_original_gene = [] log_score_original_gene = [] log_line_cnv = []
def gzip_open(*args, **kwargs): return _gzip_open(*args, **kwargs)
"IEA", }, ), ] ) RESERVED_FIELD_NUM = 7 if args.neighbour_weight_function == "0": anwf = "1" else: anwf = args.neighbour_weight_function fn_str = "lambda x: %s" % anwf print fn_str NEIGHBOUR_WEIGHT_FUNCTION = eval(fn_str) output_file = _gzip_open(args.output_file, "w") def timer(s): global start global totalstart totalelapsed = time.clock() - totalstart elapsed = time.clock() - start start = time.clock() print "%s...%s...%s" % (s, elapsed, totalelapsed) sys.stdout.flush() def load_gene_net(): iin = open(args.gene_network_file, "r")
namespace_dict[current_ont] = current_namespace f.close() return ont_dict, namespace_dict # file to create hp <-> hp layer hp2parents, _ = parse_ontology(args.hp_file) # file to create go <-> go layer go2parents, namespace_dict = parse_ontology(args.go_file) if args.j48_graph_file == '_': print json.dumps(namespace_dict, indent=4) if not args.j48_graph_file == '_': f = _gzip_open(args.j48_graph_file, 'r') for line in f: line = line.strip() if not len(line) > 0: continue if not line[0] == 'N': continue if '->' in line: n1 = uniq_id + "_" + line.split('->')[0] n2 = uniq_id + "_" + line.split('->')[1].split()[0] label = line.split('"')[1] label = label.replace('>', 'gt') label = label.replace('<', 'lt')
from gzip import open as _gzip_open from collections import OrderedDict parser = argparse.ArgumentParser(description="Create a CNV arff file from a gene arff file.") parser.add_argument('cnvs_w_dgv_overlap', help="DGV annotation file.") parser.add_argument('out_pt', help="Results pivot table file. Summary of harmful/benign classification of genes per cnv.") parser.add_argument('out_pred_w_cnv', help="Weka predictions with cnv annotations.") parser.add_argument('--incorrect', '-i', help="Only output incorrect.", action='store_true') parser.add_argument('--debug', '-d', help="Debug", action='store_true') args = parser.parse_args() cnv_dict = OrderedDict() # arff f = _gzip_open(args.cnvs_w_dgv_overlap, 'r') for line in f: (cnv, length, num_overlap, metric_overlap, sample, phenotype) = line.rstrip().split() cnv_dict[cnv] = OrderedDict([['length', length], ['num_overlap', num_overlap], ['metric_overlap', metric_overlap], ['sample', sample], ['phenotype', phenotype], ['maxgene', 'NoCnv'], ['conf', -1], ['simscore', -1], ]) f.close() # assign the max gene to look at for each cnv f = _gzip_open(args.out_pred_w_cnv)
import sys import argparse from pandas import * import itertools import copy from gzip import open as _gzip_open parser = argparse.ArgumentParser(description='Calculate the DGV overlap feature for each CNV.') parser.add_argument('cnvs_w_dgv_bed', help="Input CNVs/DGV overlap bed file.") parser.add_argument('--debug', '-d', help="Debug.", action='store_true') args = parser.parse_args() # arff f = _gzip_open(args.cnvs_w_dgv_bed, 'r') prev_cnv = None cnv_name = None overlap_log = [] line = f.readline() keep_looping = True while keep_looping: if not line: keep_looping = False cnv_name = None else: line = line.strip().split('\t') prev_cnv = cnv_name