def read_tfgenes(tgfile): """Read tf-gene pairs""" logger.info('Reading TF-gene pairs in %s ...', tgfile) reader = TsvReader(tgfile, cnames=False) ret = {} # gene => tf for row in reader: ret.setdefault(row[1], set()).add(row[0]) reader.close() return ret
# get gene, snp pairs """ chr1 12463073 12463074 AADACL4 0 + chr1 12463073 12463074 chr1_12463073_rs7547740_A_G 0 + chr1 12480504 12480505 AADACL4 0 + chr1 12480504 12480505 chr1_12480504_rs6660365_T_C 0 + chr1 12496021 12496022 AADACL4 0 + chr1 12496021 12496022 chr1_12496021_rs6541023_T_C 0 + """ mutgenes = defaultdict(lambda: []) intereader = TsvReader(interfile) genes = set() for r in intereader: if not r[3] in genetfs: continue mutgenes[r[9]].append(r[3]) genes.add(r[3]) intereader.close() # shrink the sets genetfs = {g: genetfs[g] for g in genes} tfs = list({tf for gtfs in genetfs.values() for tf in gtfs}) # nothing, write empty files if not mutgenes or not genes: open(outdata, 'w').close() open(outgroup, 'w').close() open(outcase, 'w').close() exit(0) # save the data file # expfile """
infile = {{i.infile | quote}} snpfile = {{o.snpfile | quote}} genefile = {{o.genefile | quote}} snppergene = {{args.snppergene | repr}} nchr = {{args.nchr | repr}} seed = {{args.seed | repr}} # distances between genes dist = {{args.dist | repr}} random.seed(seed) reader = TsvReader(infile, cnames=False) allsnps = set(reader.dump(0)) reader.rewind() allgenes = set(reader.dump(1)) reader.close() # assign a probability to each snp nsnps = len(allsnps) ngenes = len(allgenes) snp_probs = dict(zip(allsnps, random.choices(range(ngenes * snppergene), k=nsnps))) genebed = TsvWriter(genefile) snpbed = TsvWriter(snpfile) geneperchr = math.ceil(float(ngenes) / float(nchr)) for i, gene in enumerate(allgenes): chrname = 'chr' + str(int(i % nchr) + 1) start = (int(i / nchr) + 1) * dist end = start + 1