# parse the file with the known fusion genes data = set() d = [ line.upper().rstrip("\r\n").split("\t")[1] for line in file(tmp_file, "r") ] d.pop(0) # remove the header data = set(d) print "%d known genes found (using gene symbols)" % (len(data), ) # read the gene symbols file_symbols = os.path.join(options.output_directory, 'synonyms.txt') loci = symbols.generate_loci(file_symbols) genes = symbols.read_genes_symbols(file_symbols) d = [] for g in data: ens = symbols.ensembl(g.upper(), genes, loci) if ens: d.extend(ens) data = [line + '\n' for line in d] data = sorted(set(data)) print "%d known genes found (after conversion to Ensembl ids)" % ( len(data), )
# # # print "Reading the input file...", options.input mygenes = [line.rstrip('\r\n').split('\t') for line in file(options.input,'r').readlines() if line.rstrip('\r\n')] data = [] if mygenes: file_symbols1 = os.path.join(os.path.dirname(options.output),'genes_symbols.txt') file_symbols2 = os.path.join(os.path.dirname(options.output),'synonyms.txt') loci1 = symbols.generate_loci(file_symbols1) loci2 = symbols.generate_loci(file_symbols2) genes1 = symbols.read_genes_symbols(file_symbols1) genes2 = symbols.read_genes_symbols(file_symbols2) d = [] for (g1,g2) in mygenes: if g1 and g2 and g1.upper() != g2.upper(): ens1 = symbols.ensembl(g1.upper(),genes1,loci1) ens2 = symbols.ensembl(g2.upper(),genes1,loci1) if not ens1: ens1 = symbols.ensembl(g1.upper(),genes2,loci2) if not ens2: ens2 = symbols.ensembl(g2.upper(),genes2,loci2)
for gg1 in g1: for gg2 in g2: if gg1 and gg2 and gg1 != gg2: (gg1,gg2) = (gg2,gg1) if gg2 < gg1 else (gg1,gg2) data.add((gg1,gg2)) print " - found",len(data),"fusions" # save version of txt = ['Non-cancer tissues and cells (Babiceanu et al. Nucl. Acids Res. 2016) database version: %s\n' % (today.strftime("%Y-%m-%d"),)] file(os.path.join(options.output_directory,'version.txt'),'a').writelines(txt) # # read the gene symbols file_symbols = os.path.join(options.output_directory,'synonyms.txt') loci = symbols.generate_loci(file_symbols) genes = symbols.read_genes_symbols(file_symbols) d = [] for (g1,g2) in data: if g1.upper() != g2.upper(): ens1 = symbols.ensembl(g1.upper(),genes,loci) ens2 = symbols.ensembl(g2.upper(),genes,loci) if ens1 and ens2: for e1 in ens1: for e2 in ens2: if e1 != e2: d.append([e1,e2]) data = ['\t'.join(sorted(line)) + '\n' for line in d]
print "Reading the input file...", options.input mygenes = [ line.rstrip('\r\n').split('\t') for line in file(options.input, 'r').readlines() if line.rstrip('\r\n') ] data = [] if mygenes: file_symbols1 = os.path.join(os.path.dirname(options.output), 'genes_symbols.txt') file_symbols2 = os.path.join(os.path.dirname(options.output), 'synonyms.txt') loci1 = symbols.generate_loci(file_symbols1) loci2 = symbols.generate_loci(file_symbols2) genes1 = symbols.read_genes_symbols(file_symbols1) genes2 = symbols.read_genes_symbols(file_symbols2) d = [] for (g1, g2) in mygenes: if g1 and g2 and g1.upper() != g2.upper(): ens1 = symbols.ensembl(g1.upper(), genes1, loci1) ens2 = symbols.ensembl(g2.upper(), genes1, loci1) if not ens1: ens1 = symbols.ensembl(g1.upper(), genes2, loci2) if not ens2: ens2 = symbols.ensembl(g2.upper(), genes2, loci2)