Пример #1
0
            # parse the file with the known fusion genes
            data = set()

            d = [
                line.upper().rstrip("\r\n").split("\t")[1]
                for line in file(tmp_file, "r")
            ]
            d.pop(0)  # remove the header
            data = set(d)

            print "%d known genes found (using gene symbols)" % (len(data), )

            # read the gene symbols
            file_symbols = os.path.join(options.output_directory,
                                        'synonyms.txt')
            loci = symbols.generate_loci(file_symbols)

            genes = symbols.read_genes_symbols(file_symbols)

            d = []
            for g in data:
                ens = symbols.ensembl(g.upper(), genes, loci)
                if ens:
                    d.extend(ens)

            data = [line + '\n' for line in d]
            data = sorted(set(data))

            print "%d known genes found (after conversion to Ensembl ids)" % (
                len(data), )

    #
    #
    #

    print "Reading the input file...", options.input
    mygenes = [line.rstrip('\r\n').split('\t') for line in file(options.input,'r').readlines() if line.rstrip('\r\n')]

    data = []
    if mygenes:

        file_symbols1 = os.path.join(os.path.dirname(options.output),'genes_symbols.txt')
        file_symbols2 = os.path.join(os.path.dirname(options.output),'synonyms.txt')

        loci1 = symbols.generate_loci(file_symbols1)
        loci2 = symbols.generate_loci(file_symbols2)

        genes1 = symbols.read_genes_symbols(file_symbols1)
        genes2 = symbols.read_genes_symbols(file_symbols2)

        d = []
        for (g1,g2) in mygenes:
            if g1 and g2 and g1.upper() != g2.upper():
                ens1 = symbols.ensembl(g1.upper(),genes1,loci1)
                ens2 = symbols.ensembl(g2.upper(),genes1,loci1)
                if not ens1:
                    ens1 = symbols.ensembl(g1.upper(),genes2,loci2)
                if not ens2:
                    ens2 = symbols.ensembl(g2.upper(),genes2,loci2)
                    
Пример #3
0
                for gg1 in g1:
                    for gg2 in g2:
                        if gg1 and gg2 and gg1 != gg2:
                            (gg1,gg2) = (gg2,gg1) if gg2 < gg1 else (gg1,gg2)
                            data.add((gg1,gg2))

            print " - found",len(data),"fusions"

            # save version of
            txt = ['Non-cancer tissues and cells (Babiceanu et al. Nucl. Acids Res. 2016) database version: %s\n' % (today.strftime("%Y-%m-%d"),)]
            file(os.path.join(options.output_directory,'version.txt'),'a').writelines(txt)

    #
            # read the gene symbols
            file_symbols = os.path.join(options.output_directory,'synonyms.txt')
            loci = symbols.generate_loci(file_symbols)

            genes = symbols.read_genes_symbols(file_symbols)

            d = []
            for (g1,g2) in data:
                if g1.upper() != g2.upper():
                    ens1 = symbols.ensembl(g1.upper(),genes,loci)
                    ens2 = symbols.ensembl(g2.upper(),genes,loci)
                    if ens1 and ens2:
                        for e1 in ens1:
                            for e2 in ens2:
                                if e1 != e2:
                                    d.append([e1,e2])

            data = ['\t'.join(sorted(line)) + '\n' for line in d]
Пример #4
0
    print "Reading the input file...", options.input
    mygenes = [
        line.rstrip('\r\n').split('\t')
        for line in file(options.input, 'r').readlines() if line.rstrip('\r\n')
    ]

    data = []
    if mygenes:

        file_symbols1 = os.path.join(os.path.dirname(options.output),
                                     'genes_symbols.txt')
        file_symbols2 = os.path.join(os.path.dirname(options.output),
                                     'synonyms.txt')

        loci1 = symbols.generate_loci(file_symbols1)
        loci2 = symbols.generate_loci(file_symbols2)

        genes1 = symbols.read_genes_symbols(file_symbols1)
        genes2 = symbols.read_genes_symbols(file_symbols2)

        d = []
        for (g1, g2) in mygenes:
            if g1 and g2 and g1.upper() != g2.upper():
                ens1 = symbols.ensembl(g1.upper(), genes1, loci1)
                ens2 = symbols.ensembl(g2.upper(), genes1, loci1)
                if not ens1:
                    ens1 = symbols.ensembl(g1.upper(), genes2, loci2)
                if not ens2:
                    ens2 = symbols.ensembl(g2.upper(), genes2, loci2)