Exemplo n.º 1
0
def blastp2geneMap(blastFiles,organism1,taxID1,organism2,taxID2,gene2refseq,topNum,IDtype='protein'):
    """
    This function transfers blastp results into geneID mapping results
    
    * blastFiles: a list of 2  way blastp tabular result files. eg: [blast1.txt,blast2.txt]
    
    * organism1: string. the 1st organism. eg: 'cho'
    
    * organism2: string. the 2nd organism. eg: 'human'
    
    * gene2refseq: filename. eg: 'gene2refseq'
    
    returns a list of four files. For first 2: each with two columns of gene ID mapping.
    For thre rest 2: gene2refseq files for each organism
    """
    # extract gene, accession, protein mapping
    if IDtype == 'protein':
        columnNum = [2,6,7,16]
    else:
        columnNum = [2,4,5,16]
    org1ref = extract_from_gene2ref(gene2refseq,taxID1,organism1,columnNum)
    org2ref = extract_from_gene2ref(gene2refseq,taxID2,organism2,columnNum)
    result = []
    switches = ['False','True']
    for blast,switch in zip(blastFiles,switches):
        # extract protein id map
        pr_id_map = extract_blast_ID_map(blast,topNum,switch)  # pr_id_map: cho2human.top1.txt
        # protein id mapping to gene id mapping
        gene_id_map = mRNA_prIDMap2geneIDMap(pr_id_map,org1ref,org2ref,IDtype) # cho2human.top1.gene.txt
        # get unique line of mapping
        uniqFile = gene_id_map[:-3] + 'uniqline.txt'   # cho2human.top1.gene.uniqline.txt
        interFile = gene_id_map[:-3] + 'inter.txt'
        # # # get unique id mappings (each line is unique, but genes are not unique)
        if switch == 'True':
            cmd1 = ('awk -F $\'\\t\' \'BEGIN {OFS} {printrow}\' {input} > {output}').format(
                                      OFS='{FS=\"\\t\"; OFS=FS}',printrow='{print $1,$2}',input=gene_id_map,output=interFile)
            cmd2 = ('sort -k2,2 -k1,1 {input} | uniq > {output}').format(input=interFile,output=uniqFile)
        else:
            cmd1 = ('awk -F $\'\\t\' \'BEGIN {OFS} {printrow}\' {input} > {output}').format(
                                      OFS='{FS=\"\\t\"; OFS=FS}',printrow='{print $1,$2}',input=gene_id_map,output=interFile)
            cmd2 = ('sort -k1,1 -k2,2 {input} | uniq > {output}').format(input=interFile,output=uniqFile)
            
        subprocess.call(cmd1,shell=True)
        subprocess.call(cmd2,shell=True)
        
        subprocess.call(('rm {inter}').format(inter=interFile),shell=True)
        # unique gene ID in 1st column
        uniq = uniq1stGene(uniqFile) # cho2human.top1.gene.uniqline.uniq1stgene.txt
        result.append(uniq)
    result.extend([org1ref,org2ref])
    return result
Exemplo n.º 2
0
def blastp2geneMap(blastFiles,
                   organism1,
                   taxID1,
                   organism2,
                   taxID2,
                   gene2refseq,
                   topNum,
                   IDtype='protein'):
    """
    This function transfers blastp results into geneID mapping results
    
    * blastFiles: a list of 2  way blastp tabular result files. eg: [blast1.txt,blast2.txt]
    
    * organism1: string. the 1st organism. eg: 'cho'
    
    * organism2: string. the 2nd organism. eg: 'human'
    
    * gene2refseq: filename. eg: 'gene2refseq'
    
    returns a list of four files. For first 2: each with two columns of gene ID mapping.
    For thre rest 2: gene2refseq files for each organism
    """
    # extract gene, accession, protein mapping
    if IDtype == 'protein':
        columnNum = [2, 6, 7, 16]
    else:
        columnNum = [2, 4, 5, 16]
    org1ref = extract_from_gene2ref(gene2refseq, taxID1, organism1, columnNum)
    org2ref = extract_from_gene2ref(gene2refseq, taxID2, organism2, columnNum)
    result = []
    switches = ['False', 'True']
    for blast, switch in zip(blastFiles, switches):
        # extract protein id map
        pr_id_map = extract_blast_ID_map(
            blast, topNum, switch)  # pr_id_map: cho2human.top1.txt
        # protein id mapping to gene id mapping
        gene_id_map = mRNA_prIDMap2geneIDMap(pr_id_map, org1ref, org2ref,
                                             IDtype)  # cho2human.top1.gene.txt
        # get unique line of mapping
        uniqFile = gene_id_map[:-3] + 'uniqline.txt'  # cho2human.top1.gene.uniqline.txt
        interFile = gene_id_map[:-3] + 'inter.txt'
        # # # get unique id mappings (each line is unique, but genes are not unique)
        if switch == 'True':
            cmd1 = (
                'awk -F $\'\\t\' \'BEGIN {OFS} {printrow}\' {input} > {output}'
            ).format(OFS='{FS=\"\\t\"; OFS=FS}',
                     printrow='{print $1,$2}',
                     input=gene_id_map,
                     output=interFile)
            cmd2 = ('sort -k2,2 -k1,1 {input} | uniq > {output}').format(
                input=interFile, output=uniqFile)
        else:
            cmd1 = (
                'awk -F $\'\\t\' \'BEGIN {OFS} {printrow}\' {input} > {output}'
            ).format(OFS='{FS=\"\\t\"; OFS=FS}',
                     printrow='{print $1,$2}',
                     input=gene_id_map,
                     output=interFile)
            cmd2 = ('sort -k1,1 -k2,2 {input} | uniq > {output}').format(
                input=interFile, output=uniqFile)

        subprocess.call(cmd1, shell=True)
        subprocess.call(cmd2, shell=True)

        subprocess.call(('rm {inter}').format(inter=interFile), shell=True)
        # unique gene ID in 1st column
        uniq = uniq1stGene(
            uniqFile)  # cho2human.top1.gene.uniqline.uniq1stgene.txt
        result.append(uniq)
    result.extend([org1ref, org2ref])
    return result
Exemplo n.º 3
0
def DB4unOverlap(unmapGeneIDs, org1ref, org2ref, blastFiles, topPrNum,
                 topGeneNum):
    """
    This function tries to find why the 2wayblastP result don't have overlapped mapping ids.
    It builds a database file that has all gene ids mappings from both sides and then we can check whethe there 
    are some overlapps
    
    * unmapGeneIDs: filename. gene ids that don't have overlapped mapping results.
    
    * or
    """
    # -------------- 1. get the protein ids of unoverlapped gene ids ---------------
    #     org1ref = '/data/shangzhong/CHO2Mouse/2wayBlastPresult/141026gene2refseq.cho.txt'
    #     org2ref = '/data/shangzhong/CHO2Mouse/2wayBlastPresult/141026gene2refseq.mouse.txt'
    #     # -------------- 2. get top 5 protein mappings ---------------------------------
    #     blastFiles = ['/data/shangzhong/CHO2Mouse/2wayBlastPresult/all/cho2mouse.txt','/data/shangzhong/CHO2Mouse/2wayBlastPresult/all/mouse2cho.txt']
    indexFile = []
    switches = ['False', 'True']
    for blast, switch in zip(blastFiles, switches):
        pr_id_map = extract_blast_ID_map(
            blast, topPrNum, switch)  # pr_id_map: cho2human.top5.txt
        # protein id mapping to gene id mapping
        gene_id_map = mRNA_prIDMap2geneIDMap(
            pr_id_map, org1ref, org2ref, switch,
            IDtype='protein')  # cho2human.top5.gene.txt
        # delete the consecutive repeated lines, for lines repeated but are seperated by other lines, they will retain
        uniq_id_map = gene_id_map[:-3] + 'uniq.txt'  # cho2human.top5.gene.uniq.txt
        cmd = ('rev {input} | uniq -f 2 | rev > {output}').format(
            input=gene_id_map, output=uniq_id_map)
        subprocess.call(cmd, shell=True)
        # index the id mapping, this is for one id in org1 has multiple ids in org2 mapped to
        index_map = indexUniqline(uniq_id_map,
                                  switch)  # cho2human.top5.gene.uniq.index.txt
        # sort index file
        sort_map = index_map[:-3] + 'sort.txt'  # cho2human.top5.gene.uniq.index.sort.txt
        if switch == 'False':  # sort based on 1st column, then on 2nd column
            cmd = ('sort -k1,1n -k2,2n {input} > {output}').format(
                input=index_map, output=sort_map)
        else:
            cmd = ('sort -k2,2n -k1,1n {input} > {output}').format(
                input=index_map, output=sort_map)
        subprocess.call(cmd, shell=True)
        #sort_map = '/data/shangzhong/CHO2Human/2wayBlastPresult/top5/cho2human.top250.gene.uniq.index.sort.txt'
        # get uniq first two ids.
        uniqline_map = uniqFirst2Col(
            sort_map)  # cho2human.top5.gene.uniq.index.sort.uniqline.txt
        # sort by 1rs and 3rd columns for cho2human, and 2nd and 3rd columns for human2cho
        sortbyindex_map = uniqline_map[:-3] + 'index.txt'  # cho2human.top5.gene.uniq.index.sort.uniqline.index.txt
        if switch == 'False':
            cmd = ('sort -k1,1n -k5,5n {input} > {output}').format(
                input=uniqline_map, output=sortbyindex_map)
        else:
            cmd = ('sort -k2,2n -k5,5n {input} > {output}').format(
                input=uniqline_map, output=sortbyindex_map)
        subprocess.call(cmd, shell=True)
        final_index = indexUniqline(
            sortbyindex_map, switch
        )  # cho2human.top5.gene.uniq.index.sort.uniqline.index.index.txt
        indexFile.append(final_index)
    return indexFile
    # -------------- 3. get unoverlapped proteins into a list ---------------
    # get unonverlapped ids
    unmapGeneIDs = '/data/shangzhong/CHO2Mouse/finalresult/CHO2Mouse_nonOverlap.txt'
    geneIds = {}
    with open(unmapGeneIDs, 'r') as inputfile:
        for line in inputfile:
            geneIds[line[:-1]] = [[] for i in range(4)]
    # -------------- 4. merge two index files into one ---------------------------------
    # indexFile = ['/data/shangzhong/CHO2Mouse/2wayBlastPresult/all/cho2mouse.top250.gene.uniq.index.sort.uniqline.index.index.txt',
    #               '/data/shangzhong/CHO2Mouse/2wayBlastPresult/all/mouse2cho.top250.gene.uniq.index.sort.uniqline.index.index.txt']
    topGeneNum = 5
    res = open(indexFile[0], 'r')
    for line in res:
        item = line[:-1].split('\t')
        if item[0] in geneIds:
            if int(item[4]) > topGeneNum:
                continue
            else:
                geneIds[item[0]][1].append(item[1])
                geneIds[item[0]][0].append(item[4])
        else:
            continue

    res = open(indexFile[1], 'r')
    for line in res:
        item = line[:-1].split('\t')
        if item[0] in geneIds:
            if int(item[4]) > topGeneNum:
                continue
            else:
                geneIds[item[0]][2].append(item[1])
                geneIds[item[0]][3].append(item[4])
        else:
            continue

    outputfile = indexFile[0][:-22] + 'nonmap.txt'  # cho2human.top5.nonmap.txt
    output = open(outputfile, 'w')
    for key in geneIds:
        outline = (
            '{key}\t{cho2human}\t{human2cho}\n-\t{cho2humanIndex}\t{human2choIndex}\n'
        ).format(key=key,
                 cho2human=','.join(geneIds[key][1]),
                 human2cho=','.join(geneIds[key][2]),
                 human2choIndex=','.join(geneIds[key][3]),
                 cho2humanIndex=','.join(geneIds[key][0]))
        output.write(outline)
    output.close()
    print 'done'
Exemplo n.º 4
0
def DB4unOverlap(unmapGeneIDs,org1ref,org2ref,blastFiles,topPrNum,topGeneNum):
    """
    This function tries to find why the 2wayblastP result don't have overlapped mapping ids.
    It builds a database file that has all gene ids mappings from both sides and then we can check whethe there 
    are some overlapps
    
    * unmapGeneIDs: filename. gene ids that don't have overlapped mapping results.
    
    * or
    """
    # -------------- 1. get the protein ids of unoverlapped gene ids ---------------
#     org1ref = '/data/shangzhong/CHO2Mouse/2wayBlastPresult/141026gene2refseq.cho.txt'
#     org2ref = '/data/shangzhong/CHO2Mouse/2wayBlastPresult/141026gene2refseq.mouse.txt'
#     # -------------- 2. get top 5 protein mappings ---------------------------------
#     blastFiles = ['/data/shangzhong/CHO2Mouse/2wayBlastPresult/all/cho2mouse.txt','/data/shangzhong/CHO2Mouse/2wayBlastPresult/all/mouse2cho.txt']
    indexFile = []
    switches = ['False','True']
    for blast,switch in zip(blastFiles,switches):
        pr_id_map = extract_blast_ID_map(blast,topPrNum,switch)  # pr_id_map: cho2human.top5.txt
        # protein id mapping to gene id mapping
        gene_id_map = mRNA_prIDMap2geneIDMap(pr_id_map,org1ref,org2ref,switch,IDtype='protein') # cho2human.top5.gene.txt
        # delete the consecutive repeated lines, for lines repeated but are seperated by other lines, they will retain
        uniq_id_map = gene_id_map[:-3] + 'uniq.txt'  # cho2human.top5.gene.uniq.txt
        cmd = ('rev {input} | uniq -f 2 | rev > {output}').format(input=gene_id_map,output=uniq_id_map)
        subprocess.call(cmd,shell=True)
        # index the id mapping, this is for one id in org1 has multiple ids in org2 mapped to 
        index_map = indexUniqline(uniq_id_map,switch)  # cho2human.top5.gene.uniq.index.txt
        # sort index file
        sort_map = index_map[:-3] + 'sort.txt'   # cho2human.top5.gene.uniq.index.sort.txt
        if switch == 'False':  # sort based on 1st column, then on 2nd column
            cmd = ('sort -k1,1n -k2,2n {input} > {output}').format(input=index_map,output=sort_map)
        else:
            cmd = ('sort -k2,2n -k1,1n {input} > {output}').format(input=index_map,output=sort_map)
        subprocess.call(cmd,shell=True)
        #sort_map = '/data/shangzhong/CHO2Human/2wayBlastPresult/top5/cho2human.top250.gene.uniq.index.sort.txt'
        # get uniq first two ids.
        uniqline_map = uniqFirst2Col(sort_map)  # cho2human.top5.gene.uniq.index.sort.uniqline.txt
        # sort by 1rs and 3rd columns for cho2human, and 2nd and 3rd columns for human2cho
        sortbyindex_map = uniqline_map[:-3] + 'index.txt' # cho2human.top5.gene.uniq.index.sort.uniqline.index.txt
        if switch == 'False':
            cmd = ('sort -k1,1n -k5,5n {input} > {output}').format(input=uniqline_map,output=sortbyindex_map)
        else:
            cmd = ('sort -k2,2n -k5,5n {input} > {output}').format(input=uniqline_map,output=sortbyindex_map)
        subprocess.call(cmd,shell=True)
        final_index = indexUniqline(sortbyindex_map,switch) # cho2human.top5.gene.uniq.index.sort.uniqline.index.index.txt
        indexFile.append(final_index)
    return indexFile
    # -------------- 3. get unoverlapped proteins into a list ---------------
    # get unonverlapped ids
    unmapGeneIDs = '/data/shangzhong/CHO2Mouse/finalresult/CHO2Mouse_nonOverlap.txt'
    geneIds = {}
    with open(unmapGeneIDs,'r') as inputfile:
        for line in inputfile:
            geneIds[line[:-1]] = [[] for i in range(4)]    
    # -------------- 4. merge two index files into one ---------------------------------
    # indexFile = ['/data/shangzhong/CHO2Mouse/2wayBlastPresult/all/cho2mouse.top250.gene.uniq.index.sort.uniqline.index.index.txt',
    #               '/data/shangzhong/CHO2Mouse/2wayBlastPresult/all/mouse2cho.top250.gene.uniq.index.sort.uniqline.index.index.txt']
    topGeneNum = 5
    res = open(indexFile[0],'r')
    for line in res:
        item = line[:-1].split('\t')
        if item[0] in geneIds:
            if int(item[4]) > topGeneNum:
                continue
            else:
                geneIds[item[0]][1].append(item[1])
                geneIds[item[0]][0].append(item[4])
        else:
            continue
    
    res = open(indexFile[1],'r')
    for line in res:
        item = line[:-1].split('\t')
        if item[0] in geneIds:
            if int(item[4]) > topGeneNum:
                continue
            else:
                geneIds[item[0]][2].append(item[1])
                geneIds[item[0]][3].append(item[4])
        else:
            continue 
            
    outputfile = indexFile[0][:-22] + 'nonmap.txt'  # cho2human.top5.nonmap.txt
    output = open(outputfile,'w')
    for key in geneIds:
        outline = ('{key}\t{cho2human}\t{human2cho}\n-\t{cho2humanIndex}\t{human2choIndex}\n').format(key=key,
                            cho2human=','.join(geneIds[key][1]),human2cho=','.join(geneIds[key][2]),
                            human2choIndex=','.join(geneIds[key][3]),cho2humanIndex=','.join(geneIds[key][0]))
        output.write(outline)
    output.close()
    print 'done'