예제 #1
0
def getFig3Percent(path):
    fileTr = path + 'Data/transcriptType/HS_transcript_unspliced_All.txt'
    fileBt = path + 'Data/transcriptType/transcriptType_All.txt'
    filepG4rWt = path + 'Results/All/HS_All_G4InTranscript.txt'
    filepG4rShuf = path + 'Results/All/pG4r_shuffle.csv'
    dicoNbTr = {'Wt': {}, 'Shuf': {}, 'Tot': {}}

    dicoBt = rF.createDictionaryBiotypeByTranscript(fileBt)
    listTr = importTr(fileTr)
    # we keep only transcript with a biotype known.
    listTot = [tr for tr in listTr if tr in dicoBt]
    dicoTrClass, dicoNbTrClass = getNumberTrinClass(dicoBt, listTot)
    dicoNbTr['Tot']['Global'] = len(set(listTot))
    dicoNbTr['Tot'].update(dicoNbTrClass)

    listTrpG4rWt = readWtpG4r(filepG4rWt)
    dicoNbTrWt = countpG4rByClass(dicoTrClass, listTrpG4rWt)
    dicoNbTr['Wt']['Global'] = len(listTrpG4rWt)
    dicoNbTr['Wt'].update(dicoNbTrWt)

    listTrpG4rShuf = readShufpG4r(filepG4rShuf)
    dicoNbTrShuf = countpG4rByClass(dicoTrClass, listTrpG4rShuf)
    dicoNbTr['Shuf']['Global'] = len(listTrpG4rShuf)
    dicoNbTr['Shuf'].update(dicoNbTrShuf)
    return dicoNbTr
예제 #2
0
        name = '>' + gene + ':junction:' + chr + ':' + coords + ':' + strand + ':' + '|'.join(
            listTr)
        fastaShuf[name] = seq
    writeFasta(outputDir, fastaShuf, chr, 'junction')


def build_arg_parser():
    parser = argparse.ArgumentParser(description='generateRandom')
    GITDIR = os.getcwd() + '/'
    parser.add_argument('-p', '--path', default=GITDIR)
    parser.add_argument('-chr', '--chromosome', default='Y')
    return parser


if __name__ == '__main__':
    parser = build_arg_parser()
    arg = parser.parse_args()
    chr = arg.chromosome
    path = arg.path
    fastaFile = path + '/Fasta/'
    outputDir = path + '/chr' + chr + '/'
    fastaJunction = path + '/chr' + chr + '/HS_transcript_unspliced_chr' + chr + '_Sequence.txt'
    print(chr)
    dicoBt = rF.createDictionaryBiotypeByTranscript(
        path + '/transcriptType/transcriptType_chr' + chr + '.txt')
    dicoInfo = getDicoIndex(
        path + '/chr' + chr + '/HS_transcript_unspliced_chr' + chr +
        '_Index.txt', dicoBt)
    createFasta(dicoInfo, fastaFile, chr, outputDir)
    fromFasta(fastaJunction, outputDir, chr, dicoInfo)
예제 #3
0
def main(path, chromosome, specie, dicoParam):
    """Main function.

	The main function contains 4 parts :

	- initialization of all lists and dictionnary containing informations
	- merge of all windows over thresholds. If two windows over thresolds
		are separated by an other window (under thresholds) between them, it
		be two differents pG4
	- annotation of pG4 : on genome and transcriptom
	- writing files

	"""
    GITDIR = os.getcwd() + '/'
    ProteinCoding = createListCodingProtein()
    G4DetectedInGene = {}
    G4DetectedInJunction = {}
    G4InTranscript = {}
    G4InGenome = {}
    TranscriptPerG4 = {}
    directory = path + '/chr' + chromosome
    # variable directory which contain the data for this chromosome
    index = directory + '/' + specie + '_transcript_unspliced_chr' + \
      chromosome + '_Index.txt'
    # file which contain info by transcript for this chromosome
    indexBiotypeTranscript = path + '/transcriptType/transcriptType_chr' + \
          chromosome
    # file which contain biotype of transcript for this chromosome
    print "Chromosome " + chromosome
    BiotypeByTranscript = \
     rF.createDictionaryBiotypeByTranscript(indexBiotypeTranscript)
    StrandByGene = createDictionaryStrandByGene(index)
    AnnotationTranscript = rF.GetAnnotationTranscript(index, ProteinCoding,
                                                      BiotypeByTranscript)
    # get g4 from the ouput of G4RNA Screener
    for path, dirs, files in os.walk(directory):
        # for each element of the directory to passed
        for filename in files:  # for each files (all csv)
            inputfile = directory + '/' + filename
            if ('gene_unspliced' in filename and '.csv' in filename):
                G4DetectedInGene = returnG4InGene(G4DetectedInGene, inputfile,
                                                  dicoParam, StrandByGene)
            elif ('Junction' in filename and '.fas' in filename):
                G4DetectedInJunction = returnG4InJunction(
                    G4DetectedInJunction, inputfile, dicoParam, StrandByGene)
    listeG4InGeneEntire = getlisteG4InGene(G4DetectedInGene)
    listeG4InGeneJunction = getlisteG4InGene(G4DetectedInJunction)
    G4InTranscript, G4InGenome, TranscriptPerG4 = getInfoAboutpG4(
        index, BiotypeByTranscript, listeG4InGeneEntire, G4InTranscript,
        G4InGenome, TranscriptPerG4, AnnotationTranscript, G4DetectedInGene,
        ProteinCoding, 'Transcript')
    G4InTranscript, G4InGenome, TranscriptPerG4 = getInfoAboutpG4(
        index, BiotypeByTranscript, listeG4InGeneJunction, G4InTranscript,
        G4InGenome, TranscriptPerG4, AnnotationTranscript,
        G4DetectedInJunction, ProteinCoding, 'Junction')
    extractionG4InTranscript(GITDIR + 'results/perChromosome', specie,
                             chromosome, G4InTranscript)
    extractionG4InGenome(GITDIR + 'results/perChromosome', specie, chromosome,
                         G4InGenome)
    extractionTranscriptPerG4(GITDIR + 'results/perChromosome', specie,
                              chromosome, TranscriptPerG4)
    print "\t Done."