def getFig3Percent(path): fileTr = path + 'Data/transcriptType/HS_transcript_unspliced_All.txt' fileBt = path + 'Data/transcriptType/transcriptType_All.txt' filepG4rWt = path + 'Results/All/HS_All_G4InTranscript.txt' filepG4rShuf = path + 'Results/All/pG4r_shuffle.csv' dicoNbTr = {'Wt': {}, 'Shuf': {}, 'Tot': {}} dicoBt = rF.createDictionaryBiotypeByTranscript(fileBt) listTr = importTr(fileTr) # we keep only transcript with a biotype known. listTot = [tr for tr in listTr if tr in dicoBt] dicoTrClass, dicoNbTrClass = getNumberTrinClass(dicoBt, listTot) dicoNbTr['Tot']['Global'] = len(set(listTot)) dicoNbTr['Tot'].update(dicoNbTrClass) listTrpG4rWt = readWtpG4r(filepG4rWt) dicoNbTrWt = countpG4rByClass(dicoTrClass, listTrpG4rWt) dicoNbTr['Wt']['Global'] = len(listTrpG4rWt) dicoNbTr['Wt'].update(dicoNbTrWt) listTrpG4rShuf = readShufpG4r(filepG4rShuf) dicoNbTrShuf = countpG4rByClass(dicoTrClass, listTrpG4rShuf) dicoNbTr['Shuf']['Global'] = len(listTrpG4rShuf) dicoNbTr['Shuf'].update(dicoNbTrShuf) return dicoNbTr
name = '>' + gene + ':junction:' + chr + ':' + coords + ':' + strand + ':' + '|'.join( listTr) fastaShuf[name] = seq writeFasta(outputDir, fastaShuf, chr, 'junction') def build_arg_parser(): parser = argparse.ArgumentParser(description='generateRandom') GITDIR = os.getcwd() + '/' parser.add_argument('-p', '--path', default=GITDIR) parser.add_argument('-chr', '--chromosome', default='Y') return parser if __name__ == '__main__': parser = build_arg_parser() arg = parser.parse_args() chr = arg.chromosome path = arg.path fastaFile = path + '/Fasta/' outputDir = path + '/chr' + chr + '/' fastaJunction = path + '/chr' + chr + '/HS_transcript_unspliced_chr' + chr + '_Sequence.txt' print(chr) dicoBt = rF.createDictionaryBiotypeByTranscript( path + '/transcriptType/transcriptType_chr' + chr + '.txt') dicoInfo = getDicoIndex( path + '/chr' + chr + '/HS_transcript_unspliced_chr' + chr + '_Index.txt', dicoBt) createFasta(dicoInfo, fastaFile, chr, outputDir) fromFasta(fastaJunction, outputDir, chr, dicoInfo)
def main(path, chromosome, specie, dicoParam): """Main function. The main function contains 4 parts : - initialization of all lists and dictionnary containing informations - merge of all windows over thresholds. If two windows over thresolds are separated by an other window (under thresholds) between them, it be two differents pG4 - annotation of pG4 : on genome and transcriptom - writing files """ GITDIR = os.getcwd() + '/' ProteinCoding = createListCodingProtein() G4DetectedInGene = {} G4DetectedInJunction = {} G4InTranscript = {} G4InGenome = {} TranscriptPerG4 = {} directory = path + '/chr' + chromosome # variable directory which contain the data for this chromosome index = directory + '/' + specie + '_transcript_unspliced_chr' + \ chromosome + '_Index.txt' # file which contain info by transcript for this chromosome indexBiotypeTranscript = path + '/transcriptType/transcriptType_chr' + \ chromosome # file which contain biotype of transcript for this chromosome print "Chromosome " + chromosome BiotypeByTranscript = \ rF.createDictionaryBiotypeByTranscript(indexBiotypeTranscript) StrandByGene = createDictionaryStrandByGene(index) AnnotationTranscript = rF.GetAnnotationTranscript(index, ProteinCoding, BiotypeByTranscript) # get g4 from the ouput of G4RNA Screener for path, dirs, files in os.walk(directory): # for each element of the directory to passed for filename in files: # for each files (all csv) inputfile = directory + '/' + filename if ('gene_unspliced' in filename and '.csv' in filename): G4DetectedInGene = returnG4InGene(G4DetectedInGene, inputfile, dicoParam, StrandByGene) elif ('Junction' in filename and '.fas' in filename): G4DetectedInJunction = returnG4InJunction( G4DetectedInJunction, inputfile, dicoParam, StrandByGene) listeG4InGeneEntire = getlisteG4InGene(G4DetectedInGene) listeG4InGeneJunction = getlisteG4InGene(G4DetectedInJunction) G4InTranscript, G4InGenome, TranscriptPerG4 = getInfoAboutpG4( index, BiotypeByTranscript, listeG4InGeneEntire, G4InTranscript, G4InGenome, TranscriptPerG4, AnnotationTranscript, G4DetectedInGene, ProteinCoding, 'Transcript') G4InTranscript, G4InGenome, TranscriptPerG4 = getInfoAboutpG4( index, BiotypeByTranscript, listeG4InGeneJunction, G4InTranscript, G4InGenome, TranscriptPerG4, AnnotationTranscript, G4DetectedInJunction, ProteinCoding, 'Junction') extractionG4InTranscript(GITDIR + 'results/perChromosome', specie, chromosome, G4InTranscript) extractionG4InGenome(GITDIR + 'results/perChromosome', specie, chromosome, G4InGenome) extractionTranscriptPerG4(GITDIR + 'results/perChromosome', specie, chromosome, TranscriptPerG4) print "\t Done."