def getG4(G4DetectedInGene, inputfile, dicoParam): """Merges all windows from a gene upper thresholds, it will be predicted G4. This function browse all windows returned by G4RNA Screener and keep only those over the thresholds. It also merge overlapping windows. If two windows upper thresholds are separated with one window which is under thresholds, the two windows will not be merge and will be concidered as 2 pG4. :param G4DetectedInGene: pG4 with its scores and sequence, predicted in genes. :type G4DetectedInGene: dictionary :param inputfile: name of the outputfile of G4RNA screener. :type inputfile: string :param dicoParam: all parameters given to G4RNA screener. :type dicoParam: dictionary :returns: G4DetectedInGene, contains all predicted G4 in genes, with there scores. :rtype: dictionary """ oldPassed = False # boolean about the previous windows passed = False # true if over threshold, elsewise false inputfile = open(inputfile, 'r') for line in inputfile: if (re.search('^[0-9]', line)): # if the line is not the header of the file dicoLine = readLineG4ScreenerLoc(line) if (dicoLine['cGcC'] >= dicoParam['cGcC'] and dicoLine['g4H'] >= dicoParam['g4H'] and dicoLine['g4NN'] >= dicoParam['g4NN']): # if window over thresholds passed = True if (oldPassed != passed): # first windows over the threshold descriptionOverThreshold = dicoLine['Description'] # assignation of description for this window sequenceG4 = dicoLine['WindowSeq'] oldPassed = passed if dicoLine['Strand'] == '1': startG4 = dicoLine['WindowStart'] endG4 = dicoLine['WindowEnd'] else: startG4 = dicoLine['locationEnd'] - \ (dicoLine['WindowStart'] -\ dicoLine['locationStart']) endG4 = startG4 - (len(dicoLine['WindowSeq'])) + 1 listeCGcC = [dicoLine['cGcC']] listeG4Hunter = [dicoLine['g4H']] listeG4NN = [dicoLine['g4NN']] else: # not the first windows above the thresholds sequenceG4 = rF.addWindowToG4Seq(sequenceG4, dicoLine['WindowSeq'], dicoParam['Step'], dicoParam['Window']) if dicoLine['Strand'] == '1': endG4 = dicoLine['WindowEnd'] else: endG4 = startG4 - (len(sequenceG4)) + 1 listeCGcC.append(dicoLine['cGcC']) listeG4Hunter.append(dicoLine['g4H']) listeG4NN.append(dicoLine['g4NN']) if (dicoLine['cGcC'] < dicoParam['cGcC'] or dicoLine['g4H'] < dicoParam['g4H'] or dicoLine['g4NN'] < dicoParam['g4NN'] or descriptionOverThreshold != dicoLine['Description']): # one of the score is under his threshold # or this windows is from another gene passed = False # update if (oldPassed != passed): # last windows before under the thresolds meanCGcC = rF.mean(listeCGcC) meanG4Hunter = rF.mean(listeG4Hunter) meanG4NN = rF.mean(listeG4NN) oldPassed = passed # update if descriptionOverThreshold != dicoLine['Description']: headerG4 = rF.createIdpG4rShuffle( descriptionOverThreshold, startG4, endG4, dicoLine['Strand']) else: headerG4 = rF.createIdpG4rShuffle( dicoLine['Description'], startG4, endG4, dicoLine['Strand']) if headerG4 not in G4DetectedInGene and dicoLine['Strand']: G4DetectedInGene[headerG4] = [ str(meanCGcC), str(meanG4Hunter), sequenceG4, str(meanG4NN) ] inputfile.close() return G4DetectedInGene
def ReturnG4InCircu(G4Detected, inputfile, dicoParam): """Merges all windows from a gene upper thresholds, it will be predicted G4. This function browses all windows returned by G4RNA Screener and keep only those over the thresholds. It also merges overlapping windows. If two windows upper thresholds are separated with one window which is under thresholds, the two windows will not be merge and will be concidered as 2 pG4. :param G4Detected: pG4r with its scores and sequence, predicted in genes. :type G4Detected: dictionary :param inputfile: name of the outputfile containing the liste of informations for each circular. :type inputfile: string :param dicoParam: all parameters given to G4RNA screener. :type dicoParam: dictionary :returns: G4Detected, contains all predicted G4 in genes, with there scores. :rtype: dictionary """ rnaType = 'circu' location = 'ExonNC' oldPassed = False passed = False overtakingSequence = False first = False tmp = [] upstream = '' downstream = '' condition = False descriptionOverThreshold = '' inputfile = open(inputfile, "r") for line in inputfile: if re.search('^[0-9]', line): words = line.split('\t') if len(words) == 8: dL = readLineCircu(words) if (dL['cGcC'] >= dicoParam['cGcC'] and dL['g4H'] >= dicoParam['g4H'] and dL['g4NN'] >= dicoParam['g4NN']): passed = True if (oldPassed != passed): # if it's the first windows, beginning if the passage descriptionOverThreshold = dL['Description'] sequenceG4 = dL['Sequence'] oldPassed = passed if dL['Strand'] == '+': startG4 = dL['wStart'] # same as startWindow of G4 screener endG4 = dL['wEnd'] # same as endWindow of G4 screener else: startG4 = dL['circuEnd'] - dL['wStart'] + 1 endG4 = startG4 - len(dL['Sequence']) + 1 listeCGcC = [dL['cGcC']] listeG4Hunter = [dL['g4H']] listeG4NN = [dL['g4NN']] else: # if it's not the first windows above the thresholds sequenceG4 = rF.addWindowToG4Seq( sequenceG4, dL['Sequence'], dicoParam['Step'], dicoParam['Window']) if dL['Strand'] == '+': endG4 = dL['wEnd'] else: endG4 = startG4 - len(sequenceG4) + 1 listeCGcC.append(dL['cGcC']) listeG4Hunter.append(dL['g4H']) listeG4NN.append(dL['g4NN']) if (dL['cGcC'] < dicoParam['cGcC'] or dL['g4H'] < dicoParam['g4H'] or dL['g4NN'] < dicoParam['g4NN'] or descriptionOverThreshold != dL['Description']): # one of the score is under his threshold # or this windows is from another gene passed = False if (oldPassed != passed): # last windows before under the thresolds meanCGcC = rF.mean(listeCGcC) meanG4Hunter = rF.mean(listeG4Hunter) meanG4NN = rF.mean(listeG4NN) oldPassed = passed # update headerG4 = rF.createIdG4(dL['circuId'], startG4, endG4, dL['Strand']) if dL['Strand'] == '+': if (startG4 > dL['circuStart'] and startG4 < dL['circuEnd'] and endG4 > dL['circuEnd']): # overlap condition = True else: if (startG4 > dL['circuStart'] and startG4 < dL['circuEnd'] and endG4 < dL['circuStart']): # overlap condition = True if condition == True: # if overlap location = 'junction' if dL['Strand'] == '+': endG4 = dL['circuStart'] + \ (endG4 - dL['circuEnd']) else: endG4 = dL['circuEnd'] - \ (dL['circuStart'] - endG4) headerG4 = rF.createIdG4(dL['circuId'], startG4, endG4, dL['Strand']) G4Detected[headerG4] = [ str(meanCGcC), str(meanG4Hunter), sequenceG4, str(meanG4NN), location, rnaType ] else: if (G4Detected.has_key(headerG4) == False and dL['Strand'] != None): G4Detected[headerG4] = [ str(meanCGcC), str(meanG4Hunter), sequenceG4, str(meanG4NN), location, rnaType ] condition = False location = 'ExonNC' inputfile.close() return G4Detected
def returnG4InJunction(G4DetectedInJunction, inputfile, dicoParam): """Merges all windows from junctions upper thresholds, it will be pG4. This function browses all windows returned by G4RNA Screener and keep only those over the thresholds. It also merge overlapping windows. If two windows upper thresholds are separated with one window which is under thresholds, the two windows will not be merge and will be concidered as 2 pG4. :param G4DetectedInJunction: pG4 with its scores and sequence, those pG4 are predicted in junctions. :type G4DetectedInJunction: dictionary :param inputfile: name of the outputfile of G4RNA screener. :type inputfile: string :param dicoParam: all parameters given to G4RNA screener. :type dicoParam: dictionary :returns: G4DetectedInJunction, contains all pG4 in juctions, with there scores. :rtype: dictionary """ oldPassed = False passed = False descriptionOverThreshold = '' inputfile = open(inputfile, 'r') for line in inputfile: if (re.search('^[0-9]', line)): # if the line is not the header of the file dicoLine = readLineG4ScreenerJun(line, 'Junction') if (dicoLine['cGcC'] >= dicoParam['cGcC'] and dicoLine['g4H'] >= dicoParam['g4H'] and dicoLine['g4NN'] >= dicoParam['g4NN'] and dicoLine['WindowStart']): # window over thresholds onJunction = False passed = True if (oldPassed != passed): # first windows over thresholds descriptionOverThreshold = dicoLine['Description'] sequenceG4 = dicoLine['WindowSeq'] oldPassed = passed # update startFirstWindow = dicoLine['locationStart'] endFirstWindow = dicoLine['locationEnd'] startG4 = dicoLine['WindowStart'] endG4 = dicoLine['WindowEnd'] listeCGcC = [dicoLine['cGcC']] listeG4Hunter = [dicoLine['g4H']] listeG4NN = [dicoLine['g4NN']] else: # not the first windows above the thresholds sequenceG4 = rF.addWindowToG4Seq(sequenceG4, dicoLine['WindowSeq'], dicoParam['Step'], dicoParam['Window']) endG4 = dicoLine['WindowEnd'] listeCGcC.append(dicoLine['cGcC']) listeG4Hunter.append(dicoLine['g4H']) listeG4NN.append(dicoLine['g4NN']) if (dicoLine['cGcC'] < dicoParam['cGcC'] or dicoLine['g4H'] < dicoParam['g4H'] or dicoLine['g4NN'] < dicoParam['g4NN'] or descriptionOverThreshold != dicoLine['Description']): # one of the score is under his threshold # or this windows is from another gene passed = False if (oldPassed != passed): # last windows before under the thresolds meanCGcC = rF.mean(listeCGcC) meanG4Hunter = rF.mean(listeG4Hunter) meanG4NN = rF.mean(listeG4NN) oldPassed = passed if (startG4 > 40 or endG4 < 160): if descriptionOverThreshold != dicoLine['Description']: description = descriptionOverThreshold.split( ':')[0] + ':junction:' + ':'.join( descriptionOverThreshold.split(':')[2:]) else: description = dicoLine['Description'].split( ':')[0] + ':junction:' + ':'.join( dicoLine['Description'].split(':')[2:]) if description not in G4DetectedInJunction: G4DetectedInJunction[description] = [ str(meanCGcC), str(meanG4Hunter), sequenceG4, str(meanG4NN) ] inputfile.close() return G4DetectedInJunction