예제 #1
0
def getG4(G4DetectedInGene, inputfile, dicoParam):
    """Merges all windows from a gene upper thresholds, it will be predicted G4.

	This function browse all windows returned by G4RNA Screener and
	keep only those over the thresholds. It also merge overlapping
	windows. If two windows upper thresholds are separated with one window
	which is under thresholds, the two windows will not be merge and will be
	concidered as 2 pG4.

	:param G4DetectedInGene: pG4 with its scores and sequence, predicted in
		genes.
	:type G4DetectedInGene: dictionary
	:param inputfile: name of the outputfile of G4RNA screener.
	:type inputfile: string
	:param dicoParam: all parameters given to G4RNA screener.
	:type dicoParam: dictionary

	:returns: G4DetectedInGene, contains all predicted G4 in genes,
		with there scores.
	:rtype: dictionary
	"""
    oldPassed = False  # boolean about the previous windows
    passed = False  # true if over threshold, elsewise false
    inputfile = open(inputfile, 'r')
    for line in inputfile:
        if (re.search('^[0-9]', line)):
            # if the line is not the header of the file
            dicoLine = readLineG4ScreenerLoc(line)
            if (dicoLine['cGcC'] >= dicoParam['cGcC']
                    and dicoLine['g4H'] >= dicoParam['g4H']
                    and dicoLine['g4NN'] >= dicoParam['g4NN']):
                # if window over thresholds
                passed = True
                if (oldPassed != passed):
                    # first windows over the threshold
                    descriptionOverThreshold = dicoLine['Description']
                    # assignation of description for this window
                    sequenceG4 = dicoLine['WindowSeq']
                    oldPassed = passed
                    if dicoLine['Strand'] == '1':
                        startG4 = dicoLine['WindowStart']
                        endG4 = dicoLine['WindowEnd']
                    else:
                        startG4 = dicoLine['locationEnd'] - \
                          (dicoLine['WindowStart'] -\
                          dicoLine['locationStart'])
                        endG4 = startG4 - (len(dicoLine['WindowSeq'])) + 1
                    listeCGcC = [dicoLine['cGcC']]
                    listeG4Hunter = [dicoLine['g4H']]
                    listeG4NN = [dicoLine['g4NN']]
                else:
                    # not the first windows above the thresholds
                    sequenceG4 = rF.addWindowToG4Seq(sequenceG4,
                                                     dicoLine['WindowSeq'],
                                                     dicoParam['Step'],
                                                     dicoParam['Window'])
                    if dicoLine['Strand'] == '1':
                        endG4 = dicoLine['WindowEnd']
                    else:
                        endG4 = startG4 - (len(sequenceG4)) + 1
                    listeCGcC.append(dicoLine['cGcC'])
                    listeG4Hunter.append(dicoLine['g4H'])
                    listeG4NN.append(dicoLine['g4NN'])
            if (dicoLine['cGcC'] < dicoParam['cGcC']
                    or dicoLine['g4H'] < dicoParam['g4H']
                    or dicoLine['g4NN'] < dicoParam['g4NN']
                    or descriptionOverThreshold != dicoLine['Description']):
                # one of the score is under his threshold
                # or this windows is from another gene
                passed = False  # update
                if (oldPassed != passed):
                    # last windows before under the thresolds
                    meanCGcC = rF.mean(listeCGcC)
                    meanG4Hunter = rF.mean(listeG4Hunter)
                    meanG4NN = rF.mean(listeG4NN)
                    oldPassed = passed  # update
                    if descriptionOverThreshold != dicoLine['Description']:
                        headerG4 = rF.createIdpG4rShuffle(
                            descriptionOverThreshold, startG4, endG4,
                            dicoLine['Strand'])
                    else:
                        headerG4 = rF.createIdpG4rShuffle(
                            dicoLine['Description'], startG4, endG4,
                            dicoLine['Strand'])
                    if headerG4 not in G4DetectedInGene and dicoLine['Strand']:
                        G4DetectedInGene[headerG4] = [
                            str(meanCGcC),
                            str(meanG4Hunter), sequenceG4,
                            str(meanG4NN)
                        ]
    inputfile.close()
    return G4DetectedInGene
def ReturnG4InCircu(G4Detected, inputfile, dicoParam):
    """Merges all windows from a gene upper thresholds, it will be predicted G4.

	This function browses all windows returned by G4RNA Screener and
	keep only those over the thresholds. It also merges overlapping
	windows. If two windows upper thresholds are separated with one window
	which is under thresholds, the two windows will not be merge and will be
	concidered as 2 pG4.

	:param G4Detected: pG4r with its scores and sequence, predicted in
		genes.
	:type G4Detected: dictionary
	:param inputfile: name of the outputfile containing the liste of
		informations for each circular.
	:type inputfile: string
	:param dicoParam: all parameters given to G4RNA screener.
	:type dicoParam: dictionary

	:returns: G4Detected, contains all predicted G4 in genes, with there scores.
	:rtype: dictionary
	"""
    rnaType = 'circu'
    location = 'ExonNC'
    oldPassed = False
    passed = False
    overtakingSequence = False
    first = False
    tmp = []
    upstream = ''
    downstream = ''
    condition = False
    descriptionOverThreshold = ''
    inputfile = open(inputfile, "r")
    for line in inputfile:
        if re.search('^[0-9]', line):
            words = line.split('\t')
            if len(words) == 8:
                dL = readLineCircu(words)
                if (dL['cGcC'] >= dicoParam['cGcC']
                        and dL['g4H'] >= dicoParam['g4H']
                        and dL['g4NN'] >= dicoParam['g4NN']):
                    passed = True
                    if (oldPassed != passed):
                        # if it's the first windows, beginning if the passage
                        descriptionOverThreshold = dL['Description']
                        sequenceG4 = dL['Sequence']
                        oldPassed = passed
                        if dL['Strand'] == '+':
                            startG4 = dL['wStart']
                            # same as startWindow of G4 screener
                            endG4 = dL['wEnd']
                            # same as endWindow of G4 screener
                        else:
                            startG4 = dL['circuEnd'] - dL['wStart'] + 1
                            endG4 = startG4 - len(dL['Sequence']) + 1
                        listeCGcC = [dL['cGcC']]
                        listeG4Hunter = [dL['g4H']]
                        listeG4NN = [dL['g4NN']]
                    else:
                        # if it's not the first windows above the thresholds
                        sequenceG4 = rF.addWindowToG4Seq(
                            sequenceG4, dL['Sequence'], dicoParam['Step'],
                            dicoParam['Window'])
                        if dL['Strand'] == '+':
                            endG4 = dL['wEnd']
                        else:
                            endG4 = startG4 - len(sequenceG4) + 1
                        listeCGcC.append(dL['cGcC'])
                        listeG4Hunter.append(dL['g4H'])
                        listeG4NN.append(dL['g4NN'])
                if (dL['cGcC'] < dicoParam['cGcC']
                        or dL['g4H'] < dicoParam['g4H']
                        or dL['g4NN'] < dicoParam['g4NN']
                        or descriptionOverThreshold != dL['Description']):
                    # one of the score is under his threshold
                    # or this windows is from another gene
                    passed = False
                    if (oldPassed != passed):
                        # last windows before under the thresolds
                        meanCGcC = rF.mean(listeCGcC)
                        meanG4Hunter = rF.mean(listeG4Hunter)
                        meanG4NN = rF.mean(listeG4NN)
                        oldPassed = passed  # update
                        headerG4 = rF.createIdG4(dL['circuId'], startG4, endG4,
                                                 dL['Strand'])
                        if dL['Strand'] == '+':
                            if (startG4 > dL['circuStart']
                                    and startG4 < dL['circuEnd']
                                    and endG4 > dL['circuEnd']):
                                # overlap
                                condition = True
                        else:
                            if (startG4 > dL['circuStart']
                                    and startG4 < dL['circuEnd']
                                    and endG4 < dL['circuStart']):  # overlap
                                condition = True
                        if condition == True:  # if overlap
                            location = 'junction'
                            if dL['Strand'] == '+':
                                endG4 = dL['circuStart'] + \
                                 (endG4 - dL['circuEnd'])
                            else:
                                endG4 = dL['circuEnd'] - \
                                 (dL['circuStart'] - endG4)
                            headerG4 = rF.createIdG4(dL['circuId'], startG4,
                                                     endG4, dL['Strand'])
                            G4Detected[headerG4] = [
                                str(meanCGcC),
                                str(meanG4Hunter), sequenceG4,
                                str(meanG4NN), location, rnaType
                            ]
                        else:
                            if (G4Detected.has_key(headerG4) == False
                                    and dL['Strand'] != None):
                                G4Detected[headerG4] = [
                                    str(meanCGcC),
                                    str(meanG4Hunter), sequenceG4,
                                    str(meanG4NN), location, rnaType
                                ]
                        condition = False
                        location = 'ExonNC'
    inputfile.close()
    return G4Detected
예제 #3
0
def returnG4InJunction(G4DetectedInJunction, inputfile, dicoParam):
    """Merges all windows from junctions upper thresholds, it will be pG4.

	This function browses all windows returned by G4RNA Screener and
	keep only those over the thresholds. It also merge overlapping
	windows. If two windows upper thresholds are separated with one window
	which is under thresholds, the two windows will not be merge and will be
	concidered as 2 pG4.

	:param G4DetectedInJunction: pG4 with its scores and sequence, those pG4
		are predicted in junctions.
	:type G4DetectedInJunction: dictionary
	:param inputfile: name of the outputfile of G4RNA screener.
	:type inputfile: string
	:param dicoParam: all parameters given to G4RNA screener.
	:type dicoParam: dictionary

	:returns: G4DetectedInJunction, contains all pG4 in juctions,
		with there scores.
	:rtype: dictionary
	"""
    oldPassed = False
    passed = False
    descriptionOverThreshold = ''
    inputfile = open(inputfile, 'r')
    for line in inputfile:
        if (re.search('^[0-9]', line)):
            # if the line is not the header of the file
            dicoLine = readLineG4ScreenerJun(line, 'Junction')
            if (dicoLine['cGcC'] >= dicoParam['cGcC']
                    and dicoLine['g4H'] >= dicoParam['g4H']
                    and dicoLine['g4NN'] >= dicoParam['g4NN']
                    and dicoLine['WindowStart']):
                # window over thresholds
                onJunction = False
                passed = True
                if (oldPassed != passed):
                    # first windows over thresholds
                    descriptionOverThreshold = dicoLine['Description']
                    sequenceG4 = dicoLine['WindowSeq']
                    oldPassed = passed  # update
                    startFirstWindow = dicoLine['locationStart']
                    endFirstWindow = dicoLine['locationEnd']
                    startG4 = dicoLine['WindowStart']
                    endG4 = dicoLine['WindowEnd']
                    listeCGcC = [dicoLine['cGcC']]
                    listeG4Hunter = [dicoLine['g4H']]
                    listeG4NN = [dicoLine['g4NN']]
                else:
                    # not the first windows above the thresholds
                    sequenceG4 = rF.addWindowToG4Seq(sequenceG4,
                                                     dicoLine['WindowSeq'],
                                                     dicoParam['Step'],
                                                     dicoParam['Window'])
                    endG4 = dicoLine['WindowEnd']
                    listeCGcC.append(dicoLine['cGcC'])
                    listeG4Hunter.append(dicoLine['g4H'])
                    listeG4NN.append(dicoLine['g4NN'])
            if (dicoLine['cGcC'] < dicoParam['cGcC']
                    or dicoLine['g4H'] < dicoParam['g4H']
                    or dicoLine['g4NN'] < dicoParam['g4NN']
                    or descriptionOverThreshold != dicoLine['Description']):
                # one of the score is under his threshold
                # or this windows is from another gene
                passed = False
                if (oldPassed != passed):
                    # last windows before under the thresolds
                    meanCGcC = rF.mean(listeCGcC)
                    meanG4Hunter = rF.mean(listeG4Hunter)
                    meanG4NN = rF.mean(listeG4NN)
                    oldPassed = passed
                    if (startG4 > 40 or endG4 < 160):
                        if descriptionOverThreshold != dicoLine['Description']:
                            description = descriptionOverThreshold.split(
                                ':')[0] + ':junction:' + ':'.join(
                                    descriptionOverThreshold.split(':')[2:])
                        else:
                            description = dicoLine['Description'].split(
                                ':')[0] + ':junction:' + ':'.join(
                                    dicoLine['Description'].split(':')[2:])
                        if description not in G4DetectedInJunction:
                            G4DetectedInJunction[description] = [
                                str(meanCGcC),
                                str(meanG4Hunter), sequenceG4,
                                str(meanG4NN)
                            ]
    inputfile.close()
    return G4DetectedInJunction