예제 #1
0
def checkActorActeeNotReal(dataLines):
    '''
    Checks ad-lib lines in dataLines for cases where either the actor or actee
    is noted as "NULL" or some other placeholder-type value.
    
    All legitimate names used as actor or actee should be exactly 3 characters,
    so any names that aren't will be flagged here, whether or not they were
    specifically listed as possible "placeholder" values beforehand.
    
    This function is different from checkNeighborNotReal in that it uses a
    different (larger) set of "placeholder" values.  Some of these values are
    okay for use as neighbors.  See Babase documentation.
    
    dataLines is a list of list of strings, presumed to be all the data from a
    file, stripped and split.
    
    Returns a list of lists of strings: the lines where this is true.
    '''
    from constants import unknSnames, unnamedCodes, adlibAbbrev
    
    # Make a set of known "placeholder" codes to check for 
    plcHoldrs = set(unknSnames.keys()).union(unnamedCodes)
    
    linesOfInterest = [line for line in dataLines if isType(line, adlibAbbrev)]
    
    return [line for line in linesOfInterest if line[5] in plcHoldrs or line[7] in plcHoldrs or len(line[5]) != 3 or len(line[7]) != 3]
def checkActorActeeNotReal(dataLines):
    '''
    Checks ad-lib lines in dataLines for cases where either the actor or actee
    is noted as "NULL" or some other placeholder-type value.
    
    This function is different from checkNeighborNotReal in that it uses a
    different (larger) set of "placeholder" values.  Some of these values are
    okay for use as neighbors.  See Babase documentation.
    
    dataLines is a list of list of strings, presumed to be all the data from a
    file, stripped and split.
    
    Returns a list of lists of strings: the lines where this is true.
    '''
    from constants import unknSnames, unnamedCodes, adlibAbbrev
    
    # Make a set of known "placeholder" codes to check for 
    plcHoldrs = set(unknSnames.keys()).union(unnamedCodes)
    
    linesOfInterest = [line for line in dataLines if isType(line, adlibAbbrev)]
    
    return [line for line in linesOfInterest if line[5] in plcHoldrs or line[7] in plcHoldrs]
def checkUniqueNeighbors(dataLines, sampleProtocols):
    '''
    Checks all the recorded neighbors for each point collected during
    sampleProtocols samples to make sure that the list of neighbors is unique.
    Ideally, only check juvenile samples because the adult female protocol
    allows for some redundancy.  Returns a list of lists of strings: the point
    lines that have non-unique neighbors, and all of the point's associated
    neighbor lines.
    
    When considering uniqueness of neighbors, placeholder names (any names in
    constants.unknSnames) are ignored.
    
    dataLines is a list of lists of strings, presumed to be all the data from a
    file, stripped and split. sampleProtocols is a list of strings.
    '''
    from constants import unknSnames, focalAbbrev, pntAbbrev, neighborAbbrev
    
    # Make placeholders for iteration.
    # Holds the last-read "header" line, but only for focals of the type(s) allowed by sampleProtocols
    currentHeader = []
    #Holds the last-read "point" line, but only if it was in an allowed focal 
    currentPoint = []
    
    # Dictionary of points and neighbors.
    myPnts = {}
    #   Key: the point line--joined as a string
    #   Value: list of the neighbor lines (as lists of strings) for the point
    
    # Key for dictionary when neighbors are missing a point line
    missingPntKey = '(MISSING POINT LINE)'
    myPnts[missingPntKey] = []
    
    for line in dataLines:
        if line[0] not in [focalAbbrev, pntAbbrev, neighborAbbrev]:
            # Then we don't care about it for this question
            continue
        elif isType(line, focalAbbrev):
            if line[6] in sampleProtocols: #This is a focal sample of interest
                currentHeader = line[:]
                currentPoint = []
            else: #We don't care about any of the data in this focal
                currentHeader = []
                currentPoint = []
                continue
        # All that's left are points and neighbors. If currentHeader is empty, then we don't care about any of these.
        elif currentHeader == []:
            continue
        # Only points and neighbors that actually happened during focals of interest are left
        elif isType(line, pntAbbrev):
            if sameDate(line, currentHeader): #This should always be true, but added here just in case
                currentPoint = line[:]
                myPnts['\t'.join(currentPoint)] = []
        elif isType(line, neighborAbbrev):
            if currentPoint == []: #This should only happen if the observer messed up somewhere else
                myPnts[missingPntKey].append(line)
            elif sameDate(line, currentPoint):
                myPnts['\t'.join(currentPoint)].append(line)
    
    # Get list of names that are allowed to be nonunique
    fakeNames = unknSnames.keys()
    
    # Make list to hold the point and neighbor lines with nonunique neighbors
    nonUniqueNeighbors = []
    
    for (point, neighbors) in myPnts.iteritems():
        nghNames = []
        for neighbor in neighbors: #Collect all the neighbor names into one list
            if neighbor[7] not in fakeNames:
                nghNames.append(neighbor[7])
        if len(nghNames) > len(set(nghNames)): #Then 1 or more neighbors is redundant
            if len(nonUniqueNeighbors) > 0: # For every instance after the first, add a newline first
                nonUniqueNeighbors.append([]) # When this is output to file, a newline will be added
            nonUniqueNeighbors.append(point.split('\t'))
            for neighbor in neighbors:
                nonUniqueNeighbors.append(neighbor)

    return nonUniqueNeighbors
예제 #4
0
def checkUniqueNeighbors(dataLines, sampleProtocols):
    '''
    Checks all the recorded neighbors for each point collected during
    sampleProtocols samples to make sure that the list of neighbors is unique.
    Ideally, only check juvenile samples because the adult female protocol
    allows for some redundancy.  Returns a list of lists of strings: the point
    lines that have non-unique neighbors, and all of the point's associated
    neighbor lines.
    
    When considering uniqueness of neighbors, placeholder names (any names in
    constants.unknSnames) are ignored.
    
    If a point has more than three neighbors, the neighbors are likely
    not unique, but neighbor uniqueness really isn't _the_ problem.
    Those cases are not returned by this function.
    
    dataLines is a list of lists of strings, presumed to be all the data from a
    file, stripped and split. sampleProtocols is a list of strings.
    '''
    from constants import unknSnames, focalAbbrev, pntAbbrev, neighborAbbrev
    
    # Make placeholders for iteration.
    # Holds the last-read "header" line, but only for focals of the type(s) allowed by sampleProtocols
    currentHeader = []
    #Holds the last-read "point" line, but only if it was in an allowed focal 
    currentPoint = []
    
    # Dictionary of points and neighbors.
    myPnts = {}
    #   Key: the point line--joined as a string
    #   Value: list of the neighbor lines (as lists of strings) for the point
    
    # Key for dictionary when neighbors are missing a point line
    missingPntKey = '(MISSING POINT LINE)'
    myPnts[missingPntKey] = []
    
    for line in dataLines:
        if line[0] not in [focalAbbrev, pntAbbrev, neighborAbbrev]:
            # Then we don't care about it for this question
            continue
        elif isType(line, focalAbbrev):
            if line[6] in sampleProtocols: #This is a focal sample of interest
                currentHeader = line[:]
                currentPoint = []
            else: #We don't care about any of the data in this focal
                currentHeader = []
                currentPoint = []
                continue
        # All that's left are points and neighbors. If currentHeader is empty, then we don't care about any of these.
        elif currentHeader == []:
            continue
        # Only points and neighbors that actually happened during focals of interest are left
        elif isType(line, pntAbbrev):
            if sameDate(line, currentHeader): #This should always be true, but added here just in case
                currentPoint = line[:]
                myPnts['\t'.join(currentPoint)] = []
        elif isType(line, neighborAbbrev):
            if currentPoint == []: #This should only happen if the observer messed up somewhere else
                myPnts[missingPntKey].append(line)
            elif sameDate(line, currentPoint):
                myPnts['\t'.join(currentPoint)].append(line)
    
    # Get list of names that are allowed to be nonunique
    fakeNames = unknSnames.keys()
    
    # Make list to hold the point and neighbor lines with nonunique neighbors
    nonUniqueNeighbors = []
    
    for (point, neighbors) in sorted(myPnts.items()):
        if len(neighbors) > 3:
            # Then you've got too many neighbors.  Don't bother
            # with checking for neighbor uniqueness, this point has
            # bigger issues.
            continue
        nghNames = []
        for neighbor in neighbors: #Collect all the neighbor names into one list
            if neighbor[7] not in fakeNames:
                nghNames.append(neighbor[7])
        if len(nghNames) > len(set(nghNames)): #Then 1 or more neighbors is redundant
            if len(nonUniqueNeighbors) > 0: # For every instance after the first, add a newline first
                nonUniqueNeighbors.append([]) # When this is output to file, a newline will be added
            nonUniqueNeighbors.append(point.split('\t'))
            for neighbor in neighbors:
                nonUniqueNeighbors.append(neighbor)

    return nonUniqueNeighbors