def __init__(self, match, variant_number = None, position = None, open_brackets = 0):
     self.match = match
     self.variant_number = variant_number
     self.position = position
     # Following credit: http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
     self.normalized_word = import_helpers.normalize_token(match['word'])
     self.open_brackets = open_brackets
Exemplo n.º 2
0
def insertToken(i, placeholderToken):
    """
    Look behind at the previously inserted tokens to see if any of them 
    have the exact same tokenPosition as this token.position; if so, then
    do not insert a new instance of this token, but rather associate a new
    TokenVariantGroup instance to the previously-matched token
    """
    global finalTokenPosition, tokens, bookRefs, chapterRefs, verseRefs, pageRefs
    normalizedMatchedPreviousToken = None

    # If placeholderToken.position is different than previous token, then increment
    if len(previousPlaceholderTokens) and previousPlaceholderTokens[-1].position != placeholderToken.position:
        finalTokenPosition += 1
    previousPlaceholderTokens.append(placeholderToken)

    for previousToken in reversed(tokens):
        # Tokens that have the exact same data or same normalized data, all have same position
        if previousToken.position != finalTokenPosition:
            break
        # If the previous token is exactly the same as this token, then simply add the variantGroup to the existing token
        if (placeholderToken.data) == (previousToken.data):
            vgt = VariantGroupToken(
                token=previousToken, variant_group=placeholderToken.variant_group, certainty=placeholderToken.certainty
            )
            vgt.save()
            updateRefs(i)
            return
        # If a previous token has the same normalized data as this token,
        elif (import_helpers.normalize_token(placeholderToken.data)) == (
            import_helpers.normalize_token(previousToken.data)
        ):
            normalizedMatchedPreviousToken = previousToken
        else:
            print 'Position is same (%s) but data is different "%s" != "%s"!' % (
                finalTokenPosition,
                import_helpers.normalize_token(placeholderToken.data),
                import_helpers.normalize_token(previousToken.data),
            )
            raise Exception(
                'Position is same (%s) but data is different "%s" != "%s"!'
                % (
                    finalTokenPosition,
                    import_helpers.normalize_token(placeholderToken.data),
                    import_helpers.normalize_token(previousToken.data),
                )
            )

    # Now take the placeholderToken and convert it into a real token and insert it into the database
    # placeholderToken.certainty = None #Handled by variantGroupToken
    placeholderToken.position = finalTokenPosition
    token = placeholderToken.establish()
    tokens.append(token)
    updateRefs(i)
 def __unicode__(self):
     #return ("[" * self['certainty']) + self['data'] + ("]" * self['certainty'])
     return import_helpers.normalize_token(self['data'])
Exemplo n.º 4
0
     # Normalize all tokens in the first manuscript and insert them as the tokens for the UMS
     umsTokens = []
     msWork = msWorksUnunified[0]
     msWork.unified = umsWork
     msWork.save()
     msWorksToMerge = msWorksUnunified[1:]
     if(msWork.base):
         msWorkBase = msWork.base
     else:
         msWorkBase = msWork
     
     print " - " + msWork.title
     tokens = Ref.objects.filter(work=msWorkBase, osis_id=book_code)[0].get_tokens(variant_number = msWork.variant_number)
     for token in tokens:
         umsToken = Token(
             data = normalize_token(token.data),
             type = token.type,
             work = umsWork,
             position = 0 #temporary until merge has been completed; the umsToken's index in umsTokens is its true position
         )
         umsToken.save()
         umsTokens.append(umsToken)
         token.unified_token = umsToken
         token.save()
 
 # Foreach of the MSS then compare with the tokens inserted into the UMS
 for msWork in msWorksToMerge:
     print " - " + msWork.title
     
     msWork.unified = umsWork
     msWork.save()
Exemplo n.º 5
0
    if opcode[0] == "equal":
        token1indicies = range(opcode[1], opcode[2])
        token2indicies = range(opcode[3], opcode[4])

        while len(token1indicies):
            token1index = token1indicies.pop(0)
            token2index = token2indicies.pop(0)

            # Set the position in both var1 and var2 to be the same
            if isinstance(placeholderTokens[0][token1index], Ref):
                insertRef(0, placeholderTokens[0][token1index])
                insertRef(1, placeholderTokens[1][token2index])
            # Otherwise, insert both placeholderTokens but at the same position to indicate that they are alternates of the same
            else:
                assert import_helpers.normalize_token(
                    placeholderTokens[0][token1index].data
                ) == import_helpers.normalize_token(placeholderTokens[1][token2index].data)

                # Only increment the token position if these equal tokens are a different position than the previously seen token
                if (
                    len(previousMergedPlaceholderPositions[0])
                    and previousMergedPlaceholderPositions[0][-1] != placeholderTokens[0][token1index].position
                ):
                    tokenPosition += 1
                elif (
                    len(previousMergedPlaceholderPositions[1])
                    and previousMergedPlaceholderPositions[1][-1] != placeholderTokens[1][token2index].position
                ):
                    tokenPosition += 1

                previousMergedPlaceholderPositions[0].append(placeholderTokens[0][token1index].position)
 def __init__(self, match, variant_number=None, position=None, open_brackets=0):
     self.match = match
     self.variant_number = variant_number
     self.position = position
     self.normalized_word = import_helpers.normalize_token(match["word"])
     self.open_brackets = open_brackets