def __init__(self, match, variant_number = None, position = None, open_brackets = 0): self.match = match self.variant_number = variant_number self.position = position # Following credit: http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string self.normalized_word = import_helpers.normalize_token(match['word']) self.open_brackets = open_brackets
def insertToken(i, placeholderToken): """ Look behind at the previously inserted tokens to see if any of them have the exact same tokenPosition as this token.position; if so, then do not insert a new instance of this token, but rather associate a new TokenVariantGroup instance to the previously-matched token """ global finalTokenPosition, tokens, bookRefs, chapterRefs, verseRefs, pageRefs normalizedMatchedPreviousToken = None # If placeholderToken.position is different than previous token, then increment if len(previousPlaceholderTokens) and previousPlaceholderTokens[-1].position != placeholderToken.position: finalTokenPosition += 1 previousPlaceholderTokens.append(placeholderToken) for previousToken in reversed(tokens): # Tokens that have the exact same data or same normalized data, all have same position if previousToken.position != finalTokenPosition: break # If the previous token is exactly the same as this token, then simply add the variantGroup to the existing token if (placeholderToken.data) == (previousToken.data): vgt = VariantGroupToken( token=previousToken, variant_group=placeholderToken.variant_group, certainty=placeholderToken.certainty ) vgt.save() updateRefs(i) return # If a previous token has the same normalized data as this token, elif (import_helpers.normalize_token(placeholderToken.data)) == ( import_helpers.normalize_token(previousToken.data) ): normalizedMatchedPreviousToken = previousToken else: print 'Position is same (%s) but data is different "%s" != "%s"!' % ( finalTokenPosition, import_helpers.normalize_token(placeholderToken.data), import_helpers.normalize_token(previousToken.data), ) raise Exception( 'Position is same (%s) but data is different "%s" != "%s"!' % ( finalTokenPosition, import_helpers.normalize_token(placeholderToken.data), import_helpers.normalize_token(previousToken.data), ) ) # Now take the placeholderToken and convert it into a real token and insert it into the database # placeholderToken.certainty = None #Handled by variantGroupToken placeholderToken.position = finalTokenPosition token = placeholderToken.establish() tokens.append(token) updateRefs(i)
def __unicode__(self): #return ("[" * self['certainty']) + self['data'] + ("]" * self['certainty']) return import_helpers.normalize_token(self['data'])
# Normalize all tokens in the first manuscript and insert them as the tokens for the UMS umsTokens = [] msWork = msWorksUnunified[0] msWork.unified = umsWork msWork.save() msWorksToMerge = msWorksUnunified[1:] if(msWork.base): msWorkBase = msWork.base else: msWorkBase = msWork print " - " + msWork.title tokens = Ref.objects.filter(work=msWorkBase, osis_id=book_code)[0].get_tokens(variant_number = msWork.variant_number) for token in tokens: umsToken = Token( data = normalize_token(token.data), type = token.type, work = umsWork, position = 0 #temporary until merge has been completed; the umsToken's index in umsTokens is its true position ) umsToken.save() umsTokens.append(umsToken) token.unified_token = umsToken token.save() # Foreach of the MSS then compare with the tokens inserted into the UMS for msWork in msWorksToMerge: print " - " + msWork.title msWork.unified = umsWork msWork.save()
if opcode[0] == "equal": token1indicies = range(opcode[1], opcode[2]) token2indicies = range(opcode[3], opcode[4]) while len(token1indicies): token1index = token1indicies.pop(0) token2index = token2indicies.pop(0) # Set the position in both var1 and var2 to be the same if isinstance(placeholderTokens[0][token1index], Ref): insertRef(0, placeholderTokens[0][token1index]) insertRef(1, placeholderTokens[1][token2index]) # Otherwise, insert both placeholderTokens but at the same position to indicate that they are alternates of the same else: assert import_helpers.normalize_token( placeholderTokens[0][token1index].data ) == import_helpers.normalize_token(placeholderTokens[1][token2index].data) # Only increment the token position if these equal tokens are a different position than the previously seen token if ( len(previousMergedPlaceholderPositions[0]) and previousMergedPlaceholderPositions[0][-1] != placeholderTokens[0][token1index].position ): tokenPosition += 1 elif ( len(previousMergedPlaceholderPositions[1]) and previousMergedPlaceholderPositions[1][-1] != placeholderTokens[1][token2index].position ): tokenPosition += 1 previousMergedPlaceholderPositions[0].append(placeholderTokens[0][token1index].position)
def __init__(self, match, variant_number=None, position=None, open_brackets=0): self.match = match self.variant_number = variant_number self.position = position self.normalized_word = import_helpers.normalize_token(match["word"]) self.open_brackets = open_brackets