def GetScore(Text,Lexicon,Rules=set(),verbose=False,hyphen=False): CapsMatch = 0 CapsSub = 0 CapsCount = 0 LowMatch = 0 LowSub = 0 LowCount = 0 if verbose: print("Attempting token matching") if Rules == set(): print("No substitution rules loaded") ## If not asked to check for possible fragmented matches, use basic checker. ## Function will default to basic checker. if hyphen == False: Tokens = TokenGen.Basic(Text,verbose) else: Tokens = TokenGen.Hyphen(Text,Lexicon,Rules,verbose) ## Maintains separate scores for substitution if rules were passed in, as well ## as separate scores for capitals, lowercase for word in Tokens: if word.islower(): LowCount = LowCount + 1 if word in Lexicon: LowMatch = LowMatch + 1 elif len(Rules) >= 1 and word in Rules: LowSub = LowSub + 1 else: CapsCount = CapsCount + 1 if word in Lexicon: CapsMatch = CapsMatch + 1 elif len(Rules) >= 1 and word in Rules: CapsSub = CapsSub + 1 if verbose: print("\t" + str(CapsCount) + " total capitalized tokens") print("\t" + str(CapsMatch) + " total capitalized dictionary matches") print("\t" + str(CapsSub) + " total capitalized valid substitutions") print("\t" + str(LowCount) + " total lower-case tokens") print("\t" + str(LowMatch) + " total lower-case dictionary matches") print("\t" + str(LowSub) + " total lower-case valid substitutions\n") ## Return the six scores as a tuple. return (CapsCount,CapsMatch,CapsSub,LowCount,LowMatch,LowSub)
def GetScore(Text,Lexicon,Rules=set(),verbose=False,hyphen=False): CapsMatch = 0 CapsSub = 0 CapsCount = 0 LowMatch = 0 LowSub = 0 LowCount = 0 if verbose: print("Attempting token matching") if Rules == set(): print("No substitution rules loaded") ## If not asked to check for possible fragmented matches, use basic checker. ## Function will default to basic checker. Tokens = TokenGen.break_hyphens(Text,Lexicon,Rules,verbose) ## Maintains separate scores for substitution if rules were passed in, as well ## as separate scores for capitals, lowercase. Note that tokens like ## "wiU" (will) are not counted as capitalized. for word in Tokens: LowerWord = word.lower() if word[0].islower(): LowCount = LowCount + 1 if LowerWord in Lexicon: LowMatch = LowMatch + 1 elif len(Rules) >= 1 and LowerWord in Rules: LowSub = LowSub + 1 else: CapsCount = CapsCount + 1 if LowerWord in Lexicon: CapsMatch = CapsMatch + 1 elif len(Rules) >= 1 and LowerWord in Rules: CapsSub = CapsSub + 1 if verbose: print("\t" + str(CapsCount) + " total capitalized tokens") print("\t" + str(CapsMatch) + " total capitalized dictionary matches") print("\t" + str(CapsSub) + " total capitalized valid substitutions") print("\t" + str(LowCount) + " total lower-case tokens") print("\t" + str(LowMatch) + " total lower-case dictionary matches") print("\t" + str(LowSub) + " total lower-case valid substitutions\n") ## Return the six scores as a tuple. return (CapsCount,CapsMatch,CapsSub,LowCount,LowMatch,LowSub)
IDtoprocess = HTIDlist[index].strip() filepath, postfix = FileCabinet.pairtreepath(IDtoprocess, datapath) filename = filepath + postfix + '/' + postfix + ".txt" try: with open(filename, encoding='utf-8') as file: lines = file.readlines() successflag = True except IOError as e: successflag = False if not successflag: print(IDtoprocess + " is missing.") continue tokens = TokenGen.keep_hyphens(lines, Lexicon, verbose=debug) volacc = TypeIndex.GetAcc(tokens, Lexicon, debug) types = TypeIndex.GetTypes(tokens, verbose=debug) TypeIndex.UpdateIndex(BigIndex, types, volacc, debug) ### Deletes BigIndex after copying to list in order to save memory SortedIndex = TypeIndex.SortIndex(BigIndex, debug) del BigIndex TypeIndex.WriteIndex(SortedIndex, outputpath + writename, delim, debug)
IDtoprocess = IDtoprocess.strip() filepath, postfix = FileCabinet.pairtreepath(IDtoprocess, datapath) filename = filepath + postfix + '/' + postfix + ".txt" try: with open(filename, encoding='utf-8') as file: lines = file.readlines() successflag = True except IOError as e: successflag = False if not successflag: print(IDtoprocess + " is missing.") continue tokens = TokenGen.keep_hyphens(lines,Lexicon,verbose=debug) if len(tokens) < 10: print(IDtoprocess, "has only tokencount", len(tokens)) volacc = TypeIndex.GetAcc(tokens,Lexicon,debug) types = TypeIndex.GetTypes(tokens,verbose=debug) TypeIndex.UpdateIndex(BigIndex, types, volacc, debug) ### Deletes BigIndex after copying to list in order to save memory SortedIndex = TypeIndex.SortIndex(BigIndex, debug) del BigIndex