def checkTxt(pHashed, pProcessed, txt, lineNum, m, rank): '''Check for matches between pHashed (hashed pattern) and txt (one chunk of hashed corpus text)''' matches = [] # for each m-tuple in corpus for k,txtMtuple in enumerate(izip(*[iter(txt[i:]) for i in xrange(m)])): # for m-tuples in pattern -- might just use izip here for i in range(len(pHashed)-m+1): # first word in seqs seq = pHashed[i:i+m] broken = m # not broken for j,hashedWord in enumerate(seq): if hashedWord != txtMtuple[j]: broken = j break if broken == m: # was not redefined matches.append((k,' '.join(pProcessed[i:i+m]))) if len(matches) > 0: processMatches(matches,m) # print out matches
def full_search(hashedData, pat, m=20): """Take prehashed corpus text and hash the pattern text. Compare each line of 20 words in pattern with each line of 20 words in the corpus. Print line number and text for matches.""" # Hash words in pattern pHashed = [] pProcessed = [] matches = [] for word in pat.split(): new = word.translate(string.maketrans("", ""), string.punctuation).upper() pProcessed.append(new) pHashed.append(letsHash(new)) # for each m-tuple in corpus for k, txtMtuple in enumerate(izip(*[iter(hashedData[i:]) for i in xrange(m)])): # for m-tuples in pattern -- might just use izip here for i in range(len(pHashed) - m + 1): # first word in seqs seq = pHashed[i : i + m] broken = m # not broken for j, hashedWord in enumerate(seq): if hashedWord != txtMtuple[j]: broken = j break if broken == m: # was not redefined matches.append((k, " ".join(pProcessed[i : i + m]))) if len(matches) > 0: processMatches(matches, m) # print out matches
def full_search(hashedData, pat, m=20): '''Take prehashed corpus text and hash the pattern text. Compare each line of 20 words in pattern with each line of 20 words in the corpus. Print line number and text for matches.''' # Hash words in pattern pHashed = [] pProcessed = [] matches = [] for word in (pat.split()): new = word.translate(string.maketrans("", ""), string.punctuation).upper() pProcessed.append(new) pHashed.append(letsHash(new)) # for each m-tuple in corpus for k, txtMtuple in enumerate( izip(*[iter(hashedData[i:]) for i in xrange(m)])): # for m-tuples in pattern -- might just use izip here for i in range(len(pHashed) - m + 1): # first word in seqs seq = pHashed[i:i + m] broken = m # not broken for j, hashedWord in enumerate(seq): if hashedWord != txtMtuple[j]: broken = j break if broken == m: # was not redefined matches.append((k, ' '.join(pProcessed[i:i + m]))) if len(matches) > 0: processMatches(matches, m) # print out matches