def full_search(hashedData, pat, m=20): """Take prehashed corpus text and hash the pattern text. Compare each line of 20 words in pattern with each line of 20 words in the corpus. Print line number and text for matches.""" # Hash words in pattern pHashed = [] pProcessed = [] matches = [] for word in pat.split(): new = word.translate(string.maketrans("", ""), string.punctuation).upper() pProcessed.append(new) pHashed.append(letsHash(new)) # for each m-tuple in corpus for k, txtMtuple in enumerate(izip(*[iter(hashedData[i:]) for i in xrange(m)])): # for m-tuples in pattern -- might just use izip here for i in range(len(pHashed) - m + 1): # first word in seqs seq = pHashed[i : i + m] broken = m # not broken for j, hashedWord in enumerate(seq): if hashedWord != txtMtuple[j]: broken = j break if broken == m: # was not redefined matches.append((k, " ".join(pProcessed[i : i + m]))) if len(matches) > 0: processMatches(matches, m) # print out matches
def full_search(hashedData, pat, m=20): '''Take prehashed corpus text and hash the pattern text. Compare each line of 20 words in pattern with each line of 20 words in the corpus. Print line number and text for matches.''' # Hash words in pattern pHashed = [] pProcessed = [] matches = [] for word in (pat.split()): new = word.translate(string.maketrans("", ""), string.punctuation).upper() pProcessed.append(new) pHashed.append(letsHash(new)) # for each m-tuple in corpus for k, txtMtuple in enumerate( izip(*[iter(hashedData[i:]) for i in xrange(m)])): # for m-tuples in pattern -- might just use izip here for i in range(len(pHashed) - m + 1): # first word in seqs seq = pHashed[i:i + m] broken = m # not broken for j, hashedWord in enumerate(seq): if hashedWord != txtMtuple[j]: broken = j break if broken == m: # was not redefined matches.append((k, ' '.join(pProcessed[i:i + m]))) if len(matches) > 0: processMatches(matches, m) # print out matches
def hashPat(pat): """ Hash the pattern """ # Hash words in pattern pHashed = [] pProcessed = [] for word in (pat.split()): new = word.translate(string.maketrans("",""), string.punctuation).upper() pProcessed.append(new) pHashed.append(letsHash(new, q=1009, d=26)) return pHashed, pProcessed
def processData(hashedData, pat, m, rank, comm): """ Each process hashes the pattern and searches through part of corpus for matches of length m words""" # Hash words in pattern pHashed = [] pProcessed = [] matches = [] for word in (pat.split()): new = word.translate(string.maketrans("",""), string.punctuation).upper() pProcessed.append(new) pHashed.append(letsHash(new, q=1009, d=26)) # for each m-tuple in corpus for k,txtMtuple in enumerate(izip(*[iter(hashedData[i:]) for i in xrange(m)])): # for m-tuples in pattern -- might just use izip here for i in range(len(pHashed)-m+1): # first word in seqs seq = pHashed[i:i+m] broken = m # not broken for j,hashedWord in enumerate(seq): if hashedWord != txtMtuple[j]: broken = j break if broken == m: # was not redefined matches.append((k,' '.join(pProcessed[i:i+m]))) if len(matches) > 0: processMatches(matches,m) # print out matches return time.time()
def processData(hashedData, pat, m, rank, comm): """ Each process hashes the pattern and searches through part of corpus for matches of length m words""" # Hash words in pattern pHashed = [] pProcessed = [] matches = [] for word in (pat.split()): new = word.translate(string.maketrans("", ""), string.punctuation).upper() pProcessed.append(new) pHashed.append(letsHash(new, q=1009, d=26)) # for each m-tuple in corpus for k, txtMtuple in enumerate( izip(*[iter(hashedData[i:]) for i in xrange(m)])): # for m-tuples in pattern -- might just use izip here for i in range(len(pHashed) - m + 1): # first word in seqs seq = pHashed[i:i + m] broken = m # not broken for j, hashedWord in enumerate(seq): if hashedWord != txtMtuple[j]: broken = j break if broken == m: # was not redefined matches.append((k, ' '.join(pProcessed[i:i + m]))) if len(matches) > 0: processMatches(matches, m) # print out matches return time.time()