def __init__(self, name, f): self.id = None # pylint: disable=invalid-name self.name = name self.enc = trie() self.dec = trie() self.eos = [] # Skip blank lines when reading. lines = [line for line in f.read().split("\n") if line] for line in lines: prefix = line[0] if prefix in "@/$!": line = line[1:] if prefix == "@": self.id = line continue if prefix == "!": msg = "Table switching not yet implemented." raise NotImplementedError(msg) code, text = line.split("=", 1) codeseq = bytes.fromhex(code) self.enc[text] = codeseq self.dec[codeseq] = text if prefix == "/": self.eos.append(codeseq)
def testInitContains(self): T = trie(key='value') T = trie(**T) self.assertTrue('key' in T) self.assertFalse('keys' in T) self.assertFalse('ke' in T) self.assertFalse('kex' in T)
def add(self, update): dt = update.get_time() dt = datetime.datetime.strptime(dt, '%m/%d/%y %H:%M:%S') # Format to obj # Set granularity dt = dt.replace(second = 0, microsecond = 0) mi = (dt.minute / self.granu) * self.granu dt = dt.replace(minute = mi) time = time_lib.mktime(dt.timetuple()) # Change datetime into seconds from_ip = update.get_from_ip() if from_ip not in self.from_ip_list: self.from_ip_list.append(from_ip) prefix = update.get_announce() + update.get_withdrawn() if time != self.lasttime: if self.lasttime != 0: # Not the first run self.get_index() # TODO: should consider the last run self.trie = patricia.trie(None) self.lasttime = time for p in prefix: try: # Test whether the trie has the node test = self.trie[p] except: # Node does not exist self.trie[p] = [] if from_ip not in self.trie[p]: self.trie[p].append(from_ip)
def testBorderlineValues(self): T = trie(foo=1, bar=2) self.assertEqual('foo', T.key('foo', -3)) self.assertEqual('foo', T.key('foo', -4)) self.assertEqual('foo', T.key('foo', -4, 3)) self.assertEqual(None, T.key('foo', -3, -4, None)) self.assertEqual(None, T.key('foo', -4, -4, None))
def __init__(self, granu): self.granu = granu # Time granularity self.trie = patricia.trie(None) # prefix: AS list self.from_ip_list = [] # temprorily store monitor self.lasttime = 0 self.dvi1 = dict() # {time: index value} self.dvi2 = dict() self.dvi4 = dict() self.dvi5 = dict() self.monitor_dict = dict() # {time: number of monitors} self.dvi1_avg = [] # {day: average value} self.dvi1_med = [] self.dvi2_avg = [] # {day: average value} self.dvi2_med = [] self.dvi4_avg = [] # {day: average value} self.dvi4_med = [] self.dvi5_avg = [] # {day: average value} self.dvi5_med = [] self.monitor = [] # {day: number of monitors} self.days = [] # strings of days #for i in range(2003, 2005): for i in range(2003, 2014): for j in ['01', '04', '07', '10']: if i == 2012 and j == '07': # TODO: temp: to avoid bug continue self.days.append(str(i)+j) self.dvi1_avg.append(0) # initialization self.dvi2_avg.append(0) self.dvi4_avg.append(0) self.dvi5_avg.append(0)
def testIterPrefix(self): T = trie() T['b'] = 1 T['baar'] = 2 T['baahus'] = 3 self.assertListEqual(sorted(['baar', 'baahus']), sorted(list(T.iter('ba')))) self.assertListEqual(sorted(['baar', 'baahus']), sorted(list(T.iter('baa')))) self.assertListEqual(sorted(['b', 'baar', 'baahus']), sorted(list(T.iter('b')))) self.assertListEqual(sorted([]), sorted(list(T.iter('others'))))
def main(): for i in range(0,index_count): t = trie() with open(pickleFolder+'/'+sys.argv[1]+'/'+str(i)+'.pik', 'rb') as f: t = pickle.load(f) #a = sorted(t.iter('')) for j in t: if j != "": print j , t[j]
def testWindowMatching(self): T = trie(foo=1, foobar=2) self.assertListEqual(['foo'], list(T.keys("foobar", 0, 3))) self.assertListEqual([1], list(T.values("a foobar!", 2, 7))) self.assertListEqual([('foo', 1), ('foobar', 2)], list(T.items("a foobar!", 2, 8))) self.assertEqual('foo', T.key("foobar", 0, 3)) self.assertEqual(1, T.value("a foobar!", 2, 7)) self.assertEqual(('foobar', 2), T.item("a foobar!", 2, 8))
def testKeyPresenceOnly(self): T = trie(foo=True, baar=True, baarhus=True, bazar=True) txt = 'The fool baal baarhus in the bazar!' presence = [4, 14, 29] for i in range(len(txt)): if T.value(txt, i, default=False): self.assertTrue(i in presence, '{} {} "{}"'.format(str(presence), i, txt[i:])) presence.remove(i) self.assertEqual(0, len(presence), str(presence))
def testStrRepr(self): T = trie() T['ba'] = 2 T['baz'] = "hey's" T['fool'] = 1.5 result = repr(T) self.assertTrue(result.startswith("trie({"), result) self.assertTrue(result.endswith('})'), result) self.assertTrue("'ba': 2" in result, result) self.assertTrue("'baz': \"hey's\"" in result, result) self.assertTrue("'fool': 1.5" in result, result)
def testGetItems(self): T = trie() T['ba'] = 2 T['baz'] = 3 T['fool'] = 1 self.assertEqual(('ba', 2), T.item('bar')) self.assertEqual(1, T.value('fool')) self.assertRaises(KeyError, T.key, 'foo') T[''] = 0 self.assertEqual(('', 0), T.item('')) self.assertEqual('', T.key('foo'))
def change_query(self): self.worker.display_query.emit(-1, set(), set()) cleared = [False] def clear(): if not cleared[0]: cleared[0] = True self.results.clear() includes = { element.value for element in elements if element.type == 'inc'} excludes = { element.value for element in elements if element.type == 'exc'} columns = [] sort = [] for element in elements: if ( element.type in ('col', 'sort_asc', 'sort_desc', 'sort_rand') and element.value and element.value not in columns ): columns.append(element.value) if element.type == 'sort_asc': sort.append(('asc', element.value)) elif element.type == 'sort_desc': sort.append(('desc', element.value)) elif element.type == 'sort_rand': sort.append(('rand', element.value)) if includes != self.includes or excludes != self.excludes: self.query_unique += 1 known_columns = patricia.trie() self.raw = [] self.includes = includes self.excludes = excludes clear() if self.includes or self.excludes: self._reset_query(self.query_unique) if columns != self.columns or sort != self.sort: self.columns = columns self.sort = sort if self.columns: self.results.setColumnCount(len(self.columns)) self.results.setHeaderLabels(self.columns) self.results.header().show() else: self.results.header().hide() self.results.setColumnCount(1) self.columns = ['filename'] self.sort.append(('asc', 'filename')) clear() self._redisplay()
def get_all_length(sdate): print 'Getting all prefix lengthes from RIB...' len_count = dict() # length:count trie = patricia.trie(None) mydate = sdate[0:4] + '.' + sdate[4:6] dir_list = os.listdir(datadir+'routeviews.org/bgpdata/'+mydate+'/RIBS/') rib_location = datadir+'routeviews.org/bgpdata/'+mydate+'/RIBS/' for f in dir_list: if not f.startswith('.'): rib_location = rib_location + f # if RIB is of the same month. That's OK. break if rib_location.endswith('txt.gz'): subprocess.call('gunzip '+rib_location, shell=True) # unpack rib_location = rib_location.replace('.txt.gz', '.txt') elif not rib_location.endswith('txt'): # .bz2/.gz file exists parse_mrt(rib_location, rib_location+'.txt') os.remove(rib_location) # then remove .bz2/.gz rib_location = rib_location + '.txt' # now rib file definitely ends with .txt with open(rib_location, 'r') as f: # get monitors from RIB for line in f: try: pfx = line.split('|')[5] pfx = ip_to_binary(pfx, '0.0.0.0') except: # incomplete entry may exsits continue try: test = trie[pfx] # whether already exists except: trie[pfx] = True f.close() # compress the RIB back into .gz if not os.path.exists(rib_location+'.gz'): pack_gz(rib_location) pfx_count = 0 for pfx in trie.iter(''): if pfx != '': pfx_count += 1 try: len_count[len(pfx)] += 1 except: len_count[len(pfx)] = 1 del trie return [len_count, pfx_count]
def __init__(self, maxsize): # Window parameters self.maxsize = maxsize self.size = 0 self.start = 0 # Window start self.end = 0 # Window end self.trie = patricia.trie(None) # Store still active updates. # BGP dynamics count variables self.wadi = 0 self.aadi = 0 self.wwdu = 0 self.aadut1 = 0 self.aadut2 = 0 self.wadu = 0 self.aw = 0
def testSetGetDel(self): T = trie() T['foo'] = 1 T['bar'] = 2 T['baz'] = 3 self.assertTrue('foo' in T) self.assertTrue('bar' in T) self.assertTrue('baz' in T) self.assertEqual(T['foo'], 1) self.assertEqual(T['bar'], 2) self.assertEqual(T['baz'], 3) self.assertRaises(KeyError, T.__getitem__, 'ba') self.assertRaises(KeyError, T.__getitem__, 'fool') del T['bar'] self.assertRaises(KeyError, T.__getitem__, 'bar') self.assertEqual(T['baz'], 3)
def testOffsetMatching(self): T = trie() T['foo'] = 1 T['baar'] = 2 T['baarhus'] = 3 T['bazar'] = 4 txt = 'The fool baal baarhus in the bazar!' keys = [] values = [] items = [] for i in range(len(txt)): values.extend(T.values(txt, i)) for i in range(len(txt)): keys.extend(T.keys(txt, i)) for i in range(len(txt)): items.extend(T.items(txt, i)) self.assertListEqual([1, 2, 3, 4], values) self.assertListEqual(['foo', 'baar', 'baarhus', 'bazar'], keys) self.assertListEqual([('foo', 1), ('baar', 2), ('baarhus', 3), ('bazar', 4)], items)
def test(): # 1. build a trie t = trie(zero=0, one=1, two=2, three=3, four=4, five=5, six=6, seven=7, eight=8, nine=9, ten=10, eleven=11, twelve=12, thirteen=13, fourteen=10, fifteen=15, sixteen=16, seventeen=17, eighteen=18, nineteen=19, twenty=20, thirty=30, fourty=40, fifty=50, sixty=60, seventy=70, eighty=80, ninety=90, hundred=100) # 2. scan 2000 "sentences" with it for _ in range(1000): # scanning for the longest matches only in sentence 1 i = S1[0] #print(TEXT[i:S1[1]]) while i < S1[1]: k, v = t.item(TEXT, i, S1[1], None) if k is not None: #print(v) i += len(k) else: i += 1 # scanning for all matches in sentence 2 i = S2[0] #print(TEXT[i:S2[1]]) s = 0 while i < S2[1]: for k, v in t.items(TEXT, i, S2[1]): #print(v) s += v i += 1 if s != 142: raise RuntimeError(str(s)) # 3. make a real dictionary of all keys in the trie if 'nine' not in dict(t.items()): raise RuntimeError(str(dict(t.items())))
def testIterItems(self): T = trie(ba=2, baz=3, fool=1) self.assertListEqual(['ba', 'baz'], list(T.keys('bazar'))) self.assertListEqual([('fool', 1)], list(T.items('fools'))) self.assertListEqual([], list(T.values('others')))
def testIterator(self): T = trie(ba=2, baz=3, fool=1) self.assertListEqual(sorted(['fool', 'ba', 'baz']), sorted(list(T))) T[''] = 0 self.assertEqual(sorted(['', 'fool', 'ba', 'baz']), sorted(list(T)))
def testFakeDefault(self): T = trie() fake = _NonTerminal() self.assertEqual(fake, T.value('foo', default=fake))
def testGetExactMatch(self): T = trie(exact=5) self.assertListEqual(['exact'], list(T.keys('exact'))) self.assertListEqual([5], list(T.values('exact'))) self.assertListEqual([('exact', 5)], list(T.items('exact')))
def clear_index(self): self.trie = patricia.trie() self.search_trie = patricia.trie() self.comments = collections.defaultdict(set) self.fns = {}
def testSingleEntry(self): T = trie(foo=5) self.assertListEqual(['foo'], list(T.keys())) self.assertListEqual([5], list(T.values())) self.assertListEqual([('foo', 5)], list(T.items()))
def testEmptyStringKey(self): T = trie(2, foo=1) self.assertTrue('foo' in T) self.assertTrue('' in T) del T[''] self.assertRaises(KeyError, T.__getitem__, '')
class PersianStemmer(object): lexicon = trie() mokassarDic = trie() cache = trie() verbDic = trie() _ruleList = [] verbAffix = [ "*ش", "*نده", "*ا", "*ار", "وا*", "اثر*", "فرو*", "پیش*", "گرو*", "*ه", "*گار", "*ن" ] suffix = [ "كار", "ناك", "وار", "آسا", "آگین", "بار", "بان", "دان", "زار", "سار", "سان", "لاخ", "مند", "دار", "مرد", "کننده", "گرا", "نما", "متر" ] prefix = ["بی", "با", "پیش", "غیر", "فرو", "هم", "نا", "یک"] prefixException = ["غیر"] suffixZamir = ["م", "ت", "ش"] suffixException = ["ها", "تر", "ترین", "ام", "ات", "اش"] PATTERN_FILE_NAME = os.path.dirname(__file__) + "/data/Patterns.fa" VERB_FILE_NAME = os.path.dirname(__file__) + "/data/VerbList.fa" DIC_FILE_NAME = os.path.dirname(__file__) + "/data/Dictionary.fa" MOKASSAR_FILE_NAME = os.path.dirname(__file__) + "/data/Mokassar.fa" patternCount = 1 enableCache = True enableVerb = False def __init__(self): try: self.loadRule() self.loadLexicon() self.loadMokassarDic() if self.enableVerb: self.loadVerbDic() except Exception as ex: print(ex) def loadData(self, resourceName): result = [] with open(resourceName, 'r', encoding="utf-8") as reader: result = [ line.strip("\r\n ") for line in reader if line.strip("\r\n ") ] return result def loadVerbDic(self): if len(PersianStemmer.verbDic) > 0: return lines = self.loadData(PersianStemmer.VERB_FILE_NAME) for line in lines: arr = line.split("\t") PersianStemmer.verbDic[arr[0].strip()] = VerbStem( arr[1].strip(), arr[2].strip()) def str2bool(self, v): return v.lower() in ("yes", "true", "t", "1") def loadRule(self): if len(PersianStemmer._ruleList) > 0: return lines = self.loadData(PersianStemmer.PATTERN_FILE_NAME) for line in lines: arr = line.split(",") PersianStemmer._ruleList.append( StemmingRule(arr[0], arr[1], arr[2], int(arr[3]), self.str2bool(arr[4]))) #PersianStemmer._ruleList = [StemmingRule(arr[0], arr[1], arr[2], int(arr[3]), self.str2bool(arr[4])) for line in lines for arr in line.split(",")] def loadLexicon(self): if len(PersianStemmer.lexicon) > 0: return lines = self.loadData(PersianStemmer.DIC_FILE_NAME) for line in lines: PersianStemmer.lexicon[line.strip("\r\n ")] = True def loadMokassarDic(self): if len(PersianStemmer.mokassarDic) > 0: return lines = self.loadData(PersianStemmer.MOKASSAR_FILE_NAME) for line in lines: arr = line.split("\t") PersianStemmer.mokassarDic[arr[0].strip()] = arr[1].strip() def strip_accents(self, s): return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn') def normalization(self, s): newString = [] for ch in s: if ch == 'ي': newString.append('ی') elif ch in ['ة', 'ۀ']: newString.append('ه') elif ch in ['', '']: newString.append(' ') elif ch == 'ك': newString.append('ک') elif ch == 'ؤ': newString.append('و') elif ch in ['إ', 'أ']: newString.append('ا') elif ch in [ '\u064B', #FATHATAN '\u064C', #DAMMATAN '\u064D', #KASRATAN '\u064E', #FATHA '\u064F', #DAMMA '\u0650', #KASRA '\u0651', #SHADDA '\u0652' ]: #SUKUN pass else: newString.append(ch) return ''.join(newString) def validation(self, sWord): return (sWord in PersianStemmer.lexicon) def removeZamir(self, sInput, bState): sRule = "^(?P<stem>.+?)((?<=(ا|و))ی)?(ها)?(ی)?((ات)?( تان|تان| مان|مان| شان|شان)|ی|م|ت|ش|ء)$" if bState: sRule = "^(?P<stem>.+?)((?<=(ا|و))ی)?(ها)?(ی)?(ات|ی|م|ت|ش| تان|تان| مان|مان| شان|شان|ء)$" return self.extractStem(sInput, sRule) def getMokassarStem(self, sWord): if sWord in PersianStemmer.mokassarDic: return PersianStemmer.mokassarDic[sWord] else: sNewWord = self.removeZamir(sWord, True) if sNewWord in PersianStemmer.mokassarDic: return PersianStemmer.mokassarDic[sNewWord] else: sNewWord = self.removeZamir(sWord, False) if sNewWord in PersianStemmer.mokassarDic: return PersianStemmer.mokassarDic[sNewWord] return "" def verbValidation(self, sWord): if sWord.find(' ') > -1: return "" j = 0 for affix in PersianStemmer.verbAffix: if (j == 0 and (sWord[-1] == 'ا' or sWord[-1] == 'و')): sTemp = affix.replace("*", sWord + "ی") else: sTemp = affix.replace("*", sWord) if self.normalizeValidation(sTemp, True): return affix j = j + 1 return "" def getPrefix(self, sWord): result = [ sPrefix for sPrefix in PersianStemmer.prefix if sWord.startswith(sPrefix) ] if len(result) > 0: return result[0] return "" def getPrefixException(self, sWord): result = [ sPrefix for sPrefix in PersianStemmer.prefixException if sWord.startswith(sPrefix) ] if len(result) > 0: return result[0] return "" def getSuffix(self, sWord): result = [ sSuffix for sSuffix in PersianStemmer.prefixException if sWord.endswith(sSuffix) ] if len(result) > 0: return result[0] return "" def inRange(self, d, f, t): return d >= f and d <= t def normalizeValidation(self, sWord, bRemoveSpace): sWord = sWord.strip() l = len(sWord) - 2 result = self.validation(sWord) if not result and sWord.find('ا') == 0: result = self.validation(sWord.replace("ا", "آ", 1)) if (not result and self.inRange(sWord.find('ا'), 1, l)): result = self.validation(sWord.replace('ا', 'أ')) if (not result and self.inRange(sWord.find('ا'), 1, l)): result = self.validation(sWord.replace('ا', 'إ')) if (not result and self.inRange(sWord.find("ئو"), 1, l)): result = self.validation(sWord.replace("ئو", "ؤ")) if (not result and sWord.endswith("ء")): result = self.validation(sWord.replace("ء", "")) if (not result and self.inRange(sWord.find("ئ"), 1, l)): result = self.validation(sWord.replace("ئ", "ی")) if (bRemoveSpace): if (not result and self.inRange(sWord.find(' '), 1, l)): result = self.validation(sWord.replace(" ", "")) # # دیندار # # دین دار if (not result): sSuffix = self.getSuffix(sWord) if (sSuffix): sTemp = sWord.replace(sSuffix, " " + sSuffix) if sSuffix == "مند": sTemp = sWord.replace(sSuffix, "ه " + sSuffix) result = self.validation(sTemp) if (not result): sPrefix = self.getPrefix(sWord) if sPrefix: if (sWord.startswith(sPrefix + " ")): result = self.validation( sWord.replace(sPrefix + " ", sPrefix)) else: result = self.validation( sWord.replace(sPrefix, sPrefix + " ")) if (not result): sPrefix = self.getPrefixException(sWord) if (sPrefix): if (sWord.startswith(sPrefix + " ")): result = self.validation( sWord.replace(sPrefix + " ", "", 1)) else: result = self.validation(sWord.replace(sPrefix, "", 1)) return result def isMatch(self, sInput, sRule): match = re.compile(sRule).search(sInput) if match: return True return False def extractStem(self, sInput, sRule, sReplacement="\g<stem>"): return re.sub(sRule, sReplacement, sInput).strip() def getVerb(self, input): if input in PersianStemmer.verbDic: vs = PersianStemmer.verbDic[input] if self.validation(vs.getPresent()): return vs.getPresent() return vs.getPast() return "" def PatternMatching(self, input, stemList=[]): terminate = False s = "" sTemp = "" for rule in PersianStemmer._ruleList: if terminate: return terminate sReplace = rule.getSubstitution().split(";") pattern = rule.getBody() if not self.isMatch(input, pattern): continue k = 0 for t in sReplace: if k > 0: break s = self.extractStem(input, pattern, t) if len(s) < rule.getMinLength(): continue if rule.getPoS() == 'K': # Kasre Ezafe if len(stemList) == 0: sTemp = self.getMokassarStem(s) if sTemp: stemList.append(sTemp) #, pattern + " [جمع مکسر]") k = k + 1 elif self.normalizeValidation(s, True): stemList.append(s) #, pattern) k = k + 1 else: pass #addToLog("", pattern + " ::" + s + "}") elif rule.getPoS() == 'V': # Verb sTemp = self.verbValidation(s) if len(sTemp) == 0: stemList.append(s) # pattern + " : [" + sTemp + "]" k = k + 1 else: pass #addToLog("", pattern + " ::تمام وندها}") else: if self.normalizeValidation(s, True): stemList.append(s) if rule.getState(): terminate = True k = k + 1 else: pass #addToLog("", pattern + " ::" + s + "}") return terminate def run(self, input): input = self.normalization(input).strip() if not input: return "" #Integer or english if Utils.isEnglish(input) or Utils.isNumber(input) or len(input) <= 2: return input if self.enableCache and input in self.cache: return self.cache[input] s = self.getMokassarStem(input) if self.normalizeValidation(input, False): #stemList.add(input/*, "[فرهنگ لغت]"*/) if self.enableCache: self.cache[input] = input return input elif s: #addToLog(s/*, "[جمع مکسر]"*/) #stemList.add(s) if self.enableCache: self.cache[input] = s return s stemList = [] terminate = self.PatternMatching(input, stemList) if self.enableVerb: s = self.getVerb(input) if s: stemList = [s] if len(stemList) == 0: if self.normalizeValidation(input, True): #stemList.add(input, "[فرهنگ لغت]") if self.enableCache: self.cache[input] = input #stemList.get(0)) return input #stemList.get(0) stemList.append(input) #, "") if terminate and len(stemList) > 1: return self.nounValidation(stemList) if self.patternCount != 0: stemList.sort(reverse=self.patternCount >= 0) stemList = stemList[abs(self.patternCount) - 1:] if self.enableCache: self.cache[input] = stemList[0] return stemList[0] def nounValidation(self, stemList): stemList.sort() lastStem = stemList[-1] if lastStem.endswith("ان"): return lastStem else: firstStem = stemList[0] secondStem = stemList[1].replace(" ", "") for sSuffix in PersianStemmer.suffixZamir: if secondStem == firstStem + sSuffix: return firstStem return lastStem
print ('\n' * 100) ######################################################################################################################## ######################################################################################################################## ''' Heapsort utilizado para organizar os dados ''' def heapsort(iterable): h = [] for value in iterable: heapq.heappush(h, value) return [heapq.heappop(h) for i in range(len(h))] contadorArtistas = 1 # Contador de artistas. contadorMusicas = 0 # Contador de musicas. contadorIndex = 0 arvoreMusica = trie() # Árvore com nome das músicas. arvoreArtistas = trie() # Árvore com nome dos artistas. auxArtista = trie() # Árvore Auxiliar com nome dos artistas. auxMusica = trie() # Árvore Auxiliar com nome das músicas. Index = list() ######################################################################################################################## ######################################################################################################################## ''' Lendo o arquivo Json ''' arquivo_dados = raw_input("Insira o nome do arquivo de dados (sem o .json): ") arquivo_dados = arquivo_dados + ".json" with open(arquivo_dados) as f: data = f.read(); jsondata = json.loads(data) for row in jsondata['items']:
def random_string(length): """Produce a random string made of *length* uppercase ascii characters""" return ''.join(choice(ascii_uppercase) for i in range(length)) strings = [random_string(32) for i in range(10000)] matches = [s for s in strings if s.startswith('AA')] print(matches) from patricia import trie strings_dict = {s:0 for s in strings} # A dictionary where all values are 0 strings_trie = trie(**strings_dict) matches = list(strings_trie.iter('AA')) print(matches) # If you look closely, the timing for this input size is 60.1 μs, which is about 30 times # faster (1.76 ms = 1760 μs) than linear search! # Note that if we want to return all the prefixes that match, the running time will be # proportional to the number of results that match the prefix. Therefore, when designing # timing benchmarks, care must be taken to ensure that we are always returning the same # number of results. # The scaling properties of a trie versus a linear scan for datasets of different sizes that # contains ten prefix matches are shown in the following table: # Algorithm N=10000 (μs) N=20000 (μs) N=30000 (μs) Time # Trie 17.12 17.27 17.47 O(S) # Linear scan 1978.44 4075.72 6398.06 O(N)
def createIndex(start): #create stop word file stopWords = create_stopword_list(stopwordsFile) counter = 0 #list of create patricia tries for various alphabets index = {} for i in range(0, index_count): index[i] = trie() #create index for folder in range(int(start), int(start) + 1): print "working on folder" + str(folder) #folder = os.walk(corpusFolder).next()[1]: for file in os.walk(corpusFolder + '/' + str(folder)).next()[2]: try: #print file docID = file #ignore files other than the html files ( not of the form *.* ) pattern = re.compile('\.') if pattern.search(file) != None: continue #read html file html_doc = "" try: html_doc = open(corpusFolder + '/' + str(folder) + '/' + file).read() except: print "could not open file " + file continue #create soup soup = BeautifulSoup(html_doc) #remove absurd tags like javascript, css, iframe [s.extract() for s in soup(['script', 'iframe', 'style'])] textList = [] try: text = soup.get_text().encode("utf-8").lower() pattern = re.compile("[^a-z0-9]") textList = re.split(pattern, text) except Exception as e: print e continue #remove stop words textList = remove_stopwords(textList, stopWords) #sort the text list to reduce the number of IOs textList = sorted(textList) #print textList #add words to appropriate patricia-trie for word in textList: #perform stemming word = stem(word) #remove small words if len(word) <= min_word_length: #print word#"word length insufficient" continue #print word try: if not word in index[sec( word)]: #the word has occured the first time index[sec(word)][word] = PostingStruct( 0, {}) #create posting list for the word #print index[sec(word)][word].df # the word has occured earlier if index[sec(word)][word].posting.has_key( file) == True: index[sec(word)][word].posting[file] += 1 else: index[sec(word)][word].posting[file] = 1 index[sec(word)][ word].df += 1 #index[ sec(word) ][word].df+1 except Exception as e: print word print e, word, sec(word) #return #break """ if counter == 3: break counter+=1 """ except: continue #once all the tries have been created pickle them for i in range(0, index_count): with open(pickleFolder + '/' + start + '/' + str(i) + '.pik', 'wb') as f: pickle.dump(index[i], f, -1) print "successfully completed !"
######################################################################################################################## ''' Heapsort utilizado para organizar os dados ''' def heapsort(iterable): h = [] for value in iterable: heapq.heappush(h, value) return [heapq.heappop(h) for i in range(len(h))] contadorArtistas = 1 # Contador de artistas. contadorMusicas = 0 # Contador de musicas. contadorIndex = 0 arvoreMusica = trie() # Árvore com nome das músicas. arvoreArtistas = trie() # Árvore com nome dos artistas. auxArtista = trie() # Árvore Auxiliar com nome dos artistas. auxMusica = trie() # Árvore Auxiliar com nome das músicas. Index = list() ######################################################################################################################## ######################################################################################################################## ''' Lendo o arquivo Json ''' arquivo_dados = raw_input("Insira o nome do arquivo de dados (sem o .json): ") arquivo_dados = arquivo_dados + ".json" with open(arquivo_dados) as f: data = f.read() jsondata = json.loads(data) for row in jsondata['items']:
def create_corpus( words ): maxFreq = {} unwanted = ['.b', '.x', '.n', '.c' ] wanted = ['.i', '.t', '.a', '.w', '.k'] i = 0 t = trie() docID = 0 global docCount while i < len(words): freq = 1 if words[i] == '.i': i=i+1 docID = words[i] docCount +=1 elif words[i] in unwanted: i = 1+i while (words[i] not in wanted + unwanted) : i = i+1 if i >= len(words): break # print i, words[i], len(words) else: try: #stem the word before insetion words[i] = stem(words[i]); #if new word if t[ str(words[i]) ] == False: t[ str(words[i]) ] = {}; if t[ str(words[i]) ].has_key(docID) == True: t[ str(words[i]) ][docID] += 1 else: t[ str(words[i]) ][docID] = 1 """ t[] f = getFreq(t[str(words[i])],docID) #print str(words[i]) #print f if f == -1: t[str(words[i])][docID] = freq#.append((docID,freq)) else: #t[str(words[i])].remove((docID,f)) if len(t[str(words[i])]) == 0: t[str(words[i])] = {}#[(docID,f+1)] else: t[str(words[i])][docID] = f+1#.append((docID,f+1)) """ except KeyError: pass #no key is present #print type(str(words[i])) #stem the word before insetion #words[i] = words[i].strip('0123456789., ') #words[i] = PorterStemmer().stem_word(words[i]) t[str(words[i])] = {} t[str(words[i])][docID] = 1 contFreq = t[ str(words[i]) ][docID] if maxFreq.has_key(docID): if maxFreq[docID] < contFreq: maxFreq[docID] = contFreq else: maxFreq[docID] = contFreq i= i + 1 #print "docCount is ", docCount with open('maxFreq.pik', 'wb') as f: pickle.dump([docCount, maxFreq], f, -1) with open('trie.pik', 'wb') as f: pickle.dump( t, f, -1) #print t.keys() """for word in t.keys():
def testIsPrefix(self): T = trie(bar=2, baz=3, fool=1) self.assertTrue(T.isPrefix('ba')) self.assertFalse(T.isPrefix('fools')) self.assertTrue(T.isPrefix(''))
def testValues(self): T = trie() T['ba'] = 2 T['baz'] = "hey's" T['fool'] = 1.5 self.assertListEqual(sorted(["2", "hey's", "1.5"]), sorted([str(v) for v in T.values()]))
def __init__(self, obj, config=str()): #print(config) #_config = """input = '@text1', mapping = 'low = 1, medium = 2, high = 3'""" super(Enum, self).__init__(obj, config=config) #print(config) #print(self.config) #print(config == _config) if self.config is None or len(self.config) < 1 or not isinstance( self.config, str): raise ValueError('Enum plugin function requires a config string') inputkeyword = 'input' mappingkeyword = 'mapping' if not inputkeyword in self.config: raise ValueError( 'A input keyword argument must be specified for the Enum plugin function' ) if not mappingkeyword in self.config: raise ValueError( 'A mapping keyword argument must be specified for the Enum plugin function' ) attrexpr = p.Combine( p.Literal("'").suppress() + (p.Literal('@') | p.Literal('!')).suppress() + p.Word(p.alphanums) + p.Literal("'").suppress()) inputexpr = p.CaselessKeyword(inputkeyword).suppress() + p.Literal( '=').suppress() + attrexpr mappingexpr = p.CaselessKeyword(mappingkeyword).suppress() + p.Literal( '=').suppress() + p.sglQuotedString() expr = inputexpr + p.Literal(',').suppress() + mappingexpr self.input = None self.mapping = None _matches = [] for x in expr.scanString(self.config): _matches.append(x) if len(_matches) > 1: raise IndexError( 'There should only be one input and mapping keyword set in the Enum plugin function\'s config but %s was received' % _matches) #print(_matches) rawconfig = _matches[0] mappingdict = {} for mapitem in rawconfig[0][1][1:-1].split(','): k, v = mapitem.split('=') mappingdict[k.strip()] = v.strip() self.input = rawconfig[0][0] self.mapping = pt.trie() for k, v in mappingdict.items(): self.mapping[k.lower()] = v
def testLongRootValue(self): T = trie(1, 2) self.assertEqual((1, 2), T[''])
import string # Add common-lib code to system path sources = os.getenv('BISCUIT_DIR') if sources not in sys.path: sys.path.append(sources) from common_lib.read_config import enabled_modules # Add trie module to path trie_module = enabled_modules['hashtag'] if trie_module: print 'trie module: ', trie_module if trie_module not in sys.path: sys.path.append(trie_module) import patricia _dictionary = patricia.trie() else: _dictionary = {} # Hand annotated hashtags annotations = {} with open(os.path.join(enabled_modules['hashtag'], 'ht-expansions.txt'), 'r') as f: for line in f.readlines(): ht,expansion = line.strip('\n').split(' || ') annotations[ht] = expansion # For development: allow module to be run-able def main():
def del_tabletran_updates(peer, sdate, cl_name): f_results = open(hdname+'tmp/'+peer+'_result.txt', 'r') for line in f_results: # get all affection info of this peer line = line.replace('\n', '') attr = line.split(',') if attr[0] == '#START': continue print line print 'get session reset time...' stime_unix = int(attr[0]) endtime_unix = int(attr[1]) start_datetime = datetime.datetime.fromtimestamp(stime_unix) +\ datetime.timedelta(hours=-8) end_datetime = datetime.datetime.fromtimestamp(endtime_unix) +\ datetime.timedelta(hours=-8) print 'from ', start_datetime, ' to ', end_datetime updatefile_list = open(hdname+'metadata/'+sdate+'/updt_filelist_'+cl_name, 'r') for updatefile in updatefile_list: updatefile = updatefile.replace('\n', '') file_attr = updatefile.split('.') fattr_date = file_attr[5] fattr_time = file_attr[6] dt = datetime.datetime(int(fattr_date[0:4]),int(fattr_date[4:6]), int(fattr_date[6:8]),int(fattr_time[0:2]), int(fattr_time[2:4])) if not start_datetime + datetime.timedelta(minutes =\ -15) <= dt <= end_datetime: # filename not OK continue print 'session reset exists in: ', updatefile size_before = os.path.getsize(updatefile) # unpack myfilename = updatefile.replace('txt.gz', 'txt') # .txt file subprocess.call('gunzip -c ' + updatefile + ' > ' +\ myfilename, shell=True) # only .txt from now on! oldfile = open(myfilename, 'r') newfile = open(hdname+'tmp/'+myfilename.split('/')[-1], 'w') counted_pfx = patricia.trie(None) for updt in oldfile: # loop over each update updt = updt.replace('\n', '') update_attr = updt.split('|') try: if (cmp(update_attr[3], peer)==0)\ & (stime_unix<int(update_attr[1])<\ endtime_unix): # culprit update confirmed pfx = update_attr[5] try: # Test whether the trie has the pfx test = counted_pfx[pfx] newfile.write(updt+'\n') # pfx exists except: # Node does not exist counted_pfx[pfx] = True else: # not culprit update newfile.write(updt+'\n') except: continue oldfile.close() newfile.close() os.remove(updatefile) # remove old .gz file # compress .txt into txt.gz to replace the old file subprocess.call('gzip -c '+hdname+'tmp/'+myfilename.split('/')[-1]+\ ' > '+updatefile, shell=True) size_after = os.path.getsize(updatefile) os.remove(updatefile.replace('txt.gz','txt')) print 'size(b):', size_before, ',size(a):', size_after updatefile_list.close() f_results.close()
def constructTrieIndex(self): self.trieIndex = trie('root') for termPage, postingList in self.index.iteritems(): idfData = math.log(float(self.docCount)/float(self.df[termPage]), 10) self.trieIndex[termPage] = (postingList, self.tf[termPage], idfData)