Пример #1
0
    def __init__(self, name, f):
        self.id = None  # pylint: disable=invalid-name
        self.name = name
        self.enc = trie()
        self.dec = trie()
        self.eos = []

        # Skip blank lines when reading.
        lines = [line for line
                 in f.read().split("\n")
                 if line]

        for line in lines:
            prefix = line[0]
            if prefix in "@/$!":
                line = line[1:]
            if prefix == "@":
                self.id = line
                continue
            if prefix == "!":
                msg = "Table switching not yet implemented."
                raise NotImplementedError(msg)

            code, text = line.split("=", 1)
            codeseq = bytes.fromhex(code)
            self.enc[text] = codeseq
            self.dec[codeseq] = text
            if prefix == "/":
                self.eos.append(codeseq)
Пример #2
0
 def testInitContains(self):
     T = trie(key='value')
     T = trie(**T)
     self.assertTrue('key' in T)
     self.assertFalse('keys' in T)
     self.assertFalse('ke' in T)
     self.assertFalse('kex' in T)
    def add(self, update):
        dt = update.get_time()
        dt = datetime.datetime.strptime(dt, '%m/%d/%y %H:%M:%S')  # Format to obj 

        # Set granularity
        dt = dt.replace(second = 0, microsecond = 0)
        mi = (dt.minute / self.granu) * self.granu
        dt = dt.replace(minute = mi)
        time = time_lib.mktime(dt.timetuple())  # Change datetime into seconds

        from_ip = update.get_from_ip()
        if from_ip not in self.from_ip_list:
            self.from_ip_list.append(from_ip)

        prefix = update.get_announce() + update.get_withdrawn()

        if time != self.lasttime:
            if self.lasttime != 0:  # Not the first run
                self.get_index()  # TODO: should consider the last run
                self.trie = patricia.trie(None)
            
            self.lasttime = time

        for p in prefix:
            try:  # Test whether the trie has the node
                test = self.trie[p]
            except:  # Node does not exist
                self.trie[p] = []

            if from_ip not in self.trie[p]:
                self.trie[p].append(from_ip)
Пример #4
0
 def testBorderlineValues(self):
     T = trie(foo=1, bar=2)
     self.assertEqual('foo', T.key('foo', -3))
     self.assertEqual('foo', T.key('foo', -4))
     self.assertEqual('foo', T.key('foo', -4, 3))
     self.assertEqual(None, T.key('foo', -3, -4, None))
     self.assertEqual(None, T.key('foo', -4, -4, None))
    def __init__(self, granu):
        self.granu = granu  # Time granularity
        self.trie = patricia.trie(None)  # prefix: AS list
        self.from_ip_list = []  # temprorily store monitor
        self.lasttime = 0
        self.dvi1 = dict()  # {time: index value}
        self.dvi2 = dict()
        self.dvi4 = dict()
        self.dvi5 = dict()
        self.monitor_dict = dict()  # {time: number of monitors}

        self.dvi1_avg = []  # {day: average value}
        self.dvi1_med = []
        self.dvi2_avg = []  # {day: average value}
        self.dvi2_med = []
        self.dvi4_avg = []  # {day: average value}
        self.dvi4_med = []
        self.dvi5_avg = []  # {day: average value}
        self.dvi5_med = []

        self.monitor = []  # {day: number of monitors}

        self.days = []  # strings of days
        #for i in range(2003, 2005):
        for i in range(2003, 2014):
            for j in ['01', '04', '07', '10']:
                if i == 2012 and j == '07':  # TODO: temp: to avoid bug
                    continue
                self.days.append(str(i)+j)
                self.dvi1_avg.append(0)  # initialization 
                self.dvi2_avg.append(0)
                self.dvi4_avg.append(0)
                self.dvi5_avg.append(0)
Пример #6
0
 def testIterPrefix(self):
     T = trie()
     T['b'] = 1
     T['baar'] = 2
     T['baahus'] = 3
     self.assertListEqual(sorted(['baar', 'baahus']), sorted(list(T.iter('ba'))))
     self.assertListEqual(sorted(['baar', 'baahus']), sorted(list(T.iter('baa'))))
     self.assertListEqual(sorted(['b', 'baar', 'baahus']), sorted(list(T.iter('b'))))
     self.assertListEqual(sorted([]), sorted(list(T.iter('others'))))
def main():
	for i in range(0,index_count):
		t = trie()
		with open(pickleFolder+'/'+sys.argv[1]+'/'+str(i)+'.pik', 'rb') as f:
				t = pickle.load(f)
		#a =  sorted(t.iter(''))
		for j in t:
			if j != "":
				print j , t[j]
Пример #8
0
 def testWindowMatching(self):
     T = trie(foo=1, foobar=2)
     self.assertListEqual(['foo'], list(T.keys("foobar", 0, 3)))
     self.assertListEqual([1], list(T.values("a foobar!", 2, 7)))
     self.assertListEqual([('foo', 1), ('foobar', 2)],
                          list(T.items("a foobar!", 2, 8)))
     self.assertEqual('foo', T.key("foobar", 0, 3))
     self.assertEqual(1, T.value("a foobar!", 2, 7))
     self.assertEqual(('foobar', 2), T.item("a foobar!", 2, 8))
Пример #9
0
 def testKeyPresenceOnly(self):
     T = trie(foo=True, baar=True, baarhus=True, bazar=True)
     txt = 'The fool baal baarhus in the bazar!'
     presence = [4, 14, 29]
     for i in range(len(txt)):
         if T.value(txt, i, default=False):
             self.assertTrue(i in presence,
                             '{} {} "{}"'.format(str(presence), i, txt[i:]))
             presence.remove(i)
     self.assertEqual(0, len(presence), str(presence))
Пример #10
0
 def testStrRepr(self):
     T = trie()
     T['ba'] = 2
     T['baz'] = "hey's"
     T['fool'] = 1.5
     result = repr(T)
     self.assertTrue(result.startswith("trie({"), result)
     self.assertTrue(result.endswith('})'), result)
     self.assertTrue("'ba': 2" in result, result)
     self.assertTrue("'baz': \"hey's\"" in result, result)
     self.assertTrue("'fool': 1.5" in result, result)
Пример #11
0
 def testGetItems(self):
     T = trie()
     T['ba'] = 2
     T['baz'] = 3
     T['fool'] = 1
     self.assertEqual(('ba', 2), T.item('bar'))
     self.assertEqual(1, T.value('fool'))
     self.assertRaises(KeyError, T.key, 'foo')
     T[''] = 0
     self.assertEqual(('', 0), T.item(''))
     self.assertEqual('', T.key('foo'))
Пример #12
0
    def change_query(self):
        self.worker.display_query.emit(-1, set(), set())
        cleared = [False]
        def clear():
            if not cleared[0]:
                cleared[0] = True
            self.results.clear()

        includes = {
            element.value for element in elements if element.type == 'inc'}
        excludes = {
            element.value for element in elements if element.type == 'exc'}
        columns = []
        sort = []
        for element in elements:
            if (
                    element.type in ('col', 'sort_asc', 'sort_desc', 'sort_rand') 
                    and element.value 
                    and element.value not in columns
                    ):
                columns.append(element.value)
            if element.type == 'sort_asc':
                sort.append(('asc', element.value))
            elif element.type == 'sort_desc':
                sort.append(('desc', element.value))
            elif element.type == 'sort_rand':
                sort.append(('rand', element.value))

        if includes != self.includes or excludes != self.excludes:
            self.query_unique += 1
            known_columns = patricia.trie()
            self.raw = []
            self.includes = includes
            self.excludes = excludes
            clear()
            if self.includes or self.excludes:
                self._reset_query(self.query_unique)

        if columns != self.columns or sort != self.sort:
            self.columns = columns
            self.sort = sort
            if self.columns:
                self.results.setColumnCount(len(self.columns))
                self.results.setHeaderLabels(self.columns)
                self.results.header().show()
            else:
                self.results.header().hide()
                self.results.setColumnCount(1)
                self.columns = ['filename']
                self.sort.append(('asc', 'filename'))
            clear()
            self._redisplay()
Пример #13
0
def get_all_length(sdate):
    print 'Getting all prefix lengthes from RIB...'

    len_count = dict() # length:count
    trie = patricia.trie(None)

    mydate = sdate[0:4] + '.' + sdate[4:6]
    dir_list = os.listdir(datadir+'routeviews.org/bgpdata/'+mydate+'/RIBS/')
    rib_location = datadir+'routeviews.org/bgpdata/'+mydate+'/RIBS/'
    for f in dir_list:
        if not f.startswith('.'):
            rib_location = rib_location + f # if RIB is of the same month. That's OK.
            break

    if rib_location.endswith('txt.gz'):
        subprocess.call('gunzip '+rib_location, shell=True)  # unpack                        
        rib_location = rib_location.replace('.txt.gz', '.txt')
    elif not rib_location.endswith('txt'):  # .bz2/.gz file exists
        parse_mrt(rib_location, rib_location+'.txt')
        os.remove(rib_location)  # then remove .bz2/.gz
        rib_location = rib_location + '.txt'
    # now rib file definitely ends with .txt  
    with open(rib_location, 'r') as f:  # get monitors from RIB
        for line in f:
            try:
                pfx = line.split('|')[5]
                pfx = ip_to_binary(pfx, '0.0.0.0')
            except: # incomplete entry may exsits
                continue
            try: 
                test = trie[pfx] # whether already exists
            except:
                trie[pfx] = True
    f.close()
    # compress the RIB back into .gz
    if not os.path.exists(rib_location+'.gz'):
        pack_gz(rib_location)

    pfx_count = 0
    for pfx in trie.iter(''):
        if pfx != '':
            pfx_count += 1
            try:
                len_count[len(pfx)] += 1
            except:
                len_count[len(pfx)] = 1
    del trie 

    return [len_count, pfx_count]
Пример #14
0
 def __init__(self, maxsize):
     # Window parameters
     self.maxsize = maxsize
     self.size = 0
     self.start = 0  # Window start
     self.end = 0  # Window end
     self.trie = patricia.trie(None)  # Store still active updates.
     # BGP dynamics count variables
     self.wadi = 0
     self.aadi = 0
     self.wwdu = 0
     self.aadut1 = 0
     self.aadut2 = 0
     self.wadu = 0
     self.aw = 0
Пример #15
0
 def testSetGetDel(self):
     T = trie()
     T['foo'] = 1
     T['bar'] = 2
     T['baz'] = 3
     self.assertTrue('foo' in T)
     self.assertTrue('bar' in T)
     self.assertTrue('baz' in T)
     self.assertEqual(T['foo'], 1)
     self.assertEqual(T['bar'], 2)
     self.assertEqual(T['baz'], 3)
     self.assertRaises(KeyError, T.__getitem__, 'ba')
     self.assertRaises(KeyError, T.__getitem__, 'fool')
     del T['bar']
     self.assertRaises(KeyError, T.__getitem__, 'bar')
     self.assertEqual(T['baz'], 3)
Пример #16
0
 def testOffsetMatching(self):
     T = trie()
     T['foo'] = 1
     T['baar'] = 2
     T['baarhus'] = 3
     T['bazar'] = 4
     txt = 'The fool baal baarhus in the bazar!'
     keys = []
     values = []
     items = []
     for i in range(len(txt)):
         values.extend(T.values(txt, i))
     for i in range(len(txt)):
         keys.extend(T.keys(txt, i))
     for i in range(len(txt)):
         items.extend(T.items(txt, i))
     self.assertListEqual([1, 2, 3, 4], values)
     self.assertListEqual(['foo', 'baar', 'baarhus', 'bazar'], keys)
     self.assertListEqual([('foo', 1), ('baar', 2), ('baarhus', 3), ('bazar', 4)], items)
Пример #17
0
def test():
    # 1. build a trie
    t = trie(zero=0, one=1, two=2, three=3, four=4, five=5, six=6, seven=7,
             eight=8, nine=9, ten=10, eleven=11, twelve=12, thirteen=13,
             fourteen=10, fifteen=15, sixteen=16, seventeen=17, eighteen=18,
             nineteen=19, twenty=20, thirty=30, fourty=40, fifty=50, sixty=60,
             seventy=70, eighty=80, ninety=90, hundred=100)

    # 2. scan 2000 "sentences" with it
    for _ in range(1000):
        # scanning for the longest matches only in sentence 1
        i = S1[0]
        #print(TEXT[i:S1[1]])
        while i < S1[1]:
            k, v = t.item(TEXT, i, S1[1], None)
            if k is not None:
                #print(v)
                i += len(k)
            else:
                i += 1

        # scanning for all matches in sentence 2
        i = S2[0]
        #print(TEXT[i:S2[1]])
        s = 0
        while i < S2[1]:
            for k, v in t.items(TEXT, i, S2[1]):
                #print(v)
                s += v
            i += 1
        if s != 142:
            raise RuntimeError(str(s))

    # 3. make a real dictionary of all keys in the trie
    if 'nine' not in dict(t.items()):
        raise RuntimeError(str(dict(t.items())))
Пример #18
0
 def testIterItems(self):
     T = trie(ba=2, baz=3, fool=1)
     self.assertListEqual(['ba', 'baz'], list(T.keys('bazar')))
     self.assertListEqual([('fool', 1)], list(T.items('fools')))
     self.assertListEqual([], list(T.values('others')))
Пример #19
0
 def testIterator(self):
     T = trie(ba=2, baz=3, fool=1)
     self.assertListEqual(sorted(['fool', 'ba', 'baz']), sorted(list(T)))
     T[''] = 0
     self.assertEqual(sorted(['', 'fool', 'ba', 'baz']), sorted(list(T)))
Пример #20
0
 def testFakeDefault(self):
     T = trie()
     fake = _NonTerminal()
     self.assertEqual(fake, T.value('foo', default=fake))
Пример #21
0
 def testGetExactMatch(self):
     T = trie(exact=5)
     self.assertListEqual(['exact'], list(T.keys('exact')))
     self.assertListEqual([5], list(T.values('exact')))
     self.assertListEqual([('exact', 5)], list(T.items('exact')))
Пример #22
0
 def clear_index(self):
   self.trie = patricia.trie()
   self.search_trie = patricia.trie()
   self.comments = collections.defaultdict(set)
   self.fns = {}
Пример #23
0
 def testSingleEntry(self):
     T = trie(foo=5)
     self.assertListEqual(['foo'], list(T.keys()))
     self.assertListEqual([5], list(T.values()))
     self.assertListEqual([('foo', 5)], list(T.items()))
Пример #24
0
 def testEmptyStringKey(self):
     T = trie(2, foo=1)
     self.assertTrue('foo' in T)
     self.assertTrue('' in T)
     del T['']
     self.assertRaises(KeyError, T.__getitem__, '')
Пример #25
0
class PersianStemmer(object):
    lexicon = trie()
    mokassarDic = trie()
    cache = trie()
    verbDic = trie()
    _ruleList = []

    verbAffix = [
        "*ش", "*نده", "*ا", "*ار", "وا*", "اثر*", "فرو*", "پیش*", "گرو*", "*ه",
        "*گار", "*ن"
    ]
    suffix = [
        "كار", "ناك", "وار", "آسا", "آگین", "بار", "بان", "دان", "زار", "سار",
        "سان", "لاخ", "مند", "دار", "مرد", "کننده", "گرا", "نما", "متر"
    ]
    prefix = ["بی", "با", "پیش", "غیر", "فرو", "هم", "نا", "یک"]
    prefixException = ["غیر"]
    suffixZamir = ["م", "ت", "ش"]
    suffixException = ["ها", "تر", "ترین", "ام", "ات", "اش"]

    PATTERN_FILE_NAME = os.path.dirname(__file__) + "/data/Patterns.fa"
    VERB_FILE_NAME = os.path.dirname(__file__) + "/data/VerbList.fa"
    DIC_FILE_NAME = os.path.dirname(__file__) + "/data/Dictionary.fa"
    MOKASSAR_FILE_NAME = os.path.dirname(__file__) + "/data/Mokassar.fa"
    patternCount = 1
    enableCache = True
    enableVerb = False

    def __init__(self):
        try:
            self.loadRule()
            self.loadLexicon()
            self.loadMokassarDic()
            if self.enableVerb:
                self.loadVerbDic()
        except Exception as ex:
            print(ex)

    def loadData(self, resourceName):
        result = []
        with open(resourceName, 'r', encoding="utf-8") as reader:
            result = [
                line.strip("\r\n ") for line in reader if line.strip("\r\n ")
            ]
        return result

    def loadVerbDic(self):
        if len(PersianStemmer.verbDic) > 0:
            return

        lines = self.loadData(PersianStemmer.VERB_FILE_NAME)
        for line in lines:
            arr = line.split("\t")
            PersianStemmer.verbDic[arr[0].strip()] = VerbStem(
                arr[1].strip(), arr[2].strip())

    def str2bool(self, v):
        return v.lower() in ("yes", "true", "t", "1")

    def loadRule(self):
        if len(PersianStemmer._ruleList) > 0:
            return

        lines = self.loadData(PersianStemmer.PATTERN_FILE_NAME)
        for line in lines:
            arr = line.split(",")
            PersianStemmer._ruleList.append(
                StemmingRule(arr[0], arr[1], arr[2], int(arr[3]),
                             self.str2bool(arr[4])))
        #PersianStemmer._ruleList = [StemmingRule(arr[0], arr[1], arr[2], int(arr[3]), self.str2bool(arr[4])) for line in lines for arr in line.split(",")]

    def loadLexicon(self):
        if len(PersianStemmer.lexicon) > 0:
            return

        lines = self.loadData(PersianStemmer.DIC_FILE_NAME)
        for line in lines:
            PersianStemmer.lexicon[line.strip("\r\n ")] = True

    def loadMokassarDic(self):
        if len(PersianStemmer.mokassarDic) > 0:
            return

        lines = self.loadData(PersianStemmer.MOKASSAR_FILE_NAME)
        for line in lines:
            arr = line.split("\t")
            PersianStemmer.mokassarDic[arr[0].strip()] = arr[1].strip()

    def strip_accents(self, s):
        return ''.join(c for c in unicodedata.normalize('NFD', s)
                       if unicodedata.category(c) != 'Mn')

    def normalization(self, s):
        newString = []
        for ch in s:
            if ch == 'ي':
                newString.append('ی')
            elif ch in ['ة', 'ۀ']:
                newString.append('ه')
            elif ch in ['‌', '‏']:
                newString.append(' ')
            elif ch == 'ك':
                newString.append('ک')
            elif ch == 'ؤ':
                newString.append('و')
            elif ch in ['إ', 'أ']:
                newString.append('ا')
            elif ch in [
                    '\u064B',  #FATHATAN
                    '\u064C',  #DAMMATAN
                    '\u064D',  #KASRATAN
                    '\u064E',  #FATHA
                    '\u064F',  #DAMMA
                    '\u0650',  #KASRA
                    '\u0651',  #SHADDA
                    '\u0652'
            ]:  #SUKUN
                pass
            else:
                newString.append(ch)

        return ''.join(newString)

    def validation(self, sWord):
        return (sWord in PersianStemmer.lexicon)

    def removeZamir(self, sInput, bState):
        sRule = "^(?P<stem>.+?)((?<=(ا|و))ی)?(ها)?(ی)?((ات)?( تان|تان| مان|مان| شان|شان)|ی|م|ت|ش|ء)$"
        if bState:
            sRule = "^(?P<stem>.+?)((?<=(ا|و))ی)?(ها)?(ی)?(ات|ی|م|ت|ش| تان|تان| مان|مان| شان|شان|ء)$"

        return self.extractStem(sInput, sRule)

    def getMokassarStem(self, sWord):

        if sWord in PersianStemmer.mokassarDic:
            return PersianStemmer.mokassarDic[sWord]
        else:
            sNewWord = self.removeZamir(sWord, True)
            if sNewWord in PersianStemmer.mokassarDic:
                return PersianStemmer.mokassarDic[sNewWord]
            else:
                sNewWord = self.removeZamir(sWord, False)
                if sNewWord in PersianStemmer.mokassarDic:
                    return PersianStemmer.mokassarDic[sNewWord]
        return ""

    def verbValidation(self, sWord):
        if sWord.find(' ') > -1:
            return ""

        j = 0
        for affix in PersianStemmer.verbAffix:
            if (j == 0 and (sWord[-1] == 'ا' or sWord[-1] == 'و')):
                sTemp = affix.replace("*", sWord + "ی")
            else:
                sTemp = affix.replace("*", sWord)

            if self.normalizeValidation(sTemp, True):
                return affix
            j = j + 1
        return ""

    def getPrefix(self, sWord):
        result = [
            sPrefix for sPrefix in PersianStemmer.prefix
            if sWord.startswith(sPrefix)
        ]
        if len(result) > 0:
            return result[0]
        return ""

    def getPrefixException(self, sWord):
        result = [
            sPrefix for sPrefix in PersianStemmer.prefixException
            if sWord.startswith(sPrefix)
        ]
        if len(result) > 0:
            return result[0]
        return ""

    def getSuffix(self, sWord):
        result = [
            sSuffix for sSuffix in PersianStemmer.prefixException
            if sWord.endswith(sSuffix)
        ]
        if len(result) > 0:
            return result[0]
        return ""

    def inRange(self, d, f, t):
        return d >= f and d <= t

    def normalizeValidation(self, sWord, bRemoveSpace):
        sWord = sWord.strip()
        l = len(sWord) - 2
        result = self.validation(sWord)

        if not result and sWord.find('ا') == 0:
            result = self.validation(sWord.replace("ا", "آ", 1))

        if (not result and self.inRange(sWord.find('ا'), 1, l)):
            result = self.validation(sWord.replace('ا', 'أ'))

        if (not result and self.inRange(sWord.find('ا'), 1, l)):
            result = self.validation(sWord.replace('ا', 'إ'))

        if (not result and self.inRange(sWord.find("ئو"), 1, l)):
            result = self.validation(sWord.replace("ئو", "ؤ"))

        if (not result and sWord.endswith("ء")):
            result = self.validation(sWord.replace("ء", ""))

        if (not result and self.inRange(sWord.find("ئ"), 1, l)):
            result = self.validation(sWord.replace("ئ", "ی"))

        if (bRemoveSpace):
            if (not result and self.inRange(sWord.find(' '), 1, l)):
                result = self.validation(sWord.replace(" ", ""))

        #	# دیندار
        #	# دین دار
        if (not result):
            sSuffix = self.getSuffix(sWord)
            if (sSuffix):
                sTemp = sWord.replace(sSuffix, " " + sSuffix)
                if sSuffix == "مند":
                    sTemp = sWord.replace(sSuffix, "ه " + sSuffix)
                result = self.validation(sTemp)

        if (not result):
            sPrefix = self.getPrefix(sWord)
            if sPrefix:
                if (sWord.startswith(sPrefix + " ")):
                    result = self.validation(
                        sWord.replace(sPrefix + " ", sPrefix))
                else:
                    result = self.validation(
                        sWord.replace(sPrefix, sPrefix + " "))

        if (not result):
            sPrefix = self.getPrefixException(sWord)
            if (sPrefix):
                if (sWord.startswith(sPrefix + " ")):
                    result = self.validation(
                        sWord.replace(sPrefix + " ", "", 1))
                else:
                    result = self.validation(sWord.replace(sPrefix, "", 1))

        return result

    def isMatch(self, sInput, sRule):
        match = re.compile(sRule).search(sInput)
        if match:
            return True
        return False

    def extractStem(self, sInput, sRule, sReplacement="\g<stem>"):
        return re.sub(sRule, sReplacement, sInput).strip()

    def getVerb(self, input):
        if input in PersianStemmer.verbDic:
            vs = PersianStemmer.verbDic[input]
            if self.validation(vs.getPresent()):
                return vs.getPresent()
            return vs.getPast()
        return ""

    def PatternMatching(self, input, stemList=[]):
        terminate = False
        s = ""
        sTemp = ""
        for rule in PersianStemmer._ruleList:
            if terminate:
                return terminate

            sReplace = rule.getSubstitution().split(";")
            pattern = rule.getBody()

            if not self.isMatch(input, pattern):
                continue

            k = 0
            for t in sReplace:
                if k > 0:
                    break

                s = self.extractStem(input, pattern, t)
                if len(s) < rule.getMinLength():
                    continue

                if rule.getPoS() == 'K':  # Kasre Ezafe
                    if len(stemList) == 0:
                        sTemp = self.getMokassarStem(s)
                        if sTemp:
                            stemList.append(sTemp)  #, pattern + " [جمع مکسر]")
                            k = k + 1
                        elif self.normalizeValidation(s, True):
                            stemList.append(s)  #, pattern)
                            k = k + 1
                        else:
                            pass
                            #addToLog("", pattern + " ::" + s + "}")
                elif rule.getPoS() == 'V':  # Verb
                    sTemp = self.verbValidation(s)
                    if len(sTemp) == 0:
                        stemList.append(s)  # pattern + " : [" + sTemp + "]"
                        k = k + 1
                    else:
                        pass
                        #addToLog("", pattern + " ::تمام وندها}")
                else:
                    if self.normalizeValidation(s, True):
                        stemList.append(s)
                        if rule.getState():
                            terminate = True
                            k = k + 1
                    else:
                        pass
                        #addToLog("", pattern + " ::" + s + "}")
        return terminate

    def run(self, input):
        input = self.normalization(input).strip()

        if not input:
            return ""

        #Integer or english
        if Utils.isEnglish(input) or Utils.isNumber(input) or len(input) <= 2:
            return input

        if self.enableCache and input in self.cache:
            return self.cache[input]

        s = self.getMokassarStem(input)
        if self.normalizeValidation(input, False):
            #stemList.add(input/*, "[فرهنگ لغت]"*/)
            if self.enableCache:
                self.cache[input] = input
            return input
        elif s:
            #addToLog(s/*, "[جمع مکسر]"*/)
            #stemList.add(s)
            if self.enableCache:
                self.cache[input] = s
            return s

        stemList = []
        terminate = self.PatternMatching(input, stemList)

        if self.enableVerb:
            s = self.getVerb(input)
            if s:
                stemList = [s]

        if len(stemList) == 0:
            if self.normalizeValidation(input, True):
                #stemList.add(input, "[فرهنگ لغت]")
                if self.enableCache:
                    self.cache[input] = input  #stemList.get(0))
                return input  #stemList.get(0)
            stemList.append(input)  #, "")

        if terminate and len(stemList) > 1:
            return self.nounValidation(stemList)

        if self.patternCount != 0:
            stemList.sort(reverse=self.patternCount >= 0)
            stemList = stemList[abs(self.patternCount) - 1:]

        if self.enableCache:
            self.cache[input] = stemList[0]

        return stemList[0]

    def nounValidation(self, stemList):
        stemList.sort()
        lastStem = stemList[-1]

        if lastStem.endswith("ان"):
            return lastStem
        else:
            firstStem = stemList[0]
            secondStem = stemList[1].replace(" ", "")

            for sSuffix in PersianStemmer.suffixZamir:
                if secondStem == firstStem + sSuffix:
                    return firstStem

        return lastStem
Пример #26
0
 def testEmptyStringKey(self):
     T = trie(2, foo=1)
     self.assertTrue('foo' in T)
     self.assertTrue('' in T)
     del T['']
     self.assertRaises(KeyError, T.__getitem__, '')
Пример #27
0
	print ('\n' * 100)
########################################################################################################################

########################################################################################################################
''' Heapsort utilizado para organizar os dados '''
def heapsort(iterable):
    h = []
    for value in iterable:
        heapq.heappush(h, value)
    return [heapq.heappop(h) for i in range(len(h))]

contadorArtistas = 1                                                        #                      Contador de artistas.
contadorMusicas = 0                                                         #                       Contador de musicas.
contadorIndex = 0

arvoreMusica = trie()                                                       #               Árvore com nome das músicas.
arvoreArtistas = trie()                                                     #              Árvore com nome dos artistas.
auxArtista = trie()                                                         #     Árvore Auxiliar com nome dos artistas.
auxMusica = trie()                                                          #      Árvore Auxiliar com nome das músicas.
Index = list()
########################################################################################################################

########################################################################################################################
''' Lendo o arquivo Json '''
arquivo_dados = raw_input("Insira o nome do arquivo de dados (sem o .json): ")
arquivo_dados = arquivo_dados + ".json"
with open(arquivo_dados) as f:
    data = f.read();
    jsondata = json.loads(data)

    for row in jsondata['items']:
Пример #28
0
def random_string(length):
    """Produce a random string made of *length* uppercase ascii
    characters"""
    return ''.join(choice(ascii_uppercase) for i in range(length))


strings = [random_string(32) for i in range(10000)]
matches = [s for s in strings if s.startswith('AA')]
print(matches)



from patricia import trie
strings_dict = {s:0 for s in strings}
# A dictionary where all values are 0
strings_trie = trie(**strings_dict)
matches = list(strings_trie.iter('AA'))
print(matches)
# If you look closely, the timing for this input size is 60.1 μs, which is about 30 times
# faster (1.76 ms = 1760 μs) than linear search!
# Note that if we want to return all the prefixes that match, the running time will be
# proportional to the number of results that match the prefix. Therefore, when designing
# timing benchmarks, care must be taken to ensure that we are always returning the same
# number of results.
# The scaling properties of a trie versus a linear scan for datasets of different sizes that
# contains ten prefix matches are shown in the following table:

# Algorithm     N=10000 (μs)    N=20000 (μs)    N=30000 (μs)    Time
# Trie          17.12           17.27           17.47           O(S)
# Linear scan   1978.44         4075.72         6398.06         O(N)
Пример #29
0
def createIndex(start):
    #create stop word file
    stopWords = create_stopword_list(stopwordsFile)
    counter = 0
    #list of create patricia tries for various alphabets
    index = {}
    for i in range(0, index_count):
        index[i] = trie()

    #create index
    for folder in range(int(start), int(start) + 1):
        print "working on folder" + str(folder)
        #folder = os.walk(corpusFolder).next()[1]:
        for file in os.walk(corpusFolder + '/' + str(folder)).next()[2]:
            try:
                #print file
                docID = file
                #ignore files other than the html files ( not of the form *.* )
                pattern = re.compile('\.')
                if pattern.search(file) != None:
                    continue

                #read html file

                html_doc = ""
                try:
                    html_doc = open(corpusFolder + '/' + str(folder) + '/' +
                                    file).read()
                except:
                    print "could not open file " + file
                    continue
                #create soup
                soup = BeautifulSoup(html_doc)
                #remove absurd tags like javascript, css, iframe
                [s.extract() for s in soup(['script', 'iframe', 'style'])]
                textList = []
                try:
                    text = soup.get_text().encode("utf-8").lower()
                    pattern = re.compile("[^a-z0-9]")
                    textList = re.split(pattern, text)
                except Exception as e:
                    print e
                    continue

                #remove stop words
                textList = remove_stopwords(textList, stopWords)
                #sort the text list to reduce the number of IOs
                textList = sorted(textList)
                #print textList
                #add words to appropriate patricia-trie
                for word in textList:
                    #perform stemming
                    word = stem(word)
                    #remove small words
                    if len(word) <= min_word_length:
                        #print word#"word length insufficient"
                        continue
                    #print word
                    try:
                        if not word in index[sec(
                                word)]:  #the word has occured the first time
                            index[sec(word)][word] = PostingStruct(
                                0, {})  #create posting list for the word
                            #print index[sec(word)][word].df

                        # the word has occured earlier
                        if index[sec(word)][word].posting.has_key(
                                file) == True:
                            index[sec(word)][word].posting[file] += 1
                        else:
                            index[sec(word)][word].posting[file] = 1
                            index[sec(word)][
                                word].df += 1  #index[ sec(word) ][word].df+1
                    except Exception as e:
                        print word
                        print e, word, sec(word)
                        #return
                #break
                """
				if counter == 3:
					break
				counter+=1
				"""
            except:
                continue
    #once all the tries have been created pickle them
    for i in range(0, index_count):
        with open(pickleFolder + '/' + start + '/' + str(i) + '.pik',
                  'wb') as f:
            pickle.dump(index[i], f, -1)
    print "successfully completed !"
Пример #30
0
########################################################################################################################
''' Heapsort utilizado para organizar os dados '''


def heapsort(iterable):
    h = []
    for value in iterable:
        heapq.heappush(h, value)
    return [heapq.heappop(h) for i in range(len(h))]


contadorArtistas = 1  #                      Contador de artistas.
contadorMusicas = 0  #                       Contador de musicas.
contadorIndex = 0

arvoreMusica = trie()  #               Árvore com nome das músicas.
arvoreArtistas = trie()  #              Árvore com nome dos artistas.
auxArtista = trie()  #     Árvore Auxiliar com nome dos artistas.
auxMusica = trie()  #      Árvore Auxiliar com nome das músicas.
Index = list()
########################################################################################################################

########################################################################################################################
''' Lendo o arquivo Json '''
arquivo_dados = raw_input("Insira o nome do arquivo de dados (sem o .json): ")
arquivo_dados = arquivo_dados + ".json"
with open(arquivo_dados) as f:
    data = f.read()
    jsondata = json.loads(data)

    for row in jsondata['items']:
Пример #31
0
def create_corpus( words ):
	maxFreq = {}
	unwanted = ['.b', '.x', '.n', '.c' ]
	wanted  = ['.i', '.t', '.a', '.w', '.k']
	i = 0
	t = trie()
	docID = 0
	global docCount 
	while i < len(words):
		freq = 1
		if words[i] == '.i':
			i=i+1
			docID = words[i]
			docCount +=1
			
		elif words[i] in unwanted: 
			i = 1+i
			while (words[i] not in wanted + unwanted) : 
				i = i+1
				
				if i >= len(words):
					break
#				print i, words[i], len(words)
			
		else:
			try:	
				#stem the word before insetion
				words[i] = stem(words[i]);
				#if new word 
				if t[ str(words[i]) ] == False:
					t[ str(words[i]) ] = {};
				if t[ str(words[i]) ].has_key(docID) == True:
					t[ str(words[i]) ][docID] += 1
				else:
					t[ str(words[i]) ][docID] = 1
				"""
				t[]
				f =  getFreq(t[str(words[i])],docID)
				#print str(words[i])
				#print f
				if f == -1:
					t[str(words[i])][docID] = freq#.append((docID,freq))
				else:
					#t[str(words[i])].remove((docID,f))
					if len(t[str(words[i])]) == 0:
						t[str(words[i])] = {}#[(docID,f+1)]
					else:
						t[str(words[i])][docID] = f+1#.append((docID,f+1))
				"""
			except KeyError:
				pass
				#no  key is present
				#print type(str(words[i]))
				#stem the word before insetion
				#words[i] = words[i].strip('0123456789., ')
				#words[i]  = PorterStemmer().stem_word(words[i])
				t[str(words[i])] = {}
				t[str(words[i])][docID] = 1
			contFreq = t[ str(words[i]) ][docID] 
			if maxFreq.has_key(docID):
				if maxFreq[docID] < contFreq:
					maxFreq[docID] = contFreq
			else:
				maxFreq[docID] = contFreq
			i= i + 1
	#print "docCount is ", docCount
	with open('maxFreq.pik', 'wb') as f:
		pickle.dump([docCount, maxFreq], f, -1)
	with open('trie.pik', 'wb') as f:
		pickle.dump( t, f, -1)
	#print t.keys()
	"""for word in t.keys():
Пример #32
0
 def testIterator(self):
     T = trie(ba=2, baz=3, fool=1)
     self.assertListEqual(sorted(['fool', 'ba', 'baz']), sorted(list(T)))
     T[''] = 0
     self.assertEqual(sorted(['', 'fool', 'ba', 'baz']), sorted(list(T)))
Пример #33
0
 def testIsPrefix(self):
     T = trie(bar=2, baz=3, fool=1)
     self.assertTrue(T.isPrefix('ba'))
     self.assertFalse(T.isPrefix('fools'))
     self.assertTrue(T.isPrefix(''))
Пример #34
0
 def testValues(self):
     T = trie()
     T['ba'] = 2
     T['baz'] = "hey's"
     T['fool'] = 1.5
     self.assertListEqual(sorted(["2", "hey's", "1.5"]), sorted([str(v) for v in T.values()]))
Пример #35
0
    def __init__(self, obj, config=str()):

        #print(config)

        #_config = """input = '@text1', mapping = 'low = 1, medium = 2, high = 3'"""

        super(Enum, self).__init__(obj, config=config)

        #print(config)
        #print(self.config)
        #print(config == _config)

        if self.config is None or len(self.config) < 1 or not isinstance(
                self.config, str):
            raise ValueError('Enum plugin function requires a config string')

        inputkeyword = 'input'
        mappingkeyword = 'mapping'

        if not inputkeyword in self.config:
            raise ValueError(
                'A input keyword argument must be specified for the Enum plugin function'
            )

        if not mappingkeyword in self.config:
            raise ValueError(
                'A mapping keyword argument must be specified for the Enum plugin function'
            )

        attrexpr = p.Combine(
            p.Literal("'").suppress() +
            (p.Literal('@') | p.Literal('!')).suppress() +
            p.Word(p.alphanums) + p.Literal("'").suppress())

        inputexpr = p.CaselessKeyword(inputkeyword).suppress() + p.Literal(
            '=').suppress() + attrexpr

        mappingexpr = p.CaselessKeyword(mappingkeyword).suppress() + p.Literal(
            '=').suppress() + p.sglQuotedString()

        expr = inputexpr + p.Literal(',').suppress() + mappingexpr

        self.input = None
        self.mapping = None

        _matches = []

        for x in expr.scanString(self.config):

            _matches.append(x)

        if len(_matches) > 1:
            raise IndexError(
                'There should only be one input and mapping keyword set in the Enum plugin function\'s config but %s was received'
                % _matches)

        #print(_matches)

        rawconfig = _matches[0]

        mappingdict = {}

        for mapitem in rawconfig[0][1][1:-1].split(','):
            k, v = mapitem.split('=')
            mappingdict[k.strip()] = v.strip()

        self.input = rawconfig[0][0]
        self.mapping = pt.trie()
        for k, v in mappingdict.items():
            self.mapping[k.lower()] = v
Пример #36
0
 def testIsPrefix(self):
     T = trie(bar=2, baz=3, fool=1)
     self.assertTrue(T.isPrefix('ba'))
     self.assertFalse(T.isPrefix('fools'))
     self.assertTrue(T.isPrefix(''))
Пример #37
0
 def testIterItems(self):
     T = trie(ba=2, baz=3, fool=1)
     self.assertListEqual(['ba', 'baz'], list(T.keys('bazar')))
     self.assertListEqual([('fool', 1)], list(T.items('fools')))
     self.assertListEqual([], list(T.values('others')))
Пример #38
0
 def testFakeDefault(self):
     T = trie()
     fake = _NonTerminal()
     self.assertEqual(fake, T.value('foo', default=fake))
Пример #39
0
 def testLongRootValue(self):
     T = trie(1, 2)
     self.assertEqual((1, 2), T[''])
Пример #40
0
import string


# Add common-lib code to system path
sources = os.getenv('BISCUIT_DIR')
if sources not in sys.path: sys.path.append(sources)
from common_lib.read_config import enabled_modules


# Add trie module to path
trie_module = enabled_modules['hashtag']
if trie_module:
    print 'trie module: ', trie_module
    if trie_module not in sys.path: sys.path.append(trie_module)
    import patricia
    _dictionary = patricia.trie()
else:
    _dictionary = {}


# Hand annotated hashtags
annotations = {}
with open(os.path.join(enabled_modules['hashtag'], 'ht-expansions.txt'), 'r') as f:
    for line in f.readlines():
        ht,expansion = line.strip('\n').split(' || ')
        annotations[ht] = expansion



# For development: allow module to be run-able
def main():
Пример #41
0
def del_tabletran_updates(peer, sdate, cl_name):
    f_results = open(hdname+'tmp/'+peer+'_result.txt', 'r')
    for line in f_results:  # get all affection info of this peer
        line = line.replace('\n', '')
        attr = line.split(',')
        if attr[0] == '#START':
            continue
        print line
        print 'get session reset time...'
        stime_unix = int(attr[0])
        endtime_unix = int(attr[1])
        start_datetime = datetime.datetime.fromtimestamp(stime_unix) +\
                datetime.timedelta(hours=-8)
        end_datetime = datetime.datetime.fromtimestamp(endtime_unix) +\
                datetime.timedelta(hours=-8)
        print 'from ', start_datetime, ' to ', end_datetime

        updatefile_list = open(hdname+'metadata/'+sdate+'/updt_filelist_'+cl_name, 'r')
        for updatefile in updatefile_list:  
            updatefile = updatefile.replace('\n', '')
            file_attr = updatefile.split('.')
            fattr_date = file_attr[5]
            fattr_time = file_attr[6]
            dt = datetime.datetime(int(fattr_date[0:4]),int(fattr_date[4:6]), int(fattr_date[6:8]),int(fattr_time[0:2]), int(fattr_time[2:4]))
            if not start_datetime + datetime.timedelta(minutes =\
                    -15) <= dt <= end_datetime:  # filename not OK
                continue
            print 'session reset exists in: ', updatefile
            size_before = os.path.getsize(updatefile)
            # unpack
            myfilename = updatefile.replace('txt.gz', 'txt')  # .txt file
            subprocess.call('gunzip -c ' + updatefile + ' > ' +\
                    myfilename, shell=True)
            # only .txt from now on!
            oldfile = open(myfilename, 'r')
            newfile = open(hdname+'tmp/'+myfilename.split('/')[-1], 'w')

            counted_pfx = patricia.trie(None)
            for updt in oldfile:  # loop over each update
                updt = updt.replace('\n', '')
                update_attr = updt.split('|')
                try:
                    if (cmp(update_attr[3], peer)==0)\
                    & (stime_unix<int(update_attr[1])<\
                    endtime_unix):  # culprit update confirmed
                        pfx = update_attr[5]
                        try:  # Test whether the trie has the pfx
                            test = counted_pfx[pfx]
                            newfile.write(updt+'\n')  # pfx exists
                        except:  # Node does not exist
                            counted_pfx[pfx] = True
                    else:  # not culprit update
                        newfile.write(updt+'\n')
                except:
                    continue

            oldfile.close()
            newfile.close()

            os.remove(updatefile)  # remove old .gz file
            # compress .txt into txt.gz to replace the old file
            subprocess.call('gzip -c '+hdname+'tmp/'+myfilename.split('/')[-1]+\
                    ' > '+updatefile, shell=True)
            size_after = os.path.getsize(updatefile)
            os.remove(updatefile.replace('txt.gz','txt'))
            print 'size(b):', size_before, ',size(a):', size_after
                   
        updatefile_list.close()
    f_results.close()
    def constructTrieIndex(self):
		self.trieIndex = trie('root')
		for termPage, postingList in self.index.iteritems():
			idfData = math.log(float(self.docCount)/float(self.df[termPage]), 10)
			self.trieIndex[termPage] = (postingList, self.tf[termPage], idfData)
Пример #43
0
 def testLongRootValue(self):
     T = trie(1, 2)
     self.assertEqual((1, 2), T[''])