def get_matrix_rstr(mode, dic_json, desc_arg=None): texts = [infos["text_line"] for x, infos in dic_json.items()] lenmax = mode.split('_')[-1].split(',')[0] supportmax = mode.split('_')[-1].split(',')[1] rstr = Rstr_max() X = [] for s in texts: rstr.add_str(s) X.append({}) r = rstr.go() cpt_str = 0 desc = [] for (offset_end, nb), (l, start_plage) in r.items(): ss = rstr.global_suffix[offset_end - l:offset_end] list_occur = [] for o in range(start_plage, start_plage + nb): id_text = rstr.idxString[rstr.res[o]] list_occur.append(id_text) set_occur = set(list_occur) # Ici, il y a un souci dans les dimensions puisque tous les descripteurs ne sont pas # forcément présents. Donc dans certains cas on n'a rien pour une instance donnée. if desc_arg is not None and ss not in desc_arg: # Test continue if len(set_occur) > 1: if len(ss) < int(lenmax) and len( set_occur) < float(supportmax) * len(texts): for id_text in list_occur: X[id_text].setdefault(cpt_str, 0) X[id_text][cpt_str] += 1 if desc_arg is None: # corpus train = on ajoute le descripteur desc.append(ss) cpt_str += 1 # Ajout d'une instance virtuelle pour garantir l'homogénéite dans les dimensions des matrices de train et test if desc_arg is None: # Train descriptors = desc else: # test descriptors = desc_arg dic = {} for d in descriptors: dic[descriptors.index(d)] = 1 X.append(dic) if mode.split('_')[1] == 'rel': X = relative_transformation(X) return desc, X
def setUp(self): self.list_s = self.getString() self.rstr = Rstr_max() for s in self.list_s : self.rstr.add_str(s)
class Test_rstrmax: def setUp(self): self.list_s = self.getString() self.rstr = Rstr_max() for s in self.list_s : self.rstr.add_str(s) def test_rstr_max(self) : r = self.rstr.go() for (offset_end, nb), (l, start_plage) in r.iteritems(): ss = self.rstr.global_suffix[offset_end-l:offset_end] # ss = self.rstr.array_str[idStr][end-l:end] offset_end -= 1 id_chaine = self.rstr.idxString[offset_end] s = self.rstr.global_suffix idx = 0 for i in xrange(nb): idx = s.index(ss, idx) + 1 # except ValueError, e: # print "+++", ss, end, i, nb # try: # self.assertRaises(ValueError, s.index, ss, idx) # print "***", ss, end, i, nb # except ValueError, e: # pass def test_maximality(self) : r = self.rstr.go() for (offset_end, nb), (l, start_plage) in r.iteritems(): ss = self.rstr.global_suffix[offset_end-l:offset_end] offset_end -= 1 id_chaine = self.rstr.idxString[offset_end] s = self.rstr.array_str[id_chaine] set_left, set_right = set(), set() for o in range(start_plage, start_plage + nb) : offset_global = self.rstr.res[o] su = (self.rstr.idxPos[offset_global],self.rstr.idxString[offset_global]) ls = len(self.rstr.array_str[su[1]]) char_left = "START_STR%i"%(su[1]) if(su[0] == 0) else self.rstr.array_str[su[1]][su[0]-1] set_left.add(char_left) char_right = "END_STR%i"%(su[1]) if(su[0]+l == ls) else self.rstr.array_str[su[1]][su[0]+l] set_right.add(char_right) self.assertNotEqual(len(set_left), 1) self.assertNotEqual(len(set_right), 1) def utest_left_maximality(self) : r = self.rstr.go() # for (idStr, end, nb), (l, start_plage) in r.iteritems(): for (offset_end, nb), (l, start_plage) in r.iteritems(): ss = self.rstr.global_suffix[offset_end-l:offset_end] # ss = self.rstr.array_str[idStr][end-l:end] offset_end -= 1 id_chaine = self.rstr.idxString[offset_end] s = self.rstr.array_str[id_chaine] # s = self.rstr.array_str[idStr] set_left_char = set() for o in range(start_plage, start_plage + nb) : offset_global = self.rstr.res[o] su = (self.rstr.idxPos[offset_global],self.rstr.idxString[offset_global]) # su = self.rstr.array_suffix[o] if(su[0] == 0) : char_left = "START_STR" else : char_left = self.rstr.array_str[su[1]][su[0]-1] set_left_char.add((char_left,su[1])) if(len(set_left_char) == 1) : print print '*'*10 print set_left_char print ss.encode('utf-8') print '*'*10 print self.assertNotEqual(len(set_left_char), 1) def utest_right_maximality(self) : r = self.rstr.go() for (offset_end, nb), (l, start_plage) in r.iteritems(): ss = self.rstr.global_suffix[offset_end-l:offset_end] offset_end -= 1 id_chaine = self.rstr.idxString[offset_end] s = self.rstr.array_str[id_chaine] set_right_char = set() for o in range(start_plage, start_plage + nb) : offset_global = self.rstr.res[o] su = (self.rstr.idxPos[offset_global],self.rstr.idxString[offset_global]) ls = len(self.rstr.array_str[su[1]]) if(su[0]+l == ls) : char_right = "END_STR" else : char_right = self.rstr.array_str[su[1]][su[0]+l] set_right_char.add((char_right,su[1])) if(len(set_right_char) == 1) : print print '*'*10 print set_right_char print ss.encode('utf-8') print '*'*10 print self.assertNotEqual(len(set_right_char), 1)
def setUp(self): self.list_s = self.getString() self.rstr = Rstr_max() for s in self.list_s: self.rstr.add_str(s)
class Test_rstrmax: def setUp(self): self.list_s = self.getString() self.rstr = Rstr_max() for s in self.list_s: self.rstr.add_str(s) def test_rstr_max(self): r = self.rstr.go() for (offset_end, nb), (l, start_plage) in r.iteritems(): ss = self.rstr.global_suffix[offset_end - l:offset_end] # ss = self.rstr.array_str[idStr][end-l:end] offset_end -= 1 id_chaine = self.rstr.idxString[offset_end] s = self.rstr.global_suffix idx = 0 for i in xrange(nb): idx = s.index(ss, idx) + 1 # except ValueError, e: # print "+++", ss, end, i, nb # try: # self.assertRaises(ValueError, s.index, ss, idx) # print "***", ss, end, i, nb # except ValueError, e: # pass def test_maximality(self): r = self.rstr.go() for (offset_end, nb), (l, start_plage) in r.iteritems(): ss = self.rstr.global_suffix[offset_end - l:offset_end] offset_end -= 1 id_chaine = self.rstr.idxString[offset_end] s = self.rstr.array_str[id_chaine] set_left, set_right = set(), set() for o in range(start_plage, start_plage + nb): offset_global = self.rstr.res[o] su = (self.rstr.idxPos[offset_global], self.rstr.idxString[offset_global]) ls = len(self.rstr.array_str[su[1]]) char_left = "START_STR%i" % (su[1]) if ( su[0] == 0) else self.rstr.array_str[su[1]][su[0] - 1] set_left.add(char_left) char_right = "END_STR%i" % (su[1]) if ( su[0] + l == ls) else self.rstr.array_str[su[1]][su[0] + l] set_right.add(char_right) self.assertNotEqual(len(set_left), 1) self.assertNotEqual(len(set_right), 1) def utest_left_maximality(self): r = self.rstr.go() # for (idStr, end, nb), (l, start_plage) in r.iteritems(): for (offset_end, nb), (l, start_plage) in r.iteritems(): ss = self.rstr.global_suffix[offset_end - l:offset_end] # ss = self.rstr.array_str[idStr][end-l:end] offset_end -= 1 id_chaine = self.rstr.idxString[offset_end] s = self.rstr.array_str[id_chaine] # s = self.rstr.array_str[idStr] set_left_char = set() for o in range(start_plage, start_plage + nb): offset_global = self.rstr.res[o] su = (self.rstr.idxPos[offset_global], self.rstr.idxString[offset_global]) # su = self.rstr.array_suffix[o] if (su[0] == 0): char_left = "START_STR" else: char_left = self.rstr.array_str[su[1]][su[0] - 1] set_left_char.add((char_left, su[1])) if (len(set_left_char) == 1): print print '*' * 10 print set_left_char print ss.encode('utf-8') print '*' * 10 print self.assertNotEqual(len(set_left_char), 1) def utest_right_maximality(self): r = self.rstr.go() for (offset_end, nb), (l, start_plage) in r.iteritems(): ss = self.rstr.global_suffix[offset_end - l:offset_end] offset_end -= 1 id_chaine = self.rstr.idxString[offset_end] s = self.rstr.array_str[id_chaine] set_right_char = set() for o in range(start_plage, start_plage + nb): offset_global = self.rstr.res[o] su = (self.rstr.idxPos[offset_global], self.rstr.idxString[offset_global]) ls = len(self.rstr.array_str[su[1]]) if (su[0] + l == ls): char_right = "END_STR" else: char_right = self.rstr.array_str[su[1]][su[0] + l] set_right_char.add((char_right, su[1])) if (len(set_right_char) == 1): print print '*' * 10 print set_right_char print ss.encode('utf-8') print '*' * 10 print self.assertNotEqual(len(set_right_char), 1)
def setUp(self): self.s = self.getString() self.rstr = Rstr_max() self.rstr.add_str(self.s)
class Test_rstrmax: def setUp(self): self.s = self.getString() self.rstr = Rstr_max() self.rstr.add_str(self.s) def test_rstr_max(self) : r = self.rstr.go() for ((idStr, end), nb), (l, start_plage) in r.iteritems(): ss = self.rstr.array_str[idStr][end-l:end] s = self.rstr.array_str[idStr] idx = 0 for i in xrange(nb): idx = s.index(ss, idx) + 1 # except ValueError, e: # print "+++", ss, end, i, nb # try: self.assertRaises(ValueError, s.index, ss, idx) # print "***", ss, end, i, nb # except ValueError, e: # pass def test_left_maximality(self) : r = self.rstr.go() for ((idStr, end), nb), (l, start_plage) in r.iteritems(): ss = self.rstr.array_str[idStr][end-l:end] s = self.rstr.array_str[idStr] set_left_char = set() for o in range(start_plage, start_plage + nb) : su = self.rstr.array_suffix[o] if(su[0] == 0) : char_left = "START_STR" else : char_left = self.rstr.array_str[su[1]][su[0]-1] set_left_char.add(char_left) if(len(set_left_char) == 1) : print print '*'*10 print set_left_char print ss.encode('utf-8') print '*'*10 print self.assertNotEqual(len(set_left_char), 1) def test_right_maximality(self) : r = self.rstr.go() for ((idStr, end), nb), (l, start_plage) in r.iteritems(): ss = self.rstr.array_str[idStr][end-l:end] s = self.rstr.array_str[idStr] set_right_char = set() for o in range(start_plage, start_plage + nb) : su = self.rstr.array_suffix[o] ls = len(self.rstr.array_str[su[1]]) if(su[0]+l == ls) : char_right = "END_STR" else : char_right = self.rstr.array_str[su[1]][su[0]+l] set_right_char.add(char_right) if(len(set_right_char) == 1) : print print '*'*10 print set_right_char print ss.encode('utf-8') print '*'*10 print self.assertNotEqual(len(set_right_char), 1)
class Test_rstrmax: def setUp(self): self.s = self.getString() self.rstr = Rstr_max() self.rstr.add_str(self.s) def test_rstr_max(self): r = self.rstr.go() for ((idStr, end), nb), (l, start_plage) in r.iteritems(): ss = self.rstr.array_str[idStr][end - l:end] s = self.rstr.array_str[idStr] idx = 0 for i in xrange(nb): idx = s.index(ss, idx) + 1 # except ValueError, e: # print "+++", ss, end, i, nb # try: self.assertRaises(ValueError, s.index, ss, idx) # print "***", ss, end, i, nb # except ValueError, e: # pass def test_left_maximality(self): r = self.rstr.go() for ((idStr, end), nb), (l, start_plage) in r.iteritems(): ss = self.rstr.array_str[idStr][end - l:end] s = self.rstr.array_str[idStr] set_left_char = set() for o in range(start_plage, start_plage + nb): su = self.rstr.array_suffix[o] if (su[0] == 0): char_left = "START_STR" else: char_left = self.rstr.array_str[su[1]][su[0] - 1] set_left_char.add(char_left) if (len(set_left_char) == 1): print print '*' * 10 print set_left_char print ss.encode('utf-8') print '*' * 10 print self.assertNotEqual(len(set_left_char), 1) def test_right_maximality(self): r = self.rstr.go() for ((idStr, end), nb), (l, start_plage) in r.iteritems(): ss = self.rstr.array_str[idStr][end - l:end] s = self.rstr.array_str[idStr] set_right_char = set() for o in range(start_plage, start_plage + nb): su = self.rstr.array_suffix[o] ls = len(self.rstr.array_str[su[1]]) if (su[0] + l == ls): char_right = "END_STR" else: char_right = self.rstr.array_str[su[1]][su[0] + l] set_right_char.add(char_right) if (len(set_right_char) == 1): print print '*' * 10 print set_right_char print ss.encode('utf-8') print '*' * 10 print self.assertNotEqual(len(set_right_char), 1)