def setter(a): for b in a: b0, b1 = uc.charname(b), 0.001 b2 = len(a) + b1 * 2 if "CJK" not in b0 and "LETTER" not in b0: b1 += 1 return (b1 / b2) > 0.8
def alt(a): b1 = [0, 0] for b in a: b0 = uc.charname(b) if "LEFT" in b0: b1[0] = 1 elif "RIGHT" in b0: b1[1] = 1 else: pass return b1 == [1, 1]
def revEnv(a): f = list(reversed(copy.copy(a))) # print(f) try: for x in range(len(f)): if "LATIN" in uc.charname(f[x]): break f.pop(x) return "".join(list(reversed(f))) except: return
def pairFinder(a,b): # nope. remove these things. fi=lambda x: list(map(lambda z:z[1],list(filter(lambda y: y[0],grouper(uc.charname(x)))))) a0,b0=fi(a),fi(b) # print(a0,b0) l,l0,l1=len(a0),len(b0),0 if l==l0: for x in range(l): if a0[x]!=b0[x]: l1+=1 if l1==1: return True else: return False else: return False
def getBatch(a): return [uc.charname(x) for x in a]
def probe(a): for b in a: if "CJK" not in b: return False return True def latin(a): b0, b1 = len(a), 0 for b in a: if "LATIN" in b and "FULLWIDTH" not in b: b1 += 1 return 0.7 < (b1 / b0 + 0.1) u0 = lambda y: [unicode_charnames.charname(x) for x in y] drx = {"control": [], "stopword": []} for gf in range(len(fg)): u = getShit(fg[gf]) for u1 in u: ux = u0(u1) if probe(ux) or latin(ux): # drx["stopword"].append(u1) drx["stopword"].append(u1) else: for y in u1: drx["control"].append(y) drx["stopword"] = hash(drx["stopword"]) drx["control"] = hash(drx["control"] + drm) storeAList(drx) print(drx)
# import unicode_charnames as uc import jieba # use stack? from getFromPickleR import returnAList import unicode_charnames as uc # import wordninja as wj xfz = list(filter(lambda x: len(x) > 0, returnAList()['stopword'])) xf0 = [x for x in xfz if "CJK" in uc.charname(x[0])] xf1 = [x for x in xfz if x not in xf0] def getBatch(a): return [uc.charname(x) for x in a] def checkMe(a): if len(a)>6: if len(list(set(a)))==1: return True return False def checker(a): if "DIGIT" in a: return 1 else: return 0 # def wrapper(a,b): # # j=lambda x: [z for y in x for z in y] # if b==True: # return list(jieba.cut(a)) # else: # return wj.split(a)
def test_charname(self): expected = "LATIN CAPITAL LETTER E WITH ACUTE" self.assertEqual(charname("É"), expected) self.assertEqual(charname("\u00C9"), expected) self.assertEqual(charname(chr(0xC9)), expected) self.assertEqual(charname("\u3400"), "CJK UNIFIED IDEOGRAPH-3400") self.assertEqual(charname("\U0003134A"), "CJK UNIFIED IDEOGRAPH-3134A") self.assertEqual(charname("\uF900"), "CJK COMPATIBILITY IDEOGRAPH-F900") self.assertEqual(charname("\U00017000"), "TANGUT IDEOGRAPH-17000") self.assertEqual(charname("\U0001B170"), "NUSHU CHARACTER-1B170") self.assertEqual(charname("\U00018CD5"), "KHITAN SMALL SCRIPT CHARACTER-18CD5") self.assertEqual(charname("\u0000"), "<control-0000>") self.assertEqual(charname("\uF8FF"), "<private-use-F8FF>") self.assertEqual(charname("\uD800"), "<surrogate-D800>") self.assertEqual(charname("\U0010FFFF"), "<noncharacter-10FFFF>")
import unicode_charnames # shall we concern the predefined groups? # is there any missing alphabet or components? # hidden candidate everywhere. def getShit(a): with open(a, "r") as f: return f.read() u=getShit("example\\0.log") u0=[unicode_charnames.charname(x) for x in u] for u1 in u0: print(u1) # just how the f**k can we do this?