def test_tag(self): # Assert [("il", "DT"), ("gatto", "NN"), ("nero", "JJ")]. v = it.tag("il gatto nero") self.assertEqual(v, [("il", "DT"), ("gatto", "NN"), ("nero", "JJ")]) print("pattern.it.tag()")
def nonWordSxDx(tokenEstratti): parola = re.compile('[\wàèìòùé+]') c = 0 assente = [] scelte = [] appoggio = [] pronominale = False split = False for tok in tokenEstratti: posTok = tag(tok.encode('utf-8'))[0][1] if tok.encode('utf-8').lower() in NWORDS: c = c if tok.encode('utf-8').lower() not in NWORDS and (tag(tok.encode('utf-8').lower()[len(tok)-2:])[0][1] == 'PRP' or tag(tok.encode('utf-8').lower()[len(tok)-2:])[0][1] == 'DT') and tok.encode('utf-8').islower() and (NWORDS[tok.encode('utf-8').lower()[:len(tok)-2]] > 0 or NWORDS[str(tok.encode('utf-8').lower()[:len(tok)-2])+'e'] > 0): c = c pronominale = True print tok if tok.encode('utf-8').lower() not in NWORDS and pronominale == False: x = 1 y = 1 result = '' for canc in tok: if y == len(tok.encode('utf-8')):break b = tok.encode('utf-8')[:x] e = tok.encode('utf-8')[y:] big = (b,e) if BIG[big]>20: result = big[0].decode('utf-8')+' '+big[1].decode('utf-8') print "SPLITZERO:", result, BIG[big], tokenEstratti[c] tokenEstratti[c]= result split = True x = x + 1 y = y + 1 if tok.encode('utf-8').lower() not in NWORDS and parola.match(tok.encode('utf-8')) and posTok != 'NNP' and tok.islower() and pronominale == False and split == False and tok.encode('utf-8').capitalize() not in NWORDS: assente.append(tok.encode('utf-8').lower()) scelte = filtraCandidati(assente) if scelte != [] and pronominale == False and split == False: for i in scelte: probSx = 0 probDx = 0 if tokenEstratti[c-1].lower() in NWORDS and parola.match(tokenEstratti[c-1]): bigra = (tokenEstratti[c-1].lower(), i) probSx = BIG[bigra] if tokenEstratti[c+1].lower() in NWORDS and parola.match(tokenEstratti[c+1]): bigra = (i, tokenEstratti[c+1].lower()) probDx = BIG[bigra] if probSx > 0 and probDx > 0: if LD(i, tok.encode('utf-8').lower()) < 2: appoggio.append((i.decode('utf-8').encode('utf-8'), LD(i, tok.encode('utf-8').lower()),probSx*probDx)) if appoggio != []: appoggio = sorted(appoggio, key=lambda x:(-x[1], x[2]), reverse=True) print "NON WORD SX DX:", appoggio, tok tokenEstratti[c] = appoggio[0][0].decode('utf-8') assente = [] scelte = [] appoggio = [] c = c + 1 pronominale = False split = False return tokenEstratti
def test_tag(self): # Assert [("il", "DT"), ("gatto", "NN"), ("nero", "JJ")]. v = it.tag("il gatto nero") self.assertEqual(v, [("il", "DT"), ("gatto", "NN"), ("nero", "JJ")]) print "pattern.it.tag()"
def nonWordZero(tokenEstratti): parola = re.compile('[\wàèìòùé+]') c = 0 assente = [] scelte = [] appoggio = [] spazio = ' ' split = False pronominale = False for tok in tokenEstratti: posTok = tag(tok.encode('utf-8'))[0][1] if tok.encode('utf-8').lower() in NWORDS: c = c if tok.encode('utf-8').lower() not in NWORDS and (tag(tok.encode('utf-8').lower()[len(tok)-2:])[0][1] == 'PRP' or tag(tok.encode('utf-8').lower()[len(tok)-2:])[0][1] == 'DT') and tok.encode('utf-8').islower() and (NWORDS[tok.encode('utf-8').lower()[:len(tok)-2]] > 0 or NWORDS[str(tok.encode('utf-8').lower()[:len(tok)-2])+'e'] > 0): c = c pronominale = True if tok.encode('utf-8').lower() not in NWORDS and pronominale == False and spazio not in tok: x = 1 y = x + 1 result = '' for canc in tok: if y == len(tok):break b = tok[:x] e = tok[y:] big = (b.encode('utf-8'),e.encode('utf-8')) if BIG[big]>20: result = big[0].decode('utf-8')+' '+big[1].decode('utf-8') print "SPLIT UNO:", result, BIG[big], tokenEstratti[c] tokenEstratti[c]= result split = True x = x + 1 y = y + 1 if tok.encode('utf-8').lower() not in NWORDS and parola.match(tok.encode('utf-8')) and posTok != 'NNP' and tok.islower() and pronominale == False and spazio not in tok and tok.encode('utf-8').capitalize() not in NWORDS and split == False: assente.append(tok.encode('utf-8').lower()) scelte = filtraCandidati(assente) if scelte != [] and pronominale == False and split == False: for i in scelte: probSx = 0 probDx = 0 probTot = 0 sim = int(difflib.SequenceMatcher(None, tok.encode('utf-8'), i).ratio()*100) if tokenEstratti[c-1].lower() in NWORDS and parola.match(tokenEstratti[c-1]): bigra = (tokenEstratti[c-1].lower(), i) probSx = BIG[bigra] if tokenEstratti[c+1].lower() in NWORDS and parola.match(tokenEstratti[c+1]): bigra = (i, tokenEstratti[c+1].lower()) probDx = BIG[bigra] if probSx > 0 and probDx > 0: if LD(i, tok.encode('utf-8').lower()) <= 2: appoggio.append((i.decode('utf-8').encode('utf-8'), LD(i, tok.encode('utf-8').lower()),probSx+probDx, "sxdx")) if probSx > 0 and probDx == 0: if LD(i, tok.encode('utf-8').lower()) <= 2: appoggio.append((i.decode('utf-8').encode('utf-8'), LD(i, tok.encode('utf-8').lower()),probSx, "sx")) if probDx > 0 and probSx == 0: if LD(i, tok.encode('utf-8').lower()) <= 2: appoggio.append((i.decode('utf-8').encode('utf-8'), LD(i, tok.encode('utf-8').lower()),probDx, "dx")) if probDx == 0 and probSx == 0 and NWORDS[i] > 0: if LD(i, tok.encode('utf-8').lower()) <= 2 and sim > 60: appoggio.append((i.decode('utf-8').encode('utf-8'), LD(i, tok.encode('utf-8').lower()),NWORDS[i], "zero")) if appoggio != []: appoggio = sorted(appoggio, key=lambda x:(-x[1], x[2]), reverse=True) print "NON WORD ZERO:", appoggio, tok tokenEstratti[c] = appoggio[0][0].decode('utf-8') assente = [] scelte = [] appoggio = [] c = c + 1 pronominale = False split = False return tokenEstratti