Exemplo n.º 1
0
 def test_tag(self):
     # Assert [("il", "DT"), ("gatto", "NN"), ("nero", "JJ")].
     v = it.tag("il gatto nero")
     self.assertEqual(v, [("il", "DT"), ("gatto", "NN"), ("nero", "JJ")])
     print("pattern.it.tag()")
Exemplo n.º 2
0
def nonWordSxDx(tokenEstratti):


	parola = re.compile('[\wàèìòùé+]')
	c = 0
	assente = []
	scelte = []
	appoggio = []
	pronominale = False
	split = False

	for tok in tokenEstratti:

		posTok = tag(tok.encode('utf-8'))[0][1]
			
		if tok.encode('utf-8').lower() in NWORDS:

			c = c

		if tok.encode('utf-8').lower() not in NWORDS and (tag(tok.encode('utf-8').lower()[len(tok)-2:])[0][1] == 'PRP' or tag(tok.encode('utf-8').lower()[len(tok)-2:])[0][1] == 'DT') and tok.encode('utf-8').islower() and (NWORDS[tok.encode('utf-8').lower()[:len(tok)-2]] > 0 or NWORDS[str(tok.encode('utf-8').lower()[:len(tok)-2])+'e'] > 0):

			c = c
			pronominale = True
			print tok


		if tok.encode('utf-8').lower() not in NWORDS and pronominale == False:

			x = 1
			y = 1
			result = ''

			for canc in tok:

				if y == len(tok.encode('utf-8')):break
				b = tok.encode('utf-8')[:x]
				e = tok.encode('utf-8')[y:]
				big = (b,e)
				if BIG[big]>20:
					result = big[0].decode('utf-8')+' '+big[1].decode('utf-8')
					print "SPLITZERO:", result, BIG[big], tokenEstratti[c]
					tokenEstratti[c]= result
					split = True

				x = x + 1
				y = y + 1


		if tok.encode('utf-8').lower() not in NWORDS and parola.match(tok.encode('utf-8')) and posTok != 'NNP' and tok.islower() and pronominale == False and split == False and tok.encode('utf-8').capitalize() not in NWORDS:

			assente.append(tok.encode('utf-8').lower())
			scelte = filtraCandidati(assente)
			
		if scelte != [] and pronominale == False and split == False:

			for i in scelte:

				probSx = 0
				probDx = 0

				if tokenEstratti[c-1].lower() in NWORDS and parola.match(tokenEstratti[c-1]):

					bigra = (tokenEstratti[c-1].lower(), i)
					probSx = BIG[bigra]
						

				if tokenEstratti[c+1].lower() in NWORDS and parola.match(tokenEstratti[c+1]):

					bigra = (i, tokenEstratti[c+1].lower())
					probDx = BIG[bigra]
						
				if probSx > 0 and probDx > 0:

					if LD(i, tok.encode('utf-8').lower()) < 2:
							
						appoggio.append((i.decode('utf-8').encode('utf-8'), LD(i, tok.encode('utf-8').lower()),probSx*probDx))
								
		if appoggio != []:
			appoggio = sorted(appoggio, key=lambda x:(-x[1], x[2]), reverse=True)
			print "NON WORD SX DX:", appoggio, tok
			tokenEstratti[c] = appoggio[0][0].decode('utf-8')
		assente = []
		scelte = []
		appoggio = []
		c = c + 1
		pronominale = False
		split = False

	return tokenEstratti
Exemplo n.º 3
0
 def test_tag(self):
     # Assert [("il", "DT"), ("gatto", "NN"), ("nero", "JJ")].
     v = it.tag("il gatto nero")
     self.assertEqual(v, [("il", "DT"), ("gatto", "NN"), ("nero", "JJ")])
     print "pattern.it.tag()"
Exemplo n.º 4
0
def nonWordZero(tokenEstratti):


	parola = re.compile('[\wàèìòùé+]')
	c = 0
	assente = []
	scelte = []
	appoggio = []
	spazio = ' '
	split = False
	pronominale = False

	for tok in tokenEstratti:

		posTok = tag(tok.encode('utf-8'))[0][1]
			
		if tok.encode('utf-8').lower() in NWORDS:

			c = c

		if tok.encode('utf-8').lower() not in NWORDS and (tag(tok.encode('utf-8').lower()[len(tok)-2:])[0][1] == 'PRP' or tag(tok.encode('utf-8').lower()[len(tok)-2:])[0][1] == 'DT') and tok.encode('utf-8').islower() and (NWORDS[tok.encode('utf-8').lower()[:len(tok)-2]] > 0 or NWORDS[str(tok.encode('utf-8').lower()[:len(tok)-2])+'e'] > 0):
			
			c = c
			pronominale = True


		if tok.encode('utf-8').lower() not in NWORDS and pronominale == False and spazio not in tok:

			x = 1
			y = x + 1
			result = ''

			for canc in tok:

				if y == len(tok):break
				b = tok[:x]
				e = tok[y:]
				big = (b.encode('utf-8'),e.encode('utf-8'))
				if BIG[big]>20:
					result = big[0].decode('utf-8')+' '+big[1].decode('utf-8')
					print "SPLIT UNO:", result, BIG[big], tokenEstratti[c]
					tokenEstratti[c]= result
					split = True

				x = x + 1
				y = y + 1

		if tok.encode('utf-8').lower() not in NWORDS and parola.match(tok.encode('utf-8')) and posTok != 'NNP' and tok.islower() and pronominale == False and spazio not in tok and tok.encode('utf-8').capitalize() not in NWORDS and split == False:

			assente.append(tok.encode('utf-8').lower())
			scelte = filtraCandidati(assente)
			
		if scelte != [] and pronominale == False and split == False:

			for i in scelte:

				probSx = 0
				probDx = 0
				probTot = 0
				sim = int(difflib.SequenceMatcher(None, tok.encode('utf-8'), i).ratio()*100)

				if tokenEstratti[c-1].lower() in NWORDS and parola.match(tokenEstratti[c-1]):

					bigra = (tokenEstratti[c-1].lower(), i)
					probSx = BIG[bigra]
						

				if tokenEstratti[c+1].lower() in NWORDS and parola.match(tokenEstratti[c+1]):

					bigra = (i, tokenEstratti[c+1].lower())
					probDx = BIG[bigra]

				if probSx > 0 and probDx > 0:

					if LD(i, tok.encode('utf-8').lower()) <= 2:
							
						appoggio.append((i.decode('utf-8').encode('utf-8'), LD(i, tok.encode('utf-8').lower()),probSx+probDx, "sxdx"))
						
				if probSx > 0 and probDx == 0:

					if LD(i, tok.encode('utf-8').lower()) <= 2:
							
						appoggio.append((i.decode('utf-8').encode('utf-8'), LD(i, tok.encode('utf-8').lower()),probSx, "sx"))
						
				if probDx > 0 and probSx == 0:

					if LD(i, tok.encode('utf-8').lower()) <= 2:
							
						appoggio.append((i.decode('utf-8').encode('utf-8'), LD(i, tok.encode('utf-8').lower()),probDx, "dx"))
				
				if probDx == 0 and probSx == 0 and NWORDS[i] > 0:

					if LD(i, tok.encode('utf-8').lower()) <= 2 and sim > 60:
							
						appoggio.append((i.decode('utf-8').encode('utf-8'), LD(i, tok.encode('utf-8').lower()),NWORDS[i], "zero"))

		if appoggio != []:
			appoggio = sorted(appoggio, key=lambda x:(-x[1], x[2]), reverse=True)
			print "NON WORD ZERO:", appoggio, tok
			tokenEstratti[c] = appoggio[0][0].decode('utf-8')
		assente = []
		scelte = []
		appoggio = []
		c = c + 1
		pronominale = False
		split = False

	
	return tokenEstratti