예제 #1
0
    def do_stem(self, word):
        fsm = Fysom(initial='start', events=self.events)

        i = len(word) - 1
        j = len(word)

        while (True):
            if i <= 0:
                break
            v = word[i:j]
            # print v
            res = fsm.can(v)
            if (res):
                if v == 'i' and fsm.can(word[i - 1:j]):
                    i = i - 1
                    continue
                fsm.trigger(v)
                if fsm.current == 'h':
                    if word[i - 1:i] == 'i':
                        i = i - 1  # skip i
                        if word[i - 1:i] == 'n':
                            # ning qushimchasi
                            fsm.current = 'start'
                            continue
                elif fsm.current == 'b':
                    fsm.current = 'start'
                j = i

            i = i - 1

        return word[:j]
예제 #2
0
파일: devon.py 프로젝트: MrBrownWins/devon
    def do_stem(self, word):
        fsm = Fysom(initial='start', events=self.events)
        # FIXME: uncomment below and make sanitize functions support
        # both Python 2 and 3 versions
        # word = WordProcessor.sanitize(word)
        i = len(word) - 1
        j = len(word)

        while(True):
            if i <= 0:
                break
            v = word[i:j]
            # print v
            res = fsm.can(v)
            if (res):
                if v == 'i' and fsm.can(word[i-1:j]):
                    i = i - 1
                    continue
                fsm.trigger(v)
                if fsm.current == 'h':
                    if word[i-1:i] == 'i':
                        i = i - 1  # skip i
                        if word[i-1:i] == 'n':
                            # ning qushimchasi
                            fsm.current = 'start'
                            continue
                elif fsm.current == 'b':
                    fsm.current = 'start'
                j = i

            i = i - 1

        return word[:j]
예제 #3
0
파일: devon.py 프로젝트: anvarulugov/devon
    def do_stem(self, word):
        fsm = Fysom(initial='start', events=self.events)
        # FIXME: uncomment below and make sanitize functions support
        # both Python 2 and 3 versions
        # word = WordProcessor.sanitize(word)
        i = len(word) - 1
        j = len(word)

        while (True):
            if i <= 0:
                break
            v = word[i:j]
            # print v
            res = fsm.can(v)
            if (res):
                if v == 'i' and fsm.can(word[i - 1:j]):
                    i = i - 1
                    continue
                fsm.trigger(v)
                if fsm.current == 'h':
                    if word[i - 1:i] == 'i':
                        i = i - 1  # skip i
                        if word[i - 1:i] == 'n':
                            # ning qushimchasi
                            fsm.current = 'start'
                            continue
                elif fsm.current == 'b':
                    fsm.current = 'start'
                j = i

            i = i - 1

        return word[:j]
예제 #4
0
파일: eng_prep.py 프로젝트: roseanil/SA
def SenToPhrase (tagged_sentence):
	fsm = Fysom({'initial': '0',
	                'events': [
	                {'name': 'IN', 'src': '0', 'dst': '1'},{'name': 'NN', 'src': '1', 'dst': '3'},{'name': 'NNS', 'src': '1', 'dst': '3'},
	                {'name': 'NNP', 'src': '1', 'dst': '3'},{'name': 'NNPS', 'src': '1', 'dst': '3'},{'name': 'DT', 'src': '1', 'dst': '2'},
	                {'name': 'NN', 'src': '2', 'dst': '3'},{'name': 'NNS', 'src': '2', 'dst': '3'},{'name': 'NNP', 'src': '2', 'dst': '3'},
	                {'name': 'NNPS', 'src': '2', 'dst': '3'},{'name': 'PRP$', 'src': '1', 'dst': '4'},{'name': 'PRP$', 'src': '2', 'dst': '4'},
	                {'name': 'JJ', 'src': '1', 'dst': '5'},{'name': 'JJ', 'src': '2', 'dst': '5'},{'name': 'JJR', 'src': '1', 'dst': '6'},
	                {'name': 'JJR', 'src': '2', 'dst': '6'},{'name': 'JJS', 'src': '1', 'dst': '7'},{'name': 'JJS', 'src': '2', 'dst': '7'},
	                {'name': 'NN', 'src': '5', 'dst': '3'},{'name': 'NN', 'src': '6', 'dst': '3'},{'name': 'NN', 'src': '7', 'dst': '3'},
	                {'name': 'NNS', 'src': '5', 'dst': '3'},{'name': 'NNS', 'src': '6', 'dst': '3'},{'name': 'NNS', 'src': '7', 'dst': '3'},
	                {'name': 'NNP', 'src': '5', 'dst': '3'},{'name': 'NNP', 'src': '6', 'dst': '3'},{'name': 'NNP', 'src': '7', 'dst': '3'},
	                {'name': 'NNPS', 'src': '5', 'dst': '3'},{'name': 'NNPS', 'src': '6', 'dst': '3'},{'name': 'NNPS', 'src': '7', 'dst': '3'},
	                {'name': 'PRP', 'src': '1', 'dst': '4'},{'name': 'PRP', 'src': '2', 'dst': '4'},{'name': 'NN', 'src': '4', 'dst': '3'},
	                {'name': 'NNS', 'src': '4', 'dst': '3'},{'name': 'NNP', 'src': '4', 'dst': '3'},{'name': 'NNPS', 'src': '4', 'dst': '3'},
	                {'name': 'TO', 'src': '0', 'dst': '1'},{'name': 'NN', 'src': '3', 'dst': '4'},{'name': 'NNS', 'src': '3', 'dst': '4'},
					{'name': 'NNP', 'src': '3', 'dst': '4'},{'name': 'NNPS', 'src': '3', 'dst': '4'},
	                #######VERB################
	                {'name': 'MD', 'src': '0', 'dst': '8'},{'name': 'VB', 'src': '8', 'dst': '9'},{'name': 'VBN', 'src': '9', 'dst': '21'},
					{'name': 'VBG', 'src': '9', 'dst': '10'},{'name': 'JJ', 'src': '9', 'dst': '10'},{'name': 'RB', 'src': '9', 'dst': '11'},
					{'name': 'VBD', 'src': '0', 'dst': '12'},{'name': 'VBG', 'src': '12', 'dst': '13'},{'name': 'RB', 'src': '13', 'dst': '11'},
					{'name': 'RB', 'src': '12', 'dst': '11'},{'name': 'VBN', 'src': '12', 'dst': '14'},{'name': 'VBG', 'src': '14', 'dst': '15'},
					{'name': 'JJ', 'src': '14', 'dst': '16'},{'name': 'VBZ', 'src': '0', 'dst': '17'},{'name': 'VBP', 'src': '0', 'dst': '17'},
					{'name': 'RB', 'src': '17', 'dst': '11'},{'name': 'VBG', 'src': '17', 'dst': '18'},{'name': 'VBN', 'src': '17', 'dst': '19'},
					{'name': 'RB', 'src': '18', 'dst': '11'},{'name': 'VBG', 'src': '19', 'dst': '20'},{'name': 'RB', 'src': '20', 'dst': '11'},
					{'name': 'VBG', 'src': '21', 'dst': '22'},{'name': 'RB', 'src': '14', 'dst': '11'}
	                ]})
	high_final_states = ['3','4','9','10','13','14','15','16','18','19','20','21','22']
	phras_rules={'R1':[],'R2':[],'R3':[],'R4':[],'R5':[],'R6':[],'R7':[],'R8':[],'R9':[],'R10':[],'R11':[],'R12':[]}
	to_rb = ['9','12','13','14','17','18','20']
	fsm.current = '0'
	# new_temp = ""
	t=[]
	k = 0
	phrase_count=0
	while(k<len(tagged_sentence)):
		flag = 0
		rbflag = 0
		fsm.current = '0'
		temp_current='0'
		count = 0
		# new_temp = ""
		t=[]
		j = k
		s=[]
		for j in range(k,len(tagged_sentence)):
			# print("-----For loop j----")
			# print(tagged_sentence[j])
			try:
				fsm.trigger(tagged_sentence[j][1])
				# print("\n",fsm.current)
				temp_current=fsm.current
				# new_temp += tagged_sentence[j][0] + " "
				t.append(tagged_sentence[j])
				count += 1
			except:
				break
			finally:
				if(fsm.current=='3' and j!=len(tagged_sentence)-1 and tagged_sentence[j][0][-1]!=","):
					try:
						fsm.trigger(tagged_sentence[j+1][1])
						# print(tagged_sentence[j+1])
						# print("\n",fsm.current)
						# new_temp += tagged_sentence[j+1][0] + " "
						t.append(tagged_sentence[j+1])
						count += 1
					except:
						oops = 2
				if(fsm.current=='9' and j!=len(tagged_sentence)-1):
					try:
						fsm.trigger(tagged_sentence[j+1][1])
						# print("\n",fsm.current)
						# print(tagged_sentence[j+1])
						if(fsm.current=='11'):
							fsm.current = '9'
						else:
						# new_temp += i[j+1][0] + " "
							t.append(tagged_sentence[j+1])
						count += 1
					except:
						oops = 3

				if(fsm.current=='14' and j!=len(tagged_sentence)-1):
					try:
						fsm.trigger(tagged_sentence[j+1][1])
						# print("\n",fsm.current)
						# print(tagged_sentence[j+1])
						if(fsm.current=='11'):
							fsm.current = '14'
						else:
						# new_temp += i[j+1][0] + " "
							t.append(tagged_sentence[j+1])
						count += 1
					except:
						oops = 3
				if(fsm.current=='19' and j!=len(tagged_sentence)-1):
					try:
						fsm.trigger(tagged_sentence[j+1][1])
						# print("\n",fsm.current)
						# print(tagged_sentence[j+1])
						# new_temp += i[j+1][0] + " "
						temp_current=fsm.current
						t.append(tagged_sentence[j+1])
						count += 1
					except:
						oops = 1
				if(fsm.current=='21' and j!=len(tagged_sentence)-1):
					try:
						fsm.trigger(tagged_sentence[j+1][1])
						# print("\n",fsm.current)
						# print(tagged_sentence[j+1])
						# new_temp += i[j+1][0] + " "
						t.append(tagged_sentence[j+1])
						count += 1
					except:
						oops = 1

				c=int(fsm.current)
				if(fsm.current in high_final_states):
					phrase_count+=1
					# new_temp = new_temp[:-1]
					# if(new_temp[-1]==','):
					# 	new_temp = new_temp[:-1]
					if(c==3 or c==4):
						phras_rules['R1'].append(t)
						#PREPOSITION
					elif(c==18):
						phras_rules['R2'].append(t)
						#PRESENT CONTINUOUS
					elif(c==19):
						phras_rules['R3'].append(t)
						#PRESENT PERFECT
					elif(c==20):
						phras_rules['R4'].append(t)
						#PRESENT PERFECT CONTINUOUS
					elif(c==13):
						phras_rules['R5'].append(t)
						#PAST CONTINUOUS
					elif(c==14):
						phras_rules['R6'].append(t)
						#PAST PERFECT
					elif(c==15):
						phras_rules['R7'].append(t)
						#PAST PERFECT CONTINUOUS
					elif(c==9):
						phras_rules['R8'].append(t)
						#SIMPLE FUTURE
					elif(c==10):
						phras_rules['R9'].append(t)
						#FUTURE CONTINUOUS
					elif(c==21):
						phras_rules['R10'].append(t)
						#FUTURE PERFECT
					elif(c==22):
						phras_rules['R11'].append(t)
						#FUTURE PERFECT CONTINUOUS
					t = []
					fsm.current = '0'
					# backup_k = k
					k = count + k
					flag = 1
					# break
				if(temp_current in to_rb and j!=len(tagged_sentence)-1):
					# print("----temp-----")
					# print("\n",temp_current)
					fsm.current = temp_current
					try:
						if(fsm.current=='20'):
							check = fsm.current
							s.append(tagged_sentence[j+1])
							fsm.trigger(tagged_sentence[j+2][1])
						else:
							check = fsm.current
							s.append(tagged_sentence[j])
							fsm.trigger(tagged_sentence[j+1][1])
						# print("-------ENTERED TRYYYYY-------"
						# print(tagged_sentence[j+1])
						# print("\n",fsm.current)
						if(fsm.current=='11'):
							# print("----ENTERED IF----")
							if(check=='20'):
								s.append(tagged_sentence[j+2])
							else:
								s.append(tagged_sentence[j+1])
							# count += 1
						else:
							# print("-----OOPS ELSEEE---")
							s=[]
							fsm.current =temp_current
						rbflag = 1
					except:
						# print("----Uhohhhh------")
						oops = 1
						t = []
						fsm.current = '0'
				if(fsm.current=='11'):
					phrase_count+=1
					phras_rules['R12'].append(s)
					fsm.current = '0'
					s = []
				if(flag==0):
					k += 1
				if(rbflag==1 and flag==1):
					break

	english_sentence_structure[sentence].append(phrase_count)
	english_sentence_structure[sentence].append(phras_rules)
예제 #5
0
def SenToPhrase(tagged_sentence):
    fsm = Fysom({
        'initial':
        '0',
        'events': [{
            'name': 'IN',
            'src': '0',
            'dst': '1'
        }, {
            'name': 'NN',
            'src': '1',
            'dst': '3'
        }, {
            'name': 'NNS',
            'src': '1',
            'dst': '3'
        }, {
            'name': 'NNP',
            'src': '1',
            'dst': '3'
        }, {
            'name': 'NNPS',
            'src': '1',
            'dst': '3'
        }, {
            'name': 'DT',
            'src': '1',
            'dst': '2'
        }, {
            'name': 'NN',
            'src': '2',
            'dst': '3'
        }, {
            'name': 'NNS',
            'src': '2',
            'dst': '3'
        }, {
            'name': 'NNP',
            'src': '2',
            'dst': '3'
        }, {
            'name': 'NNPS',
            'src': '2',
            'dst': '3'
        }, {
            'name': 'PRP$',
            'src': '1',
            'dst': '4'
        }, {
            'name': 'PRP$',
            'src': '2',
            'dst': '4'
        }, {
            'name': 'JJ',
            'src': '1',
            'dst': '5'
        }, {
            'name': 'JJ',
            'src': '2',
            'dst': '5'
        }, {
            'name': 'JJR',
            'src': '1',
            'dst': '6'
        }, {
            'name': 'JJR',
            'src': '2',
            'dst': '6'
        }, {
            'name': 'JJS',
            'src': '1',
            'dst': '7'
        }, {
            'name': 'JJS',
            'src': '2',
            'dst': '7'
        }, {
            'name': 'NN',
            'src': '5',
            'dst': '3'
        }, {
            'name': 'NN',
            'src': '6',
            'dst': '3'
        }, {
            'name': 'NN',
            'src': '7',
            'dst': '3'
        }, {
            'name': 'NNS',
            'src': '5',
            'dst': '3'
        }, {
            'name': 'NNS',
            'src': '6',
            'dst': '3'
        }, {
            'name': 'NNS',
            'src': '7',
            'dst': '3'
        }, {
            'name': 'NNP',
            'src': '5',
            'dst': '3'
        }, {
            'name': 'NNP',
            'src': '6',
            'dst': '3'
        }, {
            'name': 'NNP',
            'src': '7',
            'dst': '3'
        }, {
            'name': 'NNPS',
            'src': '5',
            'dst': '3'
        }, {
            'name': 'NNPS',
            'src': '6',
            'dst': '3'
        }, {
            'name': 'NNPS',
            'src': '7',
            'dst': '3'
        }, {
            'name': 'PRP',
            'src': '1',
            'dst': '4'
        }, {
            'name': 'PRP',
            'src': '2',
            'dst': '4'
        }, {
            'name': 'NN',
            'src': '4',
            'dst': '3'
        }, {
            'name': 'NNS',
            'src': '4',
            'dst': '3'
        }, {
            'name': 'NNP',
            'src': '4',
            'dst': '3'
        }, {
            'name': 'NNPS',
            'src': '4',
            'dst': '3'
        }, {
            'name': 'TO',
            'src': '0',
            'dst': '1'
        }, {
            'name': 'MD',
            'src': '0',
            'dst': '8'
        }, {
            'name': 'VB',
            'src': '8',
            'dst': '9'
        }, {
            'name': 'VBD',
            'src': '8',
            'dst': '10'
        }, {
            'name': 'VBG',
            'src': '8',
            'dst': '11'
        }, {
            'name': 'VBN',
            'src': '8',
            'dst': '12'
        }, {
            'name': 'VBP',
            'src': '8',
            'dst': '13'
        }, {
            'name': 'VBZ',
            'src': '8',
            'dst': '14'
        }, {
            'name': 'VB',
            'src': '0',
            'dst': '9'
        }, {
            'name': 'VBD',
            'src': '0',
            'dst': '10'
        }, {
            'name': 'VBG',
            'src': '0',
            'dst': '11'
        }, {
            'name': 'VBN',
            'src': '0',
            'dst': '12'
        }, {
            'name': 'VBP',
            'src': '0',
            'dst': '13'
        }, {
            'name': 'VBZ',
            'src': '0',
            'dst': '14'
        }, {
            'name': 'RB',
            'src': '9',
            'dst': '15'
        }, {
            'name': 'RB',
            'src': '10',
            'dst': '15'
        }, {
            'name': 'RB',
            'src': '11',
            'dst': '15'
        }, {
            'name': 'RB',
            'src': '12',
            'dst': '15'
        }, {
            'name': 'RB',
            'src': '13',
            'dst': '15'
        }, {
            'name': 'RB',
            'src': '14',
            'dst': '15'
        }, {
            'name': 'JJ',
            'src': '9',
            'dst': '16'
        }, {
            'name': 'JJ',
            'src': '10',
            'dst': '16'
        }, {
            'name': 'JJ',
            'src': '11',
            'dst': '16'
        }, {
            'name': 'JJ',
            'src': '12',
            'dst': '16'
        }, {
            'name': 'JJ',
            'src': '13',
            'dst': '16'
        }, {
            'name': 'JJ',
            'src': '14',
            'dst': '16'
        }, {
            'name': 'VB',
            'src': '15',
            'dst': '17'
        }, {
            'name': 'VBD',
            'src': '15',
            'dst': '18'
        }, {
            'name': 'VBG',
            'src': '15',
            'dst': '19'
        }, {
            'name': 'VBN',
            'src': '15',
            'dst': '20'
        }, {
            'name': 'VBP',
            'src': '15',
            'dst': '21'
        }, {
            'name': 'VBZ',
            'src': '15',
            'dst': '22'
        }, {
            'name': 'VB',
            'src': '9',
            'dst': '17'
        }, {
            'name': 'VBD',
            'src': '9',
            'dst': '18'
        }, {
            'name': 'VBG',
            'src': '9',
            'dst': '19'
        }, {
            'name': 'VBN',
            'src': '9',
            'dst': '20'
        }, {
            'name': 'VBP',
            'src': '9',
            'dst': '21'
        }, {
            'name': 'VBZ',
            'src': '9',
            'dst': '22'
        }, {
            'name': 'VB',
            'src': '10',
            'dst': '17'
        }, {
            'name': 'VBD',
            'src': '10',
            'dst': '18'
        }, {
            'name': 'VBG',
            'src': '10',
            'dst': '19'
        }, {
            'name': 'VBN',
            'src': '10',
            'dst': '20'
        }, {
            'name': 'VBP',
            'src': '10',
            'dst': '21'
        }, {
            'name': 'VBZ',
            'src': '10',
            'dst': '22'
        }, {
            'name': 'VB',
            'src': '11',
            'dst': '17'
        }, {
            'name': 'VBD',
            'src': '11',
            'dst': '18'
        }, {
            'name': 'VBG',
            'src': '11',
            'dst': '19'
        }, {
            'name': 'VBN',
            'src': '11',
            'dst': '20'
        }, {
            'name': 'VBP',
            'src': '11',
            'dst': '21'
        }, {
            'name': 'VBZ',
            'src': '11',
            'dst': '22'
        }, {
            'name': 'VB',
            'src': '12',
            'dst': '17'
        }, {
            'name': 'VBD',
            'src': '12',
            'dst': '18'
        }, {
            'name': 'VBG',
            'src': '12',
            'dst': '19'
        }, {
            'name': 'VBN',
            'src': '12',
            'dst': '20'
        }, {
            'name': 'VBP',
            'src': '12',
            'dst': '21'
        }, {
            'name': 'VBZ',
            'src': '12',
            'dst': '22'
        }, {
            'name': 'VB',
            'src': '13',
            'dst': '17'
        }, {
            'name': 'VBD',
            'src': '13',
            'dst': '18'
        }, {
            'name': 'VBG',
            'src': '13',
            'dst': '19'
        }, {
            'name': 'VBN',
            'src': '13',
            'dst': '20'
        }, {
            'name': 'VBP',
            'src': '13',
            'dst': '21'
        }, {
            'name': 'VBZ',
            'src': '13',
            'dst': '22'
        }, {
            'name': 'VB',
            'src': '14',
            'dst': '17'
        }, {
            'name': 'VBD',
            'src': '14',
            'dst': '18'
        }, {
            'name': 'VBG',
            'src': '14',
            'dst': '19'
        }, {
            'name': 'VBN',
            'src': '14',
            'dst': '20'
        }, {
            'name': 'VBP',
            'src': '14',
            'dst': '21'
        }, {
            'name': 'VBZ',
            'src': '14',
            'dst': '22'
        }]
    })
    high_final_states = ['3', '15', '16', '17', '18', '19', '20', '21', '22']
    low_final_states = ['4']
    final = []
    fsm.current = '0'
    new_temp = ""
    for j in tagged_sentence:
        try:
            fsm.trigger(j[1])
            new_temp += j[0] + " "
        except:
            fsm.current = '0'
            new_temp = ""
        finally:
            if (fsm.current in high_final_states):
                fsm.current = "0"
                new_temp = new_temp[:-1]
                #final.append(new_temp)
                final.append(english_postagger.tag(new_temp.split()))
                new_temp = ""
            elif (fsm.current in low_final_states):
                fsm.current = "0"
                new_temp = new_temp[:-1]
                #final.append(new_temp)
                final.append(english_postagger.tag(new_temp.split()))
                new_temp = ""
    s = " "
    return final