Exemplo n.º 1
0
	def __init__(self, goldPath, predictedPath=None):
		self.goldPath = goldPath
		self.predictedPath = predictedPath
		self.sents = [] # all sents in corpus
		self.sent_stats = {}
		self.numTokens = 0 # count total tokens in corpus
		self.tags = set()
		self.tokens = []
		sent = Sentence()



		if predictedPath:
			with open(goldPath) as gf, open(predictedPath) as pf:
				for gline,pline in izip(gf, pf): # open two files simultaneously
					if gline.strip() and pline.strip(): # check if lines not empty
						gtoken_tag = re.split(r'\t', gline)
						ptoken_tag = re.split(r'\t', pline)
						if gtoken_tag[0] == ptoken_tag[0]:
							token = Token(gtoken_tag[0], gtoken_tag[1].strip(), ptoken_tag[1].strip()) # create new Token object
							sent.addToken(token)
							self.numTokens += 1 
						else:
							raise Exception("Files not in sync")
					else:
						self.sents.append(sent)
						sent = Sentence()
		else:
			# store all sentences from corpus
			sentences = []
			# store a sentence that consists of tokens
			sentence = []
			with open(goldPath) as gf:
				for line in gf: 
					# check if lines not empty
					if line.strip(): 
						# split line into token and tag as list elements
						token_tag = re.split(r'\t', line)
						# add a token object into sentence
						sentence.append(Token(token_tag[0].strip(), token_tag[1].strip()))
						# count total number of tokens
						self.numTokens += 1 
					else:
						# we have reached end of sentence (empty line)
						sentences.append(sentence)
						sentence = []

			prev = "prevnotekzist"
			following = "folnotekzist"
			for j, sentence in enumerate(sentences):
				for i, token in enumerate(sentence):
					# make sure we don't go beyond sentence length
					if i+1 < len(sentence):
						following = sentence[i+1]
					# if we reached end of current sentence - take following word as first word of next sentence
					elif j+1 < len(sentences):
						following = sentences[j+1][0]
					token.setPrev(prev)
					token.setFollowing(following)
					token.getNeighborFeatures()
					# print (vars(token))
					prev = token
					sent.addToken(token)
				self.sents.append(sent)
		 		sent = Sentence()
Exemplo n.º 2
0
	def __init__(self, goldPath, predictedPath=None):
		self.goldPath = goldPath
		self.predictedPath = predictedPath
		self.sents = [] # all sents in corpus
		self.sent_stats = {}
		self.numTokens = 0 # count total tokens in corpus
		self.tags = set()
		self.tokens = []
		sent = Sentence()



		if predictedPath:
			with open(goldPath) as gf, open(predictedPath) as pf:
				for gline,pline in izip(gf, pf): # open two files simultaneously
					if gline.strip() and pline.strip(): # check if lines not empty
						gtoken_tag = re.split(r'\t', gline)
						ptoken_tag = re.split(r'\t', pline)
						if gtoken_tag[0] == ptoken_tag[0]:
							token = Token(gtoken_tag[0], gtoken_tag[1].strip(), ptoken_tag[1].strip()) # create new Token object
							sent.addToken(token)
							self.numTokens += 1 
						else:
							raise Exception("Files not in sync")
					else:
						self.sents.append(sent)
						sent = Sentence()
		else:
			# store all sentences from corpus
			sentences = []
			# store a sentence that consists of tokens
			sentence = []
			with open(goldPath) as gf:
				for line in gf: 
					# check if lines not empty
					if line.strip(): 
						# split line into token and tag as list elements
						token_tag = re.split(r'\t', line)
						# add a token object into sentence
						sentence.append(Token(token_tag[0].strip(), token_tag[1].strip()))
						# count total number of tokens
						self.numTokens += 1 
					else:
						# we have reached end of sentence (empty line)
						sentences.append(sentence)
						sentence = []

			prev = "prevnotekzist"
			following = "folnotekzist"
			for j, sentence in enumerate(sentences):
				for i, token in enumerate(sentence):
					# make sure we don't go beyond sentence length
					if i+1 < len(sentence):
						following = sentence[i+1]
					# if we reached end of current sentence - take following word as first word of next sentence
					elif j+1 < len(sentences):
						following = sentences[j+1][0]
					token.setPrev(prev)
					token.setFollowing(following)
					token.getNeighborFeatures()
					# print (vars(token))
					prev = token
					sent.addToken(token)
				self.sents.append(sent)
		 		sent = Sentence()