示例#1
0
class Seeds:
	def __init__(self):		
		self.dic_seeds = {}
		self.accents = Accents()
		self.__buildDic__()

	def __buildDic__(self):
		try:
			file_seeds = codecs.open('seeds.txt', 'r', 'utf-8')
		except IOError:
			print 'ERROR: System cannot open the seeds.txt file'
			sys.exit()

		for line in file_seeds:
			if line != '':
				line = line.replace('\n','')
				line = self.accents.buildCodes(line)
				self.dic_seeds[line] = line

		file_seeds.close()

	def getQtySeeds(self):
		return len(self.dic_seeds)

	def getSeeds(self):
		return sorted(self.dic_seeds.keys())

	def printSeeds(self):
		print self.dic_seeds

	def printQtySeeds(self):
		print len(self.dic_seeds)
	def __buildStatisticalCorpus__(self):
		try:
			root, dirs, files = os.walk(self.corpus_folder).next()[:3]
		except:
			print 'ERROR: It was not possible to open the ../Data/Corpus/Raw/ folder'
			sys.exit()

		accents = Accents()
		for corpus_file in files:
			if re.match('.*xml$', corpus_file):
				corpus_filename = corpus_file.split('.')[0]
				xmlfile = ParseXml(root+''+corpus_file)
				dic_terms = xmlfile.getDicTerms()
				dic_nouns = xmlfile.getNouns()
				dic_verbs = xmlfile.getVerbs()

				id_sentence = 1
				id_word = 1
				id_t = 's'+str(id_sentence)+'_'+str(id_word)

				string_full = ''
				string_nouns = ''
				while dic_terms.has_key(id_t):
					while dic_terms.has_key(id_t):
						if not re.match('^(pu|num|conj|art|prp|spec)', dic_terms[id_t]['pos']) and (re.search('[$]', dic_terms[id_t]['lemma']) is None) and (len(dic_terms[id_t]['lemma']) >= self.parameters.getMinWordSize()):
							lemma = accents.buildCodes(dic_terms[id_t]['lemma'])
							if dic_nouns.has_key(id_t):
								string_nouns += lemma+'__N '
								string_full += lemma+'__N '
							elif dic_verbs.has_key(id_t):
								string_nouns += lemma+'__V '
								string_full += lemma+'__V '
							else:
								string_full += lemma+'__O '
							string_nouns = string_nouns.replace('-', '_')
							string_full = string_full.replace('-', '_')
						id_word += 1
						id_t = 's'+str(id_sentence)+'_'+str(id_word)
					id_word = 1
					id_sentence += 1
					id_t = 's'+str(id_sentence)+'_'+str(id_word)
				self.__writeCorpusFile__(corpus_filename, string_full, string_nouns)