示例#1
0
def main():
    # reader = CsvReader()
    # filename = "/media/jetxeberria/linux_storage/data/documents/besteak/egunean_behin_results.txt.csv"
    # target_columns = ["Aciertos", "time", "Puntuacion"]
    # contents = reader.raw_read(filename)
    # cols_index, subcontents = reader.extract_columns(contents, target_columns)
    # print(f"cols_index: {cols_index}")
    # print(f"subcontents: {subcontents}")
    reader = CsvReader()

    reader.load_csv(
        "/media/jetxeberria/linux_storage/data/documents/besteak/egunean_behin_results.csv"
    )
    reader.select_columns_by_name(["Aciertos", "time", "Puntuacion"])

    plotter = Plotter()

    x = np.array(reader.selection[1:, 0], dtype=np.float)
    y = np.array(reader.selection[1:, 1], dtype=np.float)
    z = np.array(reader.selection[1:, 2], dtype=np.float)
    print(type(reader.selection))
    print(x)
    print(y)
    print(z)
    title = "Egunean Behin lehiaketa puntuatioiak"
    plot = plotter.plot_3d(x,
                           y,
                           z,
                           suptitle=title,
                           xlabel="Aciertos",
                           ylabel="Tiempo [min]",
                           zlabel="Puntuacion")
    outpath = "/media/jetxeberria/linux_storage/data/documents/besteak/egunean_behin_graph.png"
    # save_plot(plot, outpath)

    print(reader.selection)
	def __init__(self, column_format, src, out = None):
		self._src = src
		self._ostream = sys.stdout if out == None else codecs.open(out, 'w', 'utf-8')
		self._column_format = column_format
		
		self._reader = CsvReader(Wordanalyzer.CSV_SEPARATOR, Wordanalyzer.TEXT_DELIMITER, column_format)
class Wordanalyzer(object):
	"""Main class."""
	CSV_SEPARATOR = ','
	TEXT_DELIMITER = '"'
	WORD_SPLIT_REGEX = r'[ \t-\.,:;!\?\(\)"\'“”]'
	DICT_CC_MODULE = None
	SPANISH_TO_GERMAN = [ # any occurance of the letter x will not be replaced due to the unpredictability of its pronounciation (h and y need special treatment)
		('ch', 'tsch'), ('cc', 'ks'), ('j', 'ch'), ('ñ', 'nj'), ('ll', 'j'), ('v', 'b'), ('z', 's'),
		('ca', 'ka'), ('cá', 'ká'), ('co', 'ko'), ('có', 'kó'), ('c', 'k'), ('cú', 'kú'),
		('ce', 'se'), ('cé', 'sé'), ('ci', 'si'), ('cí', 'sí'),
		('que', 'ke'), ('qué', 'ké'), ('qui', 'ki'), ('quí', 'kí'),
		('ge', 'che'), ('gé', 'ché'), ('gi', 'chi'), ('gí', 'chí'),
		('gue', 'ge'), ('gué', 'gé'), ('gui', 'gi'), ('guí', 'gí'),
		('güe', 'gue'), ('güé', 'gué'), ('güi', 'gui'), ('güí', 'guí'),
		('e', 'e·'), ('é', 'é·'), ('eú', 'e·ú'),
		('ei', 'e·i'), ('éi', 'é·i'), ('eí', 'e·í'),
		('ie', 'i·e'), ('íe', 'í·e'), ('ié', 'i·é')
	]
	ACCENT_CONVERSION = {
		'a' : 'á', 'e' : 'é', 'i' : 'í', 'o' : 'ó', '' : 'ú'
	}
	
	@staticmethod
	def get_translation_es(word):
		module = SpanishdictCom()
		return module.get_translation(word)
	#end def
	
	@staticmethod
	def get_translation_es2(word):
		module = DixOsolaComDe()
		return module.get_translation(word)
	#end def
	
	@staticmethod
	def get_translation_en(word):
		if Wordanalyzer.DICT_CC_MODULE == None:
			Wordanalyzer.DICT_CC_MODULE = DictCc()
		#end if
		
		return Wordanalyzer.DICT_CC_MODULE.get_translation(word)
	#end def
	
	@staticmethod
	def get_translation_de(word):
		module = DeWiktionaryOrg()
		return module.get_translation(word)
	#end def
	
	@staticmethod
	def get_ipa_en(word):
		module = OxfordDictionary()
		return module.get_info(word)['ipa']
	#end def
	
	@staticmethod
	def get_ipa_de(word):
		module = DeWiktionaryOrg()
		return module.get_ipa(word)
	#end def
	
	@staticmethod
	def has_accent(word):
		for char in word:
			if char == 'á' or char == 'é' or char == 'í' or char == 'ó' or char == 'ú':
				return True
			#end if
		#end for
		
		return False
	#end def
	
	@staticmethod
	def is_vowel(char):
		return char == 'a' or char == 'e' or char == 'i' or char == 'o' or char == ''
	#end def
	
	@staticmethod
	def add_accent_es(word):
		if Wordanalyzer.has_accent(word):
			return word
		#end if
		
		last_char = word[-1]
		consonants = 0
		chars = list(word)
		for i, char in reversed(list(enumerate(chars))):
			if Wordanalyzer.is_vowel(char):
				if last_char == 'n' or last_char == 's':
					if consonants > 1:
						chars[i] = Wordanalyzer.ACCENT_CONVERSION[char]
						break
					#end if
				elif consonants > 0:
					chars[i] = Wordanalyzer.ACCENT_CONVERSION[char]
					break
				#end if
			else:
				consonants += 1
			#end if
		#end for
		
		return ''.join(chars)
	#end def
	
	@staticmethod
	def spanish_to_phonetic_de(word):
		#phonetics = ' '.join([Wordanalyzer.add_accent_es(word) for word in re.split(r'[ \/\(\)\[\]¡!¿?;\.:_-]', re.sub(r'(?:[^c])h', '', word.lower())) if len(word) > 0])
		phonetics = re.sub(r'(?:[^c])h', '', word.lower())
		words_with_accents = [(word, Wordanalyzer.add_accent_es(word)) for word in re.findall(r'(?u)\w+', phonetics)]
		
		for word_pair in words_with_accents:
			if word_pair[0] != word_pair[1]:
				phonetics = phonetics.replace(word_pair[0], word_pair[1])
			#end if
		#end for
		
		for conversion in Wordanalyzer.SPANISH_TO_GERMAN:
			phonetics = phonetics.replace(conversion[0], conversion[1])
		#end for
		
		return re.sub(r'y ', 'i ', re.sub(r'y$', 'i', phonetics)).replace('y', 'j')
	#end def
	
	@staticmethod
	def get_conjugation_es(verb, tense):
		module = DixOsolaComConjugator()
		return module.get_conjugation(verb, tense)
	#end def
	
	@staticmethod
	def get_normalized_words(rows):
		return set([Translator.normalize_word(row.original_word) for row in rows])
	#end def
	
	def __init__(self, column_format, src, out = None):
		self._src = src
		self._ostream = sys.stdout if out == None else codecs.open(out, 'w', 'utf-8')
		self._column_format = column_format
		
		self._reader = CsvReader(Wordanalyzer.CSV_SEPARATOR, Wordanalyzer.TEXT_DELIMITER, column_format)
	#end def
	
	def close(self):
		if self._ostream != sys.stdout:
			self._ostream.close()
		#end if
	#end def
	
	def _parse_src(self):
		"""TODO decribe what this method does."""
		return self._reader.parse(self._src)
	#end def
	
	def print_csv_row(self, cols):
		"""TODO decribe what this method does.
	
		Keyword arguments:
		cols -- TODO describe what this argument is about
		"""	
		first_col = True
	
		for col in cols:
			if not first_col:
				self._ostream.write(Wordanalyzer.CSV_SEPARATOR)
			else:
				first_col = False
			#end if
		
			self._ostream.write('"')
			self._ostream.write(col)
			self._ostream.write('"')
		#end for
	
		self._ostream.write('\n')
	#end def
	
	def print_header_row(self, column_format):
		"""TODO decribe what this method does.
		
		Keyword arguments:
		column_format -- column format as array which is used for the current table
		"""
		header_columns = []
		
		for col in column_format:
			header_columns.append('[%s]' % {
				'O': 'Original word',
				'P': 'IPA',
				'S': 'Synonyms',
				'A': 'Antonyms',
				'E': 'Example sentence',
				'D': 'Definition',
				'M': 'Translation',
				'W': 'Word type',
				'L': 'Level',
				'T': 'Tags',
				# special column identifiers
				'N': 'Normalized word',
				'NM': 'New translation',
				'NW': 'New word type',
				'NP': 'New IPA'
			}[col])
		#end for
	
		self.print_csv_row(header_columns)
	#end def

	def print_word_list(self, lang = None):
		"""Parses text file containing standard text and generates vocabulary list without translations from it.
	
		Keyword arguments:
		src -- path to input file
		out -- output stream
		"""
		with codecs.open(self._src, 'r', 'utf-8') as f:	
			text = f.read().replace('\n', ' ') # replace newlines with spaces to avoid cut-off words
			wordset = set(word for word in re.findall(r"(?u)[\w'-]{3,}", text) if re.match(r'(?u)^[^0-9]+$', word))
			words = None
			translation_lambda = None
			
			if lang == 'es':
				words = sorted(wordset - wordlist_es.WORD_COLLECTION_ES)
			elif lang == 'de':
				words = sorted(wordset)
			elif lang == 'en':
				words = sorted(wordset)
			else:
				words = sorted(wordset)
			#end if
			
			self.print_csv_row(['[Original word]', '[Translation]']) # print header
			for word in words:
				if lang == 'en':
					# skip words that accidentially have two hyphens at the end
					# skip plurals and 3rd-person-s verbs if they exist in base form
					# skip past tense verbs if they exist in base form
					if word.endswith('--') or \
						word[-1] == 's' and word[:-1] in words or \
						word[-2:] == 'ed' and word[:-2] in words or word[:-1] in words:
						continue
					#end if
				#end if
				
				self.print_csv_row([word, ''])
			#end for
		#end with
	#end def
	
	def print_enhanced_table(self):
		"""Parses CSV table already containing at least two columns with Spanish/English word pairs. The third column is optional and usually contains a 'checked' flag, which
		declares whether a word has been already validated. The generated table will contain an extra row for a normalized version of the original Spanish word, an extra row
		for translation (from SpanishDict.com) and an extra column for the word type (e.g. noun).
		
		Keyword arguments:
		src -- path to input CSV file
		out -- output stream
		"""
		thread_pool = ThreadPool(self.__print_enhanced_row)
		rows = self._parse_src()
		
		self.print_header_row(rows[0].get_column_format() if len(rows) > 0 else self._column_format.split('|'))
		for row in rows:
			if len(row) < 2:
				print('Skip incomplete row')
			else:
				p = Processable(lambda word: Wordanalyzer.get_translation_es(word), row.original_word, row)
				p.start()
				
				thread_pool.add(p)
			#end if
		#end for
		
		thread_pool.finish()
	#end def
	
	def print_enhanced_table_en(self):
		"""TODO decribe what this method does."""
		thread_pool = ThreadPool(self.__print_enhanced_row_en)
		rows = self._parse_src()
		
		self.print_header_row(rows[0].get_column_format() if len(rows) > 0 else self._column_format.split('|'))
		for row in rows:
			if len(row) < 2:
				print('Skip incomplete row')
			else:
				p = Processable(lambda word: Wordanalyzer.get_translation_en(word), row.original_word, row)
				p.start()
				
				thread_pool.add(p)
			#end if
		#end for
		
		thread_pool.finish()
	#end def
	
	def print_enhanced_table_de(self):
		"""TODO decribe what this method does."""
		thread_pool = ThreadPool(self.__print_enhanced_row_de)
		rows = self._parse_src()

		self.print_header_row(rows[0].get_column_format() if len(rows) > 0 else self._column_format.split('|'))
		for row in rows:
			if len(row) < 2:
				print('Skip incomplete row')
			else:
				p = Processable(lambda word: Wordanalyzer.get_ipa_de(word), row[0], row)
				p.start()
				
				thread_pool.add(p)
			#end if
		#end for
		
		thread_pool.finish()
	#end def
	
	def print_table_with_ipa_en(self):
		"""TODO decribe what this method does."""
		thread_pool = ThreadPool(self.__print_row_with_ipa)
		rows = self._parse_src()

		self.print_header_row(rows[0].get_column_format() if len(rows) > 0 else self._column_format.split('|'))
		for row in rows:
			if len(row) < 1:
				print('Skip incomplete row')
			else:
				p = Processable(lambda word: Wordanalyzer.get_ipa_en(word), row[0], row)
				p.start()
				
				thread_pool.add(p)
			#end if
		#end for
		
		thread_pool.finish()
	#end def
	
	def print_table_with_ipa_de(self):
		"""TODO decribe what this method does."""
		thread_pool = ThreadPool(self.__print_row_with_ipa)
		rows = self._parse_src()

		self.print_header_row(rows[0].get_column_format() if len(rows) > 0 else self._column_format.split('|'))
		for row in rows:
			if len(row) < 1:
				print('Skip incomplete row')
			else:
				p = Processable(lambda word: Wordanalyzer.get_ipa_de(word), row[0], row)
				p.start()
				
				thread_pool.add(p)
			#end if
		#end for
		
		thread_pool.finish()
	#end def
	
	def print_table_with_phonetics_es(self):
		"""TODO decribe what this method does."""
		rows = self._parse_src()
		
		self.print_header_row(rows[0].get_column_format() if len(rows) > 0 else self._column_format.split('|'))
		for row in rows:
			if len(row) < 1:
				print('Skip incomplete row')
			else:
				row.insert(1, Wordanalyzer.spanish_to_phonetic_de(row[0]))
				self.print_csv_row(row)
			#end if
		#end for
	#end def
	
	
	def print_conjugation_table(self, tenses):
		"""TODO decribe what this method does.
	
		Keyword arguments:
		tenses -- TODO describe what this argument is about
		"""	
		thread_pool = ThreadPool(self.__print_conjugation_table)
		rows = self._parse_src()
		
		self.print_csv_row(['[Infinitive]', '[yo]', '[tú]', '[el/ella/usted]', '[nosotros, -as]', '[vosotros, -as]', '[ellos/ellas/ustedes]', '[Tense]']) # print header
		for row in rows:
			if len(row) < 1:
				print('Skip incomplete row')
			else:
				for tense in tenses:
					p = Processable(lambda word: Wordanalyzer.get_conjugation_es(word, tense), row.original_word, row)
					p.start()
				
					thread_pool.add(p)
				#end for
			#end if
		#end for
		
		thread_pool.finish()
	#end def
	
	def print_word_array(self, lang_code):
		"""TODO decribe what this method does.
	
		Keyword arguments:
		lang_code -- TODO describe what this argument is about
		"""
		rows = self._parse_src()

		self._ostream.write('#!/usr/bin/env python3\n\nWORD_COLLECTION_%s = set(sorted([\n' % lang_code) # print header
		first_row = True
		if lang_code == 'es':
			for row in rows:
				for word in Translator.resolve_word_list(row.original_word):
					word = word.lower() # normalize entry
			
					if first_row:
						self._ostream.write('\t  u\'')
						first_row = False
					else:
						self._ostream.write('\t, u\'')
					#end if
					
					self._ostream.write(word.replace('\'', '\\\''))
					self._ostream.write('\'\n')
				#end for
			#end for
		else:
			for row in rows:
				word = Translator.strip_annotations(row.original_word).lower() # normalize entry
			
				if first_row:
					self._ostream.write('\t  u\'')
					first_row = False
				else:
					self._ostream.write('\t, u\'')
				#end if
			
				self._ostream.write(word.replace('\'', '\\\''))
				self._ostream.write('\'\n')
			#end for
		#end if
		
		self._ostream.write(']))\n')
	#end def
	
	def print_new_words_es(self):
		"""TODO decribe what this method does."""
		for row in self._parse_src():
			words = set([Translator.normalize_word(word) for word in Translator.resolve_word_list(row.original_word)])
			
			if words <= wordlist_es.WORD_COLLECTION_ES: # words is subset from WORD_COLLECTION
				print('Removed entry %s (Normalized: %s)' % (row.original_word, ', '.join(words)))
			else:
				self.print_csv_row(row)
			#end if
		#end if
	#end def
	
	def print_new_words_en(self):
		"""TODO decribe what this method does."""
		for row in self._parse_src():
			word = Translator.strip_annotations(row.original_word).lower() # normalize entry
			
			if word in wordlist_en.WORD_COLLECTION_EN: # words is subset from WORD_COLLECTION
				print('Removed entry %s (Normalized: %s)' % (row.original_word, word))
			else:
				self.print_csv_row(row)
			#end if
		#end if
	#end def
	
	def print_difference(self, newfile):
		"""Prints rows from [newfile] that are not part of [self._src].
	
		Keyword arguments:
		newfile -- path string to the file which should be compared to the other one
		"""
		total_count = 0
		new_count   = 0
		ignore = [row.original_word.strip().lower() for row in self._parse_src()] # [(row.original_word[3:] if row.original_word.startswith('to ') else row.original_word).strip().lower() for row in self._parse_src()]
		
		for row in self._reader.parse(newfile):
			word = row.original_word.strip().lower()
			normalized = word # word[3:] if word.startswith('to ') else word
			
			if normalized not in ignore:
				self.print_csv_row(row)
				new_count += 1
			else:
				print('Omit "' + word + '"')
			#end if
			total_count += 1
		#end
	
		print("%d new rows" % new_count)
		print("%d rows deleted" % (total_count - new_count))
	#end def
	
	def print_commons_marked(self, diff_file):
		"""TODO decribe what this method does.
	
		Keyword arguments:
		diff_file -- TODO describe what this argument is about
		"""	
		rows = self._parse_src()
		marked = Wordanalyzer.get_normalized_words(self._reader.parse(diff_file))
		
		self.print_header_row(rows[0].get_column_format() if len(rows) > 0 else self._column_format.split('|'))
		for row in rows:
			normalized = Translator.normalize_word(row.original_word)
			
			if normalized in marked:
				if len(row.tags) > 0:
					row.tags = 'marked %s' % row.tags
				else:
					row.tags = 'marked'
				#end if
				
				marked.remove(normalized)
				print('Debug: tags = %s' % row.tags)
			#end if
			
			self.print_csv_row(row)
		#end for
		
		for word in marked:
			self.print_csv_row([word, '', '', '', '', '', '', 'new'])
		#end for
	#end def
	
	def print_without_duplicates(self):
		"""TODO decribe what this method does."""	
		total_count = 0
		new_count   = 0
		rows = self._parse_src()
		checked_words = []
		
		for row in rows:
			word = row[0].strip()
			normalized = word.lower() # word[3:].lower() if word.startswith('to ') else word.lower()
			
			if normalized not in checked_words:
				checked_words.append(normalized)
				
				if word != row[0]: # remove trailing whitespaces
					row[0] = word
				#end if
				self.print_csv_row(row)
			else:
				new_count += 1
				print('Drop duplicate "' + word + '"')
			#end if
			
			total_count += 1
		#end for
		
		print('Removed %d duplicates from %d lines.' % (new_count, total_count))
	#end def
	
	def __print_word_row(self, result, row):
		"""TODO decribe what this method does.
	
		Keyword arguments:
		result -- TODO describe what this argument is about
		row -- TODO describe what this argument is about
		"""
		row.append('unknown' if result['normalized'] == None else result['normalized'])
		row.append('unknown' if result['translation'] == None else result['translation'])
		row.append('' if result['wordtype'] == None else result['wordtype'])
		
		self.print_csv_row(row)
	#end def
	
	def __print_enhanced_row(self, result, row):
		"""TODO decribe what this method does.
	
		Keyword arguments:
		result -- TODO describe what this argument is about
		row -- TODO describe what this argument is about
		"""
		normalized  = '{unknown}' if result['normalized'] == None else result['normalized']
		translation = result['translation']
		wordtype    = '' if result['wordtype'] == None else result['wordtype']
		
		if translation == None:
			translation = '{unknown}'
		elif len(translation) > 0 and translation in row.translation:
			translation = '{duplicate} ' + translation
		#endif

		row.normalized_word = normalized
		row.new_translation = translation
		row.new_wordtype    = wordtype

		self.print_csv_row(row)
	#end def
	
	def __print_enhanced_row_en(self, result, row):
		"""TODO decribe what this method does.
	
		Keyword arguments:
		result -- TODO describe what this argument is about
		row -- TODO describe what this argument is about
		"""
		normalized  = '{unknown}' if result['normalized'] == None else result['normalized']
		translation = result['translation']
		wordtype    = '' if result['wordtype'] == None else result['wordtype']
		
		if translation == None:
			translation = '{unknown}'
		elif len(translation) > 0 and translation in row.translation:
			translation = '{duplicate} ' + translation
		#endif

		row.normalized_word = normalized
		row.new_translation = translation
		row.new_wordtype    = wordtype

		self.print_csv_row(row)
	#end def
	
	def __print_enhanced_row_de(self, result, row):
		"""TODO decribe what this method does.
	
		Keyword arguments:
		result -- TODO describe what this argument is about
		row -- TODO describe what this argument is about
		"""
		normalized  = '{unknown}' if result['normalized'] == None else result['normalized']
		ipa         = '{unknown}' if result['ipa'] == None else result['ipa']
		wordtype    = '' if result['wordtype'] == None else result['wordtype']

		row.normalized_word = normalized
		row.new_ipa         = ipa
		row.new_wordtype    = wordtype

		self.print_csv_row(row)
	#end def
	
	def __print_row_with_ipa(self, result, row):
		"""TODO decribe what this method does.
	
		Keyword arguments:
		result -- TODO describe what this argument is about
		row -- TODO describe what this argument is about
		"""
		if result != None:
			row.new_ipa = result
		#end if
		
		self.print_csv_row(row)
	#end def
	
	def __print_conjugation_table(self, result, row):
		"""TODO decribe what this method does.
	
		Keyword arguments:
		result -- TODO describe what this argument is about
		row -- TODO describe what this argument is about
		"""
		if result != None:
			row.extend(result)
		#end if
		
		self.print_csv_row(row)