Exemplo n.º 1
0
def loadAuxiliaryData():
#	Service.logger.debug("Loading auxiliary data for terminology extraction system...")
	global ngramFilePath, adskUnwordsRoot
	global ngrams, nowords

#	ngrams = codecs.open(ngramFilePath, "r", "utf-8").read()
	conn = Service.connectToDB()
	cursor = conn.cursor()
	cursor.execute("select LangCode3Ltr from TargetLanguages")
	langs = cursor.fetchall()
	conn.close()
	for lang in langs:
		if __debug_on__:
			Service.logger.debug("\t\tReading nGram file " + ngramFilePath+"."+lang[0].upper()+".bz2...")
		ngrams[lang[0]] = bz2.BZ2File(ngramFilePath+"."+lang[0].upper()+".bz2", "r").read()

	# Load Autodesk-related lists:
	# - ngram-list (from Ventzi, including only the ngrams without counts)
	# - NeXLT product names (------ there is an N/A in it???)
	# - NeXLT language list
	# - city names from http://www.geodatasource.com/ and http://www.maxmind.com/en/worldcities
	# - words which should not be harvested (unwords and general words)
	# - Autodesk trademarks
	# - company names
	# Define nowords as filter
	nowords = preplists(adskUnwordsRoot+"/general_words.txt").union(preplists(adskUnwordsRoot+"/un_words.txt").union(preplists(adskUnwordsRoot+"/autodesk_trademarks.txt").union(preplists(adskUnwordsRoot+"/company_names.txt").union(preplists(adskUnwordsRoot+"/cities_regions.txt")))))