def loadAuxiliaryData(): # Service.logger.debug("Loading auxiliary data for terminology extraction system...") global ngramFilePath, adskUnwordsRoot global ngrams, nowords # ngrams = codecs.open(ngramFilePath, "r", "utf-8").read() conn = Service.connectToDB() cursor = conn.cursor() cursor.execute("select LangCode3Ltr from TargetLanguages") langs = cursor.fetchall() conn.close() for lang in langs: if __debug_on__: Service.logger.debug("\t\tReading nGram file " + ngramFilePath+"."+lang[0].upper()+".bz2...") ngrams[lang[0]] = bz2.BZ2File(ngramFilePath+"."+lang[0].upper()+".bz2", "r").read() # Load Autodesk-related lists: # - ngram-list (from Ventzi, including only the ngrams without counts) # - NeXLT product names (------ there is an N/A in it???) # - NeXLT language list # - city names from http://www.geodatasource.com/ and http://www.maxmind.com/en/worldcities # - words which should not be harvested (unwords and general words) # - Autodesk trademarks # - company names # Define nowords as filter nowords = preplists(adskUnwordsRoot+"/general_words.txt").union(preplists(adskUnwordsRoot+"/un_words.txt").union(preplists(adskUnwordsRoot+"/autodesk_trademarks.txt").union(preplists(adskUnwordsRoot+"/company_names.txt").union(preplists(adskUnwordsRoot+"/cities_regions.txt")))))