def __init__(self): freeling.util_init_locale("default") # Create options set for maco analyzer op = freeling.maco_options(LANG) op.PunctuationFile = DATA + "common/punct.dat" op.DictionaryFile = DATA + LANG + "/es-ar/dicc.src" op.AffixFile = DATA + LANG + "/afixos.dat" op.LocutionsFile = DATA + LANG + "/locucions.dat" op.NPdataFile = DATA + LANG + "/np.dat" op.QuantitiesFile = DATA + LANG + "/quantities.dat" op.ProbabilityFile = DATA + LANG + "/probabilitats.dat" # Create analyzers self.tk = freeling.tokenizer(DATA + LANG + "/tokenizer.dat") self.sp = freeling.splitter(DATA + LANG + "/splitter.dat") self.mf = freeling.maco(op) # create tagger and alternatives self.tg = freeling.hmm_tagger(DATA + LANG + "/tagger.dat", True, 2) self.alts_ort = freeling.alternatives(DATA + LANG + "/alternatives-ort.dat") # known words self.wknown = [] self.sid = self.sp.open_session()
def inicia(self): FREELINGDIR = "/usr/local" DATA = FREELINGDIR + "/share/freeling/" LANG = "es" freeling.util_init_locale("default") # create options set for maco analyzer. Default values are Ok, except for data files. op = freeling.maco_options("es") op.set_active_modules(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) op.set_data_files("", DATA + LANG + "/locucions.dat", DATA + LANG + "/quantities.dat", DATA + LANG + "/afixos.dat", DATA + LANG + "/probabilitats.dat", DATA + LANG + "/dicc.src", DATA + LANG + "/np.dat", DATA + "common/punct.dat", DATA + LANG + "/corrector/corrector.dat") # create analyzers self.tk = freeling.tokenizer(DATA + LANG + "/tokenizer.dat") self.sp = freeling.splitter(DATA + LANG + "/splitter.dat") self.mf = freeling.maco(op) self.tg = freeling.hmm_tagger("es", DATA + LANG + "/tagger.dat", 1, 2) self.sen = freeling.senses(DATA + LANG + "/senses.dat") ner = freeling.ner(DATA + LANG + "/ner/ner-ab.dat") self.parser = freeling.chart_parser(DATA + LANG + "/chunker/grammar-chunk.dat") self.dep = freeling.dep_txala(DATA + LANG + "/dep/dependences.dat", self.parser.get_start_symbol())
def tagger(file_url, out_name): freeling.util_init_locale("default") ipath = "/usr/local" lpath = ipath + "/share/freeling/" + "es" + "/" tk = freeling.tokenizer(lpath + "tokenizer.dat") sp = freeling.splitter(lpath + "splitter.dat") morfo = freeling.maco(my_maco_options("es", lpath)) morfo.set_active_options( False, # UserMap True, # NumbersDetection, True, # PunctuationDetection, True, # DatesDetection, True, # DictionarySearch, True, # AffixAnalysis, False, # CompoundAnalysis, True, # RetokContractions, True, # MultiwordsDetection, True, # NERecognition, False, # QuantitiesDetection, True) # ProbabilityAssignment tagger = freeling.hmm_tagger(lpath + "tagger.dat", True, 2) file = open(file_url, "r") text = file.read() lw = tk.tokenize(text) ls = sp.split(lw) ls = morfo.analyze(ls) ls = tagger.analyze(ls) ProcessSentences(ls, out_name)
def __init__(self, language, freeling_dir, options): self.logger = logging.getLogger(__name__) self.logger.info('FREELING_DIR: %s', freeling_dir) self.logger.info('LANGUAGE: %s', language) self.logger.info('MAPO_OPTIONS: %s', options) freeling.util_init_locale("default") # create options set for maco analyzer. Default values are Ok, except for data files. data_dir = freeling_dir + "/share/freeling/" maco_opts = freeling.maco_options(language) maco_opts.set_data_files("", data_dir + "/common/punct.dat", data_dir + language + "/dicc.src", data_dir + language + "/afixos.dat", "", data_dir + language + "/locucions.dat", data_dir + language + "/np.dat", data_dir + language + "/quantities.dat", data_dir + language + "/probabilitats.dat") # create analyzers self.tokenizer = freeling.tokenizer(data_dir + language + "/tokenizer.dat") self.splitter = freeling.splitter(data_dir + language + "/splitter.dat") self.maco = freeling.maco(maco_opts) self.maco.set_active_options(*options)
def prepare_freeling(): # Freeling: # https://github.com/TALP-UPC/FreeLing # (you may download binary at releases there) # (GPP: I'm using 4.0) # Make sure that the directory contanining libfreeling.so (FREELINGDIR/lib) is # in your LD_LIBRARY_PATH. # Make sure that freeling.py and _freeling.so are in the same directory as this one. # example of freeling's python API is at: https://github.com/TALP-UPC/FreeLing/tree/master/APIs/python # Change directories for your location FREELINGDIR = "/usr/local"; DATA = FREELINGDIR+"/share/freeling/"; LANG="pt"; freeling.util_init_locale("default"); # create options set for maco analyzer. Default values are Ok, except for data files. op= freeling.maco_options("pt"); op.set_data_files( "", DATA + "common/punct.dat", DATA + LANG + "/dicc.src", DATA + LANG + "/afixos.dat", "", DATA + LANG + "/locucions.dat", DATA + LANG + "/np.dat", "", # there's not "quantitites.dat" for pt DATA + LANG + "/probabilitats.dat"); # create analyzers tk=freeling.tokenizer(DATA+LANG+"/tokenizer.dat"); sp=freeling.splitter(DATA+LANG+"/splitter.dat"); sid=sp.open_session(); mf=freeling.maco(op); # activate mmorpho odules to be used in next call mf.set_active_options(False, True, True, True, # select which among created True, True, False, True, # submodules are to be used. True, True, True, True ); # default: all created submodules are used # create tagger, sense anotator, and ukb tg=freeling.hmm_tagger(DATA+LANG+"/tagger.dat",True,2); sen=freeling.senses(DATA+LANG+"/senses.dat"); parser= freeling.chart_parser(DATA+LANG+"/chunker/grammar-chunk.dat"); ukb = freeling.ukb(DATA+LANG+"/ukb.dat"); outputter = freeling.output_conll('./output_conll.dat') return tk, sp, sid, mf, tg, sen, parser, ukb, outputter
def setup_tools(config_fn): """Setup Freeling tools according to a config file. The tools returned is a dictionary with the following keys: tk : The tokenizer sp : The sentence splitter pos : The part of speech tagger mf : The morphological analysis tools (freeling.maco) wsd : word sense tagger """ config = configparser.ConfigParser() config.read(config_fn) language = config['wsd']['language'] data = config['freeling']['datadir'] data_l = data + '/' + language tools = {} freeling.util_init_locale("default") tools['tk'] = freeling.tokenizer(data_l + "/tokenizer.dat") tools['sp'] = freeling.splitter(data_l + "/splitter.dat") tools['pos'] = freeling.hmm_tagger(language, data_l + "/tagger.dat", True, # Retokenize 2) # Force selecting one PoS tag after retokenization op = freeling.maco_options(language); op.set_active_modules( 0, # UserMap (for analysis of domain-specific tokens) 1, # AffixAnalysis 1, # MultiwordsDetection 1, # NumbersDetection 1, # PuctuationDetection 0, # DatesDetection, gives problems with words like "Monday" 1, # QuantitiesDetection 1, # DictionarySearch 1, # ProbabilityAssignment (Essential for PoS) 1, # OrthographicCorrection (Misspelling etc.) 0) # NERecognition (Named Enitity Recognition) op.set_data_files( "", data_l+"/locucions.dat", data_l+"/quantities.dat", data_l+"/afixos.dat", data_l+"/probabilitats.dat", data_l+"/dicc.src", data_l+"/np.dat", data+"/common/punct.dat", data_l+"/corrector/corrector.dat"); tools['mf'] = freeling.maco(op) tools['wsd'] = freeling.ukb_wrap(data_l+'/ukb.dat') return tools
def __init__(self, text): super().__init__(text) freeling.util_init_locale("default") self.la = freeling.lang_ident(DATA + "common/lang_ident/ident.dat") op = freeling.maco_options("es") op.set_data_files( "", DATA + "common/punct.dat", DATA + LANG + "/dicc.src", DATA + LANG + "/afixos.dat", "", DATA + LANG + "/locucions.dat", DATA + LANG + "/np.dat", DATA + LANG + "/quantities.dat", DATA + LANG + "/probabilitats.dat" ) # create analyzers self.tk = freeling.tokenizer(DATA + LANG + "/tokenizer.dat") self.sp = freeling.splitter(DATA + LANG + "/splitter.dat") self.sid = self.sp.open_session() self.mf = freeling.maco(op) # activate mmorpho odules to be used in next call self.mf.set_active_options( False, # umap User map module True, # num Number Detection True, # pun Punctuation Detection True, # dat Date Detection True, # dic Dictionary Search True, # aff False, # com True, # rtk True, # mw Multiword Recognition True, # ner Name Entity Recongnition True, # qt Quantity Recognition True # prb Probability Assignment And Guesser ) # default: all created submodules are used # create tagger, sense anotator, and parsers self.tg = freeling.hmm_tagger(DATA + LANG + "/tagger.dat", True, 2) self.sen = freeling.senses(DATA + LANG + "/senses.dat") self.parser = freeling.chart_parser(DATA + LANG + "/chunker/grammar-chunk.dat") self.dep = freeling.dep_txala(DATA + LANG + "/dep_txala/dependences.dat", self.parser.get_start_symbol())
def inicializa(self): FREELINGDIR = "/usr/local" DATA = FREELINGDIR + "/share/freeling/" LANG = self.lang freeling.util_init_locale("default") # create language analyzer self.la = freeling.lang_ident(DATA + "common/lang_ident/ident.dat") # opciones para maco analyzer. op = freeling.maco_options("es") op.set_active_modules(0, 1, 1, 1, 1, 1, 1, 1, 1, 1) op.set_data_files("", DATA + LANG + "/locucions.dat", DATA + LANG + "/quantities.dat", DATA + LANG + "/afixos.dat", DATA + LANG + "/probabilitats.dat", DATA + LANG + "/dicc.src", DATA + LANG + "/np.dat", DATA + "common/punct.dat", DATA + LANG + "/corrector/corrector.dat") # crear analyzers self.tk = freeling.tokenizer(DATA + LANG + "/tokenizer.dat") self.sp = freeling.splitter(DATA + LANG + "/splitter.dat") self.mf = freeling.maco(op) self.tg = freeling.hmm_tagger(DATA + LANG + "/tagger.dat", 1, 2) self.sen = freeling.senses(DATA + LANG + "/senses.dat") self.nec = freeling.nec(DATA + LANG + "/nerc/nec/nec-ab-rich.dat") # self.ner=freeling.nec(DATA+LANG+"/ner/ner-ab.dat"); self.parser = freeling.chart_parser(DATA + LANG + "/chunker/grammar-chunk.dat") self.dep = freeling.dep_txala(DATA + LANG + "/dep/dependences.dat", self.parser.get_start_symbol()) con_data={'user':'******','password':'******','host':'127.0.0.1', \ 'database':'agiria','raise_on_warnings': True, 'autocommit':True, 'buffered':True} self.con = my.connect(**con_data)
def inicializa(self): FREELINGDIR = "/usr/local"; DATA = FREELINGDIR+"/share/freeling/"; LANG=self.lang; freeling.util_init_locale("default"); # create language analyzer self.la=freeling.lang_ident(DATA+"common/lang_ident/ident.dat"); # opciones para maco analyzer. op= freeling.maco_options("es"); op.set_active_modules(0,1,1,1,1,1,1,1,1,1) op.set_data_files("",DATA+LANG+"/locucions.dat", DATA+LANG+"/quantities.dat", DATA+LANG+"/afixos.dat", DATA+LANG+"/probabilitats.dat", DATA+LANG+"/dicc.src", DATA+LANG+"/np.dat", DATA+"common/punct.dat",DATA+LANG+"/corrector/corrector.dat"); # crear analyzers self.tk=freeling.tokenizer(DATA+LANG+"/tokenizer.dat"); self.sp=freeling.splitter(DATA+LANG+"/splitter.dat"); self.mf=freeling.maco(op); self.tg=freeling.hmm_tagger(DATA+LANG+"/tagger.dat",1,2); self.sen=freeling.senses(DATA+LANG+"/senses.dat"); self.nec=freeling.nec(DATA+LANG+"/nerc/nec/nec-ab-rich.dat"); # self.ner=freeling.nec(DATA+LANG+"/ner/ner-ab.dat"); self.parser= freeling.chart_parser(DATA+LANG+"/chunker/grammar-chunk.dat"); self.dep=freeling.dep_txala(DATA+LANG+"/dep/dependences.dat", self.parser.get_start_symbol()); con_data={'user':'******','password':'******','host':'127.0.0.1', \ 'database':'agiria','raise_on_warnings': True, 'autocommit':True, 'buffered':True} self.con = my.connect(**con_data)
def config_files(self, lang, data_dir, data_dir_common): data_dir += lang + "/" data_conf = data_dir + "nerc/nec/nec.cfg" opt = freeling.maco_options(lang) # (usr, pun, dic, aff, comp, loc, nps, qty, prb) opt.set_data_files("", data_dir_common + "punct.dat", data_dir + "dicc.src", data_dir + "afixos.dat", data_dir + "compounds.dat", data_dir + "locucions.dat", data_dir + "np.dat", data_dir + "quantities.dat", data_dir + "probabilitats.dat") self.mf = freeling.maco(opt) # (umap, num, pun, dat, dic, aff, comp, rtk, mw, ner, qt, prb) # (0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0) self.mf.set_active_options(False, True, True, True, False, True, True, True, True, True, True, True) self.tk = freeling.tokenizer(data_dir + "tokenizer.dat") self.sp = freeling.splitter(data_dir + "splitter.dat") self.tg = freeling.hmm_tagger(data_dir + "tagger.dat", True, 2) self.sen = freeling.senses(data_dir + "senses.dat") self.parser = freeling.chart_parser(data_dir + "chunker/grammar-chunk.dat") self.dep = freeling.dep_txala(data_dir + "/dep_txala/dependences.dat", self.parser.get_start_symbol()) self.nec = freeling.nec(data_conf)
def __init__(self): print("Inicializando Nombres") print(str(datetime.time(datetime.now()))) FREELINGDIR = "/usr/local" DATA = FREELINGDIR + "/share/freeling/" LANG = "es" freeling.util_init_locale("default") op = freeling.maco_options("es") op.set_active_modules(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) op.set_data_files("", DATA + LANG + "/locucions.dat", DATA + LANG + "/quantities.dat", DATA + LANG + "/afixos.dat", DATA + LANG + "/probabilitats.dat", DATA + LANG + "/dicc.src", DATA + LANG + "/np.dat", DATA + "common/punct.dat", DATA + LANG + "/corrector/corrector.dat") # create analyzers self.tk = freeling.tokenizer(DATA + LANG + "/tokenizer.dat") self.sp = freeling.splitter(DATA + LANG + "/splitter.dat") self.mf = freeling.maco(op) self.tg = freeling.hmm_tagger("es", DATA + LANG + "/tagger.dat", 1, 2) # self.sen=freeling.senses(DATA+LANG+"/senses.dat"); # self.ner=freeling.ner(DATA+LANG+"/ner/ner-ab.dat"); self.nec = freeling.nec(DATA + LANG + "/nec/nec-ab.dat") # self.parser= freeling.chart_parser(DATA+LANG+"/chunker/grammar-chunk.dat"); self.pondera = { 'titulo': 3, 'intro': 2, 'texto': 1 } #ponderacion dada a cada ner según tipo origen. self.indice = 0 #indice que representa la proporción de ners comunes sobre todos de una noticia self.con = my.connect(**con_data) self.ultimo = 0 self.minimo_comun = 0.45 # porcentaje que tienen que compartir dos noticias para ser relacionadas self.minimo_palabras = 14 #mínimo numero de palabras (pnderadas) para poder entrar en relación # pasamos a list (no_incluir) las palabras que no deben ser consideradas NERs self.cur1 = self.con.cursor() texto = "Select nombre from no_nombres order by nombre" try: self.cur1.execute(texto) except my.Error as err: print("Error seleccionando nombres de tabla no_nombres", format(err)) self.no_incluir = [ ] # lista de palabras a omitir en ners identificados for nombre in self.cur1: try: nombre = str(nombre[0]).upper() nombre = nombre[2:-1] # quitar simbolo de byte b' nombre = nombre.replace('\\XC3\\XA1', 'Á') nombre = nombre.replace('\\XC3\\X81', 'Á') nombre = nombre.replace('\\XC3\\XA9', 'É') nombre = nombre.replace('\\XC3\\XAD', 'Í') nombre = nombre.replace('\\XC3\\X8D', 'Í') nombre = nombre.replace('\\XC3\\XB3', 'Ó') nombre = nombre.replace('\\XC3\\X93', 'Ó') nombre = nombre.replace('\\XC3\\XBA', 'Ú') nombre = nombre.replace('\\XC3\\XBC', 'Ü') nombre = nombre.replace('\\XC3\\XB1', 'Ñ') nombre = nombre.replace('\\XC3\\X91', 'Ñ') nombre = nombre.replace('\\XC2\\XBA', 'º') nombre = nombre.replace('\\XC4\\X82\\XC4\\X84', 'ĂĄ') self.no_incluir.append(nombre) except: print("Error incluyendo no_nombres en lista") self.no_incluir = sorted(set(self.no_incluir)) # corefs de tabla self.cur1 = self.con.cursor() texto = "Select original, coref from coref order by original" try: self.cur1.execute(texto) except my.Error as err: print("Error seleccionando corefs", format(err)) self.corefs = {} # dict de palabras corefs para cambiar en ners for original, coref in self.cur1: self.corefs[original] = coref print("Inicialización terminada", str(datetime.time(datetime.now())))
freeling.util_init_locale("default") # Create options set for maco analyzer. Default values are Ok, except for data files. op = freeling.maco_options(LANG) op.set_active_modules(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) op.set_data_files("", DATA + LANG + "/locucions.dat", DATA + LANG + "/quantities.dat", DATA + LANG + "/afixos.dat", DATA + LANG + "/probabilitats.dat", DATA + LANG + "/dicc.src", DATA + LANG + "/np.dat", DATA + "common/punct.dat", DATA + LANG + "/corrector/corrector.dat") # create analyzers tk = freeling.tokenizer(DATA + LANG + "/tokenizer.dat") sp = freeling.splitter(DATA + LANG + "/splitter.dat") mf = freeling.maco(op) def decode_tag(tag): """ Función para decodificar y extender las etiquetas generadas por Freeling en codificación EAGLES """ categoria = tag[0] decoded = "Esta palabra pertenece a la categoría {} ".format( EAGLES_DICT[categoria]['Categoria']) atributos = tag[1:] if len(tag) > 1 else [] aux = '' decoded += "y presenta los siguientes atributos: " for idx, atributo in enumerate(atributos):
# get requested language from arg1, or English if not provided lang = "es" # get installation path to use from arg2, or use /usr/local if not provided #ipath = "/usr/local/Cellar/freeling/4.0_4"; ipath = "/usr/local/Cellar/freeling/4.1_3" # path to language data lpath = ipath + "/share/freeling/" + lang + "/" # create analyzers tk = freeling.tokenizer(lpath + "tokenizer.dat") sp = freeling.splitter(lpath + "splitter.dat") # create the analyzer with the required set of maco_options morfo = freeling.maco(my_maco_options(lang, lpath)) # then, (de)activate required modules morfo.set_active_options( False, # UserMap True, # NumbersDetection, True, # PunctuationDetection, True, # DatesDetection, True, # DictionarySearch, True, # AffixAnalysis, False, # CompoundAnalysis, True, # RetokContractions, True, # MultiwordsDetection, True, # NERecognition, False, # QuantitiesDetection, True) # ProbabilityAssignment
def __init__(self): lang = 'fr' ComplexityLanguage.__init__(self, lang) ## Modify this line to be your FreeLing installation directory FREELINGDIR = "/home/garciacumbreras18/dist/freeling" DATA = FREELINGDIR + "/data/" CLASSDIR = "" self.lang = lang freeling.util_init_locale("default") # create language analyzer self.la = freeling.lang_ident(DATA + "common/lang_ident/ident.dat") # create options set for maco analyzer. Default values are Ok, except for data files. op = freeling.maco_options(lang) op.set_data_files( "", DATA + "common/punct.dat", DATA + lang + "/dicc.src", DATA + lang + "/afixos.dat", "", DATA + lang + "/locucions.dat", DATA + lang + "/np.dat", DATA + lang + "/quantities.dat", DATA + lang + "/probabilitats.dat") # create analyzers self.tk = freeling.tokenizer(DATA + lang + "/tokenizer.dat") self.sp = freeling.splitter(DATA + lang + "/splitter.dat") self.mf = freeling.maco(op) # activate mmorpho modules to be used in next call self.mf.set_active_options( False, True, True, True, # select which among created True, True, False, True, # submodules are to be used. True, True, True, True) # default: all created submodules are used # create tagger and sense anotator self.tg = freeling.hmm_tagger(DATA + lang + "/tagger.dat", True, 2) self.sen = freeling.senses(DATA + lang + "/senses.dat") f = open(CLASSDIR + '/home/garciacumbreras18/DaleChall.txt') lines = f.readlines() f.close() listDaleChall = [] for l in lines: data = l.strip().split() listDaleChall += data self.listDaleChall = listDaleChall """ config es una lista de valores booleanos que activa o desactivan el cálculo de una medida config = [ True|False, # KANDEL MODELS True|False, # DALE CHALL True|False, # SOL ] """ self.config += [True, True, True] self.metricsStr.extend(['KANDEL-MODELS', 'DALE CHALL', 'SOL']) self.configExtend += [True, True] self.metricsStrExtend.extend(['MEAN RARE WORDS', 'STD RARE WORDS'])
import freeling # code extracted from https://gist.github.com/arademaker/dffb8de093502b153e85#file-processing-py-L50 FREELINGDIR = '/usr/local' DATA = FREELINGDIR + '/share/freeling/' LANGUAGE = 'en' freeling.util_init_locale('default') option = freeling.maco_options(LANGUAGE) option.set_data_files( "", DATA + "common/punct.dat", DATA + LANGUAGE + "/dicc.src", DATA + LANGUAGE + "/afixos.dat", "", DATA + LANGUAGE + "/locucions.dat", DATA + LANGUAGE + "/np.dat", DATA + LANGUAGE + "/quantities.dat", DATA + LANGUAGE + "/probabilitats.dat") morfo = freeling.maco(option) tokenizer = freeling.tokenizer(DATA + LANGUAGE + '/tokenizer.dat') splitter = freeling.splitter(DATA + LANGUAGE + '/splitter.dat') sid = splitter.open_session() tagger = freeling.hmm_tagger(DATA + LANGUAGE + '/tagger.dat', True, 2) parser = freeling.chart_parser(DATA + LANGUAGE + '/chunker/grammar-chunk.dat') morfo.set_active_options(False, True, True, True, True, True, False, True, True, True, True, True )
import freeling # code extracted from https://gist.github.com/arademaker/dffb8de093502b153e85#file-processing-py-L50 FREELINGDIR = '/usr/local' DATA = FREELINGDIR + '/share/freeling/' LANGUAGE = 'en' freeling.util_init_locale('default') option = freeling.maco_options(LANGUAGE) option.set_data_files( "", DATA + "common/punct.dat", DATA + LANGUAGE + "/dicc.src", DATA + LANGUAGE + "/afixos.dat", "", DATA + LANGUAGE + "/locucions.dat", DATA + LANGUAGE + "/np.dat", DATA + LANGUAGE + "/quantities.dat", DATA + LANGUAGE + "/probabilitats.dat") morfo = freeling.maco(option) tokenizer = freeling.tokenizer(DATA + LANGUAGE + '/tokenizer.dat') splitter = freeling.splitter(DATA + LANGUAGE + '/splitter.dat') sid = splitter.open_session() tagger = freeling.hmm_tagger(DATA + LANGUAGE + '/tagger.dat', True, 2) parser = freeling.chart_parser(DATA + LANGUAGE + '/chunker/grammar-chunk.dat') morfo.set_active_options(False, True, True, True, True, True, False, True, True, True, True, True)
freeling.util_init_locale("default") # Se crean opciones para analizador maco op= freeling.maco_options("es") op.set_data_files( "", DATA + "common/punct.dat", DATA + LANG + "/dicc.src", DATA + LANG + "/afixos.dat", "", DATA + LANG + "/locucions.dat", DATA + LANG + "/np.dat", DATA + LANG + "/quantities.dat", DATA + LANG + "/probabilitats.dat") # Se crea el analizador maco con las opciones precreadas morfo = freeling.maco(op) # Se setean los analisis requeridos. Solamente se usa deteccion de numeros y de fechas morfo.set_active_options (False, # UserMap False, # NumbersDetection, False, # PunctuationDetection, False, # DatesDetection, --> Setear a True para considerar fechas False, # DictionarySearch, False, # AffixAnalysis, False, # CompoundAnalysis, False, # RetokContractions, False, # MultiwordsDetection, False, # NERecognition, False, # QuantitiesDetection, False) # ProbabilityAssignment # Se crean tokenizador y splitter
def __init__(self, lang='it'): ## Modify this line to be your FreeLing installation directory FREELINGDIR = "/home/garciacumbreras18/dist/freeling" DATA = FREELINGDIR + "/data/" self.DATA = DATA self.lang = lang freeling.util_init_locale("default") # create language analyzer self.la = freeling.lang_ident(DATA + "common/lang_ident/ident.dat") # create options set for maco analyzer. Default values are Ok, except for data files. op = freeling.maco_options(lang) op.set_data_files("", self.DATA + "common/punct.dat", self.DATA + self.lang + "/dicc.src", self.DATA + self.lang + "/afixos.dat", "", self.DATA + self.lang + "/locucions.dat", self.DATA + self.lang + "/np.dat", "", self.DATA + self.lang + "/probabilitats.dat") # create analyzers self.tk = freeling.tokenizer(self.DATA + self.lang + "/tokenizer.dat") self.sp = freeling.splitter(self.DATA + self.lang + "/splitter.dat") self.mf = freeling.maco(op) # activate mmorpho modules to be used in next call self.mf.set_active_options( False, True, True, True, # select which among created True, True, False, True, # submodules are to be used. True, True, True, True) # default: all created submodules are used # create tagger self.tg = freeling.hmm_tagger(self.DATA + self.lang + "/tagger.dat", True, 2) self.sen = freeling.senses(DATA + lang + "/senses.dat") """ config es una lista de valores booleanos que activa o desactivan el cálculo de una medida config = [ True|False, # PUNCTUATION MARKS True|False, # SCI True|False, # ARI True|False, # MU True|False, # Flesch-Vaca True|False, # Gulpease ] Si config == None se calculan todas las métricas de complejidad soportadas """ self.config = [True, True, True, True, True, True] self.metricsIt = [ 'AVERAGE PUNCTUATION MARKS', 'SCI', 'ARI', 'MU', 'FLESCH-VACA', 'GULPEASE' ] self.configExtend = [True, True, True, True, True] self.metricsItExtend = [ 'MEAN WORDS', 'STD WORDS', 'COMPLEX SENTENCES', 'MEAN SYLLABLES', 'STD SYLLABLES' ]
def __init__(self): print ("Inicializando Nombres") print(str(datetime.time(datetime.now()))) FREELINGDIR = "/usr/local"; DATA = FREELINGDIR+"/share/freeling/"; LANG="es"; freeling.util_init_locale("default"); op= freeling.maco_options("es"); op.set_active_modules(0,1,1,1,1,1,1,1,1,1,0); op.set_data_files("",DATA+LANG+"/locucions.dat", DATA+LANG+"/quantities.dat", DATA+LANG+"/afixos.dat", DATA+LANG+"/probabilitats.dat", DATA+LANG+"/dicc.src", DATA+LANG+"/np.dat", DATA+"common/punct.dat",DATA+LANG+"/corrector/corrector.dat"); # create analyzers self.tk=freeling.tokenizer(DATA+LANG+"/tokenizer.dat"); self.sp=freeling.splitter(DATA+LANG+"/splitter.dat"); self.mf=freeling.maco(op); self.tg=freeling.hmm_tagger("es",DATA+LANG+"/tagger.dat",1,2); # self.sen=freeling.senses(DATA+LANG+"/senses.dat"); # self.ner=freeling.ner(DATA+LANG+"/ner/ner-ab.dat"); self.nec=freeling.nec(DATA+LANG+"/nec/nec-ab.dat"); # self.parser= freeling.chart_parser(DATA+LANG+"/chunker/grammar-chunk.dat"); self.pondera ={'titulo':3,'intro':2, 'texto':1} #ponderacion dada a cada ner según tipo origen. self.indice = 0 #indice que representa la proporción de ners comunes sobre todos de una noticia self.con = my.connect(**con_data) self.ultimo = 0 self.minimo_comun = 0.45 # porcentaje que tienen que compartir dos noticias para ser relacionadas self.minimo_palabras = 14 #mínimo numero de palabras (pnderadas) para poder entrar en relación # pasamos a list (no_incluir) las palabras que no deben ser consideradas NERs self.cur1 = self.con.cursor() texto = "Select nombre from no_nombres order by nombre"; try: self.cur1.execute(texto) except my.Error as err: print("Error seleccionando nombres de tabla no_nombres", format(err)) self.no_incluir = [] # lista de palabras a omitir en ners identificados for nombre in self.cur1: try: nombre = str(nombre[0]).upper() nombre = nombre[2:-1] # quitar simbolo de byte b' nombre = nombre.replace('\\XC3\\XA1', 'Á') nombre = nombre.replace('\\XC3\\X81', 'Á') nombre = nombre.replace('\\XC3\\XA9', 'É') nombre = nombre.replace('\\XC3\\XAD', 'Í') nombre = nombre.replace('\\XC3\\X8D', 'Í') nombre = nombre.replace('\\XC3\\XB3', 'Ó') nombre = nombre.replace('\\XC3\\X93', 'Ó') nombre = nombre.replace('\\XC3\\XBA', 'Ú') nombre = nombre.replace('\\XC3\\XBC', 'Ü') nombre = nombre.replace('\\XC3\\XB1', 'Ñ') nombre = nombre.replace('\\XC3\\X91', 'Ñ') nombre = nombre.replace('\\XC2\\XBA', 'º') nombre = nombre.replace('\\XC4\\X82\\XC4\\X84', 'ĂĄ') self.no_incluir.append(nombre) except: print("Error incluyendo no_nombres en lista") self.no_incluir= sorted(set(self.no_incluir)) # corefs de tabla self.cur1 = self.con.cursor() texto = "Select original, coref from coref order by original"; try: self.cur1.execute(texto) except my.Error as err: print("Error seleccionando corefs", format(err)) self.corefs = {} # dict de palabras corefs para cambiar en ners for original,coref in self.cur1: self.corefs[original] = coref print("Inicialización terminada", str(datetime.time(datetime.now())))
def __init__(self, text): super().__init__(text) self.stop_words = set(stopwords.words('spanish') + list(punctuation)) self._cleaned_text = list() freeling.util_init_locale("default") # create language analyzer la = freeling.lang_ident(DATA + "common/lang_ident/ident.dat") # create options set for maco analyzer. Default values are Ok, except for data files. op = freeling.maco_options("es") op.set_data_files( "", DATA + "common/punct.dat", DATA + LANG + "/dicc.src", DATA + LANG + "/afixos.dat", "", DATA + LANG + "/locucions.dat", DATA + LANG + "/np.dat", DATA + LANG + "/quantities.dat", DATA + LANG + "/probabilitats.dat") # create analyzers tk = freeling.tokenizer(DATA + LANG + "/tokenizer.dat") sp = freeling.splitter(DATA + LANG + "/splitter.dat") sid = sp.open_session() mf = freeling.maco(op) # activate mmorpho odules to be used in next call mf.set_active_options( True, True, True, True, # select which among created True, True, True, True, # submodules are to be used. True, True, True, True) # default: all created submodules are used # create tagger, sense anotator, and parsers tg = freeling.hmm_tagger(DATA + LANG + "/tagger.dat", True, 2) sen = freeling.senses(DATA + LANG + "/senses.dat") parser = freeling.chart_parser(DATA + LANG + "/chunker/grammar-chunk.dat") l = tk.tokenize(self.text) ls = sp.split(sid, l, False) ls = mf.analyze(ls) ls = tg.analyze(ls) ls = sen.analyze(ls) ls = parser.analyze(ls) for s in ls: ws = s.get_words() for w in ws: # Removing all stopped words, including prepositions, conjunctions, interjections and punctuation tag = w.get_tag() word = w.get_form() if tag.startswith("S") or \ tag.startswith("I") or \ tag.startswith("C") or \ tag.startswith("F") or \ tag.startswith("D") or \ tag.startswith("P"): pass else: self._cleaned_text.append("{}-{}".format(word, tag))
def build_freeling(lang): ##### Build resources FREELINGDIR = "/usr/local" DATA = FREELINGDIR + "/share/freeling/" LANG_ES = "es" LANG_EN = "en" freeling.util_init_locale("default") if lang == 'es': ##### Build Spanish analyzers op = freeling.maco_options("es") op.set_data_files( "", DATA + "common/punct.dat", DATA + "es" + "/dicc.src", DATA + "es" + "/afixos.dat", "", # "data/locutions_es_processed.dat", "", DATA + "es" + "/np.dat", DATA + "es" + "/quantities.dat", DATA + "es" + "/probabilitats.dat") # create analyzers op.MultiwordsDetection = True tk = freeling.tokenizer(DATA + "es" + "/tokenizer.dat") sp = freeling.splitter(DATA + "es" + "/splitter.dat") sid = sp.open_session() mf = freeling.maco(op) # activate mmorpho odules to be used in next call mf.set_active_options( False, True, True, True, # select which among created True, True, False, True, # submodules are to be used. True, True, True, True) # default: all created submodules are used # create tagger tg = freeling.hmm_tagger(DATA + "es" + "/tagger.dat", True, 2) elif lang == 'en': ##### Build English analyzers op = freeling.maco_options("en") op.set_data_files( "", DATA + "common/punct.dat", DATA + "en" + "/dicc.src", DATA + "en" + "/afixos.dat", "", # "data/locutions_en_processed.dat", "", DATA + "en" + "/np.dat", DATA + "en" + "/quantities.dat", DATA + "en" + "/probabilitats.dat") # create analyzers tk = freeling.tokenizer(DATA + "en" + "/tokenizer.dat") sp = freeling.splitter(DATA + "en" + "/splitter.dat") sid = sp.open_session() mf = freeling.maco(op) # activate mmorpho odules to be used in next call mf.set_active_options( False, True, True, True, # select which among created True, True, False, True, # submodules are to be used. True, True, True, True) # default: all created submodules are used # create tagger tg = freeling.hmm_tagger(DATA + "en" + "/tagger.dat", True, 2) return tk, sp, sid, mf, tg
def fullParsing(self, text, sentimentText): ## Modify this line to be your FreeLing installation directory FREELINGDIR = "/usr/local" DATA = FREELINGDIR + "/share/freeling/" LANG = "es" freeling.util_init_locale("default") # create language analyzer la = freeling.lang_ident(DATA + "common/lang_ident/ident.dat") # create options set for maco analyzer. Default values are Ok, except for data files. op = freeling.maco_options("es") op.set_data_files( "", DATA + "common/punct.dat", DATA + LANG + "/dicc.src", DATA + LANG + "/afixos.dat", "", DATA + LANG + "/locucions.dat", DATA + LANG + "/np.dat", DATA + LANG + "/quantities.dat", DATA + LANG + "/probabilitats.dat") # create analyzers tk = freeling.tokenizer(DATA + LANG + "/tokenizer.dat") sp = freeling.splitter(DATA + LANG + "/splitter.dat") sid = sp.open_session() mf = freeling.maco(op) # activate mmorpho odules to be used in next call mf.set_active_options( False, True, True, True, # select which among created True, True, False, True, # submodules are to be used. True, True, True, True) # default: all created submodules are used # create tagger, sense anotator, and parsers tg = freeling.hmm_tagger(DATA + LANG + "/tagger.dat", True, 2) sen = freeling.senses(DATA + LANG + "/senses.dat") parser = freeling.chart_parser(DATA + LANG + "/chunker/grammar-chunk.dat") dep = freeling.dep_txala(DATA + LANG + "/dep_txala/dependences.dat", parser.get_start_symbol()) #split Target as a list #print(sentimentText) sentimentText += '.' if sentimentText[0] == '@': sentimentText = sentimentText[1:] target = tk.tokenize(sentimentText) targets = sp.split(sid, target, True) targets = mf.analyze(targets) targets = parser.analyze(targets) targets = dep.analyze(targets) for s in targets: targetr = s.get_parse_tree() targetList = self.getTreeAsList(targetr, 0) del targetList[-1] #print(targetList) # process input text lin = text if lin[0] == '@': lin = lin[1:] #while (lin) : l = tk.tokenize(lin) ls = sp.split(sid, l, True) ls = mf.analyze(ls) ls = parser.analyze(ls) ls = dep.analyze(ls) finalType = None finalList = None ## output results for s in ls: tr = s.get_parse_tree() #self.printTree(tr, 0); wordType, wordList = self.getTypeNode(tr, 0, targetList) if finalType is None: if wordType is not None: finalType = wordType finalList = wordList # clean up sp.close_session(sid) return finalType, finalList
op= freeling.maco_options("es"); op.set_data_files( "", DATA + "common/punct.dat", DATA + LANG + "/dicc.src", DATA + LANG + "/afixos.dat", "", DATA + LANG + "/locucions.dat", DATA + LANG + "/np.dat", DATA + LANG + "/quantities.dat", DATA + LANG + "/probabilitats.dat"); tk=freeling.tokenizer(DATA+LANG+"/tokenizer.dat"); sp=freeling.splitter(DATA+LANG+"/splitter.dat"); sid=sp.open_session(); mf=freeling.maco(op); mf.set_active_options(False, False, True, False, True, True, False, True, False, True, False, True ) tg=freeling.hmm_tagger(DATA+LANG+"/tagger.dat",True,2) sen=freeling.senses(DATA+LANG+"/senses.dat") parser= freeling.chart_parser(DATA+LANG+"/chunker/grammar-chunk.dat") dep=freeling.dep_txala(DATA+LANG+"/dep_txala/dependences.dat", parser.get_start_symbol()) process_file(input_training_file, output_training_file, [sid, tk, sp, mf, tg, sen, parser, dep]) process_file(input_testing_file, output_testing_file, [sid, tk, sp, mf, tg, sen, parser, dep]) process_file(input_pruebas_file, output_pruebas_file, [sid, tk, sp, mf, tg, sen, parser, dep])
def tag(self): try: styles = self._styles.get() ppf = self._ppf.get() if self._only_completes.get() == 1: only_completes = True else: only_completes = False if self._webanno.get() == 1: webanno = True else: webanno = False except: messagebox.showerror( title="Ungültige Eingabe", message= """Bitte überprüfe, dass es sich bei deiner Eingabe in "Anzahl Sätze pro Datei" um eine ganzzahlige Zahl handelt.""" ) return None self._info.set("Starting...") self.root.update() # headers for the tsv if webanno: metadata_header = "webanno.custom.Metadata | Metadatavalue" lemma_header = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma | value" pos_header = "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS | PosValue" new_pos_header = "webanno.custom.NewPOS | SavePOSValue" morpho_header = "webanno.custom.Morpho | MorphoValue" comment_header = "webanno.custom.Comments | Commentvalue" dep_header = "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency | DependencyType | AttachTo=de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" hashtag = " # " # this needs to point to the freeling install directory FREELINGDIR = "/usr/local" DATA = FREELINGDIR + "/share/freeling/" LANG = "es" PATH = DATA + LANG + "/" freeling.util_init_locale("default") # create tokenizer and splitter tk = freeling.tokenizer(PATH + "tokenizer.dat") sp = freeling.splitter("RoSeData/no_splitter.dat") # a splitter is necessary for the process, sid = sp.open_session() # but our data is already split. no_splitter.dat tells the splitter to never split # create options set for maco analyzer. Default values are Ok, except for data files. op = freeling.maco_options("es") op.UserMapFile = "" op.LocutionsFile = PATH + "locucions.dat" op.AffixFile = PATH + "afixos.dat" op.ProbabilityFile = PATH + "probabilitats.dat" op.DictionaryFile = PATH + "dicc.src" op.NPdataFile = PATH + "np.dat" op.PunctuationFile = PATH + "../common/punct.dat" mf = freeling.maco(op) # activate morpho modules to be used in next call mf.set_active_options( False, True, True, True, # select which among created True, True, False, True, # submodules are to be used. True, True, False, True) # default: all created submodules are used # create tagger self._info.set("Generiere Tagger...") self.root.update() tg = freeling.hmm_tagger(PATH + "tagger.dat", True, 2) # create sense annotator and disambiguator self._info.set("Generiere sense disambiguator...") self.root.update() sen = freeling.senses(PATH + "senses.dat") wsd = freeling.ukb(PATH + "ukb.dat") # create parser self._info.set("Generiere dependency parser...") self.root.update() parser = freeling.dep_treeler(PATH + "dep_treeler/dependences.dat") # keep track of how many sentences were counted sent_counter = 0 # keep track of documents created doc_counter = 0 webanno_sent_counter = 0 outputter = freeling.output_conll() # Write headers outf = open("output/" + self._outF.get() + ".xml", encoding='utf-8', mode='w') outf.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") outf.write("<corpus>\n") # Start Tagging Process try: iterate_docs = ET.iterparse(self._indir.get(), events=("end", ), tag="document") except: messagebox.showerror( title="Ungültiger Dateipfad", message= "Unter dem angegebenen Dateipfad konnte keine XMl-Datei gefunden werden." ) self._info.set("Process stopped.") self.root.update() return None for action, doc in iterate_docs: # iterate all fileElems if True: # filter in case you only want certain docs self._info.set("Dokument {} wird bearbeitet...".format( doc.attrib["file"])) self.root.update() # filter out all unwanted phrases if styles == 'all' and only_completes == True: phrases = doc.xpath('phrase[contains(@complete,"yes")]') elif styles == 'all' and only_completes == True: phrases = doc.xpath('phrase') elif styles != 'all' and only_completes == True: phrases = doc.xpath( 'phrase[contains(@complete,"yes") and contains(@style,"' + styles + '")]') else: phrases = doc.xpath('phrase[contains(@style,"' + styles + '")]') for phrase in phrases: phrasetext = phrase.text lw = tk.tokenize(phrasetext) ls = sp.split(sid, lw, True) ls = mf.analyze(ls) ls = tg.analyze(ls) ls = sen.analyze(ls) wsdis = wsd.analyze(ls) dep = parser.analyze(wsdis) if webanno: # open a new tsv file if number of phrases is reached if sent_counter % ppf == 0: if doc_counter != 0: conllout.close() doc_counter += 1 conllout = open(self._outF.get() + '-' + str(doc_counter) + '.tsv', encoding='utf-8', mode='w') tsvwriter = csv.writer(conllout, delimiter='\t') # implement headers tsvwriter.writerow([ hashtag + metadata_header + hashtag + lemma_header + hashtag + pos_header + hashtag + new_pos_header + hashtag + morpho_header + hashtag + comment_header + hashtag + dep_header ]) webanno_sent_counter = 0 if webanno_sent_counter != 0: tsvwriter.writerow([]) tsvwriter.writerow( ["#id=" + str(webanno_sent_counter)]) word_counter = 1 sent_counter += 1 self._info2.set( str(sent_counter) + " Sätze wurden analysiert!") self.root.update() conllstr = outputter.PrintResults(dep) tokens_in_sent = conllstr.splitlines() # a clunky way to get the treedata depdict = {} for token in tokens_in_sent: if len(token) > 1: elements = token.split() depdict[elements[0]] = [ elements[1], elements[9], elements[10] ] for sentence in ls: sent_all_info = [] #only needed for the AfterFilter for word in sentence.get_words(): dictentry = depdict[str(word_counter)] if dictentry[0] != word.get_form(): print( "An error occured! Please check this phrase:", phrasetext) if dictentry[1] == "0": dictentry[1] = str(word_counter) # we give the metadata to the phrase by storing it as a layer in the first token if word_counter == 1: doc = phrase.getparent() docname = doc.attrib["file"] webanno_metadata = os.path.basename( self._indir.get() ) + ", " + docname + ", " + phrase.attrib["id"] else: webanno_metadata = "_" tokenElem = ET.SubElement(phrase, 'token', id=str(word_counter), lemma=word.get_lemma(), pos=word.get_tag(), dep_tag=dictentry[2], dep_parent=dictentry[1]) tokenElem.text = word.get_form() if webanno: #save all info as a tuple similar to webanno/conll-Format all_info = (word.get_form(), webanno_metadata, word.get_lemma(), word.get_tag(), dictentry[2], dictentry[1]) sent_all_info.append(all_info) word_counter += 1 if webanno: allowed = self._AfterFilter( sent_all_info) #filter the phrases if allowed: webanno_sent_counter += 1 this_word_counter = 1 # finally write the phrases to the tsv for element in sent_all_info: tsvwriter.writerow([ str(webanno_sent_counter) + "-" + str(this_word_counter), element[0], element[1], element[2], element[3], "_", "_", "O", element[4], str(webanno_sent_counter) + "-" + element[5] ]) this_word_counter += 1 # write docElem docString = ET.tostring(doc, encoding='unicode', pretty_print=True) outf.write(docString) doc.clear() # Also eliminate now-empty references from the root node to elem for ancestor in doc.xpath('ancestor-or-self::*'): while ancestor.getprevious() is not None: del ancestor.getparent()[0] doc.getparent().remove(doc) outf.write("</corpus>") outf.close() del iterate_docs if webanno: conllout.close() sp.close_session(sid) self._info.set("Tagging erfolgreich beendet.") self.root.update()