def __init__(self): ComplexityLanguage.__init__(self, 'en') # create parsers self.parser = freeling.chart_parser(self.DATA + self.lang + "/chunker/grammar-chunk.dat") self.dep = freeling.dep_txala( self.DATA + self.lang + "/dep_txala/dependences.dat", self.parser.get_start_symbol()) """ config es una lista de valores booleanos que activa o desactivan el cálculo de una medida config = [ True|False, # MAXIMUN EMBEDDING DEPTH OF SENTENCE (MaxDEPTH) True|False, # MINIMUN EMBEDDING DEPTH OF SENTENCE (MinDEPTH) True|False, # AVERAGE EMBEDDING DEPTH OF SENTENCE (MeanDEPTH) True|False, # FOG True|False, # FLESCH True|False, # FLESCH-KINCAID True|False, # SMOG ] """ self.config += [True, True, True, True, True, True, True, True] self.metricsStr.extend([ 'MaxDEPTH', 'MinDEPTH', 'MeanDEPTH', 'StdDEPTH', 'FOG', 'FLESCH', 'FLESCH-KINCAID', 'SMOG' ])
def inicia(self): FREELINGDIR = "/usr/local" DATA = FREELINGDIR + "/share/freeling/" LANG = "es" freeling.util_init_locale("default") # create options set for maco analyzer. Default values are Ok, except for data files. op = freeling.maco_options("es") op.set_active_modules(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) op.set_data_files("", DATA + LANG + "/locucions.dat", DATA + LANG + "/quantities.dat", DATA + LANG + "/afixos.dat", DATA + LANG + "/probabilitats.dat", DATA + LANG + "/dicc.src", DATA + LANG + "/np.dat", DATA + "common/punct.dat", DATA + LANG + "/corrector/corrector.dat") # create analyzers self.tk = freeling.tokenizer(DATA + LANG + "/tokenizer.dat") self.sp = freeling.splitter(DATA + LANG + "/splitter.dat") self.mf = freeling.maco(op) self.tg = freeling.hmm_tagger("es", DATA + LANG + "/tagger.dat", 1, 2) self.sen = freeling.senses(DATA + LANG + "/senses.dat") ner = freeling.ner(DATA + LANG + "/ner/ner-ab.dat") self.parser = freeling.chart_parser(DATA + LANG + "/chunker/grammar-chunk.dat") self.dep = freeling.dep_txala(DATA + LANG + "/dep/dependences.dat", self.parser.get_start_symbol())
def __init__(self, text): super().__init__(text) freeling.util_init_locale("default") self.la = freeling.lang_ident(DATA + "common/lang_ident/ident.dat") op = freeling.maco_options("es") op.set_data_files( "", DATA + "common/punct.dat", DATA + LANG + "/dicc.src", DATA + LANG + "/afixos.dat", "", DATA + LANG + "/locucions.dat", DATA + LANG + "/np.dat", DATA + LANG + "/quantities.dat", DATA + LANG + "/probabilitats.dat" ) # create analyzers self.tk = freeling.tokenizer(DATA + LANG + "/tokenizer.dat") self.sp = freeling.splitter(DATA + LANG + "/splitter.dat") self.sid = self.sp.open_session() self.mf = freeling.maco(op) # activate mmorpho odules to be used in next call self.mf.set_active_options( False, # umap User map module True, # num Number Detection True, # pun Punctuation Detection True, # dat Date Detection True, # dic Dictionary Search True, # aff False, # com True, # rtk True, # mw Multiword Recognition True, # ner Name Entity Recongnition True, # qt Quantity Recognition True # prb Probability Assignment And Guesser ) # default: all created submodules are used # create tagger, sense anotator, and parsers self.tg = freeling.hmm_tagger(DATA + LANG + "/tagger.dat", True, 2) self.sen = freeling.senses(DATA + LANG + "/senses.dat") self.parser = freeling.chart_parser(DATA + LANG + "/chunker/grammar-chunk.dat") self.dep = freeling.dep_txala(DATA + LANG + "/dep_txala/dependences.dat", self.parser.get_start_symbol())
def process_list(student_list, prompt): for essay in student_list: # create tagger level = essay[0] text = essay[1] tagger = freeling.hmm_tagger(lpath + "tagger.dat", True, 2) # create sense annotator sen = freeling.senses(lpath + "senses.dat") # create sense disambiguator wsd = freeling.ukb(lpath + "ukb.dat") # create dependency parser parser = freeling.chart_parser(lpath + "/chunker/grammar-chunk.dat") dep = freeling.dep_txala(lpath + "/dep_txala/dependences.dat", parser.get_start_symbol()) # tokenize input line into a list of words lw = tk.tokenize(text) # split list of words in sentences, return list of sentences ls = sp.split(lw) # perform morphosyntactic analysis and disambiguation ls = morfo.analyze(ls) ls = tagger.analyze(ls) # annotate and disambiguate senses ls = sen.analyze(ls) ls = wsd.analyze(ls) # parse sentences ls = parser.analyze(ls) ls = dep.analyze(ls) # get the parsed essay text essay_parse = ProcessSentences(ls) #append tuple with level and parsed text to appropriate essay list if prompt == "V": essays_vacation_parsed.append((level, essay_parse)) elif prompt == "F": essays_famous_parsed.append((level, essay_parse))
def __init__(self): ComplexityLanguage.__init__(self, 'es') ## Modify this line to be your FreeLing installation directory # create parsers self.parser= freeling.chart_parser(self.DATA+self.lang+"/chunker/grammar-chunk.dat") self.dep=freeling.dep_txala(self.DATA+self.lang+"/dep_txala/dependences.dat", self.parser.get_start_symbol()) # Para leer el texto que introducimos CLASSDIR = "/home/garciacumbreras18/" f = open(CLASSDIR + 'CREA_total.txt') lines = f.readlines() f.close() crea = {} for l in lines[1:1000]: # those words not in the 1000 most frequent words in CREA are low frequency words data = l.strip().split() crea[data[1]] = float(data[2].replace(',', '')) self.crea = crea """ config es una lista de valores booleanos que activa o desactivan el cálculo de una medida config = [ True|False, # MAXIMUN EMBEDDING DEPTH OF SENTENCE (MaxDEPTH) True|False, # MINIMUN EMBEDDING DEPTH OF SENTENCE (MinDEPTH) True|False, # AVERAGE EMBEDDING DEPTH OF SENTENCE (MeanDEPTH) True|False, # LC True|False, # SSR True|False, # HUERTA True|False, # IFSZ True|False, # POLINI True|False, # MINIMUN AGE True|False, # SOL True|False, # CRAWFORD ] """ self.config += [True, True, True, True, True, True, True, True, True, True, True, True] self.metricsStr.extend(['MaxDEPTH','MinDEPTH', 'MeanDEPTH', 'StdDEPTH', 'LC','SSR', 'HUERTA', 'IFSZ', 'POLINI', 'MINIMUN AGE', 'SOL', 'CRAWFORD']) self.configExtend += [True, True] self.metricsStrExtend.extend(['MEAN RARE WORDS', 'STD RARE WORDS'])
def inicializa(self): FREELINGDIR = "/usr/local" DATA = FREELINGDIR + "/share/freeling/" LANG = self.lang freeling.util_init_locale("default") # create language analyzer self.la = freeling.lang_ident(DATA + "common/lang_ident/ident.dat") # opciones para maco analyzer. op = freeling.maco_options("es") op.set_active_modules(0, 1, 1, 1, 1, 1, 1, 1, 1, 1) op.set_data_files("", DATA + LANG + "/locucions.dat", DATA + LANG + "/quantities.dat", DATA + LANG + "/afixos.dat", DATA + LANG + "/probabilitats.dat", DATA + LANG + "/dicc.src", DATA + LANG + "/np.dat", DATA + "common/punct.dat", DATA + LANG + "/corrector/corrector.dat") # crear analyzers self.tk = freeling.tokenizer(DATA + LANG + "/tokenizer.dat") self.sp = freeling.splitter(DATA + LANG + "/splitter.dat") self.mf = freeling.maco(op) self.tg = freeling.hmm_tagger(DATA + LANG + "/tagger.dat", 1, 2) self.sen = freeling.senses(DATA + LANG + "/senses.dat") self.nec = freeling.nec(DATA + LANG + "/nerc/nec/nec-ab-rich.dat") # self.ner=freeling.nec(DATA+LANG+"/ner/ner-ab.dat"); self.parser = freeling.chart_parser(DATA + LANG + "/chunker/grammar-chunk.dat") self.dep = freeling.dep_txala(DATA + LANG + "/dep/dependences.dat", self.parser.get_start_symbol()) con_data={'user':'******','password':'******','host':'127.0.0.1', \ 'database':'agiria','raise_on_warnings': True, 'autocommit':True, 'buffered':True} self.con = my.connect(**con_data)
def process_file(essay_lst, x): for entry in essay_lst: # create tagger essay = entry[1] id = entry[0] tagger = freeling.hmm_tagger(lpath + "tagger.dat", True, 2) # create sense annotator sen = freeling.senses(lpath + "senses.dat") # create sense disambiguator wsd = freeling.ukb(lpath + "ukb.dat") # create dependency parser parser = freeling.chart_parser(lpath + "/chunker/grammar-chunk.dat") dep = freeling.dep_txala(lpath + "/dep_txala/dependences.dat", parser.get_start_symbol()) # tokenize input line into a list of words lw = tk.tokenize(essay) # split list of words in sentences, return list of sentences ls = sp.split(lw) # perform morphosyntactic analysis and disambiguation ls = morfo.analyze(ls) ls = tagger.analyze(ls) # annotate and disambiguate senses ls = sen.analyze(ls) ls = wsd.analyze(ls) # parse sentences ls = parser.analyze(ls) ls = dep.analyze(ls) # do whatever is needed with processed sentences if x == 2: essays_special_tagged.append((id, ProcessSentences(ls))) elif x == 3: essays_terrible_tagged.append((id, ProcessSentences(ls)))
def inicializa(self): FREELINGDIR = "/usr/local"; DATA = FREELINGDIR+"/share/freeling/"; LANG=self.lang; freeling.util_init_locale("default"); # create language analyzer self.la=freeling.lang_ident(DATA+"common/lang_ident/ident.dat"); # opciones para maco analyzer. op= freeling.maco_options("es"); op.set_active_modules(0,1,1,1,1,1,1,1,1,1) op.set_data_files("",DATA+LANG+"/locucions.dat", DATA+LANG+"/quantities.dat", DATA+LANG+"/afixos.dat", DATA+LANG+"/probabilitats.dat", DATA+LANG+"/dicc.src", DATA+LANG+"/np.dat", DATA+"common/punct.dat",DATA+LANG+"/corrector/corrector.dat"); # crear analyzers self.tk=freeling.tokenizer(DATA+LANG+"/tokenizer.dat"); self.sp=freeling.splitter(DATA+LANG+"/splitter.dat"); self.mf=freeling.maco(op); self.tg=freeling.hmm_tagger(DATA+LANG+"/tagger.dat",1,2); self.sen=freeling.senses(DATA+LANG+"/senses.dat"); self.nec=freeling.nec(DATA+LANG+"/nerc/nec/nec-ab-rich.dat"); # self.ner=freeling.nec(DATA+LANG+"/ner/ner-ab.dat"); self.parser= freeling.chart_parser(DATA+LANG+"/chunker/grammar-chunk.dat"); self.dep=freeling.dep_txala(DATA+LANG+"/dep/dependences.dat", self.parser.get_start_symbol()); con_data={'user':'******','password':'******','host':'127.0.0.1', \ 'database':'agiria','raise_on_warnings': True, 'autocommit':True, 'buffered':True} self.con = my.connect(**con_data)
def config_files(self, lang, data_dir, data_dir_common): data_dir += lang + "/" data_conf = data_dir + "nerc/nec/nec.cfg" opt = freeling.maco_options(lang) # (usr, pun, dic, aff, comp, loc, nps, qty, prb) opt.set_data_files("", data_dir_common + "punct.dat", data_dir + "dicc.src", data_dir + "afixos.dat", data_dir + "compounds.dat", data_dir + "locucions.dat", data_dir + "np.dat", data_dir + "quantities.dat", data_dir + "probabilitats.dat") self.mf = freeling.maco(opt) # (umap, num, pun, dat, dic, aff, comp, rtk, mw, ner, qt, prb) # (0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0) self.mf.set_active_options(False, True, True, True, False, True, True, True, True, True, True, True) self.tk = freeling.tokenizer(data_dir + "tokenizer.dat") self.sp = freeling.splitter(data_dir + "splitter.dat") self.tg = freeling.hmm_tagger(data_dir + "tagger.dat", True, 2) self.sen = freeling.senses(data_dir + "senses.dat") self.parser = freeling.chart_parser(data_dir + "chunker/grammar-chunk.dat") self.dep = freeling.dep_txala(data_dir + "/dep_txala/dependences.dat", self.parser.get_start_symbol()) self.nec = freeling.nec(data_conf)
DATA + LANG + "/quantities.dat", DATA + LANG + "/probabilitats.dat"); tk=freeling.tokenizer(DATA+LANG+"/tokenizer.dat"); sp=freeling.splitter(DATA+LANG+"/splitter.dat"); sid=sp.open_session(); mf=freeling.maco(op); mf.set_active_options(False, False, True, False, True, True, False, True, False, True, False, True ) tg=freeling.hmm_tagger(DATA+LANG+"/tagger.dat",True,2) sen=freeling.senses(DATA+LANG+"/senses.dat") parser= freeling.chart_parser(DATA+LANG+"/chunker/grammar-chunk.dat") dep=freeling.dep_txala(DATA+LANG+"/dep_txala/dependences.dat", parser.get_start_symbol()) process_file(input_training_file, output_training_file, [sid, tk, sp, mf, tg, sen, parser, dep]) process_file(input_testing_file, output_testing_file, [sid, tk, sp, mf, tg, sen, parser, dep]) process_file(input_pruebas_file, output_pruebas_file, [sid, tk, sp, mf, tg, sen, parser, dep]) input_training_file.close() input_pruebas_file.close() input_testing_file.close() output_pruebas_file.close() output_testing_file.close() output_training_file.close() sp.close_session(sid);
True, # select which among created True, True, False, True, # submodules are to be used. True, True, True, True) # default: all created submodules are used # create tagger, sense anotator, and parsers tg = freeling.hmm_tagger(DATA + LANG + "/tagger.dat", True, 2) sen = freeling.senses(DATA + LANG + "/senses.dat") parser = freeling.chart_parser(DATA + LANG + "/chunker/grammar-chunk.dat") dep = freeling.dep_txala(DATA + LANG + "/dep_txala/dependences.dat", parser.get_start_symbol()) # process input text lin = sys.stdin.readline() print("Text language is: " + la.identify_language(lin, ["es", "ca", "en", "it"]) + "\n") while (lin): l = tk.tokenize(lin) ls = sp.split(sid, l, False) ls = mf.analyze(ls) ls = tg.analyze(ls) ls = sen.analyze(ls)
def fullParsing(self, text, sentimentText): ## Modify this line to be your FreeLing installation directory FREELINGDIR = "/usr/local" DATA = FREELINGDIR + "/share/freeling/" LANG = "es" freeling.util_init_locale("default") # create language analyzer la = freeling.lang_ident(DATA + "common/lang_ident/ident.dat") # create options set for maco analyzer. Default values are Ok, except for data files. op = freeling.maco_options("es") op.set_data_files( "", DATA + "common/punct.dat", DATA + LANG + "/dicc.src", DATA + LANG + "/afixos.dat", "", DATA + LANG + "/locucions.dat", DATA + LANG + "/np.dat", DATA + LANG + "/quantities.dat", DATA + LANG + "/probabilitats.dat") # create analyzers tk = freeling.tokenizer(DATA + LANG + "/tokenizer.dat") sp = freeling.splitter(DATA + LANG + "/splitter.dat") sid = sp.open_session() mf = freeling.maco(op) # activate mmorpho odules to be used in next call mf.set_active_options( False, True, True, True, # select which among created True, True, False, True, # submodules are to be used. True, True, True, True) # default: all created submodules are used # create tagger, sense anotator, and parsers tg = freeling.hmm_tagger(DATA + LANG + "/tagger.dat", True, 2) sen = freeling.senses(DATA + LANG + "/senses.dat") parser = freeling.chart_parser(DATA + LANG + "/chunker/grammar-chunk.dat") dep = freeling.dep_txala(DATA + LANG + "/dep_txala/dependences.dat", parser.get_start_symbol()) #split Target as a list #print(sentimentText) sentimentText += '.' if sentimentText[0] == '@': sentimentText = sentimentText[1:] target = tk.tokenize(sentimentText) targets = sp.split(sid, target, True) targets = mf.analyze(targets) targets = parser.analyze(targets) targets = dep.analyze(targets) for s in targets: targetr = s.get_parse_tree() targetList = self.getTreeAsList(targetr, 0) del targetList[-1] #print(targetList) # process input text lin = text if lin[0] == '@': lin = lin[1:] #while (lin) : l = tk.tokenize(lin) ls = sp.split(sid, l, True) ls = mf.analyze(ls) ls = parser.analyze(ls) ls = dep.analyze(ls) finalType = None finalList = None ## output results for s in ls: tr = s.get_parse_tree() #self.printTree(tr, 0); wordType, wordList = self.getTypeNode(tr, 0, targetList) if finalType is None: if wordType is not None: finalType = wordType finalList = wordList # clean up sp.close_session(sid) return finalType, finalList