def write_core_format_into_conllup_file(sentences, filepath): print( "Converting {} sentences into CONLLUP format. This requires a text preprocessor for Romanian. If the following function fails please install NLP-Cube (pip3 install nlpcube)." ) from cube.api import Cube cube = Cube(verbose=True) cube.load("ro", tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True) cube_no_tok = Cube(verbose=True) cube_no_tok.load("ro", tokenization=False, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True) conllupdataset = [] for sentence in sentences: sentence = process_split_exceptions(sentence) conllupsentence = _conllup_to_core_sentence(sentence, cube, cube_no_tok) conllupdataset.append(conllupsentence) write_file(filepath, conllupdataset)
def main(filename): cube = Cube(verbose=True) cube.load('en') with open('words.txt') as f: word_list = [line.rstrip() for line in f.readlines()] word_set = set(word_list) with open('my_words.txt') as f: my_word_list = [line.rstrip() for line in f.readlines()] my_word_set = set(my_word_list) text = srt_to_text(filename) sentences = cube(text) new_words = [] for sentence in sentences: for entry in sentence: if entry.lemma in word_set and entry.lemma not in my_word_list: if entry.lemma not in new_words: new_words.append(entry.lemma) print('-' * 100) print(f'{len(new_words)} new words are found.') print('-' * 100) for i, word in enumerate(new_words): print(i, word)
def test_4_3_run_model_with_default_external_embeddings(self): print("\n\33[33m{}\33[0m".format("4.3. Run a local model with default external embeddings ...")) from cube.api import Cube cube = Cube(verbose=True) cube.load('my_model', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True) text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln." sentences = cube(text) self.assertTrue(len(sentences)>0) self.assertTrue(len(sentences[0])>0)
def test_2_run_a_local_model(self): print("\n\33[33m{}\33[0m".format("2. Run a local model that does not have embeddings or metadata (running with dummy.vec embeddings) ...")) embeddings = os.path.join(self.root_path, "examples","wiki.dummy.vec") from cube.api import Cube cube = Cube(verbose=True) cube.load('my_model', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True, local_models_repository=self.local_model_repo, local_embeddings_file=embeddings) text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln." sentences = cube(text) self.assertTrue(len(sentences)>0) self.assertTrue(len(sentences[0])>0)
def test_1_2_download_and_run_an_online_model_specific_version(self): print("\n\33[33m{}\33[0m".format("1.2. Loading an online model (sme, 1.0) ...")) from cube.api import Cube cube = Cube(verbose=True) cube.load('sme', version='1.0', tokenization=True, compound_word_expanding=False, tagging=False, lemmatization=False, parsing=False) cube.metadata.info() text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln." sentences = cube(text) self.assertTrue(len(sentences)>0) self.assertTrue(len(sentences[0])>0)
def prepare_dialogs_sorted_by_lang(dialog_ids, dialog_path, prepared_path, start_date, end_date, additional_options=""): dialog_ids_sorted_by_lang = {"ua": [], "ru": [], "en": []} if dialog_ids[0] == -1: for filename in os.listdir(dialog_path): data = pd.read_csv(f"{dialog_path}/{filename}") lang = detect_data_language(data) dialog_ids_sorted_by_lang[lang].append(filename[:-4]) else: for dialog in dialog_ids: data = pd.read_csv(f"{dialog_path}/{dialog}.csv") lang = detect_data_language(data) dialog_ids_sorted_by_lang[lang].append(dialog) print("dialog_ids_sorted_by_lang") pprint(dialog_ids_sorted_by_lang) n_all_dialogs = sum([ len(dialog_ids_sorted_by_lang[lang]) for lang in dialog_ids_sorted_by_lang.keys() ]) n_dialog = 0 for lang in dialog_ids_sorted_by_lang.keys(): if not dialog_ids_sorted_by_lang[lang]: continue cube = "" if lang == "ua": cube = Cube(verbose=True) cube.load("uk") elif lang == "en": cube = Cube(verbose=True) cube.load("en") for dialog_id in dialog_ids_sorted_by_lang[lang]: if f"{dialog_id}.csv" in os.listdir(prepared_path): print( f"=========WARNING: {dialog_id}.csv already in {prepared_path}" ) n_dialog += 1 continue n_dialog += 1 print( f"\n=======Language {lang}, dialog_id {dialog_id}-- {n_dialog} from {n_all_dialogs}=======" ) prepare_dialogs(lang, cube, dialog_id, prepared_path, dialog_path, start_date, end_date, "words_frequency", additional_options)
def test_3_3_run_model_with_manual_embeddings(self): print("\n\33[33m{}\33[0m".format("3.3. Run a local model with manual embeddings ...")) embeddings = os.path.join(self.root_path, "examples","wiki.dummy.vec") print("\t\tPath to local manual embeddings file: "+embeddings) from cube.api import Cube cube = Cube(verbose=True) cube.load('my_model', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True, local_embeddings_file=embeddings) text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln." sentences = cube(text) self.assertTrue(len(sentences)>0) self.assertTrue(len(sentences[0])>0)
def create_conll_sentences(file_path): print("*"*25 + " Working on transforming the input file '{}' to CoNLL format ".format(file_path) + "*"*25 + "\n") print("Reading the input file...") with open(file_path, "r", encoding="utf-8") as file: text = file.read() print("Loading the 'ro' nlp-cube model...") cube = Cube(verbose=False) cube.load("ro") print("Creating the CoNLL sentences...") sentences = cube(text) print("\n" + "*"*124 + "\n") return sentences
def get_lemmatized_vocabulary(unlemmatized_voc, epi, lang): if os.path.isfile('lemmatized_' + lang + '.p'): voc = p.load(open('lemmatized_' + lang + '.p', 'rb')) return voc # Lemmatizer lang_acron = { 'gothic': 'got', 'latin': 'la', 'italian': 'it', 'german': 'de', 'greek': 'grc', 'english': 'eng' } cube = Cube(verbose=False) cube.load(lang_acron[lang]) voc = {} keys = list(unlemmatized_voc.keys()) for i in tqdm(range(len(keys))): w = keys[i] sents = cube(w) if type(sents[0]) == list: for sent in sents: for token in sent: if token.lemma != '_': voc[w] = [ token.lemma, epi.transliterate(token.lemma), 'L' ] else: voc[w] = [ token.word, epi.transliterate(token.word), 'T' ] else: for token in sents: if token.lemma != '_': voc[w] = [token.lemma, epi.transliterate(token.lemma), 'L'] else: voc[w] = [token.word, epi.transliterate(token.word), 'T'] p.dump(voc, open('lemmatized_' + lang + '.p', 'wb')) return voc
def download_model(self): if self.lib.lower() == "stanford": print("-----------You are going to use Stanford library-----------") if self.lang.lower() == "basque": print("-------------You are going to use Basque model-------------") # MODELS_DIR = '/home/edercarbajo/eu' MODELS_DIR = 'J:\TextSimilarity\eu' stanfordnlp.download('eu', MODELS_DIR) # Download the Basque models # config = {'processors': 'tokenize,pos,lemma,depparse', # Comma-separated list of processors to use # 'lang': 'eu', # Language code for the language to build the Pipeline in # 'tokenize_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_tokenizer.pt', # # Processor-specific arguments are set with keys "{processor_name}_{argument_name}" # 'pos_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_tagger.pt', # 'pos_pretrain_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt.pretrain.pt', # 'lemma_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_lemmatizer.pt', # 'depparse_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_parser.pt', # 'depparse_pretrain_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt.pretrain.pt' # } config = {'processors': 'tokenize,pos,lemma,depparse', # Comma-separated list of processors to use 'lang': 'eu', # Language code for the language to build the Pipeline in 'tokenize_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tokenizer.pt', # Processor-specific arguments are set with keys "{processor_name}_{argument_name}" 'pos_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tagger.pt', 'pos_pretrain_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt', 'lemma_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_lemmatizer.pt', 'depparse_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_parser.pt', 'depparse_pretrain_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt' } self.parser = stanfordnlp.Pipeline(**config) else: print("............Working...........") elif self.lib.lower() == "cube": print("-----------You are going to use Cube Library-----------") if self.lang.lower() == "basque": cube = Cube(verbose=True) cube.load("eu", "latest") self.parser = cube else: print("............Working...........") else: print("You cannot use this library. Introduce a valid library (Cube or Stanford)")
def start(): from analyzer import Analyzer p = ArgumentParser(description="python3 ./main.py -f \"laginak/*.doc.txt\" ") optional = p._action_groups.pop() # Edited this line required = p.add_argument_group('Required arguments') required.add_argument("-f", "--files", nargs='+', help="Files to analyze (in .txt, .odt, .doc or .docx format)") optional.add_argument('-a', '--all', action='store_true', help="Generate a CSV file with all the results") optional.add_argument('-s', '--similarity', action='store_true', help="Calculate similarity (max. 5 files)") p._action_groups.append(optional) opts = p.parse_args() FileLoader.load_files(opts.files) FileLoader.load_irregular_verbs_list() FileLoader.load_dale_chall_list() FileLoader.load_connectives_list() FileLoader.load_oxford_word_list() cube = Cube(verbose=True) # Cargar modelo Cube cube.load("en", "latest") df_row = None ### Files will be created in this folder path = Printer.create_directory(FileLoader.files[0]) file_num = 0 total = len(FileLoader.files) for input in FileLoader.files: texto = Analyzer.process_text(input=input) # Analizar a = Analyzer(texto, input, cube) i = a.analyze(opts.similarity) df = a.create_dataframe() prediction = a.predict_dificulty(df) file_num += 1 p = Printer(input, i) p.print_info(opts.similarity, prediction, file_num, total) if opts.all: df_row = p.write_in_full_csv(df_row, opts.similarity) p.generate_csv(path, prediction, opts.similarity) if opts.all: df_row.to_csv(os.path.join(path, "full_results_aztertest.csv"), encoding='utf-8', index=False)
def download_model(self): if self.lib.lower() == "stanford": print( "-----------You are going to use Stanford library-----------") if self.lang.lower() == "basque": print( "-------------You are going to use Basque model-------------" ) # MODELS_DIR = '/home/kepa/eu' MODELS_DIR = 'J:\TextSimilarity\eu' stanfordnlp.download('eu', MODELS_DIR) # Download the Basque models elif self.lang.lower() == "english": print( "-------------You are going to use English model-------------" ) MODELS_DIR = '/home/kepa/en' print( "-------------Downloading Stanford Basque model-------------" ) stanfordnlp.download('en', MODELS_DIR) # Download the Basque models elif self.lang.lower() == "spanish": print( "-------------You are going to use Spanish model-------------" ) MODELS_DIR = '/home/kepa/es' stanfordnlp.download('es', MODELS_DIR) # Download the English models else: print("........You cannot use this language...........") elif self.lib.lower() == "cube": print("-----------You are going to use Cube Library-----------") if self.lang.lower() == "basque": cube = Cube(verbose=True) cube.load("eu", "latest") elif self.lang.lower() == "english": cube = Cube(verbose=True) cube.load("en", "latest") elif self.lang.lower() == "spanish": cube = Cube(verbose=True) cube.load("es", "latest") else: print("........You cannot use this language...........") else: print( "You cannot use this library. Introduce a valid library (Cube or Stanford)" )
class CubeNLP(TeproApi): """By Tibi Boroș & co., does sentence splitting, tokenization, POS tagging, lemmatization and dependency parsing for Romanian.""" def __init__(self): super().__init__() self._algoName = TeproAlgo.algoCube @staticmethod def sgml2unicode(word: str) -> str: word = word.replace("ă", "ă") word = word.replace("Ă", "Ă") word = word.replace("â", "â") word = word.replace("Â", "Â") word = word.replace("î", "î") word = word.replace("Î", "Î") word = word.replace("ş", "ș") word = word.replace("Ş", "Ș") word = word.replace("ţ", "ț") word = word.replace("Ţ", "Ț") return word @staticmethod def _readMSDMappings(): m2c = {} with open(CTAG2MSDMAPFILE, mode="r") as f: for line in f: line = line.strip() parts = line.split() if len(parts) == 2: msd = parts[0] ctg = parts[1] m2c[msd] = ctg # end if # end for line # end open file return m2c @staticmethod def _readTblWordForm(): tbl = {} counter = 0 with open(TBLWORDFORMFILE, mode="r", encoding="utf-8") as f: for line in f: counter += 1 if counter > 0 and counter % 100000 == 0: print("{0}.{1}[{2}]: loading tbl.wordform.ro, at line {3}". format( Path(inspect.stack()[0].filename).stem, inspect.stack()[0].function, inspect.stack()[0].lineno, counter), file=sys.stderr, flush=True) line = line.strip() if line.startswith("#"): continue parts = line.split() if len(parts) == 3: word = CubeNLP.sgml2unicode(parts[0]) lemma = CubeNLP.sgml2unicode(parts[1]) if lemma == '=': lemma = word msd = parts[2] if word not in tbl: tbl[word] = {} if msd not in tbl[word]: tbl[word][msd] = [] tbl[word][msd].append(lemma) # end if parts has 3 elems # end for line in f # end while open file return tbl def createApp(self): self._cubeInst = Cube(verbose=True) def loadResources(self): self._cubeInst.load('ro', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True) self._tblwordform = CubeNLP._readTblWordForm() self._msd2ctag = CubeNLP._readMSDMappings() def _runApp(self, dto, opNotDone): text = dto.getText() sentences = self._cubeInst(text) sid = 0 for sent in sentences: # Teprolin tokenized sentence ttsent = [] # Teprolin string sentence tssent = "" for tok in sent: tt = TeproTok() tt.setId(tok.index) tt.setWordForm(tok.word) lowerWord = tok.word.lower() tt.setMSD(tok.xpos) # Assigning the mapped CTAG to the disambiguated MSD if tok.xpos in self._msd2ctag: tt.setCTAG(self._msd2ctag[tok.xpos]) else: tt.setCTAG(tok.xpos) lemmaIsSet = False # Doing lexicon lemmatization, if possible. if tok.word in self._tblwordform: if tok.xpos in self._tblwordform[tok.word] and \ len(self._tblwordform[tok.word][tok.xpos]) == 1: # TODO: if lemma is ambiguous, e.g. 'copii' can be 'copil' or 'copie' tt.setLemma(self._tblwordform[tok.word][tok.xpos][0]) lemmaIsSet = True elif lowerWord in self._tblwordform and \ tok.xpos in self._tblwordform[lowerWord] and \ len(self._tblwordform[lowerWord][tok.xpos]) == 1: tt.setLemma(self._tblwordform[lowerWord][tok.xpos][0]) lemmaIsSet = True if not lemmaIsSet: tt.setLemma(tok.lemma) tt.setHead(tok.head) tt.setDepRel(tok.label) tssent += tok.word if tok.space_after != "SpaceAfter=No": tssent += " " ttsent.append(tt) # end ttsent/tssent formation if not dto.isOpPerformed(TeproAlgo.getSentenceSplittingOperName()): dto.addSentenceString(tssent) dto.addSentenceTokens(ttsent) else: # Check and update annotations that only NLPCube # can produce or that are requested specifically from it. alignment = dto.alignSentences(ttsent, sid) for op in opNotDone: dto.copyTokenAnnotation(ttsent, sid, alignment, op) sid += 1 return dto
#pentru parsare fisiere din db / carti import os import json import fnmatch from input_parser import input_parser from nltk import tokenize from textwrap import wrap from cube.api import Cube cube = Cube(verbose=True) cube.load('ro') booksDir = os.path.abspath( os.path.realpath(os.path.join(os.path.dirname(__file__), '../../DB'))) dataDir = os.path.abspath( os.path.realpath(os.path.join(os.path.dirname(__file__), '../data'))) books = [] for filename in os.listdir(booksDir): if filename.endswith('.txt') and fnmatch.fnmatch(filename, "2???_a_*"): books.append(filename) if len(books) > 100: break for filename in books: book_content = open(os.path.join(booksDir, filename), encoding="utf-8").read() if os.path.exists(
print("Found {} local models".format(len(local_models))) model_count = len(online_models) # step 1. download all models for online_model in online_models: model, version = online_model[0], online_model[1] if not online_model in local_models: print("Downloading {}-{}".format(model,version)) else: print("Model {}-{} is already downloaded.".format(model,version)) continue cube = Cube() #cube.load(model, version)??? cube.load(model) print("\n\n") for online_model in online_models: model, version = online_model[0], online_model[1] print("\n\nTesting model {}-{}, @{}".format(model,version, datetime.today())) if model == "pl": continue # go run Cube print("\t Reading metadata ...") metadata = ModelMetadata() metadata.read(os.path.join(local_model_path,model+"-"+str(version),"metadata.json")) mlanguage = metadata.language
's', 'x', 's', 's', 's', 's', 's', 's', 'm', 's', 's', 's', 's', 'm', 's', 's', 's', 'm', 'm', 's', 's', 's', 's', 's', 's', 's', 's', 'm', 's', 'm', 'm', 's', 's', 'm', 's', 's', 's', 'm', 'm', 's', 's', 'm', 's', 's', 'm', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 'm', 's', 's', 's', 's', 's', 's', 's', 'm', 's', 's', 'm', 's', 's', 's', 's', 's', 's', 'm', 's', 'm', 's', 's', 's', 's', 'm', 's', 's', 's', 'm', 'm', 's', 's', 's', 'm' ] from cube.api import Cube cube = Cube(verbose=True) cube.load("ro", tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True) cube_no_tok = Cube(verbose=True) cube_no_tok.load("ro", tokenization=False, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True) multi_sentence_count = 0 errors = 0 multi_sentences = [] conllupdataset = [] for sentence in sentences: if "Alege: [" in sentence.sentence or "Decide tipul/clasa corecta: [" in sentence.sentence or len(
def cli(ctx, language): cube = Cube(verbose=True) cube.load(language) ctx.ensure_object(dict) ctx.obj['CUBE'] = cube
def map_bibles(f1, f2s, voc, l1='gothic', cube1=True, cube2=False, lemmatizer={}, expand_voc=False): f1_dict = load_bible(open(f1, 'r')) lemma_l1 = False if cube1 == False else True count = 0 mapped = {} all_words_count = 0 found_words = {} unfound_words = {} for book in f1_dict: if book not in mapped: mapped[book] = {} for chapter in f1_dict[book]: if chapter not in mapped[book]: mapped[book][chapter] = {} for verse in f1_dict[book][chapter]: if verse not in mapped[book][chapter]: mapped[book][chapter][verse] = {} count += 1 if count % 500 == 0: print(count) mapped[book][chapter][verse][l1] = f1_dict[book][chapter][ verse] if lemma_l1: mapped[book][chapter][verse][l1 + '_analyzed'] = analyze( f1_dict[book][chapter][verse], cube1, lang='got') else: mapped[book][chapter][verse][ l1 + '_analyzed'] = fake_analyze( f1_dict[book][chapter][verse], lang='got', lemmatizer=lemmatizer) if l1 == 'got': mapped[book][chapter][verse][l1 + '_translation'] = {} lemmas = list( list( zip(*mapped[book][chapter][verse][ l1 + '_analyzed']))[2]) for lang in all_languages: if lang not in found_words: found_words[lang] = {} if lang not in unfound_words: unfound_words[lang] = {} if lang == 'Got': mapped[book][chapter][verse][ l1 + '_translation'][lang] = lemmas mapped[book][chapter][verse][l1 + '_translation'][ lang + '_script'] = [ gothic_script_transformer(t) for t in lemmas ] mapped[book][chapter][verse][l1 + '_translation'][ lang + '_ipa'] = [ ipa_transformer(t, 'gothic') for t in lemmas ] else: if lang not in mapped[book][chapter][verse][ l1 + '_translation']: mapped[book][chapter][verse][ l1 + '_translation'][lang] = [] for word in lemmas: all_words_count += 1 if word in voc and lang in voc[word]: #if lang == 'Lat': # print(word, voc[word][lang]) if word not in found_words[lang]: found_words[lang][word] = 0 found_words[lang][word] += 1 mapped[book][chapter][verse][ l1 + '_translation'][lang].append( voc[word][lang]) else: if word not in unfound_words[lang]: unfound_words[lang][word] = 0 unfound_words[lang][word] += 1 mapped[book][chapter][verse][ l1 + '_translation'][lang].append([]) for lang in found_words: found = sum([found_words[lang][w] for w in found_words[lang]]) unfound = sum([unfound_words[lang][w] for w in unfound_words[lang]]) print('token', lang, found, unfound, '%.4f' % (float(found) / (found + unfound + 1))) found = len(found_words[lang]) unfound = len(unfound_words[lang]) print('\ttype', lang, found, unfound, '%.4f' % (float(found) / (found + unfound + 1))) print('All words: {}'.format(all_words_count)) for f in f2s: l2 = f.split('/')[0] langs_epitran = { 'german': 'deu-Latn', 'italian': 'ita-Latn', 'latin': 'ita-Latn', 'spanish': 'spa-Latn', 'english': 'eng-Latn' } if l2 in langs_epitran: epi = epitran.Epitran(langs_epitran[l2]) if cube2: lemma_l2 = lemma_l1 cube2 = Cube(verbose=False) cube2.load(lang_acron[l2]) #if f == 'greek/greek_byzantine_2000_utf8.txt': # pdb.set_trace() f2_dict = load_bible(open(f, 'r')) lemma_l2 = False if cube1 == False else True for book in mapped: for chapter in mapped[book]: for verse in mapped[book][chapter]: if book not in f2_dict or chapter not in f2_dict[ book] or verse not in f2_dict[book][chapter]: #pdb.set_trace() continue if f2_dict[book][chapter][verse] in ['', '[]', []]: continue #print(f2_dict[book][chapter][verse]) #mapped_words = {'german':'deu-Latn', 'italian':'ita-Latn', 'latin':'ita-Latn', 'spanish':'Es', 'greek':'Gre'} if lemma_l2: mapped[book][chapter][verse][ l2 + '_analyzed'] = analyze( f2_dict[book][chapter][verse], cube2, lang=l2) else: mapped[book][chapter][verse][ l2 + '_analyzed'] = fake_analyze( f2_dict[book][chapter][verse], lang=l2) # IPA #try: lemmas = [ x for x in (list( zip(*mapped[book][chapter][verse][l2 + '_analyzed']))[2] ) ] #except: #pdb.set_trace() mapped[book][chapter][verse][l2] = lemmas if l2 == 'greek': mapped[book][chapter][verse][l2 + '_ipa'] = [ ipa_transformer(l, 'greek') for l in lemmas ] #.split() elif l2 == 'old_english': try: mapped[book][chapter][verse][l2 + '_ipa'] = [ oe(l) for l in lemmas ] #.split() except: pass elif l2 == 'english': try: mapped[book][chapter][verse][l2 + '_ipa'] = [ oe(l) for l in lemmas ] #.split() #gen_mods.get_final(gen_mods.getIPA_CMU(f2_dict[book][chapter][verse])) except: pass else: mapped[book][chapter][verse][l2 + '_ipa'] = [ epi.transliterate(l) for l in lemmas ] #.split() expand_voc = False if l2 != 'Got' and expand_voc: expand_voc_by_distance(voc, mapped, book, chapter, verse, l2) #pdb.set_trace() return mapped
for entry in LSentence: print(entry) print("") if __name__ == '__main__': if True: found_nl = False for iso in CubeNLPPOS.get_L_supported_isos(None): if iso not in ('nno', 'nnb'): continue from cube.api import Cube # import the Cube object cube = Cube(verbose=True) # initialize it cube.load(DSupportedISOs[iso]) print_pos( 'id', 'Tahap pertama konflik ini dapat disebut "Perang Kemerdekaan Belanda".' ) print_pos( 'en', 'The first phase of the conflict can be considered the Dutch War of Independence.' ) print_pos('id', 'Saya tidak dapat memakan ini.') print_pos('en', 'I can\'t eat this.') print_pos('zh', '猴子高兴,实验人员也高兴。') print_pos('en', 'The monkeys were happy and the experimenters were happy.')
# filter import os import csv import json from filter.filter import filter_spam # lemmatizer from lemmatizer.lemmatizer import lemmatize from cube.api import Cube # detection sys.path.insert(0, './detector/') from detect_events import main as detect_events lemmatizer = Cube(verbose=True) lemmatizer.load("es", tokenization=False, parsing=False) class Streamer(luigi.ExternalTask): time_slice = luigi.parameter.DateMinuteParameter(interval=30) def output(self): fname = '../data/streaming/{}.csv'.format(self.time_slice) # print('Requires: {}'.format(fname)) return luigi.LocalTarget(fname) class Preprocess(luigi.Task): time_slice = luigi.parameter.DateMinuteParameter( interval=30, default=datetime.datetime.today())
from cube.api import Cube cube = Cube(verbose=True) cube.load("en", local_models_repository="/mnt/d/nlpcube/") text = "One potential microRNA that regulates Bcan is miR-9 and overexpression of miR-9 can partly rescue the effects of Dicer1 deletion on the MG phenotype." sentences = cube(text) for sentence in sentences: for entry in sentence: print( str(entry.index) + "\t" + entry.word + "\t" + entry.lemma + "\t" + entry.upos + "\t" + entry.xpos + "\t" + entry.attrs + "\t" + str(entry.head) + "\t" + str(entry.label) + "\t" + entry.space_after) print("")
import tnkeeh as tn from farasa.segmenter import FarasaSegmenter # for bulgarian and turkish from cube.api import Cube en_tok = MosesTokenizer(lang="en") en_normalizer = MosesPunctNormalizer() # TODO: change hardcoding of jar file to a arg from cli rdrsegmenter = VnCoreNLP("vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size="-Xmx500m") ar_segmenter = FarasaSegmenter() bg_cube = Cube(verbose=False) bg_cube.load("bg") tr_cube = Cube(verbose=False) tr_cube.load("tr") def clean_ar_text( text, segment=False, remove_special_chars=False, remove_english=False, normalize=False, remove_diacritics=False, excluded_chars=[], remove_tatweel=False, remove_html_elements=False,
local_models = model_store_object.list_local_models() print("Found {} local models".format(len(local_models))) model_count = len(online_models) # step 1. download all models for online_model in online_models: model, version = online_model[0], online_model[1] if not online_model in local_models: print("Downloading {}-{}".format(model, version)) else: print("Model {}-{} is already downloaded.".format(model, version)) continue cube = Cube() cube.load(model, version, local_models_repository=local_model_path) #cube.load(model) print("\n\n") #for online_model in local_models: #local_models+online_models: for online_model in local_models + online_models: model, version = online_model[0], online_model[1] print("\n\nTesting model {}-{}, @{}".format(model, version, datetime.today())) if model == "pl": continue # go run Cube print("\t Reading metadata ...") metadata = ModelMetadata() metadata.read(
from cube.api import Cube cube = Cube(verbose=True) cube.load( 'ro', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True )
import pandas as pd import os import sys from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import xml.etree.ElementTree as ET import numpy as np import nltk import string from nltk.stem import PorterStemmer, SnowballStemmer from cube.api import Cube cube = Cube(verbose=True) cube.load("ro") #import baza df = pd.read_csv('pcgarage.csv', delimiter='\t', encoding='utf-16', header=0) #preprocesare #eliminarea elementelor nedorite df['pro'] = df['pro'].str[4:] df['contra'] = df['contra'].str[8:] df['altele'] = df['altele'].str[8:] #concatenarea partilor df["corpus"] = df["pro"].astype(str) + " " + df["contra"].astype( str) + " " + df["altele"].astype(str) data = df[['product', 'rating', 'corpus']].copy() data['corpus'] = [it.lower().replace('\n\n', ' ') for it in data['corpus']] #transformarea majusculelor data['corpus'] = data.corpus.map(lambda x: x.lower()) #tokenizarea
# implement NLP-Cube https://github.com/adobe/NLP-Cube for sent and word tokenization, and lemma from cube.api import Cube # import the Cube object cube = Cube(verbose=False) # initialize it cube.load("en") from rusenttokenize import ru_sent_tokenize import re class mytokenizer: def bracket_mask(self, text): text_temp = re.sub("\[", "<<", text) text_temp = re.sub("\]", ">>", text_temp) text_temp = re.sub("\+", "===", text_temp) parenthesis = re.findall("\([^()]*\)", text_temp) bracket = re.findall("\[[^\[\]]*\]", text) if not parenthesis and not bracket: return text for p in parenthesis: p = re.sub("[\(\)]", "", p) parts = [re.sub("\.\s*$", "", s) for s in ru_sent_tokenize(p)] new_p = " ; ".join(parts) #print ("P:", p) #print("new_p:",new_p,"\n") #print("text_temp:",text_temp) text_temp = re.sub(p, new_p, text_temp) text = re.sub("<<", "[", text_temp) text = re.sub(">>", "]", text)
#################################################################################### if recompute_histograms or not os.path.isfile(histogram_picklefile_gf % (category, FREQUENCY_THRESH)): if category == "All": D_coords_fixated, D_histogram, D_entropy, D_entropy_df = \ merge_histograms(histogram_picklefile_gf) imnames = list(set(D_coords_fixated.keys())) pickle.dump([D_coords_fixated, D_histogram, D_entropy, D_entropy_df], open( histogram_picklefile_gf % (category, FREQUENCY_THRESH), "wb")) else: D_coords_fixated = get_raw_data() imnames = list(set(D_coords_fixated.keys())) cube = Cube(verbose=True) cube.load('en') start = time.time() D_histogram, D_entropy, D_entropy_df = compute_histograms( D_coords_fixated, imnames, category, is_grouping=True, fre_threshold=FREQUENCY_THRESH) print(time.time() - start) pickle.dump([D_coords_fixated, D_histogram, D_entropy, D_entropy_df], open( histogram_picklefile_gf % (category, FREQUENCY_THRESH), "wb")) print(histogram_picklefile_gf % (category, FREQUENCY_THRESH)) else: with open(histogram_picklefile_gf % (category, FREQUENCY_THRESH),
def load_model(self): if self.lib.lower() == "stanford": print( "-----------You are going to use Stanford library-----------") if self.lang.lower() == "basque": print( "-------------You are going to use Basque model-------------" ) # config = {'processors': 'tokenize,pos,lemma,depparse', # Comma-separated list of processors to use # 'lang': 'eu', # Language code for the language to build the Pipeline in # 'tokenize_model_path': '/home/kepa/eu/eu_bdt_models/eu_bdt_tokenizer.pt', # # Processor-specific arguments are set with keys "{processor_name}_{argument_name}" # 'pos_model_path': '/home/kepa/eu/eu_bdt_models/eu_bdt_tagger.pt', # 'pos_pretrain_path': '/home/kepa/eu/eu_bdt_models/eu_bdt.pretrain.pt', # 'lemma_model_path': '/home/kepa/eu/eu_bdt_models/eu_bdt_lemmatizer.pt', # 'depparse_model_path': '/home/kepa/eu/eu_bdt_models/eu_bdt_parser.pt', # 'depparse_pretrain_path': '/home/kepa/eu/eu_bdt_models/eu_bdt.pretrain.pt' # } config = { 'processors': 'tokenize,pos,lemma,depparse', # Comma-separated list of processors to use 'lang': 'eu', # Language code for the language to build the Pipeline in 'tokenize_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tokenizer.pt', # Processor-specific arguments are set with keys "{processor_name}_{argument_name}" 'pos_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tagger.pt', 'pos_pretrain_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt', 'lemma_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_lemmatizer.pt', 'depparse_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_parser.pt', 'depparse_pretrain_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt' } self.parser = stanfordnlp.Pipeline(**config) elif self.lang.lower() == "english": print( "-------------You are going to use English model-------------" ) config = { 'processors': 'tokenize,mwt,pos,lemma,depparse', # Comma-separated list of processors to use 'lang': 'en', # Language code for the language to build the Pipeline in 'tokenize_model_path': '/home/kepa/en/en_ewt_models/en_ewt_tokenizer.pt', # Processor-specific arguments are set with keys "{processor_name}_{argument_name}" #'mwt_model_path': './fr_gsd_models/fr_gsd_mwt_expander.pt', 'pos_model_path': '/home/kepa/en/en_ewt_models/en_ewt_tagger.pt', 'pos_pretrain_path': '/home/kepa/en/en_ewt_models/en_ewt.pretrain.pt', 'lemma_model_path': '/home/kepa/en/en_ewt_models/en_ewt_lemmatizer.pt', 'depparse_model_path': '/home/kepa/en/en_ewt_models/en_ewt_parser.pt', 'depparse_pretrain_path': '/home/kepa/en/en_ewt_models/en_ewt.pretrain.pt' } self.parser = stanfordnlp.Pipeline(**config) elif self.lang.lower() == "spanish": print( "-------------You are going to use Spanish model-------------" ) config = { 'processors': 'tokenize,pos,lemma,depparse', # Comma-separated list of processors to use 'lang': 'es', # Language code for the language to build the Pipeline in 'tokenize_model_path': '/home/kepa/es/es_ancora_models/es_ancora_tokenizer.pt', # Processor-specific arguments are set with keys "{processor_name}_{argument_name}" 'pos_model_path': '/home/kepa/es/es_ancora_models/es_ancora_tagger.pt', 'pos_pretrain_path': '/home/kepa/es/es_ancora_models/es_ancora.pretrain.pt', 'lemma_model_path': '/home/kepa/es/es_ancora_models/es_ancora_lemmatizer.pt', 'depparse_model_path': '/home/kepa/es/es_ancora_models/es_ancora_parser.pt', 'depparse_pretrain_path': '/home/kepa/es/es_ancora_models/es_ancora.pretrain.pt' } self.parser = stanfordnlp.Pipeline(**config) else: print("........You cannot use this language...........") elif self.lib.lower() == "cube": print("-----------You are going to use Cube Library-----------") if self.lang.lower() == "basque": #load(self, language_code, version="latest",local_models_repository=None, #local_embeddings_file=None, tokenization=True, compound_word_expanding=False, #tagging=True, lemmatization=True, parsing=True). #Ejemplo:load("es",tokenization=False, parsing=False) cube = Cube(verbose=True) cube.load("eu", "latest") elif self.lang.lower() == "english": cube = Cube(verbose=True) cube.load("en", "latest") elif self.lang.lower() == "spanish": cube = Cube(verbose=True) cube.load("es", "latest") else: print("........You cannot use this language...........") else: print( "You cannot use this library. Introduce a valid library (Cube or Stanford)" )
from typing import List, Dict from urllib.parse import urlparse, parse_qs import json from abbrev import full_to_abbrev from conllu_msd_to_monomial import MSD_dict from msd_convert import UPOS_to_MSD, MSD_to_attribs hostName = "localhost" serverPort = 8080 QUERY = 'q' from cube.api import Cube ro_cube=Cube(verbose=True) # initialize it ro_cube.load("ro") # select the desired language (it will auto-download the model on first run) class POSTagRequestHandler(http.server.BaseHTTPRequestHandler): def do_GET(self): parse = urlparse(self.path) query = parse_qs(parse.query) query = {k:' '.join(query[k]) for k in query.keys()} self.send_response(200) self.send_header("Content-type", "text/html") self.end_headers() # self.wfile.write(bytes("<html><head><title>https://pythonbasics.org</title></head>", "utf-8")) self.wfile.write(bytes(self._process_input(query[QUERY]), "utf-8")) # self.wfile.write(bytes("</body></html>", "utf-8")) def end_headers(self):
def create_pickle(): cube = Cube(verbose=True) cube.load("ja") with open('cube.pickle', mode='wb') as wh: pickle.dump(cube, wh)