def main(filename): cube = Cube(verbose=True) cube.load('en') with open('words.txt') as f: word_list = [line.rstrip() for line in f.readlines()] word_set = set(word_list) with open('my_words.txt') as f: my_word_list = [line.rstrip() for line in f.readlines()] my_word_set = set(my_word_list) text = srt_to_text(filename) sentences = cube(text) new_words = [] for sentence in sentences: for entry in sentence: if entry.lemma in word_set and entry.lemma not in my_word_list: if entry.lemma not in new_words: new_words.append(entry.lemma) print('-' * 100) print(f'{len(new_words)} new words are found.') print('-' * 100) for i, word in enumerate(new_words): print(i, word)
def test_4_3_run_model_with_default_external_embeddings(self): print("\n\33[33m{}\33[0m".format("4.3. Run a local model with default external embeddings ...")) from cube.api import Cube cube = Cube(verbose=True) cube.load('my_model', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True) text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln." sentences = cube(text) self.assertTrue(len(sentences)>0) self.assertTrue(len(sentences[0])>0)
def test_2_run_a_local_model(self): print("\n\33[33m{}\33[0m".format("2. Run a local model that does not have embeddings or metadata (running with dummy.vec embeddings) ...")) embeddings = os.path.join(self.root_path, "examples","wiki.dummy.vec") from cube.api import Cube cube = Cube(verbose=True) cube.load('my_model', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True, local_models_repository=self.local_model_repo, local_embeddings_file=embeddings) text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln." sentences = cube(text) self.assertTrue(len(sentences)>0) self.assertTrue(len(sentences[0])>0)
def test_1_2_download_and_run_an_online_model_specific_version(self): print("\n\33[33m{}\33[0m".format("1.2. Loading an online model (sme, 1.0) ...")) from cube.api import Cube cube = Cube(verbose=True) cube.load('sme', version='1.0', tokenization=True, compound_word_expanding=False, tagging=False, lemmatization=False, parsing=False) cube.metadata.info() text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln." sentences = cube(text) self.assertTrue(len(sentences)>0) self.assertTrue(len(sentences[0])>0)
def test_3_3_run_model_with_manual_embeddings(self): print("\n\33[33m{}\33[0m".format("3.3. Run a local model with manual embeddings ...")) embeddings = os.path.join(self.root_path, "examples","wiki.dummy.vec") print("\t\tPath to local manual embeddings file: "+embeddings) from cube.api import Cube cube = Cube(verbose=True) cube.load('my_model', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True, local_embeddings_file=embeddings) text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln." sentences = cube(text) self.assertTrue(len(sentences)>0) self.assertTrue(len(sentences[0])>0)
def write_core_format_into_conllup_file(sentences, filepath): print( "Converting {} sentences into CONLLUP format. This requires a text preprocessor for Romanian. If the following function fails please install NLP-Cube (pip3 install nlpcube)." ) from cube.api import Cube cube = Cube(verbose=True) cube.load("ro", tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True) cube_no_tok = Cube(verbose=True) cube_no_tok.load("ro", tokenization=False, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True) conllupdataset = [] for sentence in sentences: sentence = process_split_exceptions(sentence) conllupsentence = _conllup_to_core_sentence(sentence, cube, cube_no_tok) conllupdataset.append(conllupsentence) write_file(filepath, conllupdataset)
def prepare_dialogs_sorted_by_lang(dialog_ids, dialog_path, prepared_path, start_date, end_date, additional_options=""): dialog_ids_sorted_by_lang = {"ua": [], "ru": [], "en": []} if dialog_ids[0] == -1: for filename in os.listdir(dialog_path): data = pd.read_csv(f"{dialog_path}/{filename}") lang = detect_data_language(data) dialog_ids_sorted_by_lang[lang].append(filename[:-4]) else: for dialog in dialog_ids: data = pd.read_csv(f"{dialog_path}/{dialog}.csv") lang = detect_data_language(data) dialog_ids_sorted_by_lang[lang].append(dialog) print("dialog_ids_sorted_by_lang") pprint(dialog_ids_sorted_by_lang) n_all_dialogs = sum([ len(dialog_ids_sorted_by_lang[lang]) for lang in dialog_ids_sorted_by_lang.keys() ]) n_dialog = 0 for lang in dialog_ids_sorted_by_lang.keys(): if not dialog_ids_sorted_by_lang[lang]: continue cube = "" if lang == "ua": cube = Cube(verbose=True) cube.load("uk") elif lang == "en": cube = Cube(verbose=True) cube.load("en") for dialog_id in dialog_ids_sorted_by_lang[lang]: if f"{dialog_id}.csv" in os.listdir(prepared_path): print( f"=========WARNING: {dialog_id}.csv already in {prepared_path}" ) n_dialog += 1 continue n_dialog += 1 print( f"\n=======Language {lang}, dialog_id {dialog_id}-- {n_dialog} from {n_all_dialogs}=======" ) prepare_dialogs(lang, cube, dialog_id, prepared_path, dialog_path, start_date, end_date, "words_frequency", additional_options)
def create_conll_sentences(file_path): print("*"*25 + " Working on transforming the input file '{}' to CoNLL format ".format(file_path) + "*"*25 + "\n") print("Reading the input file...") with open(file_path, "r", encoding="utf-8") as file: text = file.read() print("Loading the 'ro' nlp-cube model...") cube = Cube(verbose=False) cube.load("ro") print("Creating the CoNLL sentences...") sentences = cube(text) print("\n" + "*"*124 + "\n") return sentences
def get_lemmatized_vocabulary(unlemmatized_voc, epi, lang): if os.path.isfile('lemmatized_' + lang + '.p'): voc = p.load(open('lemmatized_' + lang + '.p', 'rb')) return voc # Lemmatizer lang_acron = { 'gothic': 'got', 'latin': 'la', 'italian': 'it', 'german': 'de', 'greek': 'grc', 'english': 'eng' } cube = Cube(verbose=False) cube.load(lang_acron[lang]) voc = {} keys = list(unlemmatized_voc.keys()) for i in tqdm(range(len(keys))): w = keys[i] sents = cube(w) if type(sents[0]) == list: for sent in sents: for token in sent: if token.lemma != '_': voc[w] = [ token.lemma, epi.transliterate(token.lemma), 'L' ] else: voc[w] = [ token.word, epi.transliterate(token.word), 'T' ] else: for token in sents: if token.lemma != '_': voc[w] = [token.lemma, epi.transliterate(token.lemma), 'L'] else: voc[w] = [token.word, epi.transliterate(token.word), 'T'] p.dump(voc, open('lemmatized_' + lang + '.p', 'wb')) return voc
def download_model(self): if self.lib.lower() == "stanford": print("-----------You are going to use Stanford library-----------") if self.lang.lower() == "basque": print("-------------You are going to use Basque model-------------") # MODELS_DIR = '/home/edercarbajo/eu' MODELS_DIR = 'J:\TextSimilarity\eu' stanfordnlp.download('eu', MODELS_DIR) # Download the Basque models # config = {'processors': 'tokenize,pos,lemma,depparse', # Comma-separated list of processors to use # 'lang': 'eu', # Language code for the language to build the Pipeline in # 'tokenize_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_tokenizer.pt', # # Processor-specific arguments are set with keys "{processor_name}_{argument_name}" # 'pos_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_tagger.pt', # 'pos_pretrain_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt.pretrain.pt', # 'lemma_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_lemmatizer.pt', # 'depparse_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_parser.pt', # 'depparse_pretrain_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt.pretrain.pt' # } config = {'processors': 'tokenize,pos,lemma,depparse', # Comma-separated list of processors to use 'lang': 'eu', # Language code for the language to build the Pipeline in 'tokenize_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tokenizer.pt', # Processor-specific arguments are set with keys "{processor_name}_{argument_name}" 'pos_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tagger.pt', 'pos_pretrain_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt', 'lemma_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_lemmatizer.pt', 'depparse_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_parser.pt', 'depparse_pretrain_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt' } self.parser = stanfordnlp.Pipeline(**config) else: print("............Working...........") elif self.lib.lower() == "cube": print("-----------You are going to use Cube Library-----------") if self.lang.lower() == "basque": cube = Cube(verbose=True) cube.load("eu", "latest") self.parser = cube else: print("............Working...........") else: print("You cannot use this library. Introduce a valid library (Cube or Stanford)")
def start(): from analyzer import Analyzer p = ArgumentParser(description="python3 ./main.py -f \"laginak/*.doc.txt\" ") optional = p._action_groups.pop() # Edited this line required = p.add_argument_group('Required arguments') required.add_argument("-f", "--files", nargs='+', help="Files to analyze (in .txt, .odt, .doc or .docx format)") optional.add_argument('-a', '--all', action='store_true', help="Generate a CSV file with all the results") optional.add_argument('-s', '--similarity', action='store_true', help="Calculate similarity (max. 5 files)") p._action_groups.append(optional) opts = p.parse_args() FileLoader.load_files(opts.files) FileLoader.load_irregular_verbs_list() FileLoader.load_dale_chall_list() FileLoader.load_connectives_list() FileLoader.load_oxford_word_list() cube = Cube(verbose=True) # Cargar modelo Cube cube.load("en", "latest") df_row = None ### Files will be created in this folder path = Printer.create_directory(FileLoader.files[0]) file_num = 0 total = len(FileLoader.files) for input in FileLoader.files: texto = Analyzer.process_text(input=input) # Analizar a = Analyzer(texto, input, cube) i = a.analyze(opts.similarity) df = a.create_dataframe() prediction = a.predict_dificulty(df) file_num += 1 p = Printer(input, i) p.print_info(opts.similarity, prediction, file_num, total) if opts.all: df_row = p.write_in_full_csv(df_row, opts.similarity) p.generate_csv(path, prediction, opts.similarity) if opts.all: df_row.to_csv(os.path.join(path, "full_results_aztertest.csv"), encoding='utf-8', index=False)
def __init__(self, iso, use_gpu=False): from pos_tagger.engines.cubenlp_pos.CubeNLPPOS import \ DSupportedISOs assert iso in DSupportedISOs # Note the GPU argument - it may be worth putting # this on the PC with a more powerful one from cube.api import Cube cube_inst = self.cube_inst = Cube(verbose=True, use_gpu=use_gpu) # Chinese doesn't seem to work well with version 1.0, # [[??? <- I think this means 1.1??]] # despite the scores not showing much difference cube_inst.load(DSupportedISOs[iso], version='1.0' if iso == 'zh' else '1.1') EngineInstance.__init__(self, iso, use_gpu)
class CubeNLP(TeproApi): """By Tibi Boroș & co., does sentence splitting, tokenization, POS tagging, lemmatization and dependency parsing for Romanian.""" def __init__(self): super().__init__() self._algoName = TeproAlgo.algoCube @staticmethod def sgml2unicode(word: str) -> str: word = word.replace("ă", "ă") word = word.replace("Ă", "Ă") word = word.replace("â", "â") word = word.replace("Â", "Â") word = word.replace("î", "î") word = word.replace("Î", "Î") word = word.replace("ş", "ș") word = word.replace("Ş", "Ș") word = word.replace("ţ", "ț") word = word.replace("Ţ", "Ț") return word @staticmethod def _readMSDMappings(): m2c = {} with open(CTAG2MSDMAPFILE, mode="r") as f: for line in f: line = line.strip() parts = line.split() if len(parts) == 2: msd = parts[0] ctg = parts[1] m2c[msd] = ctg # end if # end for line # end open file return m2c @staticmethod def _readTblWordForm(): tbl = {} counter = 0 with open(TBLWORDFORMFILE, mode="r", encoding="utf-8") as f: for line in f: counter += 1 if counter > 0 and counter % 100000 == 0: print("{0}.{1}[{2}]: loading tbl.wordform.ro, at line {3}". format( Path(inspect.stack()[0].filename).stem, inspect.stack()[0].function, inspect.stack()[0].lineno, counter), file=sys.stderr, flush=True) line = line.strip() if line.startswith("#"): continue parts = line.split() if len(parts) == 3: word = CubeNLP.sgml2unicode(parts[0]) lemma = CubeNLP.sgml2unicode(parts[1]) if lemma == '=': lemma = word msd = parts[2] if word not in tbl: tbl[word] = {} if msd not in tbl[word]: tbl[word][msd] = [] tbl[word][msd].append(lemma) # end if parts has 3 elems # end for line in f # end while open file return tbl def createApp(self): self._cubeInst = Cube(verbose=True) def loadResources(self): self._cubeInst.load('ro', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True) self._tblwordform = CubeNLP._readTblWordForm() self._msd2ctag = CubeNLP._readMSDMappings() def _runApp(self, dto, opNotDone): text = dto.getText() sentences = self._cubeInst(text) sid = 0 for sent in sentences: # Teprolin tokenized sentence ttsent = [] # Teprolin string sentence tssent = "" for tok in sent: tt = TeproTok() tt.setId(tok.index) tt.setWordForm(tok.word) lowerWord = tok.word.lower() tt.setMSD(tok.xpos) # Assigning the mapped CTAG to the disambiguated MSD if tok.xpos in self._msd2ctag: tt.setCTAG(self._msd2ctag[tok.xpos]) else: tt.setCTAG(tok.xpos) lemmaIsSet = False # Doing lexicon lemmatization, if possible. if tok.word in self._tblwordform: if tok.xpos in self._tblwordform[tok.word] and \ len(self._tblwordform[tok.word][tok.xpos]) == 1: # TODO: if lemma is ambiguous, e.g. 'copii' can be 'copil' or 'copie' tt.setLemma(self._tblwordform[tok.word][tok.xpos][0]) lemmaIsSet = True elif lowerWord in self._tblwordform and \ tok.xpos in self._tblwordform[lowerWord] and \ len(self._tblwordform[lowerWord][tok.xpos]) == 1: tt.setLemma(self._tblwordform[lowerWord][tok.xpos][0]) lemmaIsSet = True if not lemmaIsSet: tt.setLemma(tok.lemma) tt.setHead(tok.head) tt.setDepRel(tok.label) tssent += tok.word if tok.space_after != "SpaceAfter=No": tssent += " " ttsent.append(tt) # end ttsent/tssent formation if not dto.isOpPerformed(TeproAlgo.getSentenceSplittingOperName()): dto.addSentenceString(tssent) dto.addSentenceTokens(ttsent) else: # Check and update annotations that only NLPCube # can produce or that are requested specifically from it. alignment = dto.alignSentences(ttsent, sid) for op in opNotDone: dto.copyTokenAnnotation(ttsent, sid, alignment, op) sid += 1 return dto
def createApp(self): self._cubeInst = Cube(verbose=True)
def map_bibles(f1, f2s, voc, l1='gothic', cube1=True, cube2=False, lemmatizer={}, expand_voc=False): f1_dict = load_bible(open(f1, 'r')) lemma_l1 = False if cube1 == False else True count = 0 mapped = {} all_words_count = 0 found_words = {} unfound_words = {} for book in f1_dict: if book not in mapped: mapped[book] = {} for chapter in f1_dict[book]: if chapter not in mapped[book]: mapped[book][chapter] = {} for verse in f1_dict[book][chapter]: if verse not in mapped[book][chapter]: mapped[book][chapter][verse] = {} count += 1 if count % 500 == 0: print(count) mapped[book][chapter][verse][l1] = f1_dict[book][chapter][ verse] if lemma_l1: mapped[book][chapter][verse][l1 + '_analyzed'] = analyze( f1_dict[book][chapter][verse], cube1, lang='got') else: mapped[book][chapter][verse][ l1 + '_analyzed'] = fake_analyze( f1_dict[book][chapter][verse], lang='got', lemmatizer=lemmatizer) if l1 == 'got': mapped[book][chapter][verse][l1 + '_translation'] = {} lemmas = list( list( zip(*mapped[book][chapter][verse][ l1 + '_analyzed']))[2]) for lang in all_languages: if lang not in found_words: found_words[lang] = {} if lang not in unfound_words: unfound_words[lang] = {} if lang == 'Got': mapped[book][chapter][verse][ l1 + '_translation'][lang] = lemmas mapped[book][chapter][verse][l1 + '_translation'][ lang + '_script'] = [ gothic_script_transformer(t) for t in lemmas ] mapped[book][chapter][verse][l1 + '_translation'][ lang + '_ipa'] = [ ipa_transformer(t, 'gothic') for t in lemmas ] else: if lang not in mapped[book][chapter][verse][ l1 + '_translation']: mapped[book][chapter][verse][ l1 + '_translation'][lang] = [] for word in lemmas: all_words_count += 1 if word in voc and lang in voc[word]: #if lang == 'Lat': # print(word, voc[word][lang]) if word not in found_words[lang]: found_words[lang][word] = 0 found_words[lang][word] += 1 mapped[book][chapter][verse][ l1 + '_translation'][lang].append( voc[word][lang]) else: if word not in unfound_words[lang]: unfound_words[lang][word] = 0 unfound_words[lang][word] += 1 mapped[book][chapter][verse][ l1 + '_translation'][lang].append([]) for lang in found_words: found = sum([found_words[lang][w] for w in found_words[lang]]) unfound = sum([unfound_words[lang][w] for w in unfound_words[lang]]) print('token', lang, found, unfound, '%.4f' % (float(found) / (found + unfound + 1))) found = len(found_words[lang]) unfound = len(unfound_words[lang]) print('\ttype', lang, found, unfound, '%.4f' % (float(found) / (found + unfound + 1))) print('All words: {}'.format(all_words_count)) for f in f2s: l2 = f.split('/')[0] langs_epitran = { 'german': 'deu-Latn', 'italian': 'ita-Latn', 'latin': 'ita-Latn', 'spanish': 'spa-Latn', 'english': 'eng-Latn' } if l2 in langs_epitran: epi = epitran.Epitran(langs_epitran[l2]) if cube2: lemma_l2 = lemma_l1 cube2 = Cube(verbose=False) cube2.load(lang_acron[l2]) #if f == 'greek/greek_byzantine_2000_utf8.txt': # pdb.set_trace() f2_dict = load_bible(open(f, 'r')) lemma_l2 = False if cube1 == False else True for book in mapped: for chapter in mapped[book]: for verse in mapped[book][chapter]: if book not in f2_dict or chapter not in f2_dict[ book] or verse not in f2_dict[book][chapter]: #pdb.set_trace() continue if f2_dict[book][chapter][verse] in ['', '[]', []]: continue #print(f2_dict[book][chapter][verse]) #mapped_words = {'german':'deu-Latn', 'italian':'ita-Latn', 'latin':'ita-Latn', 'spanish':'Es', 'greek':'Gre'} if lemma_l2: mapped[book][chapter][verse][ l2 + '_analyzed'] = analyze( f2_dict[book][chapter][verse], cube2, lang=l2) else: mapped[book][chapter][verse][ l2 + '_analyzed'] = fake_analyze( f2_dict[book][chapter][verse], lang=l2) # IPA #try: lemmas = [ x for x in (list( zip(*mapped[book][chapter][verse][l2 + '_analyzed']))[2] ) ] #except: #pdb.set_trace() mapped[book][chapter][verse][l2] = lemmas if l2 == 'greek': mapped[book][chapter][verse][l2 + '_ipa'] = [ ipa_transformer(l, 'greek') for l in lemmas ] #.split() elif l2 == 'old_english': try: mapped[book][chapter][verse][l2 + '_ipa'] = [ oe(l) for l in lemmas ] #.split() except: pass elif l2 == 'english': try: mapped[book][chapter][verse][l2 + '_ipa'] = [ oe(l) for l in lemmas ] #.split() #gen_mods.get_final(gen_mods.getIPA_CMU(f2_dict[book][chapter][verse])) except: pass else: mapped[book][chapter][verse][l2 + '_ipa'] = [ epi.transliterate(l) for l in lemmas ] #.split() expand_voc = False if l2 != 'Got' and expand_voc: expand_voc_by_distance(voc, mapped, book, chapter, verse, l2) #pdb.set_trace() return mapped
#pentru parsare fisiere din db / carti import os import json import fnmatch from input_parser import input_parser from nltk import tokenize from textwrap import wrap from cube.api import Cube cube = Cube(verbose=True) cube.load('ro') booksDir = os.path.abspath( os.path.realpath(os.path.join(os.path.dirname(__file__), '../../DB'))) dataDir = os.path.abspath( os.path.realpath(os.path.join(os.path.dirname(__file__), '../data'))) books = [] for filename in os.listdir(booksDir): if filename.endswith('.txt') and fnmatch.fnmatch(filename, "2???_a_*"): books.append(filename) if len(books) > 100: break for filename in books: book_content = open(os.path.join(booksDir, filename), encoding="utf-8").read() if os.path.exists(
def cli(ctx, language): cube = Cube(verbose=True) cube.load(language) ctx.ensure_object(dict) ctx.obj['CUBE'] = cube
from cube.api import Cube cube = Cube(verbose=True) cube.load( 'ro', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True )
def download_model(self): if self.lib.lower() == "stanford": print( "-----------You are going to use Stanford library-----------") if self.lang.lower() == "basque": print( "-------------You are going to use Basque model-------------" ) # MODELS_DIR = '/home/kepa/eu' MODELS_DIR = 'J:\TextSimilarity\eu' stanfordnlp.download('eu', MODELS_DIR) # Download the Basque models elif self.lang.lower() == "english": print( "-------------You are going to use English model-------------" ) MODELS_DIR = '/home/kepa/en' print( "-------------Downloading Stanford Basque model-------------" ) stanfordnlp.download('en', MODELS_DIR) # Download the Basque models elif self.lang.lower() == "spanish": print( "-------------You are going to use Spanish model-------------" ) MODELS_DIR = '/home/kepa/es' stanfordnlp.download('es', MODELS_DIR) # Download the English models else: print("........You cannot use this language...........") elif self.lib.lower() == "cube": print("-----------You are going to use Cube Library-----------") if self.lang.lower() == "basque": cube = Cube(verbose=True) cube.load("eu", "latest") elif self.lang.lower() == "english": cube = Cube(verbose=True) cube.load("en", "latest") elif self.lang.lower() == "spanish": cube = Cube(verbose=True) cube.load("es", "latest") else: print("........You cannot use this language...........") else: print( "You cannot use this library. Introduce a valid library (Cube or Stanford)" )
# filter import os import csv import json from filter.filter import filter_spam # lemmatizer from lemmatizer.lemmatizer import lemmatize from cube.api import Cube # detection sys.path.insert(0, './detector/') from detect_events import main as detect_events lemmatizer = Cube(verbose=True) lemmatizer.load("es", tokenization=False, parsing=False) class Streamer(luigi.ExternalTask): time_slice = luigi.parameter.DateMinuteParameter(interval=30) def output(self): fname = '../data/streaming/{}.csv'.format(self.time_slice) # print('Requires: {}'.format(fname)) return luigi.LocalTarget(fname) class Preprocess(luigi.Task): time_slice = luigi.parameter.DateMinuteParameter( interval=30, default=datetime.datetime.today())
parser.add_option('--port', action='store', dest='port', type='int', default=8080, help='Binding port for web service (default: 8080)') parser.add_option('--host', action='store', dest='host', default='0.0.0.0', help='Binding IP for server (default: 0.0.0.0)') parser.add_option( '--lang', action='append', dest='languages', default=[], help= 'Preload language. You can use this param multiple times: --lang en --lang fr ... (default is just ["en"])' ) (params, _) = parser.parse_args(sys.argv) if len(params.languages) == 0: params.languages = ['en'] for lang in params.languages: lang2cube[lang] = Cube(verbose=True) lang2cube[lang].load(lang) app.run(port=params.port, host=params.host)
from cube.api import Cube cube = Cube(verbose=True) cube.load("en", local_models_repository="/mnt/d/nlpcube/") text = "One potential microRNA that regulates Bcan is miR-9 and overexpression of miR-9 can partly rescue the effects of Dicer1 deletion on the MG phenotype." sentences = cube(text) for sentence in sentences: for entry in sentence: print( str(entry.index) + "\t" + entry.word + "\t" + entry.lemma + "\t" + entry.upos + "\t" + entry.xpos + "\t" + entry.attrs + "\t" + str(entry.head) + "\t" + str(entry.label) + "\t" + entry.space_after) print("")
import http.server from typing import List, Dict from urllib.parse import urlparse, parse_qs import json from abbrev import full_to_abbrev from conllu_msd_to_monomial import MSD_dict from msd_convert import UPOS_to_MSD, MSD_to_attribs hostName = "localhost" serverPort = 8080 QUERY = 'q' from cube.api import Cube ro_cube=Cube(verbose=True) # initialize it ro_cube.load("ro") # select the desired language (it will auto-download the model on first run) class POSTagRequestHandler(http.server.BaseHTTPRequestHandler): def do_GET(self): parse = urlparse(self.path) query = parse_qs(parse.query) query = {k:' '.join(query[k]) for k in query.keys()} self.send_response(200) self.send_header("Content-type", "text/html") self.end_headers() # self.wfile.write(bytes("<html><head><title>https://pythonbasics.org</title></head>", "utf-8")) self.wfile.write(bytes(self._process_input(query[QUERY]), "utf-8")) # self.wfile.write(bytes("</body></html>", "utf-8"))
def load_model(self): if self.lib.lower() == "stanford": print( "-----------You are going to use Stanford library-----------") if self.lang.lower() == "basque": print( "-------------You are going to use Basque model-------------" ) # config = {'processors': 'tokenize,pos,lemma,depparse', # Comma-separated list of processors to use # 'lang': 'eu', # Language code for the language to build the Pipeline in # 'tokenize_model_path': '/home/kepa/eu/eu_bdt_models/eu_bdt_tokenizer.pt', # # Processor-specific arguments are set with keys "{processor_name}_{argument_name}" # 'pos_model_path': '/home/kepa/eu/eu_bdt_models/eu_bdt_tagger.pt', # 'pos_pretrain_path': '/home/kepa/eu/eu_bdt_models/eu_bdt.pretrain.pt', # 'lemma_model_path': '/home/kepa/eu/eu_bdt_models/eu_bdt_lemmatizer.pt', # 'depparse_model_path': '/home/kepa/eu/eu_bdt_models/eu_bdt_parser.pt', # 'depparse_pretrain_path': '/home/kepa/eu/eu_bdt_models/eu_bdt.pretrain.pt' # } config = { 'processors': 'tokenize,pos,lemma,depparse', # Comma-separated list of processors to use 'lang': 'eu', # Language code for the language to build the Pipeline in 'tokenize_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tokenizer.pt', # Processor-specific arguments are set with keys "{processor_name}_{argument_name}" 'pos_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tagger.pt', 'pos_pretrain_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt', 'lemma_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_lemmatizer.pt', 'depparse_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_parser.pt', 'depparse_pretrain_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt' } self.parser = stanfordnlp.Pipeline(**config) elif self.lang.lower() == "english": print( "-------------You are going to use English model-------------" ) config = { 'processors': 'tokenize,mwt,pos,lemma,depparse', # Comma-separated list of processors to use 'lang': 'en', # Language code for the language to build the Pipeline in 'tokenize_model_path': '/home/kepa/en/en_ewt_models/en_ewt_tokenizer.pt', # Processor-specific arguments are set with keys "{processor_name}_{argument_name}" #'mwt_model_path': './fr_gsd_models/fr_gsd_mwt_expander.pt', 'pos_model_path': '/home/kepa/en/en_ewt_models/en_ewt_tagger.pt', 'pos_pretrain_path': '/home/kepa/en/en_ewt_models/en_ewt.pretrain.pt', 'lemma_model_path': '/home/kepa/en/en_ewt_models/en_ewt_lemmatizer.pt', 'depparse_model_path': '/home/kepa/en/en_ewt_models/en_ewt_parser.pt', 'depparse_pretrain_path': '/home/kepa/en/en_ewt_models/en_ewt.pretrain.pt' } self.parser = stanfordnlp.Pipeline(**config) elif self.lang.lower() == "spanish": print( "-------------You are going to use Spanish model-------------" ) config = { 'processors': 'tokenize,pos,lemma,depparse', # Comma-separated list of processors to use 'lang': 'es', # Language code for the language to build the Pipeline in 'tokenize_model_path': '/home/kepa/es/es_ancora_models/es_ancora_tokenizer.pt', # Processor-specific arguments are set with keys "{processor_name}_{argument_name}" 'pos_model_path': '/home/kepa/es/es_ancora_models/es_ancora_tagger.pt', 'pos_pretrain_path': '/home/kepa/es/es_ancora_models/es_ancora.pretrain.pt', 'lemma_model_path': '/home/kepa/es/es_ancora_models/es_ancora_lemmatizer.pt', 'depparse_model_path': '/home/kepa/es/es_ancora_models/es_ancora_parser.pt', 'depparse_pretrain_path': '/home/kepa/es/es_ancora_models/es_ancora.pretrain.pt' } self.parser = stanfordnlp.Pipeline(**config) else: print("........You cannot use this language...........") elif self.lib.lower() == "cube": print("-----------You are going to use Cube Library-----------") if self.lang.lower() == "basque": #load(self, language_code, version="latest",local_models_repository=None, #local_embeddings_file=None, tokenization=True, compound_word_expanding=False, #tagging=True, lemmatization=True, parsing=True). #Ejemplo:load("es",tokenization=False, parsing=False) cube = Cube(verbose=True) cube.load("eu", "latest") elif self.lang.lower() == "english": cube = Cube(verbose=True) cube.load("en", "latest") elif self.lang.lower() == "spanish": cube = Cube(verbose=True) cube.load("es", "latest") else: print("........You cannot use this language...........") else: print( "You cannot use this library. Introduce a valid library (Cube or Stanford)" )
import pandas as pd import os import sys from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import xml.etree.ElementTree as ET import numpy as np import nltk import string from nltk.stem import PorterStemmer, SnowballStemmer from cube.api import Cube cube = Cube(verbose=True) cube.load("ro") #import baza df = pd.read_csv('pcgarage.csv', delimiter='\t', encoding='utf-16', header=0) #preprocesare #eliminarea elementelor nedorite df['pro'] = df['pro'].str[4:] df['contra'] = df['contra'].str[8:] df['altele'] = df['altele'].str[8:] #concatenarea partilor df["corpus"] = df["pro"].astype(str) + " " + df["contra"].astype( str) + " " + df["altele"].astype(str) data = df[['product', 'rating', 'corpus']].copy() data['corpus'] = [it.lower().replace('\n\n', ' ') for it in data['corpus']] #transformarea majusculelor data['corpus'] = data.corpus.map(lambda x: x.lower()) #tokenizarea
print("Found {} online models".format(len(online_models))) local_models = model_store_object.list_local_models() print("Found {} local models".format(len(local_models))) model_count = len(online_models) # step 1. download all models for online_model in online_models: model, version = online_model[0], online_model[1] if not online_model in local_models: print("Downloading {}-{}".format(model, version)) else: print("Model {}-{} is already downloaded.".format(model, version)) continue cube = Cube() cube.load(model, version, local_models_repository=local_model_path) #cube.load(model) print("\n\n") #for online_model in local_models: #local_models+online_models: for online_model in local_models + online_models: model, version = online_model[0], online_model[1] print("\n\nTesting model {}-{}, @{}".format(model, version, datetime.today())) if model == "pl": continue # go run Cube print("\t Reading metadata ...") metadata = ModelMetadata()
#################################################################################### if recompute_histograms or not os.path.isfile(histogram_picklefile_gf % (category, FREQUENCY_THRESH)): if category == "All": D_coords_fixated, D_histogram, D_entropy, D_entropy_df = \ merge_histograms(histogram_picklefile_gf) imnames = list(set(D_coords_fixated.keys())) pickle.dump([D_coords_fixated, D_histogram, D_entropy, D_entropy_df], open( histogram_picklefile_gf % (category, FREQUENCY_THRESH), "wb")) else: D_coords_fixated = get_raw_data() imnames = list(set(D_coords_fixated.keys())) cube = Cube(verbose=True) cube.load('en') start = time.time() D_histogram, D_entropy, D_entropy_df = compute_histograms( D_coords_fixated, imnames, category, is_grouping=True, fre_threshold=FREQUENCY_THRESH) print(time.time() - start) pickle.dump([D_coords_fixated, D_histogram, D_entropy, D_entropy_df], open( histogram_picklefile_gf % (category, FREQUENCY_THRESH), "wb")) print(histogram_picklefile_gf % (category, FREQUENCY_THRESH)) else:
# implement NLP-Cube https://github.com/adobe/NLP-Cube for sent and word tokenization, and lemma from cube.api import Cube # import the Cube object cube = Cube(verbose=False) # initialize it cube.load("en") from rusenttokenize import ru_sent_tokenize import re class mytokenizer: def bracket_mask(self, text): text_temp = re.sub("\[", "<<", text) text_temp = re.sub("\]", ">>", text_temp) text_temp = re.sub("\+", "===", text_temp) parenthesis = re.findall("\([^()]*\)", text_temp) bracket = re.findall("\[[^\[\]]*\]", text) if not parenthesis and not bracket: return text for p in parenthesis: p = re.sub("[\(\)]", "", p) parts = [re.sub("\.\s*$", "", s) for s in ru_sent_tokenize(p)] new_p = " ; ".join(parts) #print ("P:", p) #print("new_p:",new_p,"\n") #print("text_temp:",text_temp) text_temp = re.sub(p, new_p, text_temp) text = re.sub("<<", "[", text_temp) text = re.sub(">>", "]", text)
for LSentence in LSentences: for entry in LSentence: print(entry) print("") if __name__ == '__main__': if True: found_nl = False for iso in CubeNLPPOS.get_L_supported_isos(None): if iso not in ('nno', 'nnb'): continue from cube.api import Cube # import the Cube object cube = Cube(verbose=True) # initialize it cube.load(DSupportedISOs[iso]) print_pos( 'id', 'Tahap pertama konflik ini dapat disebut "Perang Kemerdekaan Belanda".' ) print_pos( 'en', 'The first phase of the conflict can be considered the Dutch War of Independence.' ) print_pos('id', 'Saya tidak dapat memakan ini.') print_pos('en', 'I can\'t eat this.') print_pos('zh', '猴子高兴,实验人员也高兴。')
import tnkeeh as tn from farasa.segmenter import FarasaSegmenter # for bulgarian and turkish from cube.api import Cube en_tok = MosesTokenizer(lang="en") en_normalizer = MosesPunctNormalizer() # TODO: change hardcoding of jar file to a arg from cli rdrsegmenter = VnCoreNLP("vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size="-Xmx500m") ar_segmenter = FarasaSegmenter() bg_cube = Cube(verbose=False) bg_cube.load("bg") tr_cube = Cube(verbose=False) tr_cube.load("tr") def clean_ar_text( text, segment=False, remove_special_chars=False, remove_english=False, normalize=False, remove_diacritics=False, excluded_chars=[], remove_tatweel=False,