示例#1
0
def write_core_format_into_conllup_file(sentences, filepath):
    print(
        "Converting {} sentences into CONLLUP format. This requires a text preprocessor for Romanian. If the following function fails please install NLP-Cube (pip3 install nlpcube)."
    )

    from cube.api import Cube
    cube = Cube(verbose=True)
    cube.load("ro",
              tokenization=True,
              compound_word_expanding=False,
              tagging=True,
              lemmatization=True,
              parsing=True)
    cube_no_tok = Cube(verbose=True)
    cube_no_tok.load("ro",
                     tokenization=False,
                     compound_word_expanding=False,
                     tagging=True,
                     lemmatization=True,
                     parsing=True)

    conllupdataset = []
    for sentence in sentences:
        sentence = process_split_exceptions(sentence)
        conllupsentence = _conllup_to_core_sentence(sentence, cube,
                                                    cube_no_tok)
        conllupdataset.append(conllupsentence)

    write_file(filepath, conllupdataset)
示例#2
0
def main(filename):
    cube = Cube(verbose=True)
    cube.load('en')

    with open('words.txt') as f:
        word_list = [line.rstrip() for line in f.readlines()]
        word_set = set(word_list)

    with open('my_words.txt') as f:
        my_word_list = [line.rstrip() for line in f.readlines()]
        my_word_set = set(my_word_list)

    text = srt_to_text(filename)
    sentences = cube(text)
    new_words = []
    for sentence in sentences:
        for entry in sentence:
            if entry.lemma in word_set and entry.lemma not in my_word_list:
                if entry.lemma not in new_words:
                    new_words.append(entry.lemma)
    print('-' * 100)
    print(f'{len(new_words)} new words are found.')
    print('-' * 100)
    for i, word in enumerate(new_words):
        print(i, word)
示例#3
0
 def test_4_3_run_model_with_default_external_embeddings(self):  
     print("\n\33[33m{}\33[0m".format("4.3. Run a local model with default external embeddings ..."))                        
     from cube.api import Cube
     cube = Cube(verbose=True)
     cube.load('my_model', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True)        
     text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln."
     sentences = cube(text)
     self.assertTrue(len(sentences)>0)
     self.assertTrue(len(sentences[0])>0)       
示例#4
0
 def test_2_run_a_local_model(self):  
     print("\n\33[33m{}\33[0m".format("2. Run a local model that does not have embeddings or metadata (running with dummy.vec embeddings) ..."))
     embeddings = os.path.join(self.root_path, "examples","wiki.dummy.vec")
     from cube.api import Cube
     cube = Cube(verbose=True)
     cube.load('my_model', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True, local_models_repository=self.local_model_repo, local_embeddings_file=embeddings)        
     text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln."
     sentences = cube(text)
     self.assertTrue(len(sentences)>0)
     self.assertTrue(len(sentences[0])>0)   
示例#5
0
 def test_1_2_download_and_run_an_online_model_specific_version(self):                                    
     print("\n\33[33m{}\33[0m".format("1.2. Loading an online model (sme, 1.0) ..."))
     from cube.api import Cube
     cube = Cube(verbose=True)
     cube.load('sme', version='1.0', tokenization=True, compound_word_expanding=False, tagging=False, lemmatization=False, parsing=False)
     cube.metadata.info()
     text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln."
     sentences = cube(text)
     self.assertTrue(len(sentences)>0)
     self.assertTrue(len(sentences[0])>0)        
示例#6
0
def prepare_dialogs_sorted_by_lang(dialog_ids,
                                   dialog_path,
                                   prepared_path,
                                   start_date,
                                   end_date,
                                   additional_options=""):
    dialog_ids_sorted_by_lang = {"ua": [], "ru": [], "en": []}
    if dialog_ids[0] == -1:
        for filename in os.listdir(dialog_path):
            data = pd.read_csv(f"{dialog_path}/{filename}")
            lang = detect_data_language(data)
            dialog_ids_sorted_by_lang[lang].append(filename[:-4])

    else:
        for dialog in dialog_ids:
            data = pd.read_csv(f"{dialog_path}/{dialog}.csv")
            lang = detect_data_language(data)
            dialog_ids_sorted_by_lang[lang].append(dialog)

    print("dialog_ids_sorted_by_lang")
    pprint(dialog_ids_sorted_by_lang)

    n_all_dialogs = sum([
        len(dialog_ids_sorted_by_lang[lang])
        for lang in dialog_ids_sorted_by_lang.keys()
    ])
    n_dialog = 0
    for lang in dialog_ids_sorted_by_lang.keys():
        if not dialog_ids_sorted_by_lang[lang]:
            continue

        cube = ""
        if lang == "ua":
            cube = Cube(verbose=True)
            cube.load("uk")

        elif lang == "en":
            cube = Cube(verbose=True)
            cube.load("en")

        for dialog_id in dialog_ids_sorted_by_lang[lang]:
            if f"{dialog_id}.csv" in os.listdir(prepared_path):
                print(
                    f"=========WARNING: {dialog_id}.csv already in {prepared_path}"
                )
                n_dialog += 1
                continue

            n_dialog += 1
            print(
                f"\n=======Language {lang}, dialog_id {dialog_id}-- {n_dialog} from {n_all_dialogs}======="
            )
            prepare_dialogs(lang, cube, dialog_id, prepared_path, dialog_path,
                            start_date, end_date, "words_frequency",
                            additional_options)
示例#7
0
 def test_3_3_run_model_with_manual_embeddings(self):  
     print("\n\33[33m{}\33[0m".format("3.3. Run a local model with manual embeddings ..."))                
     embeddings = os.path.join(self.root_path, "examples","wiki.dummy.vec")
     print("\t\tPath to local manual embeddings file: "+embeddings)
     from cube.api import Cube
     cube = Cube(verbose=True)
     cube.load('my_model', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True, local_embeddings_file=embeddings)        
     text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln."
     sentences = cube(text)
     self.assertTrue(len(sentences)>0)
     self.assertTrue(len(sentences[0])>0)   
示例#8
0
def create_conll_sentences(file_path):
    print("*"*25 + "  Working on transforming the input file '{}' to CoNLL format  ".format(file_path) + "*"*25 + "\n")

    print("Reading the input file...")
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()

    print("Loading the 'ro' nlp-cube model...")

    cube = Cube(verbose=False)
    cube.load("ro")

    print("Creating the CoNLL sentences...")
    sentences = cube(text)

    print("\n" + "*"*124 + "\n")

    return sentences
示例#9
0
def get_lemmatized_vocabulary(unlemmatized_voc, epi, lang):
    if os.path.isfile('lemmatized_' + lang + '.p'):
        voc = p.load(open('lemmatized_' + lang + '.p', 'rb'))
        return voc

    # Lemmatizer
    lang_acron = {
        'gothic': 'got',
        'latin': 'la',
        'italian': 'it',
        'german': 'de',
        'greek': 'grc',
        'english': 'eng'
    }
    cube = Cube(verbose=False)
    cube.load(lang_acron[lang])

    voc = {}
    keys = list(unlemmatized_voc.keys())
    for i in tqdm(range(len(keys))):
        w = keys[i]
        sents = cube(w)
        if type(sents[0]) == list:
            for sent in sents:
                for token in sent:
                    if token.lemma != '_':
                        voc[w] = [
                            token.lemma,
                            epi.transliterate(token.lemma), 'L'
                        ]
                    else:
                        voc[w] = [
                            token.word,
                            epi.transliterate(token.word), 'T'
                        ]
        else:
            for token in sents:
                if token.lemma != '_':
                    voc[w] = [token.lemma, epi.transliterate(token.lemma), 'L']
                else:
                    voc[w] = [token.word, epi.transliterate(token.word), 'T']
    p.dump(voc, open('lemmatized_' + lang + '.p', 'wb'))
    return voc
示例#10
0
 def download_model(self):
     if self.lib.lower() == "stanford":
         print("-----------You are going to use Stanford library-----------")
         if self.lang.lower() == "basque":
             print("-------------You are going to use Basque model-------------")
             # MODELS_DIR = '/home/edercarbajo/eu'
             MODELS_DIR = 'J:\TextSimilarity\eu'
             stanfordnlp.download('eu', MODELS_DIR)  # Download the Basque models
             # config = {'processors': 'tokenize,pos,lemma,depparse',  # Comma-separated list of processors to use
             #           'lang': 'eu',  # Language code for the language to build the Pipeline in
             #           'tokenize_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_tokenizer.pt',
             #           # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
             #           'pos_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_tagger.pt',
             #           'pos_pretrain_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt.pretrain.pt',
             #           'lemma_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_lemmatizer.pt',
             #           'depparse_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_parser.pt',
             #           'depparse_pretrain_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt.pretrain.pt'
             #           }
             config = {'processors': 'tokenize,pos,lemma,depparse',  # Comma-separated list of processors to use
                       'lang': 'eu',  # Language code for the language to build the Pipeline in
                       'tokenize_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tokenizer.pt',
                       # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
                       'pos_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tagger.pt',
                       'pos_pretrain_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt',
                       'lemma_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_lemmatizer.pt',
                       'depparse_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_parser.pt',
                       'depparse_pretrain_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt'
                       }
             self.parser = stanfordnlp.Pipeline(**config)
         else:
             print("............Working...........")
     elif self.lib.lower() == "cube":
         print("-----------You are going to use Cube Library-----------")
         if self.lang.lower() == "basque":
             cube = Cube(verbose=True)
             cube.load("eu", "latest")
             self.parser = cube
         else:
             print("............Working...........")
     else:
         print("You cannot use this library. Introduce a valid library (Cube or Stanford)")
示例#11
0
def start():
    from analyzer import Analyzer
    p = ArgumentParser(description="python3 ./main.py -f \"laginak/*.doc.txt\" ")
    optional = p._action_groups.pop()  # Edited this line
    required = p.add_argument_group('Required arguments')
    required.add_argument("-f", "--files", nargs='+', help="Files to analyze (in .txt, .odt, .doc or .docx format)")
    optional.add_argument('-a', '--all', action='store_true', help="Generate a CSV file with all the results")
    optional.add_argument('-s', '--similarity', action='store_true', help="Calculate similarity (max. 5 files)")
    p._action_groups.append(optional)
    opts = p.parse_args()
    FileLoader.load_files(opts.files)
    FileLoader.load_irregular_verbs_list()
    FileLoader.load_dale_chall_list()
    FileLoader.load_connectives_list()
    FileLoader.load_oxford_word_list()
    cube = Cube(verbose=True)
    # Cargar modelo Cube
    cube.load("en", "latest")
    df_row = None
    ### Files will be created in this folder
    path = Printer.create_directory(FileLoader.files[0])
    file_num = 0
    total = len(FileLoader.files)
    for input in FileLoader.files:
        texto = Analyzer.process_text(input=input)
        # Analizar
        a = Analyzer(texto, input, cube)
        i = a.analyze(opts.similarity)
        df = a.create_dataframe()
        prediction = a.predict_dificulty(df)
        file_num += 1
        p = Printer(input, i)
        p.print_info(opts.similarity, prediction, file_num, total)
        if opts.all:
            df_row = p.write_in_full_csv(df_row, opts.similarity)
        p.generate_csv(path, prediction, opts.similarity)
    if opts.all:
        df_row.to_csv(os.path.join(path, "full_results_aztertest.csv"), encoding='utf-8', index=False)
示例#12
0
 def download_model(self):
     if self.lib.lower() == "stanford":
         print(
             "-----------You are going to use Stanford library-----------")
         if self.lang.lower() == "basque":
             print(
                 "-------------You are going to use Basque model-------------"
             )
             # MODELS_DIR = '/home/kepa/eu'
             MODELS_DIR = 'J:\TextSimilarity\eu'
             stanfordnlp.download('eu',
                                  MODELS_DIR)  # Download the Basque models
         elif self.lang.lower() == "english":
             print(
                 "-------------You are going to use English model-------------"
             )
             MODELS_DIR = '/home/kepa/en'
             print(
                 "-------------Downloading Stanford Basque model-------------"
             )
             stanfordnlp.download('en',
                                  MODELS_DIR)  # Download the Basque models
         elif self.lang.lower() == "spanish":
             print(
                 "-------------You are going to use Spanish model-------------"
             )
             MODELS_DIR = '/home/kepa/es'
             stanfordnlp.download('es',
                                  MODELS_DIR)  # Download the English models
         else:
             print("........You cannot use this language...........")
     elif self.lib.lower() == "cube":
         print("-----------You are going to use Cube Library-----------")
         if self.lang.lower() == "basque":
             cube = Cube(verbose=True)
             cube.load("eu", "latest")
         elif self.lang.lower() == "english":
             cube = Cube(verbose=True)
             cube.load("en", "latest")
         elif self.lang.lower() == "spanish":
             cube = Cube(verbose=True)
             cube.load("es", "latest")
         else:
             print("........You cannot use this language...........")
     else:
         print(
             "You cannot use this library. Introduce a valid library (Cube or Stanford)"
         )
示例#13
0
class CubeNLP(TeproApi):
    """By Tibi Boroș & co., does sentence splitting, tokenization,
    POS tagging, lemmatization and dependency parsing for Romanian."""
    def __init__(self):
        super().__init__()
        self._algoName = TeproAlgo.algoCube

    @staticmethod
    def sgml2unicode(word: str) -> str:
        word = word.replace("ă", "ă")
        word = word.replace("Ă", "Ă")
        word = word.replace("â", "â")
        word = word.replace("Â", "Â")
        word = word.replace("î", "î")
        word = word.replace("Î", "Î")
        word = word.replace("ş", "ș")
        word = word.replace("Ş", "Ș")
        word = word.replace("ţ", "ț")
        word = word.replace("Ţ", "Ț")

        return word

    @staticmethod
    def _readMSDMappings():
        m2c = {}

        with open(CTAG2MSDMAPFILE, mode="r") as f:
            for line in f:
                line = line.strip()
                parts = line.split()

                if len(parts) == 2:
                    msd = parts[0]
                    ctg = parts[1]
                    m2c[msd] = ctg
                # end if
            # end for line
        # end open file
        return m2c

    @staticmethod
    def _readTblWordForm():
        tbl = {}
        counter = 0

        with open(TBLWORDFORMFILE, mode="r", encoding="utf-8") as f:
            for line in f:
                counter += 1

                if counter > 0 and counter % 100000 == 0:
                    print("{0}.{1}[{2}]: loading tbl.wordform.ro, at line {3}".
                          format(
                              Path(inspect.stack()[0].filename).stem,
                              inspect.stack()[0].function,
                              inspect.stack()[0].lineno, counter),
                          file=sys.stderr,
                          flush=True)

                line = line.strip()

                if line.startswith("#"):
                    continue

                parts = line.split()

                if len(parts) == 3:
                    word = CubeNLP.sgml2unicode(parts[0])
                    lemma = CubeNLP.sgml2unicode(parts[1])

                    if lemma == '=':
                        lemma = word

                    msd = parts[2]

                    if word not in tbl:
                        tbl[word] = {}

                    if msd not in tbl[word]:
                        tbl[word][msd] = []

                    tbl[word][msd].append(lemma)
                # end if parts has 3 elems
            # end for line in f
        # end while open file
        return tbl

    def createApp(self):
        self._cubeInst = Cube(verbose=True)

    def loadResources(self):
        self._cubeInst.load('ro',
                            tokenization=True,
                            compound_word_expanding=False,
                            tagging=True,
                            lemmatization=True,
                            parsing=True)
        self._tblwordform = CubeNLP._readTblWordForm()
        self._msd2ctag = CubeNLP._readMSDMappings()

    def _runApp(self, dto, opNotDone):
        text = dto.getText()
        sentences = self._cubeInst(text)
        sid = 0

        for sent in sentences:
            # Teprolin tokenized sentence
            ttsent = []
            # Teprolin string sentence
            tssent = ""

            for tok in sent:
                tt = TeproTok()
                tt.setId(tok.index)
                tt.setWordForm(tok.word)
                lowerWord = tok.word.lower()
                tt.setMSD(tok.xpos)

                # Assigning the mapped CTAG to the disambiguated MSD
                if tok.xpos in self._msd2ctag:
                    tt.setCTAG(self._msd2ctag[tok.xpos])
                else:
                    tt.setCTAG(tok.xpos)

                lemmaIsSet = False

                # Doing lexicon lemmatization, if possible.
                if tok.word in self._tblwordform:
                    if tok.xpos in self._tblwordform[tok.word] and \
                            len(self._tblwordform[tok.word][tok.xpos]) == 1:
                        # TODO: if lemma is ambiguous, e.g. 'copii' can be 'copil' or 'copie'
                        tt.setLemma(self._tblwordform[tok.word][tok.xpos][0])
                        lemmaIsSet = True
                elif lowerWord in self._tblwordform and \
                        tok.xpos in self._tblwordform[lowerWord] and \
                        len(self._tblwordform[lowerWord][tok.xpos]) == 1:
                    tt.setLemma(self._tblwordform[lowerWord][tok.xpos][0])
                    lemmaIsSet = True

                if not lemmaIsSet:
                    tt.setLemma(tok.lemma)

                tt.setHead(tok.head)
                tt.setDepRel(tok.label)

                tssent += tok.word

                if tok.space_after != "SpaceAfter=No":
                    tssent += " "

                ttsent.append(tt)
            # end ttsent/tssent formation

            if not dto.isOpPerformed(TeproAlgo.getSentenceSplittingOperName()):
                dto.addSentenceString(tssent)
                dto.addSentenceTokens(ttsent)
            else:
                # Check and update annotations that only NLPCube
                # can produce or that are requested specifically from it.
                alignment = dto.alignSentences(ttsent, sid)

                for op in opNotDone:
                    dto.copyTokenAnnotation(ttsent, sid, alignment, op)

            sid += 1

        return dto
示例#14
0
#pentru parsare fisiere din db / carti

import os
import json
import fnmatch
from input_parser import input_parser
from nltk import tokenize
from textwrap import wrap

from cube.api import Cube
cube = Cube(verbose=True)
cube.load('ro')

booksDir = os.path.abspath(
    os.path.realpath(os.path.join(os.path.dirname(__file__), '../../DB')))
dataDir = os.path.abspath(
    os.path.realpath(os.path.join(os.path.dirname(__file__), '../data')))

books = []

for filename in os.listdir(booksDir):
    if filename.endswith('.txt') and fnmatch.fnmatch(filename, "2???_a_*"):
        books.append(filename)
    if len(books) > 100:
        break

for filename in books:
    book_content = open(os.path.join(booksDir, filename),
                        encoding="utf-8").read()

    if os.path.exists(
示例#15
0
 print("Found {} local models".format(len(local_models)))
 
 
 model_count = len(online_models)
 
 # step 1. download all models
 for online_model in online_models:        
     model, version = online_model[0], online_model[1]
     if not online_model in local_models:
         print("Downloading {}-{}".format(model,version))
     else:
         print("Model {}-{} is already downloaded.".format(model,version))
         continue
     cube = Cube()
     #cube.load(model, version)???
     cube.load(model)
  
 print("\n\n")
 for online_model in online_models:
     model, version = online_model[0], online_model[1]
     print("\n\nTesting model {}-{}, @{}".format(model,version, datetime.today()))
     if model == "pl":
         continue
     
     # go run Cube
     print("\t Reading metadata ...")        
     metadata = ModelMetadata()        
     metadata.read(os.path.join(local_model_path,model+"-"+str(version),"metadata.json"))
     
     
     mlanguage = metadata.language
示例#16
0
    's', 'x', 's', 's', 's', 's', 's', 's', 'm', 's', 's', 's', 's', 'm', 's',
    's', 's', 'm', 'm', 's', 's', 's', 's', 's', 's', 's', 's', 'm', 's', 'm',
    'm', 's', 's', 'm', 's', 's', 's', 'm', 'm', 's', 's', 'm', 's', 's', 'm',
    's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's',
    's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 'm', 's', 's',
    's', 's', 's', 's', 's', 'm', 's', 's', 'm', 's', 's', 's', 's', 's', 's',
    'm', 's', 'm', 's', 's', 's', 's', 'm', 's', 's', 's', 'm', 'm', 's', 's',
    's', 'm'
]

from cube.api import Cube

cube = Cube(verbose=True)
cube.load("ro",
          tokenization=True,
          compound_word_expanding=False,
          tagging=True,
          lemmatization=True,
          parsing=True)
cube_no_tok = Cube(verbose=True)
cube_no_tok.load("ro",
                 tokenization=False,
                 compound_word_expanding=False,
                 tagging=True,
                 lemmatization=True,
                 parsing=True)
multi_sentence_count = 0
errors = 0
multi_sentences = []
conllupdataset = []
for sentence in sentences:
    if "Alege: [" in sentence.sentence or "Decide tipul/clasa corecta: [" in sentence.sentence or len(
示例#17
0
def cli(ctx, language):
    cube = Cube(verbose=True)
    cube.load(language)

    ctx.ensure_object(dict)
    ctx.obj['CUBE'] = cube
示例#18
0
def map_bibles(f1,
               f2s,
               voc,
               l1='gothic',
               cube1=True,
               cube2=False,
               lemmatizer={},
               expand_voc=False):
    f1_dict = load_bible(open(f1, 'r'))

    lemma_l1 = False if cube1 == False else True

    count = 0
    mapped = {}

    all_words_count = 0
    found_words = {}
    unfound_words = {}

    for book in f1_dict:
        if book not in mapped:
            mapped[book] = {}
        for chapter in f1_dict[book]:
            if chapter not in mapped[book]:
                mapped[book][chapter] = {}
            for verse in f1_dict[book][chapter]:
                if verse not in mapped[book][chapter]:
                    mapped[book][chapter][verse] = {}

                count += 1
                if count % 500 == 0:
                    print(count)

                mapped[book][chapter][verse][l1] = f1_dict[book][chapter][
                    verse]
                if lemma_l1:
                    mapped[book][chapter][verse][l1 + '_analyzed'] = analyze(
                        f1_dict[book][chapter][verse], cube1, lang='got')
                else:
                    mapped[book][chapter][verse][
                        l1 + '_analyzed'] = fake_analyze(
                            f1_dict[book][chapter][verse],
                            lang='got',
                            lemmatizer=lemmatizer)

                if l1 == 'got':
                    mapped[book][chapter][verse][l1 + '_translation'] = {}
                    lemmas = list(
                        list(
                            zip(*mapped[book][chapter][verse][
                                l1 + '_analyzed']))[2])

                    for lang in all_languages:
                        if lang not in found_words:
                            found_words[lang] = {}
                        if lang not in unfound_words:
                            unfound_words[lang] = {}

                        if lang == 'Got':
                            mapped[book][chapter][verse][
                                l1 + '_translation'][lang] = lemmas
                            mapped[book][chapter][verse][l1 + '_translation'][
                                lang + '_script'] = [
                                    gothic_script_transformer(t)
                                    for t in lemmas
                                ]
                            mapped[book][chapter][verse][l1 + '_translation'][
                                lang + '_ipa'] = [
                                    ipa_transformer(t, 'gothic')
                                    for t in lemmas
                                ]
                        else:
                            if lang not in mapped[book][chapter][verse][
                                    l1 + '_translation']:
                                mapped[book][chapter][verse][
                                    l1 + '_translation'][lang] = []

                            for word in lemmas:
                                all_words_count += 1
                                if word in voc and lang in voc[word]:
                                    #if lang == 'Lat':
                                    #	print(word, voc[word][lang])

                                    if word not in found_words[lang]:
                                        found_words[lang][word] = 0
                                    found_words[lang][word] += 1
                                    mapped[book][chapter][verse][
                                        l1 + '_translation'][lang].append(
                                            voc[word][lang])
                                else:
                                    if word not in unfound_words[lang]:
                                        unfound_words[lang][word] = 0
                                    unfound_words[lang][word] += 1
                                    mapped[book][chapter][verse][
                                        l1 + '_translation'][lang].append([])

    for lang in found_words:
        found = sum([found_words[lang][w] for w in found_words[lang]])
        unfound = sum([unfound_words[lang][w] for w in unfound_words[lang]])
        print('token', lang, found, unfound,
              '%.4f' % (float(found) / (found + unfound + 1)))
        found = len(found_words[lang])
        unfound = len(unfound_words[lang])
        print('\ttype', lang, found, unfound,
              '%.4f' % (float(found) / (found + unfound + 1)))
    print('All words: {}'.format(all_words_count))

    for f in f2s:
        l2 = f.split('/')[0]

        langs_epitran = {
            'german': 'deu-Latn',
            'italian': 'ita-Latn',
            'latin': 'ita-Latn',
            'spanish': 'spa-Latn',
            'english': 'eng-Latn'
        }
        if l2 in langs_epitran:
            epi = epitran.Epitran(langs_epitran[l2])

        if cube2:
            lemma_l2 = lemma_l1
            cube2 = Cube(verbose=False)
            cube2.load(lang_acron[l2])

        #if f == 'greek/greek_byzantine_2000_utf8.txt':
        #	pdb.set_trace()
        f2_dict = load_bible(open(f, 'r'))
        lemma_l2 = False if cube1 == False else True

        for book in mapped:
            for chapter in mapped[book]:
                for verse in mapped[book][chapter]:
                    if book not in f2_dict or chapter not in f2_dict[
                            book] or verse not in f2_dict[book][chapter]:
                        #pdb.set_trace()
                        continue

                    if f2_dict[book][chapter][verse] in ['', '[]', []]:
                        continue
                    #print(f2_dict[book][chapter][verse])

                    #mapped_words = {'german':'deu-Latn', 'italian':'ita-Latn', 'latin':'ita-Latn', 'spanish':'Es', 'greek':'Gre'}
                    if lemma_l2:
                        mapped[book][chapter][verse][
                            l2 + '_analyzed'] = analyze(
                                f2_dict[book][chapter][verse], cube2, lang=l2)
                    else:
                        mapped[book][chapter][verse][
                            l2 + '_analyzed'] = fake_analyze(
                                f2_dict[book][chapter][verse], lang=l2)

                    # IPA
                    #try:
                    lemmas = [
                        x for x in (list(
                            zip(*mapped[book][chapter][verse][l2 +
                                                              '_analyzed']))[2]
                                    )
                    ]
                    #except:
                    #pdb.set_trace()
                    mapped[book][chapter][verse][l2] = lemmas
                    if l2 == 'greek':
                        mapped[book][chapter][verse][l2 + '_ipa'] = [
                            ipa_transformer(l, 'greek') for l in lemmas
                        ]  #.split()
                    elif l2 == 'old_english':
                        try:
                            mapped[book][chapter][verse][l2 + '_ipa'] = [
                                oe(l) for l in lemmas
                            ]  #.split()
                        except:
                            pass
                    elif l2 == 'english':
                        try:
                            mapped[book][chapter][verse][l2 + '_ipa'] = [
                                oe(l) for l in lemmas
                            ]  #.split() #gen_mods.get_final(gen_mods.getIPA_CMU(f2_dict[book][chapter][verse]))
                        except:
                            pass
                    else:
                        mapped[book][chapter][verse][l2 + '_ipa'] = [
                            epi.transliterate(l) for l in lemmas
                        ]  #.split()

                    expand_voc = False
                    if l2 != 'Got' and expand_voc:
                        expand_voc_by_distance(voc, mapped, book, chapter,
                                               verse, l2)
                        #pdb.set_trace()

    return mapped
示例#19
0
            for entry in LSentence:
                print(entry)
            print("")


if __name__ == '__main__':
    if True:
        found_nl = False
        for iso in CubeNLPPOS.get_L_supported_isos(None):
            if iso not in ('nno', 'nnb'):
                continue

            from cube.api import Cube  # import the Cube object

            cube = Cube(verbose=True)  # initialize it
            cube.load(DSupportedISOs[iso])

    print_pos(
        'id',
        'Tahap pertama konflik ini dapat disebut "Perang Kemerdekaan Belanda".'
    )
    print_pos(
        'en',
        'The first phase of the conflict can be considered the Dutch War of Independence.'
    )

    print_pos('id', 'Saya tidak dapat memakan ini.')
    print_pos('en', 'I can\'t eat this.')

    print_pos('zh', '猴子高兴,实验人员也高兴。')
    print_pos('en', 'The monkeys were happy and the experimenters were happy.')
示例#20
0
文件: tasks.py 项目: Luiscri/MABSED
# filter
import os
import csv
import json
from filter.filter import filter_spam

# lemmatizer
from lemmatizer.lemmatizer import lemmatize
from cube.api import Cube

# detection
sys.path.insert(0, './detector/')
from detect_events import main as detect_events

lemmatizer = Cube(verbose=True)
lemmatizer.load("es", tokenization=False, parsing=False)


class Streamer(luigi.ExternalTask):
    time_slice = luigi.parameter.DateMinuteParameter(interval=30)

    def output(self):
        fname = '../data/streaming/{}.csv'.format(self.time_slice)
        # print('Requires: {}'.format(fname))
        return luigi.LocalTarget(fname)


class Preprocess(luigi.Task):
    time_slice = luigi.parameter.DateMinuteParameter(
        interval=30, default=datetime.datetime.today())
示例#21
0
from cube.api import Cube

cube = Cube(verbose=True)

cube.load("en", local_models_repository="/mnt/d/nlpcube/")

text = "One potential microRNA that regulates Bcan is miR-9 and overexpression of miR-9 can partly rescue the effects of Dicer1 deletion on the MG phenotype."

sentences = cube(text)

for sentence in sentences:
    for entry in sentence:
        print(
            str(entry.index) + "\t" + entry.word + "\t" + entry.lemma + "\t" +
            entry.upos + "\t" + entry.xpos + "\t" + entry.attrs + "\t" +
            str(entry.head) + "\t" + str(entry.label) + "\t" +
            entry.space_after)
    print("")
示例#22
0
import tnkeeh as tn
from farasa.segmenter import FarasaSegmenter

# for bulgarian and turkish
from cube.api import Cube

en_tok = MosesTokenizer(lang="en")
en_normalizer = MosesPunctNormalizer()
# TODO: change hardcoding of jar file to a arg from cli
rdrsegmenter = VnCoreNLP("vncorenlp/VnCoreNLP-1.1.1.jar",
                         annotators="wseg",
                         max_heap_size="-Xmx500m")
ar_segmenter = FarasaSegmenter()

bg_cube = Cube(verbose=False)
bg_cube.load("bg")

tr_cube = Cube(verbose=False)
tr_cube.load("tr")


def clean_ar_text(
    text,
    segment=False,
    remove_special_chars=False,
    remove_english=False,
    normalize=False,
    remove_diacritics=False,
    excluded_chars=[],
    remove_tatweel=False,
    remove_html_elements=False,
示例#23
0
    local_models = model_store_object.list_local_models()
    print("Found {} local models".format(len(local_models)))

    model_count = len(online_models)

    # step 1. download all models
    for online_model in online_models:
        model, version = online_model[0], online_model[1]
        if not online_model in local_models:
            print("Downloading {}-{}".format(model, version))
        else:
            print("Model {}-{} is already downloaded.".format(model, version))
            continue
        cube = Cube()
        cube.load(model, version, local_models_repository=local_model_path)
        #cube.load(model)

    print("\n\n")
    #for online_model in local_models: #local_models+online_models:
    for online_model in local_models + online_models:
        model, version = online_model[0], online_model[1]
        print("\n\nTesting model {}-{}, @{}".format(model, version,
                                                    datetime.today()))
        if model == "pl":
            continue

        # go run Cube
        print("\t Reading metadata ...")
        metadata = ModelMetadata()
        metadata.read(
from cube.api import Cube

cube = Cube(verbose=True)
cube.load(
    'ro', tokenization=True,
    compound_word_expanding=False,
    tagging=True,
    lemmatization=True,
    parsing=True
)
示例#25
0
import pandas as pd
import os
import sys
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import xml.etree.ElementTree as ET
import numpy as np
import nltk
import string
from nltk.stem import PorterStemmer, SnowballStemmer

from cube.api import Cube
cube = Cube(verbose=True)
cube.load("ro")

#import baza
df = pd.read_csv('pcgarage.csv', delimiter='\t', encoding='utf-16', header=0)

#preprocesare
#eliminarea elementelor nedorite
df['pro'] = df['pro'].str[4:]
df['contra'] = df['contra'].str[8:]
df['altele'] = df['altele'].str[8:]
#concatenarea partilor
df["corpus"] = df["pro"].astype(str) + " " + df["contra"].astype(
    str) + " " + df["altele"].astype(str)
data = df[['product', 'rating', 'corpus']].copy()
data['corpus'] = [it.lower().replace('\n\n', ' ') for it in data['corpus']]
#transformarea majusculelor
data['corpus'] = data.corpus.map(lambda x: x.lower())
#tokenizarea
示例#26
0
# implement NLP-Cube https://github.com/adobe/NLP-Cube for sent and word tokenization, and lemma
from cube.api import Cube  # import the Cube object
cube = Cube(verbose=False)  # initialize it
cube.load("en")
from rusenttokenize import ru_sent_tokenize

import re


class mytokenizer:
    def bracket_mask(self, text):

        text_temp = re.sub("\[", "<<", text)
        text_temp = re.sub("\]", ">>", text_temp)
        text_temp = re.sub("\+", "===", text_temp)
        parenthesis = re.findall("\([^()]*\)", text_temp)
        bracket = re.findall("\[[^\[\]]*\]", text)
        if not parenthesis and not bracket:
            return text

        for p in parenthesis:
            p = re.sub("[\(\)]", "", p)
            parts = [re.sub("\.\s*$", "", s) for s in ru_sent_tokenize(p)]
            new_p = " ; ".join(parts)
            #print ("P:", p)
            #print("new_p:",new_p,"\n")
            #print("text_temp:",text_temp)
            text_temp = re.sub(p, new_p, text_temp)

        text = re.sub("<<", "[", text_temp)
        text = re.sub(">>", "]", text)
示例#27
0
####################################################################################
if recompute_histograms or not os.path.isfile(histogram_picklefile_gf %
                                              (category, FREQUENCY_THRESH)):
    if category == "All":
        D_coords_fixated, D_histogram, D_entropy, D_entropy_df = \
         merge_histograms(histogram_picklefile_gf)
        imnames = list(set(D_coords_fixated.keys()))
        pickle.dump([D_coords_fixated, D_histogram, D_entropy, D_entropy_df],
                    open(
                        histogram_picklefile_gf % (category, FREQUENCY_THRESH),
                        "wb"))
    else:
        D_coords_fixated = get_raw_data()
        imnames = list(set(D_coords_fixated.keys()))
        cube = Cube(verbose=True)
        cube.load('en')
        start = time.time()
        D_histogram, D_entropy, D_entropy_df = compute_histograms(
            D_coords_fixated,
            imnames,
            category,
            is_grouping=True,
            fre_threshold=FREQUENCY_THRESH)
        print(time.time() - start)
        pickle.dump([D_coords_fixated, D_histogram, D_entropy, D_entropy_df],
                    open(
                        histogram_picklefile_gf % (category, FREQUENCY_THRESH),
                        "wb"))
        print(histogram_picklefile_gf % (category, FREQUENCY_THRESH))
else:
    with open(histogram_picklefile_gf % (category, FREQUENCY_THRESH),
示例#28
0
    def load_model(self):
        if self.lib.lower() == "stanford":
            print(
                "-----------You are going to use Stanford library-----------")
            if self.lang.lower() == "basque":
                print(
                    "-------------You are going to use Basque model-------------"
                )
                # config = {'processors': 'tokenize,pos,lemma,depparse',  # Comma-separated list of processors to use
                #            'lang': 'eu',  # Language code for the language to build the Pipeline in
                #            'tokenize_model_path': '/home/kepa/eu/eu_bdt_models/eu_bdt_tokenizer.pt',
                #            # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
                #            'pos_model_path': '/home/kepa/eu/eu_bdt_models/eu_bdt_tagger.pt',
                #            'pos_pretrain_path': '/home/kepa/eu/eu_bdt_models/eu_bdt.pretrain.pt',
                #            'lemma_model_path': '/home/kepa/eu/eu_bdt_models/eu_bdt_lemmatizer.pt',
                #            'depparse_model_path': '/home/kepa/eu/eu_bdt_models/eu_bdt_parser.pt',
                #            'depparse_pretrain_path': '/home/kepa/eu/eu_bdt_models/eu_bdt.pretrain.pt'
                #            }
                config = {
                    'processors':
                    'tokenize,pos,lemma,depparse',  # Comma-separated list of processors to use
                    'lang':
                    'eu',  # Language code for the language to build the Pipeline in
                    'tokenize_model_path':
                    'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tokenizer.pt',
                    # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
                    'pos_model_path':
                    'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tagger.pt',
                    'pos_pretrain_path':
                    'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt',
                    'lemma_model_path':
                    'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_lemmatizer.pt',
                    'depparse_model_path':
                    'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_parser.pt',
                    'depparse_pretrain_path':
                    'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt'
                }
                self.parser = stanfordnlp.Pipeline(**config)

            elif self.lang.lower() == "english":
                print(
                    "-------------You are going to use English model-------------"
                )
                config = {
                    'processors':
                    'tokenize,mwt,pos,lemma,depparse',  # Comma-separated list of processors to use
                    'lang':
                    'en',  # Language code for the language to build the Pipeline in
                    'tokenize_model_path':
                    '/home/kepa/en/en_ewt_models/en_ewt_tokenizer.pt',  # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
                    #'mwt_model_path': './fr_gsd_models/fr_gsd_mwt_expander.pt',
                    'pos_model_path':
                    '/home/kepa/en/en_ewt_models/en_ewt_tagger.pt',
                    'pos_pretrain_path':
                    '/home/kepa/en/en_ewt_models/en_ewt.pretrain.pt',
                    'lemma_model_path':
                    '/home/kepa/en/en_ewt_models/en_ewt_lemmatizer.pt',
                    'depparse_model_path':
                    '/home/kepa/en/en_ewt_models/en_ewt_parser.pt',
                    'depparse_pretrain_path':
                    '/home/kepa/en/en_ewt_models/en_ewt.pretrain.pt'
                }
                self.parser = stanfordnlp.Pipeline(**config)
            elif self.lang.lower() == "spanish":
                print(
                    "-------------You are going to use Spanish model-------------"
                )
                config = {
                    'processors':
                    'tokenize,pos,lemma,depparse',  # Comma-separated list of processors to use
                    'lang':
                    'es',  # Language code for the language to build the Pipeline in
                    'tokenize_model_path':
                    '/home/kepa/es/es_ancora_models/es_ancora_tokenizer.pt',  # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
                    'pos_model_path':
                    '/home/kepa/es/es_ancora_models/es_ancora_tagger.pt',
                    'pos_pretrain_path':
                    '/home/kepa/es/es_ancora_models/es_ancora.pretrain.pt',
                    'lemma_model_path':
                    '/home/kepa/es/es_ancora_models/es_ancora_lemmatizer.pt',
                    'depparse_model_path':
                    '/home/kepa/es/es_ancora_models/es_ancora_parser.pt',
                    'depparse_pretrain_path':
                    '/home/kepa/es/es_ancora_models/es_ancora.pretrain.pt'
                }
                self.parser = stanfordnlp.Pipeline(**config)
            else:
                print("........You cannot use this language...........")
        elif self.lib.lower() == "cube":
            print("-----------You are going to use Cube Library-----------")
            if self.lang.lower() == "basque":
                #load(self, language_code, version="latest",local_models_repository=None,
                #local_embeddings_file=None, tokenization=True, compound_word_expanding=False,
                #tagging=True, lemmatization=True, parsing=True).
                #Ejemplo:load("es",tokenization=False, parsing=False)
                cube = Cube(verbose=True)
                cube.load("eu", "latest")
            elif self.lang.lower() == "english":
                cube = Cube(verbose=True)
                cube.load("en", "latest")
            elif self.lang.lower() == "spanish":
                cube = Cube(verbose=True)
                cube.load("es", "latest")
            else:
                print("........You cannot use this language...........")
        else:
            print(
                "You cannot use this library. Introduce a valid library (Cube or Stanford)"
            )
示例#29
0
from typing import List, Dict
from urllib.parse import urlparse, parse_qs
import json

from abbrev import full_to_abbrev
from conllu_msd_to_monomial import MSD_dict
from msd_convert import UPOS_to_MSD, MSD_to_attribs

hostName = "localhost"
serverPort = 8080
QUERY = 'q'

from cube.api import Cube

ro_cube=Cube(verbose=True)         # initialize it
ro_cube.load("ro")                 # select the desired language (it will auto-download the model on first run)


class POSTagRequestHandler(http.server.BaseHTTPRequestHandler):
    def do_GET(self):
        parse = urlparse(self.path)
        query = parse_qs(parse.query)
        query = {k:' '.join(query[k]) for k in query.keys()}
        self.send_response(200)
        self.send_header("Content-type", "text/html")
        self.end_headers()
        # self.wfile.write(bytes("<html><head><title>https://pythonbasics.org</title></head>", "utf-8"))
        self.wfile.write(bytes(self._process_input(query[QUERY]), "utf-8"))
        # self.wfile.write(bytes("</body></html>", "utf-8"))

    def end_headers(self):
示例#30
0
def create_pickle():
    cube = Cube(verbose=True)
    cube.load("ja")
    with open('cube.pickle', mode='wb') as wh:
        pickle.dump(cube, wh)