Пример #1
0
def write_core_format_into_conllup_file(sentences, filepath):
    print(
        "Converting {} sentences into CONLLUP format. This requires a text preprocessor for Romanian. If the following function fails please install NLP-Cube (pip3 install nlpcube)."
    )

    from cube.api import Cube
    cube = Cube(verbose=True)
    cube.load("ro",
              tokenization=True,
              compound_word_expanding=False,
              tagging=True,
              lemmatization=True,
              parsing=True)
    cube_no_tok = Cube(verbose=True)
    cube_no_tok.load("ro",
                     tokenization=False,
                     compound_word_expanding=False,
                     tagging=True,
                     lemmatization=True,
                     parsing=True)

    conllupdataset = []
    for sentence in sentences:
        sentence = process_split_exceptions(sentence)
        conllupsentence = _conllup_to_core_sentence(sentence, cube,
                                                    cube_no_tok)
        conllupdataset.append(conllupsentence)

    write_file(filepath, conllupdataset)
Пример #2
0
def prepare_dialogs_sorted_by_lang(dialog_ids,
                                   dialog_path,
                                   prepared_path,
                                   start_date,
                                   end_date,
                                   additional_options=""):
    dialog_ids_sorted_by_lang = {"ua": [], "ru": [], "en": []}
    if dialog_ids[0] == -1:
        for filename in os.listdir(dialog_path):
            data = pd.read_csv(f"{dialog_path}/{filename}")
            lang = detect_data_language(data)
            dialog_ids_sorted_by_lang[lang].append(filename[:-4])

    else:
        for dialog in dialog_ids:
            data = pd.read_csv(f"{dialog_path}/{dialog}.csv")
            lang = detect_data_language(data)
            dialog_ids_sorted_by_lang[lang].append(dialog)

    print("dialog_ids_sorted_by_lang")
    pprint(dialog_ids_sorted_by_lang)

    n_all_dialogs = sum([
        len(dialog_ids_sorted_by_lang[lang])
        for lang in dialog_ids_sorted_by_lang.keys()
    ])
    n_dialog = 0
    for lang in dialog_ids_sorted_by_lang.keys():
        if not dialog_ids_sorted_by_lang[lang]:
            continue

        cube = ""
        if lang == "ua":
            cube = Cube(verbose=True)
            cube.load("uk")

        elif lang == "en":
            cube = Cube(verbose=True)
            cube.load("en")

        for dialog_id in dialog_ids_sorted_by_lang[lang]:
            if f"{dialog_id}.csv" in os.listdir(prepared_path):
                print(
                    f"=========WARNING: {dialog_id}.csv already in {prepared_path}"
                )
                n_dialog += 1
                continue

            n_dialog += 1
            print(
                f"\n=======Language {lang}, dialog_id {dialog_id}-- {n_dialog} from {n_all_dialogs}======="
            )
            prepare_dialogs(lang, cube, dialog_id, prepared_path, dialog_path,
                            start_date, end_date, "words_frequency",
                            additional_options)
Пример #3
0
 def download_model(self):
     if self.lib.lower() == "stanford":
         print(
             "-----------You are going to use Stanford library-----------")
         if self.lang.lower() == "basque":
             print(
                 "-------------You are going to use Basque model-------------"
             )
             # MODELS_DIR = '/home/kepa/eu'
             MODELS_DIR = 'J:\TextSimilarity\eu'
             stanfordnlp.download('eu',
                                  MODELS_DIR)  # Download the Basque models
         elif self.lang.lower() == "english":
             print(
                 "-------------You are going to use English model-------------"
             )
             MODELS_DIR = '/home/kepa/en'
             print(
                 "-------------Downloading Stanford Basque model-------------"
             )
             stanfordnlp.download('en',
                                  MODELS_DIR)  # Download the Basque models
         elif self.lang.lower() == "spanish":
             print(
                 "-------------You are going to use Spanish model-------------"
             )
             MODELS_DIR = '/home/kepa/es'
             stanfordnlp.download('es',
                                  MODELS_DIR)  # Download the English models
         else:
             print("........You cannot use this language...........")
     elif self.lib.lower() == "cube":
         print("-----------You are going to use Cube Library-----------")
         if self.lang.lower() == "basque":
             cube = Cube(verbose=True)
             cube.load("eu", "latest")
         elif self.lang.lower() == "english":
             cube = Cube(verbose=True)
             cube.load("en", "latest")
         elif self.lang.lower() == "spanish":
             cube = Cube(verbose=True)
             cube.load("es", "latest")
         else:
             print("........You cannot use this language...........")
     else:
         print(
             "You cannot use this library. Introduce a valid library (Cube or Stanford)"
         )
Пример #4
0
def main(filename):
    cube = Cube(verbose=True)
    cube.load('en')

    with open('words.txt') as f:
        word_list = [line.rstrip() for line in f.readlines()]
        word_set = set(word_list)

    with open('my_words.txt') as f:
        my_word_list = [line.rstrip() for line in f.readlines()]
        my_word_set = set(my_word_list)

    text = srt_to_text(filename)
    sentences = cube(text)
    new_words = []
    for sentence in sentences:
        for entry in sentence:
            if entry.lemma in word_set and entry.lemma not in my_word_list:
                if entry.lemma not in new_words:
                    new_words.append(entry.lemma)
    print('-' * 100)
    print(f'{len(new_words)} new words are found.')
    print('-' * 100)
    for i, word in enumerate(new_words):
        print(i, word)
Пример #5
0
 def test_4_3_run_model_with_default_external_embeddings(self):  
     print("\n\33[33m{}\33[0m".format("4.3. Run a local model with default external embeddings ..."))                        
     from cube.api import Cube
     cube = Cube(verbose=True)
     cube.load('my_model', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True)        
     text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln."
     sentences = cube(text)
     self.assertTrue(len(sentences)>0)
     self.assertTrue(len(sentences[0])>0)       
Пример #6
0
 def test_2_run_a_local_model(self):  
     print("\n\33[33m{}\33[0m".format("2. Run a local model that does not have embeddings or metadata (running with dummy.vec embeddings) ..."))
     embeddings = os.path.join(self.root_path, "examples","wiki.dummy.vec")
     from cube.api import Cube
     cube = Cube(verbose=True)
     cube.load('my_model', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True, local_models_repository=self.local_model_repo, local_embeddings_file=embeddings)        
     text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln."
     sentences = cube(text)
     self.assertTrue(len(sentences)>0)
     self.assertTrue(len(sentences[0])>0)   
Пример #7
0
 def test_1_2_download_and_run_an_online_model_specific_version(self):                                    
     print("\n\33[33m{}\33[0m".format("1.2. Loading an online model (sme, 1.0) ..."))
     from cube.api import Cube
     cube = Cube(verbose=True)
     cube.load('sme', version='1.0', tokenization=True, compound_word_expanding=False, tagging=False, lemmatization=False, parsing=False)
     cube.metadata.info()
     text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln."
     sentences = cube(text)
     self.assertTrue(len(sentences)>0)
     self.assertTrue(len(sentences[0])>0)        
Пример #8
0
 def test_3_3_run_model_with_manual_embeddings(self):  
     print("\n\33[33m{}\33[0m".format("3.3. Run a local model with manual embeddings ..."))                
     embeddings = os.path.join(self.root_path, "examples","wiki.dummy.vec")
     print("\t\tPath to local manual embeddings file: "+embeddings)
     from cube.api import Cube
     cube = Cube(verbose=True)
     cube.load('my_model', tokenization=True, compound_word_expanding=False, tagging=True, lemmatization=True, parsing=True, local_embeddings_file=embeddings)        
     text = "I'm a success today because I had a friend who believed in me and I didn't have the heart to let him down. This is a quote by Abraham Lincoln."
     sentences = cube(text)
     self.assertTrue(len(sentences)>0)
     self.assertTrue(len(sentences[0])>0)   
Пример #9
0
    def __init__(self, iso, use_gpu=False):
        from pos_tagger.engines.cubenlp_pos.CubeNLPPOS import \
            DSupportedISOs
        assert iso in DSupportedISOs

        # Note the GPU argument - it may be worth putting
        # this on the PC with a more powerful one
        from cube.api import Cube
        cube_inst = self.cube_inst = Cube(verbose=True, use_gpu=use_gpu)
        # Chinese doesn't seem to work well with version 1.0,
        # [[??? <-  I think this means 1.1??]]
        # despite the scores not showing much difference
        cube_inst.load(DSupportedISOs[iso],
                       version='1.0' if iso == 'zh' else '1.1')
        EngineInstance.__init__(self, iso, use_gpu)
Пример #10
0
def create_conll_sentences(file_path):
    print("*"*25 + "  Working on transforming the input file '{}' to CoNLL format  ".format(file_path) + "*"*25 + "\n")

    print("Reading the input file...")
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()

    print("Loading the 'ro' nlp-cube model...")

    cube = Cube(verbose=False)
    cube.load("ro")

    print("Creating the CoNLL sentences...")
    sentences = cube(text)

    print("\n" + "*"*124 + "\n")

    return sentences
Пример #11
0
def get_lemmatized_vocabulary(unlemmatized_voc, epi, lang):
    if os.path.isfile('lemmatized_' + lang + '.p'):
        voc = p.load(open('lemmatized_' + lang + '.p', 'rb'))
        return voc

    # Lemmatizer
    lang_acron = {
        'gothic': 'got',
        'latin': 'la',
        'italian': 'it',
        'german': 'de',
        'greek': 'grc',
        'english': 'eng'
    }
    cube = Cube(verbose=False)
    cube.load(lang_acron[lang])

    voc = {}
    keys = list(unlemmatized_voc.keys())
    for i in tqdm(range(len(keys))):
        w = keys[i]
        sents = cube(w)
        if type(sents[0]) == list:
            for sent in sents:
                for token in sent:
                    if token.lemma != '_':
                        voc[w] = [
                            token.lemma,
                            epi.transliterate(token.lemma), 'L'
                        ]
                    else:
                        voc[w] = [
                            token.word,
                            epi.transliterate(token.word), 'T'
                        ]
        else:
            for token in sents:
                if token.lemma != '_':
                    voc[w] = [token.lemma, epi.transliterate(token.lemma), 'L']
                else:
                    voc[w] = [token.word, epi.transliterate(token.word), 'T']
    p.dump(voc, open('lemmatized_' + lang + '.p', 'wb'))
    return voc
Пример #12
0
 def download_model(self):
     if self.lib.lower() == "stanford":
         print("-----------You are going to use Stanford library-----------")
         if self.lang.lower() == "basque":
             print("-------------You are going to use Basque model-------------")
             # MODELS_DIR = '/home/edercarbajo/eu'
             MODELS_DIR = 'J:\TextSimilarity\eu'
             stanfordnlp.download('eu', MODELS_DIR)  # Download the Basque models
             # config = {'processors': 'tokenize,pos,lemma,depparse',  # Comma-separated list of processors to use
             #           'lang': 'eu',  # Language code for the language to build the Pipeline in
             #           'tokenize_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_tokenizer.pt',
             #           # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
             #           'pos_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_tagger.pt',
             #           'pos_pretrain_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt.pretrain.pt',
             #           'lemma_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_lemmatizer.pt',
             #           'depparse_model_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt_parser.pt',
             #           'depparse_pretrain_path': '/home/edercarbajo/eu/eu_bdt_models/eu_bdt.pretrain.pt'
             #           }
             config = {'processors': 'tokenize,pos,lemma,depparse',  # Comma-separated list of processors to use
                       'lang': 'eu',  # Language code for the language to build the Pipeline in
                       'tokenize_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tokenizer.pt',
                       # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
                       'pos_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tagger.pt',
                       'pos_pretrain_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt',
                       'lemma_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_lemmatizer.pt',
                       'depparse_model_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_parser.pt',
                       'depparse_pretrain_path': 'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt'
                       }
             self.parser = stanfordnlp.Pipeline(**config)
         else:
             print("............Working...........")
     elif self.lib.lower() == "cube":
         print("-----------You are going to use Cube Library-----------")
         if self.lang.lower() == "basque":
             cube = Cube(verbose=True)
             cube.load("eu", "latest")
             self.parser = cube
         else:
             print("............Working...........")
     else:
         print("You cannot use this library. Introduce a valid library (Cube or Stanford)")
Пример #13
0
def start():
    from analyzer import Analyzer
    p = ArgumentParser(description="python3 ./main.py -f \"laginak/*.doc.txt\" ")
    optional = p._action_groups.pop()  # Edited this line
    required = p.add_argument_group('Required arguments')
    required.add_argument("-f", "--files", nargs='+', help="Files to analyze (in .txt, .odt, .doc or .docx format)")
    optional.add_argument('-a', '--all', action='store_true', help="Generate a CSV file with all the results")
    optional.add_argument('-s', '--similarity', action='store_true', help="Calculate similarity (max. 5 files)")
    p._action_groups.append(optional)
    opts = p.parse_args()
    FileLoader.load_files(opts.files)
    FileLoader.load_irregular_verbs_list()
    FileLoader.load_dale_chall_list()
    FileLoader.load_connectives_list()
    FileLoader.load_oxford_word_list()
    cube = Cube(verbose=True)
    # Cargar modelo Cube
    cube.load("en", "latest")
    df_row = None
    ### Files will be created in this folder
    path = Printer.create_directory(FileLoader.files[0])
    file_num = 0
    total = len(FileLoader.files)
    for input in FileLoader.files:
        texto = Analyzer.process_text(input=input)
        # Analizar
        a = Analyzer(texto, input, cube)
        i = a.analyze(opts.similarity)
        df = a.create_dataframe()
        prediction = a.predict_dificulty(df)
        file_num += 1
        p = Printer(input, i)
        p.print_info(opts.similarity, prediction, file_num, total)
        if opts.all:
            df_row = p.write_in_full_csv(df_row, opts.similarity)
        p.generate_csv(path, prediction, opts.similarity)
    if opts.all:
        df_row.to_csv(os.path.join(path, "full_results_aztertest.csv"), encoding='utf-8', index=False)
Пример #14
0
 def createApp(self):
     self._cubeInst = Cube(verbose=True)
Пример #15
0
def cli(ctx, language):
    cube = Cube(verbose=True)
    cube.load(language)

    ctx.ensure_object(dict)
    ctx.obj['CUBE'] = cube
Пример #16
0
def map_bibles(f1,
               f2s,
               voc,
               l1='gothic',
               cube1=True,
               cube2=False,
               lemmatizer={},
               expand_voc=False):
    f1_dict = load_bible(open(f1, 'r'))

    lemma_l1 = False if cube1 == False else True

    count = 0
    mapped = {}

    all_words_count = 0
    found_words = {}
    unfound_words = {}

    for book in f1_dict:
        if book not in mapped:
            mapped[book] = {}
        for chapter in f1_dict[book]:
            if chapter not in mapped[book]:
                mapped[book][chapter] = {}
            for verse in f1_dict[book][chapter]:
                if verse not in mapped[book][chapter]:
                    mapped[book][chapter][verse] = {}

                count += 1
                if count % 500 == 0:
                    print(count)

                mapped[book][chapter][verse][l1] = f1_dict[book][chapter][
                    verse]
                if lemma_l1:
                    mapped[book][chapter][verse][l1 + '_analyzed'] = analyze(
                        f1_dict[book][chapter][verse], cube1, lang='got')
                else:
                    mapped[book][chapter][verse][
                        l1 + '_analyzed'] = fake_analyze(
                            f1_dict[book][chapter][verse],
                            lang='got',
                            lemmatizer=lemmatizer)

                if l1 == 'got':
                    mapped[book][chapter][verse][l1 + '_translation'] = {}
                    lemmas = list(
                        list(
                            zip(*mapped[book][chapter][verse][
                                l1 + '_analyzed']))[2])

                    for lang in all_languages:
                        if lang not in found_words:
                            found_words[lang] = {}
                        if lang not in unfound_words:
                            unfound_words[lang] = {}

                        if lang == 'Got':
                            mapped[book][chapter][verse][
                                l1 + '_translation'][lang] = lemmas
                            mapped[book][chapter][verse][l1 + '_translation'][
                                lang + '_script'] = [
                                    gothic_script_transformer(t)
                                    for t in lemmas
                                ]
                            mapped[book][chapter][verse][l1 + '_translation'][
                                lang + '_ipa'] = [
                                    ipa_transformer(t, 'gothic')
                                    for t in lemmas
                                ]
                        else:
                            if lang not in mapped[book][chapter][verse][
                                    l1 + '_translation']:
                                mapped[book][chapter][verse][
                                    l1 + '_translation'][lang] = []

                            for word in lemmas:
                                all_words_count += 1
                                if word in voc and lang in voc[word]:
                                    #if lang == 'Lat':
                                    #	print(word, voc[word][lang])

                                    if word not in found_words[lang]:
                                        found_words[lang][word] = 0
                                    found_words[lang][word] += 1
                                    mapped[book][chapter][verse][
                                        l1 + '_translation'][lang].append(
                                            voc[word][lang])
                                else:
                                    if word not in unfound_words[lang]:
                                        unfound_words[lang][word] = 0
                                    unfound_words[lang][word] += 1
                                    mapped[book][chapter][verse][
                                        l1 + '_translation'][lang].append([])

    for lang in found_words:
        found = sum([found_words[lang][w] for w in found_words[lang]])
        unfound = sum([unfound_words[lang][w] for w in unfound_words[lang]])
        print('token', lang, found, unfound,
              '%.4f' % (float(found) / (found + unfound + 1)))
        found = len(found_words[lang])
        unfound = len(unfound_words[lang])
        print('\ttype', lang, found, unfound,
              '%.4f' % (float(found) / (found + unfound + 1)))
    print('All words: {}'.format(all_words_count))

    for f in f2s:
        l2 = f.split('/')[0]

        langs_epitran = {
            'german': 'deu-Latn',
            'italian': 'ita-Latn',
            'latin': 'ita-Latn',
            'spanish': 'spa-Latn',
            'english': 'eng-Latn'
        }
        if l2 in langs_epitran:
            epi = epitran.Epitran(langs_epitran[l2])

        if cube2:
            lemma_l2 = lemma_l1
            cube2 = Cube(verbose=False)
            cube2.load(lang_acron[l2])

        #if f == 'greek/greek_byzantine_2000_utf8.txt':
        #	pdb.set_trace()
        f2_dict = load_bible(open(f, 'r'))
        lemma_l2 = False if cube1 == False else True

        for book in mapped:
            for chapter in mapped[book]:
                for verse in mapped[book][chapter]:
                    if book not in f2_dict or chapter not in f2_dict[
                            book] or verse not in f2_dict[book][chapter]:
                        #pdb.set_trace()
                        continue

                    if f2_dict[book][chapter][verse] in ['', '[]', []]:
                        continue
                    #print(f2_dict[book][chapter][verse])

                    #mapped_words = {'german':'deu-Latn', 'italian':'ita-Latn', 'latin':'ita-Latn', 'spanish':'Es', 'greek':'Gre'}
                    if lemma_l2:
                        mapped[book][chapter][verse][
                            l2 + '_analyzed'] = analyze(
                                f2_dict[book][chapter][verse], cube2, lang=l2)
                    else:
                        mapped[book][chapter][verse][
                            l2 + '_analyzed'] = fake_analyze(
                                f2_dict[book][chapter][verse], lang=l2)

                    # IPA
                    #try:
                    lemmas = [
                        x for x in (list(
                            zip(*mapped[book][chapter][verse][l2 +
                                                              '_analyzed']))[2]
                                    )
                    ]
                    #except:
                    #pdb.set_trace()
                    mapped[book][chapter][verse][l2] = lemmas
                    if l2 == 'greek':
                        mapped[book][chapter][verse][l2 + '_ipa'] = [
                            ipa_transformer(l, 'greek') for l in lemmas
                        ]  #.split()
                    elif l2 == 'old_english':
                        try:
                            mapped[book][chapter][verse][l2 + '_ipa'] = [
                                oe(l) for l in lemmas
                            ]  #.split()
                        except:
                            pass
                    elif l2 == 'english':
                        try:
                            mapped[book][chapter][verse][l2 + '_ipa'] = [
                                oe(l) for l in lemmas
                            ]  #.split() #gen_mods.get_final(gen_mods.getIPA_CMU(f2_dict[book][chapter][verse]))
                        except:
                            pass
                    else:
                        mapped[book][chapter][verse][l2 + '_ipa'] = [
                            epi.transliterate(l) for l in lemmas
                        ]  #.split()

                    expand_voc = False
                    if l2 != 'Got' and expand_voc:
                        expand_voc_by_distance(voc, mapped, book, chapter,
                                               verse, l2)
                        #pdb.set_trace()

    return mapped
Пример #17
0
    parser.add_option('--port',
                      action='store',
                      dest='port',
                      type='int',
                      default=8080,
                      help='Binding port for web service (default: 8080)')
    parser.add_option('--host',
                      action='store',
                      dest='host',
                      default='0.0.0.0',
                      help='Binding IP for server (default: 0.0.0.0)')
    parser.add_option(
        '--lang',
        action='append',
        dest='languages',
        default=[],
        help=
        'Preload language. You can use this param multiple times: --lang en --lang fr ... (default is just ["en"])'
    )

    (params, _) = parser.parse_args(sys.argv)

    if len(params.languages) == 0:
        params.languages = ['en']

    for lang in params.languages:
        lang2cube[lang] = Cube(verbose=True)
        lang2cube[lang].load(lang)

    app.run(port=params.port, host=params.host)
Пример #18
0
    's', 's', 's', 's', 's', 's', 'm', 'm', 's', 's', 'm', 's', 'm', 's', 'm',
    's', 's', 'm', 'm', 'm', 's', 's', 's', 's', 's', 'm', 's', 's', 's', 's',
    's', 's', 's', 'm', 's', 's', 's', 's', 's', 's', 'm', 's', 'm', 's', 'm',
    's', 'x', 's', 's', 's', 's', 's', 's', 'm', 's', 's', 's', 's', 'm', 's',
    's', 's', 'm', 'm', 's', 's', 's', 's', 's', 's', 's', 's', 'm', 's', 'm',
    'm', 's', 's', 'm', 's', 's', 's', 'm', 'm', 's', 's', 'm', 's', 's', 'm',
    's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's',
    's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 's', 'm', 's', 's',
    's', 's', 's', 's', 's', 'm', 's', 's', 'm', 's', 's', 's', 's', 's', 's',
    'm', 's', 'm', 's', 's', 's', 's', 'm', 's', 's', 's', 'm', 'm', 's', 's',
    's', 'm'
]

from cube.api import Cube

cube = Cube(verbose=True)
cube.load("ro",
          tokenization=True,
          compound_word_expanding=False,
          tagging=True,
          lemmatization=True,
          parsing=True)
cube_no_tok = Cube(verbose=True)
cube_no_tok.load("ro",
                 tokenization=False,
                 compound_word_expanding=False,
                 tagging=True,
                 lemmatization=True,
                 parsing=True)
multi_sentence_count = 0
errors = 0
Пример #19
0
        for LSentence in LSentences:
            for entry in LSentence:
                print(entry)
            print("")


if __name__ == '__main__':
    if True:
        found_nl = False
        for iso in CubeNLPPOS.get_L_supported_isos(None):
            if iso not in ('nno', 'nnb'):
                continue

            from cube.api import Cube  # import the Cube object

            cube = Cube(verbose=True)  # initialize it
            cube.load(DSupportedISOs[iso])

    print_pos(
        'id',
        'Tahap pertama konflik ini dapat disebut "Perang Kemerdekaan Belanda".'
    )
    print_pos(
        'en',
        'The first phase of the conflict can be considered the Dutch War of Independence.'
    )

    print_pos('id', 'Saya tidak dapat memakan ini.')
    print_pos('en', 'I can\'t eat this.')

    print_pos('zh', '猴子高兴,实验人员也高兴。')
Пример #20
0
import tnkeeh as tn
from farasa.segmenter import FarasaSegmenter

# for bulgarian and turkish
from cube.api import Cube

en_tok = MosesTokenizer(lang="en")
en_normalizer = MosesPunctNormalizer()
# TODO: change hardcoding of jar file to a arg from cli
rdrsegmenter = VnCoreNLP("vncorenlp/VnCoreNLP-1.1.1.jar",
                         annotators="wseg",
                         max_heap_size="-Xmx500m")
ar_segmenter = FarasaSegmenter()

bg_cube = Cube(verbose=False)
bg_cube.load("bg")

tr_cube = Cube(verbose=False)
tr_cube.load("tr")


def clean_ar_text(
    text,
    segment=False,
    remove_special_chars=False,
    remove_english=False,
    normalize=False,
    remove_diacritics=False,
    excluded_chars=[],
    remove_tatweel=False,
Пример #21
0
# implement NLP-Cube https://github.com/adobe/NLP-Cube for sent and word tokenization, and lemma
from cube.api import Cube  # import the Cube object
cube = Cube(verbose=False)  # initialize it
cube.load("en")
from rusenttokenize import ru_sent_tokenize

import re


class mytokenizer:
    def bracket_mask(self, text):

        text_temp = re.sub("\[", "<<", text)
        text_temp = re.sub("\]", ">>", text_temp)
        text_temp = re.sub("\+", "===", text_temp)
        parenthesis = re.findall("\([^()]*\)", text_temp)
        bracket = re.findall("\[[^\[\]]*\]", text)
        if not parenthesis and not bracket:
            return text

        for p in parenthesis:
            p = re.sub("[\(\)]", "", p)
            parts = [re.sub("\.\s*$", "", s) for s in ru_sent_tokenize(p)]
            new_p = " ; ".join(parts)
            #print ("P:", p)
            #print("new_p:",new_p,"\n")
            #print("text_temp:",text_temp)
            text_temp = re.sub(p, new_p, text_temp)

        text = re.sub("<<", "[", text_temp)
        text = re.sub(">>", "]", text)
Пример #22
0
# filter
import os
import csv
import json
from filter.filter import filter_spam

# lemmatizer
from lemmatizer.lemmatizer import lemmatize
from cube.api import Cube

# detection
sys.path.insert(0, './detector/')
from detect_events import main as detect_events

lemmatizer = Cube(verbose=True)
lemmatizer.load("es", tokenization=False, parsing=False)


class Streamer(luigi.ExternalTask):
    time_slice = luigi.parameter.DateMinuteParameter(interval=30)

    def output(self):
        fname = '../data/streaming/{}.csv'.format(self.time_slice)
        # print('Requires: {}'.format(fname))
        return luigi.LocalTarget(fname)


class Preprocess(luigi.Task):
    time_slice = luigi.parameter.DateMinuteParameter(
        interval=30, default=datetime.datetime.today())
from cube.api import Cube

cube = Cube(verbose=True)
cube.load(
    'ro', tokenization=True,
    compound_word_expanding=False,
    tagging=True,
    lemmatization=True,
    parsing=True
)
Пример #24
0
    def load_model(self):
        if self.lib.lower() == "stanford":
            print(
                "-----------You are going to use Stanford library-----------")
            if self.lang.lower() == "basque":
                print(
                    "-------------You are going to use Basque model-------------"
                )
                # config = {'processors': 'tokenize,pos,lemma,depparse',  # Comma-separated list of processors to use
                #            'lang': 'eu',  # Language code for the language to build the Pipeline in
                #            'tokenize_model_path': '/home/kepa/eu/eu_bdt_models/eu_bdt_tokenizer.pt',
                #            # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
                #            'pos_model_path': '/home/kepa/eu/eu_bdt_models/eu_bdt_tagger.pt',
                #            'pos_pretrain_path': '/home/kepa/eu/eu_bdt_models/eu_bdt.pretrain.pt',
                #            'lemma_model_path': '/home/kepa/eu/eu_bdt_models/eu_bdt_lemmatizer.pt',
                #            'depparse_model_path': '/home/kepa/eu/eu_bdt_models/eu_bdt_parser.pt',
                #            'depparse_pretrain_path': '/home/kepa/eu/eu_bdt_models/eu_bdt.pretrain.pt'
                #            }
                config = {
                    'processors':
                    'tokenize,pos,lemma,depparse',  # Comma-separated list of processors to use
                    'lang':
                    'eu',  # Language code for the language to build the Pipeline in
                    'tokenize_model_path':
                    'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tokenizer.pt',
                    # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
                    'pos_model_path':
                    'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_tagger.pt',
                    'pos_pretrain_path':
                    'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt',
                    'lemma_model_path':
                    'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_lemmatizer.pt',
                    'depparse_model_path':
                    'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt_parser.pt',
                    'depparse_pretrain_path':
                    'J:\TextSimilarity\eu\eu_bdt_models\eu_bdt.pretrain.pt'
                }
                self.parser = stanfordnlp.Pipeline(**config)

            elif self.lang.lower() == "english":
                print(
                    "-------------You are going to use English model-------------"
                )
                config = {
                    'processors':
                    'tokenize,mwt,pos,lemma,depparse',  # Comma-separated list of processors to use
                    'lang':
                    'en',  # Language code for the language to build the Pipeline in
                    'tokenize_model_path':
                    '/home/kepa/en/en_ewt_models/en_ewt_tokenizer.pt',  # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
                    #'mwt_model_path': './fr_gsd_models/fr_gsd_mwt_expander.pt',
                    'pos_model_path':
                    '/home/kepa/en/en_ewt_models/en_ewt_tagger.pt',
                    'pos_pretrain_path':
                    '/home/kepa/en/en_ewt_models/en_ewt.pretrain.pt',
                    'lemma_model_path':
                    '/home/kepa/en/en_ewt_models/en_ewt_lemmatizer.pt',
                    'depparse_model_path':
                    '/home/kepa/en/en_ewt_models/en_ewt_parser.pt',
                    'depparse_pretrain_path':
                    '/home/kepa/en/en_ewt_models/en_ewt.pretrain.pt'
                }
                self.parser = stanfordnlp.Pipeline(**config)
            elif self.lang.lower() == "spanish":
                print(
                    "-------------You are going to use Spanish model-------------"
                )
                config = {
                    'processors':
                    'tokenize,pos,lemma,depparse',  # Comma-separated list of processors to use
                    'lang':
                    'es',  # Language code for the language to build the Pipeline in
                    'tokenize_model_path':
                    '/home/kepa/es/es_ancora_models/es_ancora_tokenizer.pt',  # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
                    'pos_model_path':
                    '/home/kepa/es/es_ancora_models/es_ancora_tagger.pt',
                    'pos_pretrain_path':
                    '/home/kepa/es/es_ancora_models/es_ancora.pretrain.pt',
                    'lemma_model_path':
                    '/home/kepa/es/es_ancora_models/es_ancora_lemmatizer.pt',
                    'depparse_model_path':
                    '/home/kepa/es/es_ancora_models/es_ancora_parser.pt',
                    'depparse_pretrain_path':
                    '/home/kepa/es/es_ancora_models/es_ancora.pretrain.pt'
                }
                self.parser = stanfordnlp.Pipeline(**config)
            else:
                print("........You cannot use this language...........")
        elif self.lib.lower() == "cube":
            print("-----------You are going to use Cube Library-----------")
            if self.lang.lower() == "basque":
                #load(self, language_code, version="latest",local_models_repository=None,
                #local_embeddings_file=None, tokenization=True, compound_word_expanding=False,
                #tagging=True, lemmatization=True, parsing=True).
                #Ejemplo:load("es",tokenization=False, parsing=False)
                cube = Cube(verbose=True)
                cube.load("eu", "latest")
            elif self.lang.lower() == "english":
                cube = Cube(verbose=True)
                cube.load("en", "latest")
            elif self.lang.lower() == "spanish":
                cube = Cube(verbose=True)
                cube.load("es", "latest")
            else:
                print("........You cannot use this language...........")
        else:
            print(
                "You cannot use this library. Introduce a valid library (Cube or Stanford)"
            )
Пример #25
0
def create_pickle():
    cube = Cube(verbose=True)
    cube.load("ja")
    with open('cube.pickle', mode='wb') as wh:
        pickle.dump(cube, wh)