예제 #1
0
def nltk_spacy_tree(text):
    """
    Visualize the SpaCy dependency tree with nltk.tree
    """
    nlp = spacy_udpipe.load_from_path(
        lang="ru",
        path="../resource/trainModel/russian-syntagrus-ud-2.5-191206.udpipe",
        meta={"description": "Custom 'ru' model"})
    doc = nlp(text)

    def token_format(token):
        # return "_".join([token.orth_, token.tag_, token.dep_])
        return "_".join([token.orth_, token.pos_])

    def to_nltk_tree(node):
        if node.n_lefts + node.n_rights > 0:
            return Tree(token_format(node),
                        [to_nltk_tree(child) for child in node.children])
            # return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
        else:
            return token_format(node)

    tree = [to_nltk_tree(sent.root) for sent in doc.sents]
    # Первый элемент в списке - полное дерево
    # tree[0].draw()
    # ttt = TreeView(tree[0])._cframe.print_to_file('output.ps')
    ttt = TreeView(tree[0])._cframe
    # вернуть путь
    print()
예제 #2
0
def main():
    textOriginalList = [textOriginal3, textOriginal2]
    # Преобразовать в объекты Text
    textsList = []
    for textOriginal in textOriginalList:
        textsList.append(Text(textOriginal))
    text_standart = Text(textOriginal1)
    # Модель для синтаксичского аналза
    nlp = spacy_udpipe.load_from_path(
        lang="ru",
        path=trainTextUdpipe,
        meta={"description": "Custom 'ru' model"})

    for text in textsList:
        # Получение синтаксической модели
        text.doc = nlp(' '.join(text.tokenz))
        # Получение леммы
        text.lemma_text = text.get_lemma_list(text.doc)

    text_standart.doc = nlp(' '.join(text_standart.tokenz))
    text_standart.lemma_text = text_standart.get_lemma_list(text_standart.doc)

    distance_metrics_Jaccard(text_standart, textsList)
    cosine_similarity(text_standart, textsList)
    print("!!!!")
def setup_udpipe_tokenizer(lang, path_model):
    assert args.path_model is not None
    if verbose:
        logger.info(f'Load UDPipe model: {path_model}')
    nlp = spacy_udpipe.load_from_path(lang, path_model)
    tokenize = lambda w: [token.text for token in nlp(w)]
    return tokenize
예제 #4
0
def load_spacy(model_str):
    try:
        return spacy.load(model_str)
    except:
        try:
            download(model_str)
            return spacy.load(model_str)
        except:
            return spacy_udpipe.load_from_path(lang='cs', path=model_str)
예제 #5
0
    def get_ud_parser(lang: str):
        if lang == "en":
            nlp = spacy_udpipe.load_from_path(lang="en",
                                              path="C:/Users/t-ofarvi/Desktop/HUJI/relation_extraction/udpipe_models/english-ewt-ud-2.5-191206.udpipe",
                                              meta={"description": "Custom 'en' model"})
        elif lang == "ru":
            nlp = spacy_udpipe.load_from_path(lang="ru",
                                              path="C:/Users/t-ofarvi/Desktop/HUJI/relation_extraction/udpipe_models/russian-syntagrus-ud-2.5-191206.udpipe",
                                              meta={"description": "Custom 'ru' model"})
        elif lang == "ko":
            nlp = spacy_udpipe.load_from_path(lang="ko",
                                              path="C:/Users/t-ofarvi/Desktop/HUJI/relation_extraction/udpipe_models/korean-gsd-ud-2.5-191206.udpipe",
                                              meta={"description": "Custom 'ko' model"})
        else:
            raise ValueError

        conll_formatter = ConllFormatter(nlp)
        nlp.add_pipe(conll_formatter)

        return nlp
예제 #6
0
    def __init__(self, remaining_arguments):
        super().__init__(remaining_arguments)
        parser = argparse.ArgumentParser(
            description=
            'Subprogram for writing data files that will be processed '
            'by the HIT-SCIR UCCA parser.')
        parser.add_argument(
            '--model',
            type=str,
            required=True,
            help=
            'Path to the UDPipe model to use for processing the given text.')
        args, _ = parser.parse_known_args(remaining_arguments)

        self._ud_model: UDPipeLanguage = spacy_udpipe.load_from_path(
            'en', args.model)
예제 #7
0
def main(args):
    global verbose
    verbose = args.verbose

    df = pd.read_table(args.path_input,
                       names=['synset', 'type', 'surface'],
                       comment='#')
    df.dropna(inplace=True)
    if verbose:
        logger.info('Read {} lines from {}'.format(len(df), args.path_input))
    words = sorted(df['surface'].unique())

    if verbose:
        logger.info('# of words: ' + str(len(words)))

    # Set up an UDPipe model
    if verbose:
        logger.info(f'Load UDPipe model: {args.path_model}')
    nlp = spacy_udpipe.load_from_path(args.lang, args.path_model)

    counter = defaultdict(int)
    for word in tqdm(words):
        buff = []
        for token in nlp(word):
            form, lemma, pos, tag = token.text, token.lemma_, token.pos_, token.tag_
            if not isinstance(lemma, str) or len(lemma) == 0:
                lemma = form
            if form in ['~', '…']:  # skip symbols
                continue
            buff.append('@@@'.join([form, lemma.lower(), f'{pos}-{tag}']))

        if len(buff) == 1:
            continue
        counter['\t'.join(buff)] += 1

    if verbose:
        logger.info('Write {} entries to {}'.format(len(counter),
                                                    args.path_output))
    write_mwe_json(args.path_output, counter, source='EOMW')

    return 0
예제 #8
0
 def __init__(self, path_file, trainTextUdpipe):
     # Модель для синтаксичского аналза
     self.nlp = spacy_udpipe.load_from_path(
         lang="ru",
         path=trainTextUdpipe,
         meta={"description": "Custom 'ru' model"})
     self.text_path: str = path_file
     self.tokenz: list = self.graphematic()
     # Получение синтаксической модели
     self.doc: Doc = self.nlp(' '.join(self.tokenz))
     # Получение леммы
     self.lemma_text: list = self.get_lemma_list()
     # Получение частоты
     self.freq_dist: dict = dict()
     # Получение матрицы вероятности
     self.matrix = None
     self.entropy: float = 0.0
     self.entropy2: float = 0.0
     self.entropy3: float = 0.0
     self.CT = None
     self.p: float = 1.0
     self.jaccard_coeff: float = 0.0
     self.cos_sim: float = 1.0
예제 #9
0
import re
import os
import stanza
import spacy_udpipe

ROOT_DIR = os.path.dirname(os.path.abspath(__file__))

EXTERNAL_DIR = ROOT_DIR+'/external_data'
STANZA_MODEL_HY = '/tmp/intrinsic_analysis/essential_models/'
UDPIPE_MODEL_HY = '/tmp/intrinsic_analysis/essential_models/armenian-armtdp-ud-2.5-191206.udpipe'

nlp_udpipe = spacy_udpipe.load_from_path(lang='hy', path=UDPIPE_MODEL_HY,
                                         meta={"description": "Custom hy model"})

nlp_stanza = stanza.Pipeline(use_gpu=False, lang='hy', dir=STANZA_MODEL_HY,
                             processors='tokenize, mwt, pos, lemma, depparse')


def lemmatizer(text: str):
    doc = nlp_stanza(text)
    return [word.lemma for sentence in doc.sentences for word in sentence.words]


def pos_tagger(text: str):
    doc = nlp_stanza(text)
    return [word.pos for sentence in doc.sentences for word in sentence.words]


def word_tokenize(text: str, remove_punctuation=False):
    text = remove_punct(text) if remove_punctuation else text
    doc = nlp_udpipe(text)
                  text)


def remove_stop_words_cs(text: str) -> str:
    stop_words = set(get_stop_words('czech'))
    word_tokens = word_tokenize(text)

    filtered_sentence = []

    for w in word_tokens:
        if w not in stop_words and len(w) > 3:
            filtered_sentence.append(w)

    return " ".join(filtered_sentence)


ex = Extract(PATH)
# pprint(ex.extract())
print("=" * 100)
text = remove_all_except_letter_dot(ex.extract())
text = remove_stop_words_cs(text=text)

nlp = spacy_udpipe.load_from_path(lang="cs",
                                  path=PATH_MODELS,
                                  meta={"description": "Custom 'hr' model"})

nlp.add_pipe(TermExtractionPipeline())

doc = nlp(text)
pprint(list(dict(doc._.combo_basic.sort_values(ascending=False)).keys()))