def nltk_spacy_tree(text): """ Visualize the SpaCy dependency tree with nltk.tree """ nlp = spacy_udpipe.load_from_path( lang="ru", path="../resource/trainModel/russian-syntagrus-ud-2.5-191206.udpipe", meta={"description": "Custom 'ru' model"}) doc = nlp(text) def token_format(token): # return "_".join([token.orth_, token.tag_, token.dep_]) return "_".join([token.orth_, token.pos_]) def to_nltk_tree(node): if node.n_lefts + node.n_rights > 0: return Tree(token_format(node), [to_nltk_tree(child) for child in node.children]) # return Tree(node.orth_, [to_nltk_tree(child) for child in node.children]) else: return token_format(node) tree = [to_nltk_tree(sent.root) for sent in doc.sents] # Первый элемент в списке - полное дерево # tree[0].draw() # ttt = TreeView(tree[0])._cframe.print_to_file('output.ps') ttt = TreeView(tree[0])._cframe # вернуть путь print()
def main(): textOriginalList = [textOriginal3, textOriginal2] # Преобразовать в объекты Text textsList = [] for textOriginal in textOriginalList: textsList.append(Text(textOriginal)) text_standart = Text(textOriginal1) # Модель для синтаксичского аналза nlp = spacy_udpipe.load_from_path( lang="ru", path=trainTextUdpipe, meta={"description": "Custom 'ru' model"}) for text in textsList: # Получение синтаксической модели text.doc = nlp(' '.join(text.tokenz)) # Получение леммы text.lemma_text = text.get_lemma_list(text.doc) text_standart.doc = nlp(' '.join(text_standart.tokenz)) text_standart.lemma_text = text_standart.get_lemma_list(text_standart.doc) distance_metrics_Jaccard(text_standart, textsList) cosine_similarity(text_standart, textsList) print("!!!!")
def setup_udpipe_tokenizer(lang, path_model): assert args.path_model is not None if verbose: logger.info(f'Load UDPipe model: {path_model}') nlp = spacy_udpipe.load_from_path(lang, path_model) tokenize = lambda w: [token.text for token in nlp(w)] return tokenize
def load_spacy(model_str): try: return spacy.load(model_str) except: try: download(model_str) return spacy.load(model_str) except: return spacy_udpipe.load_from_path(lang='cs', path=model_str)
def get_ud_parser(lang: str): if lang == "en": nlp = spacy_udpipe.load_from_path(lang="en", path="C:/Users/t-ofarvi/Desktop/HUJI/relation_extraction/udpipe_models/english-ewt-ud-2.5-191206.udpipe", meta={"description": "Custom 'en' model"}) elif lang == "ru": nlp = spacy_udpipe.load_from_path(lang="ru", path="C:/Users/t-ofarvi/Desktop/HUJI/relation_extraction/udpipe_models/russian-syntagrus-ud-2.5-191206.udpipe", meta={"description": "Custom 'ru' model"}) elif lang == "ko": nlp = spacy_udpipe.load_from_path(lang="ko", path="C:/Users/t-ofarvi/Desktop/HUJI/relation_extraction/udpipe_models/korean-gsd-ud-2.5-191206.udpipe", meta={"description": "Custom 'ko' model"}) else: raise ValueError conll_formatter = ConllFormatter(nlp) nlp.add_pipe(conll_formatter) return nlp
def __init__(self, remaining_arguments): super().__init__(remaining_arguments) parser = argparse.ArgumentParser( description= 'Subprogram for writing data files that will be processed ' 'by the HIT-SCIR UCCA parser.') parser.add_argument( '--model', type=str, required=True, help= 'Path to the UDPipe model to use for processing the given text.') args, _ = parser.parse_known_args(remaining_arguments) self._ud_model: UDPipeLanguage = spacy_udpipe.load_from_path( 'en', args.model)
def main(args): global verbose verbose = args.verbose df = pd.read_table(args.path_input, names=['synset', 'type', 'surface'], comment='#') df.dropna(inplace=True) if verbose: logger.info('Read {} lines from {}'.format(len(df), args.path_input)) words = sorted(df['surface'].unique()) if verbose: logger.info('# of words: ' + str(len(words))) # Set up an UDPipe model if verbose: logger.info(f'Load UDPipe model: {args.path_model}') nlp = spacy_udpipe.load_from_path(args.lang, args.path_model) counter = defaultdict(int) for word in tqdm(words): buff = [] for token in nlp(word): form, lemma, pos, tag = token.text, token.lemma_, token.pos_, token.tag_ if not isinstance(lemma, str) or len(lemma) == 0: lemma = form if form in ['~', '…']: # skip symbols continue buff.append('@@@'.join([form, lemma.lower(), f'{pos}-{tag}'])) if len(buff) == 1: continue counter['\t'.join(buff)] += 1 if verbose: logger.info('Write {} entries to {}'.format(len(counter), args.path_output)) write_mwe_json(args.path_output, counter, source='EOMW') return 0
def __init__(self, path_file, trainTextUdpipe): # Модель для синтаксичского аналза self.nlp = spacy_udpipe.load_from_path( lang="ru", path=trainTextUdpipe, meta={"description": "Custom 'ru' model"}) self.text_path: str = path_file self.tokenz: list = self.graphematic() # Получение синтаксической модели self.doc: Doc = self.nlp(' '.join(self.tokenz)) # Получение леммы self.lemma_text: list = self.get_lemma_list() # Получение частоты self.freq_dist: dict = dict() # Получение матрицы вероятности self.matrix = None self.entropy: float = 0.0 self.entropy2: float = 0.0 self.entropy3: float = 0.0 self.CT = None self.p: float = 1.0 self.jaccard_coeff: float = 0.0 self.cos_sim: float = 1.0
import re import os import stanza import spacy_udpipe ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) EXTERNAL_DIR = ROOT_DIR+'/external_data' STANZA_MODEL_HY = '/tmp/intrinsic_analysis/essential_models/' UDPIPE_MODEL_HY = '/tmp/intrinsic_analysis/essential_models/armenian-armtdp-ud-2.5-191206.udpipe' nlp_udpipe = spacy_udpipe.load_from_path(lang='hy', path=UDPIPE_MODEL_HY, meta={"description": "Custom hy model"}) nlp_stanza = stanza.Pipeline(use_gpu=False, lang='hy', dir=STANZA_MODEL_HY, processors='tokenize, mwt, pos, lemma, depparse') def lemmatizer(text: str): doc = nlp_stanza(text) return [word.lemma for sentence in doc.sentences for word in sentence.words] def pos_tagger(text: str): doc = nlp_stanza(text) return [word.pos for sentence in doc.sentences for word in sentence.words] def word_tokenize(text: str, remove_punctuation=False): text = remove_punct(text) if remove_punctuation else text doc = nlp_udpipe(text)
text) def remove_stop_words_cs(text: str) -> str: stop_words = set(get_stop_words('czech')) word_tokens = word_tokenize(text) filtered_sentence = [] for w in word_tokens: if w not in stop_words and len(w) > 3: filtered_sentence.append(w) return " ".join(filtered_sentence) ex = Extract(PATH) # pprint(ex.extract()) print("=" * 100) text = remove_all_except_letter_dot(ex.extract()) text = remove_stop_words_cs(text=text) nlp = spacy_udpipe.load_from_path(lang="cs", path=PATH_MODELS, meta={"description": "Custom 'hr' model"}) nlp.add_pipe(TermExtractionPipeline()) doc = nlp(text) pprint(list(dict(doc._.combo_basic.sort_values(ascending=False)).keys()))