Exemplo n.º 1
0
 def __init__(self, address_morph, address_syntax, address_rst):
     self._ppl = PipelineCommon([
         (ProcessorRemote(address_morph[0], address_morph[1],
                          'default'), ['text'], {
                              'sentences': 'sentences',
                              'tokens': 'tokens',
                              'postag': 'postag',
                              'lemma': 'lemma'
                          }),
         (ConverterMystemToUd(), ['postag'], {
             'morph': 'morph',
             'postag': 'postag'
         }),
         (ProcessorRemote(address_syntax[0], address_syntax[1],
                          '0'), ['tokens', 'sentences'], {
                              'syntax_dep_tree': 'syntax_dep_tree',
                              'postag': 'ud_postag'
                          }),
         (ProcessorRemote(address_rst[0], address_rst[1], 'default'), [
             'text', 'tokens', 'sentences', 'lemma', 'morph', 'postag',
             'syntax_dep_tree'
         ], {
             'rst': 'rst'
         })
     ])
     self._name = 'default'
class PipelineDefault:
    def __init__(self, address_morph, address_syntax, address_srl):
        self._ppl = PipelineCommon([
            (ProcessorRemote(address_morph[0], address_morph[1],
                             'default'), ['text'], {
                                 'tokens': 'tokens',
                                 'sentences': 'sentences',
                                 'postag': 'mystem_postag',
                                 'lemma': 'lemma'
                             }),
            (ProcessorSyntaxNetRemote(address_syntax[0], address_syntax[1]),
             ['tokens', 'sentences'], {
                 'syntax_dep_tree': 'syntax_dep_tree'
             }),
            (ConverterMystemToUd(), ['mystem_postag'], {
                'morph': 'morph',
                'postag': 'postag'
            }),
            (ProcessorRemote(address_srl[0], address_srl[1], 'default'),
             ['tokens', 'postag', 'morph', 'lemma', 'syntax_dep_tree'], {
                 'srl': 'srl'
             })
        ])
        self._name = 'default'

    def __call__(self, *args, **kwargs):
        return self._ppl(*args, **kwargs)

    def get_processors(self):
        return self._ppl.get_processors()
Exemplo n.º 3
0
 def __init__(self, basic_processor=('vmh1.isa.ru', 3344), udpipe_processor=('vmh1.isa.ru', 3355)):
     self.ppl = WrapperMultiProcessDocument([
                 PipelineCommon([
                     (
                         ProcessorRemote(basic_processor[0], basic_processor[1], 'default'),
                         ['text'],
                         {
                             'sentences' : 'sentences', 
                             'tokens' : 'tokens',
                             'postag' : 'mystem_postags',
                             'lemma' : 'lemma'
                         }
                     ),
                     (
                         ProcessorRemote(udpipe_processor[0], udpipe_processor[1], '0'), 
                         ['tokens', 'sentences'], 
                         {
                             'syntax_dep_tree' : 'syntax_dep_tree'
                         }
                     ),
                     (
                         ConverterMystemToUd(),
                         ['mystem_postags'],
                         {
                             'morph' : 'postag',
                         }
                     )
                 ])
             ])
Exemplo n.º 4
0
def get_tree(text):
	from isanlp import PipelineCommon
	from isanlp.processor_remote import ProcessorRemote
	from isanlp.ru.converter_mystem_to_ud import ConverterMystemToUd
	from Parser.some_reparser import extract_semantic_relations
	HOST = 'localhost'
	proc_morph = ProcessorRemote(HOST, 3333, 'default')
	proc_syntax = ProcessorRemote(HOST, 3334, '0')

	syntax_ppl = PipelineCommon([
		(proc_morph,
			['text'],
			{'tokens' : 'tokens', 'sentences' : 'sentences', 'postag' : 'postag', 'lemma' : 'lemma'}),
		(proc_syntax,
			['tokens','sentences'],
			{'syntax_dep_tree' : 'syntax_dep_tree'}),
		(ConverterMystemToUd(),
			['postag'],
			{'postag' : 'postag', 'morph' : 'morph'})
		])
	try:
		analysis_res = syntax_ppl(text)
	except:
		return None
	sentences = []
	for i in analysis_res['sentences']:
		sentence = []
		for j in range(i.begin, i.end):
			sentence.append(analysis_res['tokens'][j].text)
		sentences.append(sentence)
	vertices_list_list = []
	relations = extract_semantic_relations(text)
	for j in range(len(analysis_res['lemma'])):
		vertices_list = []
		for i in range(len(analysis_res['lemma'][j])):
			start, end = analysis_res['tokens'][i].begin, analysis_res['tokens'][i].end
			role_vert = []
			for rel in relations:
				if rel['child']['start'] == start and rel['child']['end'] == end:
					role_vert.append(rel['tp'])
			vert = tree(word(analysis_res['lemma'][j][i],
					analysis_res['postag'][j][i],
					analysis_res['morph'][j][i],
					start, end,
					i,
					role = role_vert))
			vertices_list.append(vert)
		vertices_list_list.append(vertices_list)
	root_list = []
	for i in range(len(vertices_list_list)):
		list_ = vertices_list_list[i]
		for j in range(len(analysis_res['syntax_dep_tree'][i])):
			_ = analysis_res['syntax_dep_tree'][i][j]
			if _.parent != -1:
				list_[_.parent].add_child(list_[j], _.link_name)
			else:
				list_[j].sentence = sentences[i]
				root_list.append(list_[j])
	return root_list
Exemplo n.º 5
0
def create_pipeline(delay_init=False):
    return PipelineCommon(
        [(ProcessorGramEval2020('models/ru_bert_final_model'),
          ['tokens', 'sentences'], {
              'lemma': 'lemma',
              'postag': 'postag',
              'morph': 'morph',
              'syntax_dep_tree': 'syntax_dep_tree'
          })],
        name='default')
Exemplo n.º 6
0
def create_pipeline(delay_init=False):
    return PipelineCommon([(ProcessorUDPipe('/src/parser_UDPIPE/russian-ud-2.0-170801.udpipe'),
                              ['morph'],
                              {'tokens' : 'tokens',
                               'sentences' : 'sentences',
                               'lemma': 'lemma',
                               'postag' : 'postag',
                               'morph' : 'morph',
                               'syntax_dep_tree' : 'syntax_dep_tree'}
                             )],
                            name='default')
Exemplo n.º 7
0
 def __init__(
     self,
     udpipe=("tsa05.isa.ru", 3334),
     rst=("papertext.ru", 5555),
     cache_path="./rst-cache.pkl",
 ):
     udpipe_host, udpipe_port = udpipe
     rst_host, rst_port = rst
     self.cache_path = cache_path
     self.ppl = PipelineCommon(
         [
             (
                 ProcessorRemote(udpipe_host, udpipe_port, "0"),
                 ["text"],
                 {
                     "sentences": "sentences",
                     "tokens": "tokens",
                     "lemma": "lemma",
                     "syntax_dep_tree": "syntax_dep_tree",
                     "postag": "ud_postag",
                 },
             ),
             (
                 ProcessorMystem(delay_init=False),
                 ["tokens", "sentences"],
                 {"postag": "postag"},
             ),
             (
                 ConverterMystemToUd(),
                 ["postag"],
                 {"morph": "morph", "postag": "postag"},
             ),
             (
                 ProcessorRemote(rst_host, rst_port, "default"),
                 [
                     "text",
                     "tokens",
                     "sentences",
                     "postag",
                     "morph",
                     "lemma",
                     "syntax_dep_tree",
                 ],
                 {"clauses": "clauses"},
             ),
         ]
     )
     self.__cache = {}
     self.__hasher = city_32()
     if os.path.exists(self.cache_path):
         self.__cache = jb.load(self.cache_path)
Exemplo n.º 8
0
def create_pipeline(delay_init):
    pipeline_default = PipelineCommon(
        [(ProcessorRST(model_dir_path='/models',
                       segmenter_type='lstm',
                       span_predictor_type='ensemble',
                       label_predictor_type='ensemble'), [
                           'text', 'tokens', 'sentences', 'lemma', 'morph',
                           'postag', 'syntax_dep_tree'
                       ], {
                           0: 'rst'
                       })],
        name='default')

    return pipeline_default
 def __init__(self, ud_model_path: str, lru_cache_size: int = 10000):
     self.pipeline = PipelineCommon([(ProcessorUDPipe(
         "./data/models/russian-syntagrus-ud-2.5-191206.udpipe"), ['text'],
                                      {
                                          "tokens": "tokens",
                                          "lemma": "lemma",
                                          "postag": "postag",
                                          "morph": "morph",
                                          "syntax_dep_tree":
                                          "syntax_dep_tree"
                                      })])
     self.predicate_extractor = PredicateExtractor()
     self.argument_extractor = ArgumentExtractor()
     self.call_pipeline = lru_cache(lru_cache_size)(self.pipeline.__call__)
Exemplo n.º 10
0
def create_pipeline(delay_init):
    model_path = '/src/bert-base-srl-2019.06.17.tar.gz'

    tokenizer = ProcessorTokenizerNltkEn()
    splitter = ProcessorSentenceSplitter()
    srl_proc = ProcessorSrlAllennlp(model_path)

    pipeline_default = PipelineCommon([(tokenizer, ['text'], {
        0: 'tokens'
    }), (splitter, ['tokens'], {
        0: 'sentences'
    }), (srl_proc, ['tokens', 'sentences'], {
        0: 'srl'
    })],
                                      name='default')

    return pipeline_default
Exemplo n.º 11
0
def get_tree(text):
    HOST = 'localhost'
    proc_morph = ProcessorRemote(HOST, 3333, 'default')
    proc_syntax = ProcessorRemote(HOST, 3334, '0')

    syntax_ppl = PipelineCommon([(proc_morph, ['text'], {
        'tokens': 'tokens',
        'sentences': 'sentences',
        'postag': 'postag',
        'lemma': 'lemma'
    }),
                                 (proc_syntax, ['tokens', 'sentences'], {
                                     'syntax_dep_tree': 'syntax_dep_tree'
                                 }),
                                 (ConverterMystemToUd(), ['postag'], {
                                     'postag': 'postag',
                                     'morph': 'morph'
                                 })])
    analysis_res = syntax_ppl(text)
    sentences = []
    for i in analysis_res['sentences']:
        sentence = []
        for j in range(i.begin, i.end):
            sentence.append(analysis_res['tokens'][j].text)
        sentences.append(sentence)
    vertices_list_list = []
    for j in range(len(analysis_res['lemma'])):
        vertices_list = []
        for i in range(len(analysis_res['lemma'][j])):
            vert = tree(
                word(analysis_res['lemma'][j][i], analysis_res['postag'][j][i],
                     analysis_res['morph'][j][i], i))
            vertices_list.append(vert)
        vertices_list_list.append(vertices_list)
    root_list = []
    for i in range(len(vertices_list_list)):
        list_ = vertices_list_list[i]
        for j in range(len(analysis_res['syntax_dep_tree'][i])):
            _ = analysis_res['syntax_dep_tree'][i][j]
            if _.parent != -1:
                list_[_.parent].add_child(list_[j], _.link_name)
            else:
                list_[j].sentence = sentences[i]
                root_list.append(list_[j])
    return root_list
Exemplo n.º 12
0
def prepare_compounds(compounds_path):
    ppl = PipelineCommon([(ProcessorTokenizerRu(), ['text'], {
        0: 'tokens'
    }), (ProcessorSentenceSplitter(), ['tokens'], {
        0: 'sentences'
    }), (ProcessorMystem(), ['tokens', 'sentences'], {
        'lemma': 'lemma'
    })])

    df_compounds = pd.read_csv(compounds_path)

    compound_set = set()
    for i in df_compounds.index:
        compound = '{} {}'.format(df_compounds.loc[i, 'Часть 1'],
                                  df_compounds.loc[i, 'Часть 2'])
        lemmas = ppl(compound)['lemma'][0]
        compound_set.add('{}_{}'.format(lemmas[0], lemmas[1]))

    return compound_set
Exemplo n.º 13
0
def acquiring(comp, model, true_label, model_words=None, skip_invalid_labels=True):
    ppl = PipelineCommon([
        (ProcessorTokenizerRu(), ['text'], {0 : 'tokens'}),
        (ProcessorSentenceSplitter(), ['tokens'], {0 : 'sentences'}),
        (ProcessorMystem(), ['tokens', 'sentences'], {'lemma' : 'lemma'})
    ])
    
    v_w1 = []
    v_w2 = []
    v_comp = []
    true_class = []
    
    if model_words is None:
        model_words = model
    
    indexes = []
    for i in comp.index:
        label = comp.loc[i, true_label]
        if skip_invalid_labels and label not in {0., 1.}:
            continue
            
        anns = ppl('{} {}'.format(comp.loc[i, 'Часть 1'], comp.loc[i, 'Часть 2']))['lemma'][0]
        
        try:
            #print('{}_{}'.format(anns[0], anns[1]))
            vec_w1 = model_words[anns[0]]
            vec_w2 = model_words[anns[1]]
            vec_comp = model['{}_{}'.format(anns[0], anns[1])]
            indexes.append(i)
        except KeyError:
            continue
        
        v_w1.append(vec_w1)
        v_w2.append(vec_w2)
        v_comp.append(vec_comp)
        true_class.append(label)
    
    print('Number of examples: ', len(v_w1))
    
    return np.array(v_w1), np.array(v_w2), np.array(v_comp), np.array(true_class), comp.loc[indexes]
Exemplo n.º 14
0
import os

from isanlp import PipelineCommon
from isanlp.processor_remote import ProcessorRemote

host = 'localhost'

port_morph = int(os.environ['TEST_MORPH_PORT'])
port_srl = int(os.environ['TEST_SRL_PORT'])
text_path = os.environ['TEST_EN_PATH']

with open(text_path, encoding='utf8') as f:
    text = f.read()

ppl = PipelineCommon([(ProcessorRemote(host=host,
                                       port=port_morph,
                                       pipeline_name='default'),
                       ['text'],
                       {'tokens': 'tokens',
                        'sentences': 'sentences',
                        'lemma': 'lemma',
                        'postag': 'postag'}),
                      (ProcessorRemote(host=host,
                                       port=port_srl,
                                       pipeline_name='default'),
                       ['tokens', 'sentences'],
                       {'srl': 'srl'})
                      ])

annotations = ppl(text)
Exemplo n.º 15
0
text_path = os.environ['TEST_PATH']

with open(text_path, encoding='utf8') as f:
    text = f.read()

ppl = PipelineCommon([(ProcessorRemote(host='localhost',
                                       port=port_morph,
                                       pipeline_name='default'), ['text'], {
                                           'tokens': 'tokens',
                                           'sentences': 'sentences',
                                           'lemma': 'lemma',
                                           'postag': 'mystem_postag'
                                       }),
                      (ConverterMystemToUd(), ['mystem_postag'], {
                          'morph': 'morph',
                          'postag': 'postag'
                      }),
                      (ProcessorSyntaxNetRemote(host='localhost',
                                                port=port_syntax),
                       ['tokens', 'sentences'], {
                           'syntax_dep_tree': 'syntax_dep_tree'
                       }),
                      (ProcessorRemote(host='localhost',
                                       port=port_srl,
                                       pipeline_name='default'),
                       ['postag', 'morph', 'lemma', 'syntax_dep_tree'], {
                           'srl': 'srl'
                       })])

annotations = ppl(text)
Exemplo n.º 16
0
from isanlp_srl_framebank.processor_srl_framebank import ProcessorSrlFramebank
from isanlp import PipelineCommon

PPL_SRL_FRAMEBANK = PipelineCommon(
    [(ProcessorSrlFramebank('/models',
                            enable_model_for_unknown_predicates=True,
                            known_preds_embeddings_type='elmo',
                            unknown_preds_embeddings_type='elmo',
                            threshold=0.6),
      ['tokens', 'postag', 'morph', 'lemma', 'syntax_dep_tree'], {
          0: 'srl'
      })],
    name='default')
Exemplo n.º 17
0
    def __call__(self, tokens, sentences):
        sys.stderr.write('Processing input...\n')
        sys.stderr.flush()

        input_data = [[word.text for word in CSentence(tokens, sent)]
                      for sent in sentences]
        result_str = self._process_json(json.dumps(input_data))
        result_json = json.loads(result_str)
        result = []
        for sent in result_json:
            result_sent = []
            for pred_arg in sent:
                result_sent.append(
                    Event(pred=(pred_arg[0], pred_arg[0]),
                          args=[
                              TaggedSpan(arg[0], arg[1], arg[2])
                              for arg in pred_arg[1] if arg[0] != 'V'
                          ]))

            result.append(result_sent)

        return result


DEEP_SRL = PipelineCommon([(ProcessorDeepSrlWrapper(
    "/src/deep_srl/resources/conll05_propid_model",
    "/src/deep_srl/resources/conll05_model"), ['tokens', 'sentences'], {
        0: 'srl'
    })],
                          name='default')
from isanlp_srl_framebank.processor_srl_framebank import ProcessorSrlFramebank
from isanlp import PipelineCommon

PPL_SRL_FRAMEBANK = PipelineCommon([(ProcessorSrlFramebank('/models'),
                                     ['postag', 'morph', 'lemma', 'syntax_dep_tree'],
                                     {0 : 'srl'})
                                   ],
                                   name='default')
Exemplo n.º 19
0
from processor_rst import ProcessorRST
from isanlp import PipelineCommon

PPL_RST = PipelineCommon([(ProcessorRST('/models'), [
    'text', 'tokens', 'sentences', 'postag', 'morph', 'lemma',
    'syntax_dep_tree'
], {
    0: 'rst'
})],
                         name='default')