Пример #1
0
 def fit_model(self):
     self.model_config = read_json(
         configs.doc_retrieval.ru_ranker_tfidf_wiki)
     self.model_config["dataset_reader"]["data_path"] = os.path.abspath(
         os.getcwd()) + "/Resourses"
     self.model_config["dataset_reader"]["dataset_format"] = "txt"
     self.model_config["train"]["batch_size"] = 100
     print("work!")
     self.doc_retrieval = train_model(self.model_config)
     self.squad = build_model(configs.squad.squad_ru_rubert_infer,
                              download=True)
     self.odqa = build_model(configs.odqa.ru_odqa_infer_wiki_rubert,
                             download=False)
Пример #2
0
    def __init__(self, config_dict):
        #         tf.compat.v1.random.set_random_seed(1234)
        # self.elmo_lm = build_model(config_dict, download=True)
        try:
            self.elmo_lm = build_model(config_dict, download=False)
        except Exception as e:
            self.elmo_lm = build_model(config_dict, download=True)
        self.words = self.elmo_lm.pipe[-1][-1].get_vocab()
        self.word_index = {word: i for i, word in enumerate(self.words)}
        self.INIT_STATE_OF_ELMO = self.elmo_lm.pipe[-1][-1].init_states

        # index of unknown token:
        self.IDX_UNK_TOKEN = self.word_index.get("<UNK>")
Пример #3
0
def deeppavlov_ner_cell(x, *args):
    from deeppavlov import configs, build_model

    which = args[0]

    ner_model = None
    if which == 'onto_bert_mult':
        ner_model = build_model(configs.ner.ner_ontonotes_bert_mult, download=True)  # done
    if which == 'onto_bert':
        ner_model = build_model(configs.ner.ner_ontonotes_bert, download=True)  # done
    if which == 'onto':
        ner_model = build_model(configs.ner.ner_ontonotes, download=True)  # done
    if which == 'conl_bert':
        ner_model = build_model(configs.ner.ner_conll2003_bert, download=True)  # done
    if which == 'conl':
        ner_model = build_model(configs.ner.ner_conll2003, download=True)  # done
    # if which == 'dstc2':  # deprecated
    #     ner_model = build_model(configs.ner.ner_dstc2, download=True)  # done, but miss

    if ner_model is None:
        raise ValueError("Insufficient vespine gas")

    y = ner_model([x])

    enha = {}
    current_token_l = ''
    for j in range(len(y[1][0])):

        token = y[0][0][j]
        code = y[1][0][j]

        if code != 'O':

            code_mark = code[0]
            code_label = code[2:]

            if code_mark == 'B':
                current_token_l = token

            if code_mark == 'I':
                del enha[current_token_l]
                current_token_l = current_token_l + ' ' + token

            if current_token_l in list(enha.keys()):
                if code_label not in enha[current_token_l]:
                    enha[current_token_l].append(code_label)
            else:
                enha[current_token_l] = [code_label]

    return enha
Пример #4
0
def main(args):
    os.makedirs(args.outdir, exist_ok=True)

    # pos_model = build_model(configs.morpho_tagger.UD2_0.morpho_ru_syntagrus_pymorphy, download=True)
    pos_model = build_model(
        configs.morpho_tagger.BERT.morpho_ru_syntagrus_bert, download=True)
    syntax_model = build_model(configs.syntax.syntax_ru_syntagrus_bert,
                               download=True)

    for in_path in glob.glob(args.inglob, recursive=True):
        try:
            print(in_path)

            docname = os.path.splitext(os.path.basename(in_path))[0]
            out_path = os.path.join(args.outdir, docname + '.pickle')

            if os.path.exists(out_path) and not args.f:
                print('Already processed')
                continue

            with open(in_path, 'r') as f:
                full_text = clean_text(f.read())

            sentences_spans = list(sentenize(full_text))
            sentences_spans = [
                split_sent for sent in sentences_spans for split_sent in
                split_long_sentence(sent, max_len=args.max_sent_len)
            ]
            sentences_texts = [s.text for s in sentences_spans]
            sentences_pos = pos_model.batched_call(sentences_texts,
                                                   batch_size=args.batch_size)
            sentences_syntax = syntax_model.batched_call(
                sentences_texts, batch_size=args.batch_size)
            assert len(sentences_spans) == len(sentences_pos) == len(
                sentences_syntax)

            doc_sentences = [
                dict(span=(span.start, span.stop),
                     text=span.text,
                     pos=pos,
                     syntax=synt) for span, pos, synt in zip(
                         sentences_spans, sentences_pos, sentences_syntax)
            ]
            with open(out_path, 'wb') as f:
                pickle.dump(doc_sentences, f)
        except Exception as ex:
            print(
                f'Failed to process {in_path} due to {ex}\n{traceback.format_exc()}'
            )
Пример #5
0
    def __init__(self, use_noans=False, download=False):
        if use_noans:
            config = configs.squad.multi_squad_noans
        else:
            config = configs.squad.squad

        self.model = build_model(config, download=download)
Пример #6
0
    def __init__(self, data_path: Optional[str] = None, config_type: Optional[str] = 'tfidf_autofaq',
                 x_col_name: Optional[str] = 'Question', y_col_name: Optional[str] = 'Answer',
                 save_load_path: Optional[str] = './similarity_matching',
                 edit_dict: Optional[dict] = None, train: Optional[bool] = True):

        if config_type not in configs.faq:
            raise ValueError("There is no config named '{0}'. Possible options are: {1}"
                             .format(config_type, ", ".join(configs.faq.keys())))
        model_config = read_json(configs.faq[config_type])

        if x_col_name is not None:
            model_config['dataset_reader']['x_col_name'] = x_col_name
        if y_col_name is not None:
            model_config['dataset_reader']['y_col_name'] = y_col_name

        model_config['metadata']['variables']['MODELS_PATH'] = save_load_path

        if data_path is not None:
            if expand_path(data_path).exists():
                if 'data_url' in model_config['dataset_reader']:
                    del model_config['dataset_reader']['data_url']
                model_config['dataset_reader']['data_path'] = data_path
            else:
                if 'data_path' in model_config['dataset_reader']:
                    del model_config['dataset_reader']['data_path']
                model_config['dataset_reader']['data_url'] = data_path

        if edit_dict is not None:
            update_dict_recursive(model_config, edit_dict)

        if train:
            self.model = train_model(model_config, download=True)
            log.info('Your model was saved at: \'' + save_load_path + '\'')
        else:
            self.model = build_model(model_config, download=False)
Пример #7
0
 def __init__(self):
     self.ner_model = build_model(configs.ner.ner_ontonotes_bert_mult,
                                  download=False)
     self.segmenter = Segmenter()
     self.morph_vocab = MorphVocab()
     self.emb = NewsEmbedding()
     self.morph_tagger = NewsMorphTagger(self.emb)
Пример #8
0
def start_alice_server(model_config, https=False, ssl_key=None, ssl_cert=None, port=None):
    server_config_path = get_settings_path() / SERVER_CONFIG_FILENAME
    server_params = get_server_params(server_config_path, model_config)

    https = https or server_params['https']

    if not https:
        ssl_key = ssl_cert = None
    else:
        ssh_key = Path(ssl_key or server_params['https_key_path']).resolve()
        if not ssh_key.is_file():
            e = FileNotFoundError('Ssh key file not found: please provide correct path in --key param or '
                                  'https_key_path param in server configuration file')
            log.error(e)
            raise e

        ssh_cert = Path(ssl_cert or server_params['https_cert_path']).resolve()
        if not ssh_cert.is_file():
            e = FileNotFoundError('Ssh certificate file not found: please provide correct path in --cert param or '
                                  'https_cert_path param in server configuration file')
            log.error(e)
            raise e

    host = server_params['host']
    port = port or server_params['port']
    model_endpoint = server_params['model_endpoint']

    model = build_model(model_config)
    skill = DefaultStatelessSkill(model, lang='ru')
    agent = DefaultAgent([skill], skills_processor=DefaultRichContentWrapper())

    start_agent_server(agent, host, port, model_endpoint, ssl_key, ssl_cert)
Пример #9
0
 def ask_model(self, model_name, question: str):
     if self.__model_is_exist(model_name):
         model = build_model(self.file_util.get_config_model_path(model_name))
         result = model([question])
         return result
     else:
         raise ModelNotFoundException("model {} not found".format(model_name))
    def __init__(self,
                 data_path: Optional[str] = None,
                 x_col_name: Optional[str] = None,
                 y_col_name: Optional[str] = None,
                 save_load_path: Optional[str] = './similarity_matching',
                 edit_dict: Optional[dict] = None,
                 train: bool = True):

        model_config = read_json(configs.faq.tfidf_autofaq)
        if x_col_name is not None:
            model_config['dataset_reader']['x_col_name'] = x_col_name
        if y_col_name is not None:
            model_config['dataset_reader']['y_col_name'] = y_col_name

        model_config['metadata']['variables']['ROOT_PATH'] = save_load_path

        if data_path is not None:
            if expand_path(data_path).exists():
                if 'data_url' in model_config['dataset_reader']:
                    del model_config['dataset_reader']['data_url']
                model_config['dataset_reader']['data_path'] = data_path
            else:
                if 'data_path' in model_config['dataset_reader']:
                    del model_config['dataset_reader']['data_path']
                model_config['dataset_reader']['data_url'] = data_path

        if edit_dict is not None:
            update_dict_recursive(model_config, edit_dict)

        if train:
            self.model = train_model(model_config)
            log.info('Your model was saved at: \'' + save_load_path + '\'')
        else:
            self.model = build_model(model_config)
Пример #11
0
def ner_rec(dataframe):
    """

    :param dataframe:
    :return:
    """
    # build model
    ner_model = build_model(configs.ner.ner_ontonotes_bert_mult)
    # make empty list to hold all results
    res_all = []
    # iterate over each news text
    for text in tqdm(dataframe['text'], desc='entity recognition'):
        # make empty list to hold results for  each text
        res_text = []
        # iterate over each sentence in text
        for sentence in text:
            # find entities
            res = ner_model([sentence])
            # concat results with text into list of tuples
            tokenized_text = res[0][0]
            tokenized_entity = res[1][0]
            res_list = list(zip(tokenized_text, tokenized_entity))
            # add to text
            res_text += res_list
        # add processed txt to overall results
        res_all.append(res_text)

    return res_all
Пример #12
0
def get_dp_model():
  global model
  if model is None:
    #model = build_model(configs.squad.squad, download=True)
    model = build_model(configs.squad.multi_squad_ru_retr_noans_rubert_infer, download=False)
    #model = build_model(configs.squad.squad_bert_infer, download=True)
  return model
 def __init__(self):
     """
     Модель из библиотеки deeppavlov, определяет эмоционыльный окрас русского предложения
     Подробнее на docs.deeppavlov
     """
     self.model = build_model(
         configs.classifiers.rusentiment_elmo_twitter_cnn, download=True)
    def __init__(self,
                 squad_model_config: str,
                 vocab_file: str,
                 do_lower_case: bool,
                 max_seq_length: int = 512,
                 batch_size: int = 10,
                 lang: str = 'en',
                 **kwargs) -> None:
        config = json.load(open(squad_model_config))
        config['chainer']['pipe'][0]['max_seq_length'] = max_seq_length
        self.model = build_model(config)
        self.max_seq_length = max_seq_length

        if Path(vocab_file).is_file():
            vocab_file = str(expand_path(vocab_file))
            self.tokenizer = AutoTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(
                vocab_file, do_lower_case=do_lower_case)

        self.batch_size = batch_size

        if lang == 'en':
            from nltk import sent_tokenize
            self.sent_tokenizer = sent_tokenize
        elif lang == 'ru':
            from ru_sent_tokenize import ru_sent_tokenize
            self.sent_tokenizer = ru_sent_tokenize
        else:
            raise RuntimeError('en and ru languages are supported only')
Пример #15
0
    def correct(self, word: str) -> str:
        if self.model is None:
            self.model = build_model(
                configs.spelling_correction.levenshtein_corrector_ru,
                download=False)

        return self.model([word])[0]
Пример #16
0
 def useLevenstein(self):
     self.originalText, self.errorText = FP().prepareFiles()
     originalSentencesList, errorSentencesList = EC().textToSentences(self.originalText, self.errorText)
     print(len(originalSentencesList), len(errorSentencesList))
     correctorModel = build_model(configs.spelling_correction.levenshtein_corrector_ru, download=True)
     processedSentencesList = correctorModel(errorSentencesList)
     Metrics().estimateWords(self.originalText, processedSentencesList)
class NER:
    config = "./models/ner_config.json"
    ner_model = build_model(config, download=True)

    def train():
        train_model(NER.config, download=True)
        NER.ner_model = build_model(NER.config, download=True)

    def NamedEntityRecognition(message):
        ner = NER.ner_model([message])
        sentence, labels = ner[0][0], ner[1][0]
        print("###NER: ", sentence)
        print("###NER: ", labels)
        entities, slots = DstcSlotFillingNetwork._chunk_finder(
            sentence, labels)
        s = {}
        for i, slot in enumerate(slots):
            if slot not in s:
                s[slot] = set()
            s[slot].add(entities[i])
        if 'GENRE' in s:
            for genre in s['GENRE']:
                s['GENRE'] = set.union(
                    set(word for word in genre.split()
                        if word not in (stopwords.words('english'))),
                    s['GENRE'])
        return s
Пример #18
0
    def __init__(self, gobot_config_path):
        gobot_config = read_json(f"{gobot_config_path}/gobot_config.json")
        domain_yml_path = "dp_minimal_demo_dir/domain.yml"

        self.response_templates = read_yaml(domain_yml_path)["responses"]
        self.gobot = build_model(gobot_config)

        self.DATABASE, self.PREV_UPDATE_TIME = self._update_database()
Пример #19
0
 def __init__(self, toml_file=None):
     super().__init__(toml_file)
     # Do you Init Work here
     with open('./insults_kaggle_conv_bert.json') as f:
         self.configs = json.load(f)
     
     self.model = build_model(self.configs)
     self.ready()
Пример #20
0
 def __init__(self, texts=[], embeddings=[]):
     self.m = build_model(DUMB_MODEL_CONFIG_PATH)
     if len(texts) and not len(embeddings):
         self.sent_max_embs, _, _ = self.m(texts)
     elif not len(texts) and len(embeddings):
         self.sent_max_embs = embeddings
     else:
         raise RuntimeError('no texts or embeddings were provided')
Пример #21
0
def _parse_syntax(sents: Iterable[str]):
    """Parse syntax with deeppavlov model"""
    model = build_model("ru_syntagrus_joint_parsing")
    model['main'].to_output_string = False
    model['main'].output_format = 'json'
    output = model(sents)
    model.destroy()
    return output
Пример #22
0
    def __init__(self, nlp):
        """Initialization method of :class:`dragonfire.odqa.ODQA` class.

        Args:
            nlp:  :mod:`spacy` model instance.
        """

        self.nlp = nlp  # Load en_core_web_sm, English, 50 MB, default model
        self.model = build_model(configs.squad.squad, download=True)
Пример #23
0
def handle_messages():
    print("Handling Messages")
    model = build_model('faq.json')
    payload = request.get_data()
    print(payload)
    for sender, message in messaging_events(payload):
        print("Incoming from %s: %s" % (sender, message))
        send_message(PAT, sender, message, model)
    return "ok"
Пример #24
0
def _infer(config, inputs, download=False):
    chainer = build_model(config, download=download)
    if inputs:
        prediction = chainer(*inputs)
        if len(chainer.out_params) == 1:
            prediction = [prediction]
    else:
        prediction = []
    return prediction
Пример #25
0
 def __init__(self):
     self.model = build_model(configs.squad.squad, download=True)
     # self.model = ""
     self.stopwords = set(
         ["organizations", "sectors", "entities", "organization", "sector", "entity", "actor", "actors", "target",
          "targets", "compromises", "compromise", "threat", "threats", "computer", "computers", "network",
          "networks", "institute",
          "institutes", "republic", "middle", "purpose", "purposes", "firms", "firm", "application", "applications"])
     self.lemmatizer = WordNetLemmatizer()
Пример #26
0
    def __init__(self, model=None, download_model=False, empty=False):
        if empty:
            return

        if model is None:
            self.model = build_model(configs.syntax.syntax_ru_syntagrus_bert,
                                     download=download_model)
        else:
            self.model = model
 def __init__(self):
     self.model_deeppavlov = build_model(
         configs.syntax.syntax_ru_syntagrus_bert, download=True)
     self.coordinative_conjunction = [
         'и', 'да', 'ни-ни', 'тоже', 'также', 'а', 'но', 'да', 'зато',
         'однако', 'же', 'или', 'либо', 'то-то'
     ]
     self.morph = pymorphy2.MorphAnalyzer()
     self.like_root = ['acl:relcl', 'advcl', 'root', 'parataxis', 'ccomp']
     self.can_be_root = ['nsubj', 'conj']
Пример #28
0
def _deserialize(config, raw_bytes, examples):
    chainer = build_model(config, serialized=raw_bytes)
    for *query, expected_response in examples:
        query = [[q] for q in query]
        actual_response = chainer(*query)
        if expected_response is not None:
            if actual_response is not None and len(actual_response) > 0:
                actual_response = actual_response[0]
            assert expected_response == str(actual_response), \
                f"Error in interacting with {model_dir} ({conf_file}): {query}"
Пример #29
0
    def __init__(self, delay_init=False):
        if not delay_init:
            self.init()

        self.MODELNAME = 'ru_syntagrus_joint_parsing'
        self.model = build_model(self.MODELNAME, download=True)

        self._enable_tagger = True
        self._enable_parser = True
        self.converter_conll = ConverterConllUDV1()
Пример #30
0
 def __init__(self,
              model_settings: dict,
              doc2vec: Doc2Vec,
              dist_class: Type[LinearizedDist] = Dist,
              linearization_settings: dict = {}):
     self.model_settings = model_settings
     model = build_model(model_settings, download=True)
     self.doc2vec = doc2vec
     self.dist = dist_class(model, self.doc2vec, linearization_settings)
     VPTreeSearchEngine.__init__(self, self.dist)
Пример #31
0
def _deserialize(config, raw_bytes, examples):
    chainer = build_model(config, serialized=raw_bytes)
    for *query, expected_response in examples:
        query = [[q] for q in query]
        actual_response = chainer(*query)
        if expected_response is not None:
            if actual_response is not None and len(actual_response) > 0:
                actual_response = actual_response[0]
            assert expected_response == str(actual_response), \
                f"Error in interacting with {model_dir} ({conf_file}): {query}"
Пример #32
0
def _serialize(config):
    chainer = build_model(config, download=True)
    return chainer.serialize()