예제 #1
0
 def __init__(self, seed=42):
     super(Solver, self).__init__()
     self.morph = pymorphy2.MorphAnalyzer()
     self.seed = seed
     self.init_seed()
     self.toktok = ToktokTokenizer()
     self.paronyms = self.get_paronyms()
예제 #2
0
 def __init__(self, seed=42, data_path='data/'):
     self.is_train_task = False
     self.morph = pymorphy2.MorphAnalyzer()
     self.toktok = ToktokTokenizer()
     self.seed = seed
     self.init_seed()
     self.synonyms = open(os.path.join(data_path, r'synonyms.txt'),
                          'r',
                          encoding='utf8').readlines()
     self.synonyms = [
         re.sub('\.', '',
                t.lower().strip('\n')).split(' ') for t in self.synonyms
     ]
     self.synonyms = [[t for t in l if t] for l in self.synonyms]
     self.antonyms = open(os.path.join(data_path, r'antonyms.txt'),
                          'r',
                          encoding='utf8').readlines()
     self.antonyms = [t.strip(' \n').split(' - ') for t in self.antonyms]
     self.phraseology = open(os.path.join(data_path, r'phraseologs.txt'),
                             'r',
                             encoding='utf8').readlines()
     self.phraseology = [[
         l for l in self.lemmatize(l) if l not in
         ['\n', ' ', '...', '', ',', '-', '.', '?', r' (', r'/']
     ] for l in self.phraseology]
예제 #3
0
 def __init__(self, seed=42):
     super(Solver, self).__init__()
     self.is_train_task = False
     self.morph = pymorphy2.MorphAnalyzer()
     self.toktok = ToktokTokenizer()
     self.seed = seed
     self.init_seed()
예제 #4
0
 def __init__(self, seed=42):
     super(Solver, self).__init__()
     self.seed = seed
     self.init_seed()
     self.morph = pymorphy2.MorphAnalyzer()
     self.has_model = True
     self.toktok = ToktokTokenizer()
     self.mode = 1 # 1 - find wrong word, 2 - replace word
예제 #5
0
 def __init__(self, seed=42):
     self.seed = seed
     self.init_seed()
     self.tokenizer = ToktokTokenizer()
     self.morph = pymorphy2.MorphAnalyzer()
     self.count_vectorizer = CountVectorizer(ngram_range=(1, 4),
                                             tokenizer=str.split)
     self.classifier = CatBoostClassifier(verbose=0, use_best_model=True)
     super().__init__()
예제 #6
0
 def __init__(self, t='text', seed=42, ngram_range=(1, 3)):
     self.seed = seed
     self.ngram_range = ngram_range
     self.vectorizer = TfidfVectorizer(ngram_range=ngram_range)
     self.vectorizer2 = TfidfVectorizer(ngram_range=ngram_range)
     self.clf = LinearSVC(multi_class="ovr")
     self.init_seed()
     self.word_tokenizer = ToktokTokenizer()
     self.type = t
예제 #7
0
class Solver(AbstractSolver):
    def __init__(self, seed=42):
        self.seed = seed
        self.init_seed()
        self.tokenizer = ToktokTokenizer()
        self.morph = pymorphy2.MorphAnalyzer()
        self.count_vectorizer = CountVectorizer(ngram_range=(1, 4),
                                                tokenizer=str.split)
        self.classifier = CatBoostClassifier(verbose=0, use_best_model=True)
        super().__init__()

    def init_seed(self):
        return random.seed(self.seed)

    def strs_to_pos_tags(self, texts):
        result = []
        for text in texts:
            result.append(' '.join([
                "PNCT" if self.morph.parse(word)[0].tag.POS is None else
                self.morph.parse(word)[0].tag.POS
                for word in self.tokenizer.tokenize(text)
            ]))
        return result

    def save(self, path="data/models/solver16.pkl"):
        model = {
            "count_vectorizer": self.count_vectorizer,
            "classifier": self.classifier
        }
        joblib.dump(model, path)

    def load(self, path="data/models/solver16.pkl"):
        model = joblib.load(path)
        self.count_vectorizer = model["count_vectorizer"]
        self.classifier = model["classifier"]

    def fit(self, tasks):
        X, y = [], []
        for task in tasks:
            task = standardize_task(task)
            correct = task["solution"]["correct_variants"][
                0] if "correct_variants" in task["solution"] else [
                    task["solution"]["correct"]
                ]
            sentences = [
                re.sub(r"^\d\) ?", "", sentence['text'])
                for sentence in task["question"]["choices"]
            ]
            sentences = self.strs_to_pos_tags(sentences)
            X.extend(sentences)
            y.extend([
                1 if str(i + 1) in correct else 0
                for i in range(len(sentences))
            ])
        X = self.count_vectorizer.fit_transform(X).toarray()
        X_train, X_dev, y_train, y_dev = train_test_split(X, y, train_size=0.9)
        self.classifier.fit(X_train, y_train, eval_set=(X_dev, y_dev))

    def predict_from_model(self, task):
        task = standardize_task(task)
        sentences = [
            re.sub(r"^\d\) ?", "", sentence['text'])
            for sentence in task["question"]["choices"]
        ]
        sentences = self.strs_to_pos_tags(sentences)
        vector = self.count_vectorizer.transform(sentences).toarray()
        proba = self.classifier.predict_proba(vector)[:, 1]
        two_highest = sorted([str(i + 1) for i in np.argsort(proba)[-2:]])
        return two_highest
예제 #8
0
class Solver(BertEmbedder):
    def __init__(self, seed=42):
        super(Solver, self).__init__()
        self.morph = pymorphy2.MorphAnalyzer()
        self.seed = seed
        self.init_seed()
        self.toktok = ToktokTokenizer()
        self.paronyms = self.get_paronyms()

    def init_seed(self):
        return random.seed(self.seed)

    def get_paronyms(self):
        paronyms = []
        with open('data/paronyms.csv', 'r', encoding='utf-8') as in_file:
            for line in in_file.readlines():
                pair = line.strip(punctuation).strip().split('\t')
                paronyms.append(pair)
        return paronyms

    def lemmatize(self, token):
        token_all = self.morph.parse(token.lower().rstrip('.,/;!:?'))[0]
        lemma = token_all.normal_form
        return lemma

    def find_closest_paronym(self, par):
        paronyms = set()
        for par1, par2 in self.paronyms:
            paronyms.add(par1)
            paronyms.add(par2)
        try:
            closest = get_close_matches(par, list(paronyms))[0]
        except IndexError:
            closest = None
        return closest

    def check_pair(self, token_norm):
        paronym = None
        for p1, p2 in self.paronyms:
            if token_norm == p1:
                paronym = p2
                break
            if token_norm == p2:
                paronym = p1
                break
        return paronym

    def find_paronyms(self, token):
        token_all = self.morph.parse(token.lower().rstrip('.,/;!:?'))[0]
        token_norm = token_all.normal_form
        paronym = self.check_pair(token_norm)

        if paronym is None:
            paronym_close = self.find_closest_paronym(token_norm)
            paronym = self.check_pair(paronym_close)

        if paronym is not None:
            paronym_parse = self.morph.parse(paronym)[0]
            try:
                str_grammar = str(token_all.tag).split()[1]
            except IndexError:
                str_grammar = str(token_all.tag)

            gr = set(
                str_grammar.replace("Qual ", "").replace(' ', ',').split(','))
            try:
                final_paronym = paronym_parse.inflect(gr).word
            except AttributeError:
                final_paronym = paronym
        else:
            final_paronym = ''
        return final_paronym

    def predict(self, task):
        return self.predict_from_model(task)

    def fit(self, tasks):
        pass

    def load(self, path="data/models/solver5.pkl"):
        pass

    def save(self, path="data/models/solver5.pkl"):
        pass

    def get_score(self, a, b, paronym):
        return self.fill_mask(a, b, paronym.lower())
        return cosine_similarity(
            self.sentence_embedding([sent])[0].reshape(1, -1),
            self.sentence_embedding([paronym.lower()])[0].reshape(1, -1))[0][0]

    def predict_from_model(self, task):
        description = task["text"].replace('НЕВЕРНО ', "неверно ")
        sents = []
        for line in self.toktok.sentenize(description):
            line_tok = self.toktok.tokenize(line)
            for idx, token in enumerate(line_tok):
                if token.isupper() and len(token) > 2:  # get CAPS paronyms
                    second_pair = self.find_paronyms(token)
                    line_before = ' '.join(line_tok[:idx])
                    line_after = ' '.join(line_tok[idx + 1:])
                    if second_pair != '':
                        score = self.get_score(
                            line_before, line_after, token) - self.get_score(
                                line_before, line_after, second_pair)
                        sents.append((score, token, second_pair))
        sents.sort()
        return sents[0][2].strip(punctuation + '\n')
예제 #9
0
class Solver(BertEmbedder):
    def __init__(self, seed=42):
        super(Solver, self).__init__()
        self.is_train_task = False
        self.morph = pymorphy2.MorphAnalyzer()
        self.toktok = ToktokTokenizer()
        self.seed = seed
        self.init_seed()

    def init_seed(self):
        random.seed(self.seed)

    def predict(self, task):
        return self.predict_from_model(task)

    def get_num(self, text):
        lemmas = [
            self.morph.parse(word)[0].normal_form
            for word in self.toktok.tokenize(text)
        ]
        if 'указывать' in lemmas and 'предложение' in lemmas:
            w = lemmas[lemmas.index('указывать') + 1]  # first
            d = {'один': 1, 'два': 2, 'три': 3, 'четыре': 4, 'предложение': 1}
            if w in d:
                return d[w]
        elif 'указывать' in lemmas and 'вариант' in lemmas:
            return 'unknown'
        return 1

    def compare_text_with_variants(self, text, variants, num=1):
        text_vector = self.sentence_embedding([text])
        variant_vectors = self.sentence_embedding(variants)
        i, predictions = 0, {}
        for j in variant_vectors:
            sim = cosine_similarity(text_vector[0].reshape(1, -1),
                                    j.reshape(1, -1)).flatten()[0]
            predictions[i] = sim
            i += 1
        indexes = sorted(predictions.items(),
                         key=operator.itemgetter(1),
                         reverse=True)[:num]
        return sorted([str(i[0] + 1) for i in indexes])

    def sent_split(self, text):
        reg = r'\(*\d+\)'
        return re.split(reg, text)

    def process_task(self, task):
        first_phrase, task_text = re.split(r'\(*1\)', task['text'])[:2]
        variants = [t['text'] for t in task['question']['choices']]
        text, task = "", ""
        if 'Укажите' in task_text:
            text, task = re.split('Укажите ', task_text)
            task = 'Укажите ' + task
        elif 'Укажите' in first_phrase:
            text, task = task_text, first_phrase
        return text, task, variants

    def fit(self, tasks):
        pass

    def load(self, path=""):
        pass

    def save(self, path=''):
        pass

    def predict_from_model(self, task, num=2):
        text, task, variants = self.process_task(task)
        result = self.compare_text_with_variants(text, variants, num=num)
        return result
예제 #10
0
class Solver(object):
    def __init__(self, seed=42, data_path='data/'):
        self.is_train_task = False
        self.morph = pymorphy2.MorphAnalyzer()
        self.toktok = ToktokTokenizer()
        self.seed = seed
        self.init_seed()
        self.synonyms = open(os.path.join(data_path, r'synonyms.txt'),
                             'r',
                             encoding='utf8').readlines()
        self.synonyms = [
            re.sub('\.', '',
                   t.lower().strip('\n')).split(' ') for t in self.synonyms
        ]
        self.synonyms = [[t for t in l if t] for l in self.synonyms]
        self.antonyms = open(os.path.join(data_path, r'antonyms.txt'),
                             'r',
                             encoding='utf8').readlines()
        self.antonyms = [t.strip(' \n').split(' - ') for t in self.antonyms]
        self.phraseology = open(os.path.join(data_path, r'phraseologs.txt'),
                                'r',
                                encoding='utf8').readlines()
        self.phraseology = [[
            l for l in self.lemmatize(l) if l not in
            ['\n', ' ', '...', '', ',', '-', '.', '?', r' (', r'/']
        ] for l in self.phraseology]

    def init_seed(self):
        random.seed(self.seed)

    def lemmatize(self, text):
        return [
            self.morph.parse(word)[0].normal_form
            for word in self.toktok.tokenize(text.strip())
        ]

    def predict(self, task):
        return self.predict_from_model(task)

    def get_word(self, text):
        try:
            return re.split('»', re.split('«', text)[1])[0]
        except:
            return ''

    def get_pos(self, text):
        pos = []
        lemmas = self.lemmatize(text)
        lemmas = [l for l in lemmas if l != ' ']
        if 'фразеологизм' in lemmas:
            pos = "PHR"
        elif 'синоним' in lemmas:
            pos = "SYN"
        elif 'антоним' in lemmas:
            pos = "ANT"
        elif 'антонимический' in lemmas:
            pos = "ANT"
        elif 'синонимический' in lemmas:
            pos = "SYN"
        else:
            pos = "DEF"
        return pos

    def full_intersection(self, small_lst, big_lst):
        if sum([value in big_lst for value in small_lst]) == len(small_lst):
            return True
        return False

    def sent_split(self, text):
        reg = r'\(*\n*\d+\n*\)'
        return re.split(reg, text)

    def search(self, text_lemmas, lst):
        for l in lst:
            if self.full_intersection(l, text_lemmas):
                return ''.join(l)
        return ''

    def get_num(self, text):
        nums = 0
        res = re.search('\d+–*-*\d*', text)
        if res:
            res = res[0]
            if '–' in res:
                nums = res.split('–')
                nums = list(range(int(nums[0]), int(nums[1]) + 1))
            elif '-' in res:
                nums = res.split('-')
                nums = list(range(int(nums[0]), int(nums[1]) + 1))
            else:
                nums = [int(res)]
        return nums

    def compare_text_with_variants(self, pos, text, nums=[], word=''):
        indexes = []
        sents = self.sent_split(text)
        lemmas_all = []
        for s in nums:
            lemmas = self.lemmatize(sents[s - 1])
            lemmas_all += [l for l in lemmas if l != ' ']
            conditions = 0
        lemmas_all = [
            l for l in lemmas_all
            if re.match('\w+', l) and re.match('\w+', l)[0] == l
        ]

        if pos == 'SYN':
            variant = self.search(lemmas_all, self.synonyms)
        elif pos == 'ANT':
            variant = self.search(lemmas_all, self.antonyms)
        else:
            variant = self.search(lemmas_all, self.phraseology)
        if variant:
            return variant
        else:
            return str(random.choice(lemmas_all))

    def eat_json(self, task):
        try:
            firstphrase, tasktext = re.split(r'\(\n*1\n*\)', task['text'])
        except ValueError:
            firstphrase, tasktext = ' '.join(
                re.split(r'\(\n*1\n*\)',
                         task['text'])[:-1]), re.split(r'\(\n*1\n*\)',
                                                       task['text'])[-1]
        if 'Из предложени' in tasktext:
            text, task = re.split('Из предложени', tasktext)
            task = 'Из предложени ' + task
        else:
            text, task = tasktext, firstphrase
        nums = self.get_num(task)
        pos = self.get_pos(task)
        word = ''
        if pos == 'DEF':
            word = self.get_word(task)
        return text, task, pos, nums, word

    def fit(self, tasks):
        pass

    def load(self, path='data/models/solver24.pkl'):
        pass

    def save(self, path='data/models/solver24.pkl'):
        pass

    def predict_from_model(self, task):
        text, task, pos, nums, word = self.eat_json(task)
        result = self.compare_text_with_variants(pos,
                                                 text,
                                                 nums=nums,
                                                 word=word)
        return result
예제 #11
0
class SubSolver(object):
    """
    Классификатор между заданиями.
    Работает на Tfidf векторах и мультиклассовом SVM.

    Parameters
    ----------
    seed : int, optional (default=42)
        Random seed.
    ngram_range : tuple, optional uple (min_n, max_n) (default=(1, 3))
        Used forTfidfVectorizer.
        he lower and upper boundary of the range of n-values for different n-grams to be extracted.
        All values of n such that min_n <= n <= max_n will be used.
    num_tasks : int, optional (default=27)
        Count of all tasks.

    Examples
    --------
    >>> # Basic usage
    >>> from solvers import classifier
    >>> import json
    >>> from utils import read_config
    >>> clf = classifier.Solver()
    >>> tasks = []
    >>> dir_path = "data/"
    >>> for file_name in os.listdir(dir_path):
    >>>     if file_name.endswith(".json"):
    >>>         data = read_config(os.path.join(dir_path, file_name))
    >>>         tasks.append(data)
    >>> clf = solver.fit(tasks)
    >>> # Predict for last file in dir
    >>> numbers_of_tasks = clf.predict(read_config(os.path.join(dir_path, file_name)))
    >>> numbers_of_tasks
    array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 12, 12, 13, 14, 15, 16, 17,
       18, 19, 17, 21, 22, 23, 24, 25, 26, 24])
    >>> # Save classifier
    >>> clf.save("clf.pickle")
    >>> # Load classifier
    >>> clf.load("clf.pickle")
    """
    def __init__(self, t='text', seed=42, ngram_range=(1, 3)):
        self.seed = seed
        self.ngram_range = ngram_range
        self.vectorizer = TfidfVectorizer(ngram_range=ngram_range)
        self.vectorizer2 = TfidfVectorizer(ngram_range=ngram_range)
        self.clf = LinearSVC(multi_class="ovr")
        self.init_seed()
        self.word_tokenizer = ToktokTokenizer()
        self.type = t

    def init_seed(self):
        np.random.seed(self.seed)
        random.seed(self.seed)

    def convert_to_text(self, task):
        #return task['text']
        text = self.word_tokenizer.tokenize(task['text'])
        if self.type in ["choice", "multiple_choice"]:
            choice_type = [
                t for t in task['question']['choices'][0].keys() if t != 'id'
            ][0]
            text.append(choice_type)
            for el in task['question']['choices']:
                text += self.word_tokenizer.tokenize(el[choice_type])
        text = ' '.join(text)
        return text

    def fit(self, tasks):
        texts = []
        classes = []
        for data in tasks:
            for task in data:
                if task['question']['type'] == self.type:
                    idx = int(task["id"])
                    if idx in range(17, 21):
                        idx = 17
                    texts.append(self.convert_to_text(task))
                    classes.append(idx)
        classes = np.array(classes)
        self.classes = np.unique(classes)
        if len(self.classes) > 1:
            vectors = self.vectorizer.fit_transform(texts)
            self.clf.fit(vectors, classes)
        return self

    def predict_one(self, task):
        if len(self.classes) == 1:
            return self.classes[0]
        text = self.convert_to_text(task)
        return int(
            self.clf.predict(self.vectorizer.transform([text])).ravel()[0])

    def fit_from_dir(self, dir_path):
        tasks = []
        for file_name in os.listdir(dir_path):
            if file_name.endswith(".json"):
                data = read_config(os.path.join(dir_path, file_name))
                tasks.append(data)
        tasks = [task for task in tasks if 'hint' not in task]
        return self.fit(tasks)

    def load(self, d):
        self.vectorizer = d['vec']
        self.clf = d['clf']
        self.classes = d['classes']

    def save(self):
        return {
            "vec": self.vectorizer,
            "clf": self.clf,
            "classes": self.classes
        }
예제 #12
0
class Solver(object):

    def __init__(self, seed=42):
        self.is_train_task = False
        self.morph = pymorphy2.MorphAnalyzer()
        self.toktok = ToktokTokenizer()
        self.seed = seed
        self.init_seed()

    def init_seed(self):
        random.seed(self.seed)

    def lemmatize(self, text):
        return [self.morph.parse(word)[0].normal_form for word in
                self.toktok.tokenize(text.strip())]

    def predict(self, task):
        return self.predict_from_model(task)

    def get_word(self, text):
        try:
            return re.split('»', re.split('«', text)[1])[0]
        except:
            return ''

    def get_pos(self, text):
        pos = []
        lemmas = self.lemmatize(text)
        lemmas = [l for l in lemmas if l!=' ']
        if 'сочинительный' in lemmas:
            pos.append("CCONJ")
        if 'подчинительный' in lemmas:
            pos.append("SCONJ")
        if 'наречие' in lemmas:
            pos.append("ADV")
        if 'союзный' in lemmas:
            pos.append("ADVPRO")
        if 'частица' in lemmas:
            pos.append("PART")
        if 'определительный' in lemmas:
            pos.append("OPRO")
        if 'личный' in lemmas:
            pos.append("LPRO")
        if 'указательный' in lemmas:
            pos.append("UPRO")
        return pos
    
    def sent_split(self, text):
        reg = r'\(*\n*\d+\n*\)'
        return re.split(reg, text)

    def get_num(self, text):
        nums = 0
        res = re.search('\d+([–|-|—])*\d*', text)
        if res:
            res = res[0]
            if re.search(r'–|-|—', res):
                nums = re.split(r'–|-|—', res)
                nums = list(range(int(nums[0]), int(nums[1])+1))
            else:
                nums = [int(res)]
        return nums

    def compare_text_with_variants(self, pos, text, nums=[]):
        indexes = []
        sents = self.sent_split(text)
        dic = {"CCONJ":['но','а','и','да', 'тоже', 'также', 'зато', 'однако', 'же', 'или', 'либо'],
              "SCONJ":['если','хотя','однако','когда','что','потомучто'],
              "ADV":['сейчас','сегодня'],
              "ADVPRO":['который','которая'],
              "OPRO":['этот','это','эта','все', 'сам', 'самый', 'весь', 'всякий', 'каждый', 'любой', 'другой', 'иной', 'всяк', 'всяческий'],
               "LPRO":['я', 'ты', 'он', 'она', 'оно', 'мы', 'вы', 'они'],
               "UPRO":['этот','это','эта','все', 'тот', 'такой', 'таков', 'столько', 'сей', 'оный' ],
                "PART":['только','именно','не', 'ни', 'бы', 'лишь', 'пусть', 'дескать']
              }
        if not pos:
            return [str(random.choice(nums))]
        for s in nums:
            lemmas = self.lemmatize(sents[s-1])
            lemmas = [l for l in lemmas if l!=' ']
            conditions=0
            for p in pos:
                variants = dic[p]
                if sum([v in lemmas for v in variants]):
                    conditions+=1
            if conditions==len(pos):
                indexes.append(s)
        if not indexes:
            indexes = [random.choice(nums)]

        return [str(i) for i in sorted(indexes)]

    def eat_json(self, task):
        try:
            firstphrase, tasktext = re.split(r'\(\n*1\n*\)', task['text'])
        except ValueError:
            firstphrase, tasktext = ' '.join(re.split(r'\(\n*1\n*\)', task['text'])[:-1]),re.split(r'\(\n*1\n*\)', task['text'])[-1]
        if 'Среди предложений' in tasktext:
            text, task = re.split('Среди предложений', tasktext)
            task = 'Среди предложений '+task
            #word = re.split('\.', re.split('значения слова ', text)[1])[0]
        else:
            text, task = tasktext, firstphrase
            #word = re.split('\.', re.split('значения слова ', task)[1])[0]
        nums = self.get_num(task)
        pos = self.get_pos(task)
        return text, task, nums, pos

    def fit(self, tasks):
        pass

    def load(self, path='data/models/solver25.pkl'):
        pass

    def save(self, path='data/models/solver25.pkl'):
        pass

    def predict_from_model(self, task):
        text, task, nums, pos = self.eat_json(task)
        result = self.compare_text_with_variants(pos, text, nums=nums)
        return result
예제 #13
0
class Solver(BertEmbedder):

    def __init__(self, seed=42):
        super(Solver, self).__init__()
        self.seed = seed
        self.init_seed()
        self.morph = pymorphy2.MorphAnalyzer()
        self.has_model = True
        self.toktok = ToktokTokenizer()
        self.mode = 1 # 1 - find wrong word, 2 - replace word

    def init_seed(self):
        return random.seed(self.seed)

    def predict_random(self, task_desc):
        """Random variant"""
        task_desc = re.sub("[^а-я0-9\-]", " ", task_desc)
        result = random.choice(task_desc.split())
        return result

    def exclude_word(self, task_sent):
        """Make it with Bert"""
        tokens = self.toktok.tokenize(task_sent)

        to_tokens = []
        for token in tokens:
            parse_res = self.morph.parse(token)[0]
            if parse_res.tag.POS not in ["CONJ", "PREP", "PRCL", "INTJ", "PRED", "NPRO"]:
                if parse_res.normal_form != 'быть':
                    to_tokens.append((parse_res.word, parse_res.tag.POS))

        bigrams = list(ngrams(to_tokens, 2))

        results = []
        for bigram in bigrams:
            if bigram[0] != bigram[1] and bigram[0][1] == 'ADJF' and bigram[1][1] == 'NOUN':
                b1 = self.sentence_embedding([bigram[0][0]])[0].reshape(1, -1)
                b2 = self.sentence_embedding([bigram[1][0]])[0].reshape(1, -1)
                sim = cosine_similarity(b1, b2)[0][0]
                results.append((sim, bigram[0][0], bigram[1][0], bigram[0][1], bigram[0][1]))
        if not results:
            for bigram in bigrams:
                if bigram[0] != bigram[1]:
                    b1 = self.sentence_embedding([bigram[0][0]])[0].reshape(1, -1)
                    b2 = self.sentence_embedding([bigram[1][0]])[0].reshape(1, -1)
                    sim = cosine_similarity(b1, b2)[0][0]
                    results.append((sim, bigram[0][0], bigram[1][0], bigram[0][1], bigram[0][1]))
        results = sorted(results)
        final_pair = results[-1]
        if final_pair[-1] == 'NOUN' and final_pair[-2] == 'NOUN':
            return results[-1][2], tokens
        else:
            return results[-1][1], tokens

    def fit(self, tasks):
        pass
        
    def load(self, path="data/models/solver6.pkl"):
        pass

    def save(self, path="data/models/solver6.pkl"):
        pass

    def predict(self, task):
        if not self.has_model:
            return self.predict_random(task)
        else:
            return self.predict_from_model(task)

    def predict_from_model(self, task):
        description = task["text"]
        task_desc = ""
        if "заменив" in description:
            self.mode = 2
        else:
            self.mode = 1
        for par in description.split("\n"):
            for sentence in nltk.sent_tokenize(par):
                sentence = sentence.lower().rstrip(punctuation).replace('6.', "")
                if re.match('.*(отредактируйте|выпишите|запишите|исправьте|исключите).*', sentence):
                    continue
                else:
                    task_desc += sentence
        result, tokens = self.exclude_word(task_desc)
        return result.strip(punctuation)
예제 #14
0
class Solver(BertEmbedder):
    def __init__(self, seed=42):
        super(Solver, self).__init__()
        self.is_train_task = False
        self.morph = pymorphy2.MorphAnalyzer()
        self.toktok = ToktokTokenizer()
        self.seed = seed
        self.init_seed()

    def init_seed(self):
        random.seed(self.seed)

    def predict(self, task):
        return self.predict_from_model(task)

    def clean_text(self, text):
        newtext, logic = [], [
            "PREP", "CONJ", "Apro", "PRCL", "INFN", "VERB", "ADVB"
        ]
        for token in self.toktok.tokenize(text):
            if any(tag in self.morph.parse(token)[0].tag for tag in logic):
                newtext.append(self.morph.parse(token)[0].normal_form)
        return ' '.join(newtext)

    def get_pos(self, text):
        pos, lemmas = 'word', [
            self.morph.parse(word)[0].normal_form
            for word in self.toktok.tokenize(text)
        ]
        if 'сочинительный' in lemmas:
            pos = "CCONJ"
        elif 'подчинительный' in lemmas:
            pos = "SCONJ"
        elif 'наречие' in lemmas:
            pos = "ADV"
        elif 'союзный' in lemmas:
            pos = "ADVPRO"
        elif 'местоимение' in lemmas:
            pos = "PRO"
        elif 'частица' in lemmas:
            pos = "PART"
        return pos

    def get_num(self, text):
        lemmas = [
            self.morph.parse(word)[0].normal_form
            for word in self.toktok.tokenize(text)
        ]
        if 'слово' in lemmas and 'предложение' in lemmas:
            d = {
                'один': 1,
                'два': 2,
                'три': 3,
                'четыре': 4,
                'первый': 1,
                'второй': 2,
                'третий': 3,
                'четвертый': 4,
            }
            for i in lemmas:
                if i in d:
                    return d[i]
        return 1

    def sent_split(self, text):
        reg = r'\(\n*\d+\n*\)'
        return re.split(reg, text)

    def compare_text_with_variants(self, query_word, text, variants):
        sents = self.sent_split(text)
        text_vector = None
        for sent in sents:
            sent = re.sub('Прочитайте фрагмент.*', '', sent)
            words = self.toktok.tokenize(sent)
            lemmas = [self.morph.parse(word) for word in words]
            word_idx = None
            for idx in range(len(lemmas)):
                if query_word.lower() in [
                        el.normal_form for el in lemmas[idx]
                ]:
                    word_idx = idx
                    break
            if word_idx is not None:
                text = " ".join(words[:word_idx] +
                                ['|', query_word.lower(), '|'] +
                                words[word_idx + 1:])
                text_vector = self.contextual_word_embedding([text])[0]
                break
        else:
            text_vector = self.sentence_embedding([text])[0]
        pretext = query_word.lower() + ' - это '
        variants = [
            pretext + re.sub('\d+[.)]', '', variant) for variant in variants
        ]
        variant_vectors = self.sentence_embedding(variants)
        i, predictions = 0, {}
        for j in variant_vectors:
            sim = cosine_similarity(text_vector.reshape(1, -1),
                                    j.reshape(1, -1)).flatten()[0]
            predictions[i] = sim
            i += 1
        indexes = sorted(predictions.items(),
                         key=operator.itemgetter(1),
                         reverse=True)[:1]
        return sorted([str(i[0] + 1) for i in indexes])

    def process_task(self, task):
        try:
            first_phrase, task_text = re.split(r'\(\n*1\n*\)', task['text'])
        except ValueError:
            first_phrase, task_text = ' '.join(re.split(r'\(\n*1\n*\)', task['text'])[:-1]), \
                                    re.split(r'\(\n*1\n*\)', task['text'])[-1]
        variants = [t['text'] for t in task['question']['choices']]
        text, task, word = "", "", ""
        if 'Определите' in task_text:
            text, task = re.split('Определите', task_text)
            task = 'Определите ' + task
            word = re.split('\.', re.split('значения слова ', text)[1])[0]
        elif 'Определите' in first_phrase:
            text, task = task_text, first_phrase
            word = re.split('\.', re.split('значения слова ', task)[1])[0]
        return text, task, variants, word

    def fit(self, tasks):
        pass

    def load(self, path="data/models/solver3.pkl"):
        pass

    def save(self, path='data/models/solver3.pkl'):
        pass

    def predict_from_model(self, task):
        text, task, variants, word = self.process_task(task)
        result = self.compare_text_with_variants(word, text, variants)
        return result