Exemplo n.º 1
0
 def _encode_corpus(path, corpus):
     model = ElmoModel()
     tf.reset_default_graph()
     model.load(path)
     elmo_vectors = model.get_elmo_vectors(corpus,
                                           layers = 'top')
     return elmo_vectors
Exemplo n.º 2
0
class ConsultantPlusAnalyzer:
    def __init__(self, is_elmo_used=False):
        self.config = get_config('config.yml')
        self.parser = ConsultantPlusParser(config=self.config)
        self.model = ElmoModel()
        self.mystem = Mystem()
        self.spec_chars = string.punctuation + '\n\xa0«»\t—…'
        self.stop_words = stopwords.words("russian")
        self.stop_words.extend([
            'и',
            'в',
            'на',
            'n',
            'рф',
            'гк',
            'юридического',
            ' ',
            '1',
            'ред',
            '2',
            'ст',
            'также',
            'свой',
            'либо',
            'это',
            'текст',
            'закон',
            'который',
            'иной',
            'год',
            'мочь',
        ])
        if is_elmo_used:
            self.model.load(self.config['model_info_file'])
        self.navec = Navec.load(self.config['navec_news_v1_1B_250K_300d_100q'])
        self.syntax = Syntax.load(self.config['slovnet_syntax_news_v1'])
        self.syntax.navec(self.navec)

    def save_information_about_target_words_by_codex_type(
            self, codex_type, codex_id):
        raw_articles_info = self.parser.sorted_articles_info[codex_type]
        if os.path.exists(
                generate_file_name_with_postfix(
                    self.config['information_about_target_words'],
                    str(codex_id))):
            os.remove(
                generate_file_name_with_postfix(
                    self.config['information_about_target_words'],
                    str(codex_id)))
        with open(generate_file_name_with_postfix(
                self.config['information_about_target_words'], str(codex_id)),
                  mode='w') as information_about_target_words_file:
            information_about_target_words_writer = csv.writer(
                information_about_target_words_file,
                delimiter=',',
                quotechar='"',
                quoting=csv.QUOTE_MINIMAL)
            information_about_target_words_writer.writerow([
                'article_id', 'article_title', 'parts_after_target_words',
                'sentences'
            ])
            for article_info in tqdm(raw_articles_info):
                text = self.parser.get_article_text_by_id(article_info.id)
                if text.find('если иное не предусмотрено') != -1:
                    text_parts = text.split('если иное не предусмотрено')
                    parts_before_target_words = list()
                    for i in range(0, len(text_parts) - 1):
                        parts_before_target_words.append(
                            text_parts[i].split('.')[-1])
                    parts_after_target_words = list()
                    for i in range(1, len(text_parts)):
                        parts_after_target_words.append(
                            text_parts[i].split('.')[0])
                    sentences = list()
                    for i in range(len(parts_before_target_words)):
                        sentences.append(parts_before_target_words[i] +
                                         'если иное не предусмотрено' +
                                         parts_after_target_words[i])
                    information_about_target_words_writer.writerow([
                        article_info.id, article_info.title,
                        '~'.join(parts_after_target_words), '~'.join(sentences)
                    ])

    def plot_word_vectors_graph(self,
                                proximity_threshold,
                                count_of_words=None):
        # TODO: найти порог близости для каждой статьи
        articles_vectors_info = dict()
        articles_words_info = dict()
        with open(self.config['articles_vectors_info_file']
                  ) as articles_vectors_info_file_with_words:
            reader = csv.reader(articles_vectors_info_file_with_words)
            for row in tqdm(reader):
                article_id = int(row[0])
                vector = self.convert_vector_from_string_value(row[1:len(row) -
                                                                   1])
                word = row[-1]
                if article_id not in articles_vectors_info:
                    articles_vectors_info[article_id] = list()
                    articles_words_info[article_id] = list()
                articles_vectors_info[article_id].append(vector)
                articles_words_info[article_id].append(word)
        for article_id, article_vectors in tqdm(articles_vectors_info.items()):
            n = len(article_vectors) if not count_of_words else count_of_words
            dist_matrix = np.zeros((n, n))
            for i in range(n):
                for j in range(n):
                    dist_matrix[i][j] = self.get_euclidean_distance(
                        article_vectors[i], article_vectors[j])
            G = nx.Graph()
            edges_info = dict()
            for i in range(n):
                for j in range(n):
                    if dist_matrix[i][j] > proximity_threshold and (
                        (articles_words_info[article_id][i],
                         articles_words_info[article_id][j]) not in edges_info
                            or (articles_words_info[article_id][j],
                                articles_words_info[article_id][i])
                            not in edges_info):
                        edges_info[(
                            articles_words_info[article_id][i],
                            articles_words_info[article_id][j])] = round(
                                dist_matrix[i][j], 2)
            G.add_weighted_edges_from([(item[0][0], item[0][1], item[1])
                                       for item in edges_info.items()])
            pos = nx.spring_layout(G)
            plt.figure(figsize=(50, 50))
            nx.draw(G, pos, node_size=10000, with_labels=True)
            nx.draw_networkx_edge_labels(G, pos, edge_labels=edges_info)
            plt.show()

    def save_syntax_analysis_by_text(self,
                                     text,
                                     file,
                                     is_many_sentences=False):
        f = open(file, 'a')
        sys.stdout = f
        print('-' * 100)
        if text != 'None':
            if not is_many_sentences:
                chunk = list()
                for sent in sentenize(text):
                    tokens = [_.text for _ in tokenize(sent.text)]
                    chunk.append(tokens)
                markup = next(self.syntax.map(chunk))
                words, deps = list(), list()
                for token in markup.tokens:
                    words.append(token.text)
                    source = int(token.head_id) - 1
                    target = int(token.id) - 1
                    if source > 0 and source != target:
                        deps.append([source, target, token.rel])
                show_markup(words, deps)
            else:
                for sentence in text.split('.'):
                    if len(sentence.split()) > 5:
                        chunk = list()
                        for sent in sentenize(sentence):
                            tokens = [_.text for _ in tokenize(sent.text)]
                            chunk.append(tokens)
                        markup = next(self.syntax.map(chunk))
                        words, deps = list(), list()
                        for token in markup.tokens:
                            words.append(token.text)
                            source = int(token.head_id) - 1
                            target = int(token.id) - 1
                            if source > 0 and source != target:
                                deps.append([source, target, token.rel])
                        show_markup(words, deps)
        else:
            print('None')
        print('-' * 100)

    def get_words_matrix_variance(self):
        articles_vectors_info = dict()
        articles_words_info = dict()
        with open(self.config['articles_vectors_info_file']
                  ) as articles_vectors_info_file_with_words:
            reader = csv.reader(articles_vectors_info_file_with_words)
            for row in tqdm(reader):
                article_id = int(row[0])
                vector = self.convert_vector_from_string_value(row[1:len(row) -
                                                                   1])
                word = row[-1]
                if article_id not in articles_vectors_info:
                    articles_vectors_info[article_id] = list()
                    articles_words_info[article_id] = list()
                articles_vectors_info[article_id].append(vector)
                articles_words_info[article_id].append(word)
        for article_id, article_vectors in tqdm(articles_vectors_info.items()):
            mat = np.array(article_vectors)
            print(f'The variance in article {article_id} is {mat.var()}')

    def get_prediction(self, words=None, file_with_vectors=None):
        words_vectors = list()
        if file_with_vectors:
            with open(file_with_vectors) as file:
                reader = csv.reader(file)
                for row in reader:
                    words_vectors.append(
                        self.convert_vector_from_string_value(row))
        else:
            for word in tqdm(words):
                words_vectors.append(self.model.get_elmo_vectors(word)[0][0])
        articles_vectors_info = dict()
        articles_words_info = dict()
        with open(self.config['articles_vectors_info_file']
                  ) as articles_vectors_info_file_with_words:
            reader = csv.reader(articles_vectors_info_file_with_words)
            for row in tqdm(reader):
                article_id = int(row[0])
                vector = self.convert_vector_from_string_value(row[1:len(row) -
                                                                   1])
                word = row[-1]
                if article_id not in articles_vectors_info:
                    articles_vectors_info[article_id] = list()
                    articles_words_info[article_id] = list()
                articles_vectors_info[article_id].append(vector)
                articles_words_info[article_id].append(word)
        articles_distance_info = list()
        for word_vector in tqdm(words_vectors):
            articles_distance_info.append(dict())
            for article_id in tqdm(articles_vectors_info):
                if article_id not in articles_distance_info[-1]:
                    articles_distance_info[-1][article_id] = list()
                for vector in tqdm(articles_vectors_info[article_id]):
                    articles_distance_info[-1][article_id].append(
                        self.get_euclidean_distance(word_vector, vector))
        articles_average_distance_info = list()
        for info in tqdm(articles_distance_info):
            articles_average_distance_info.append(dict())
            for article_id in tqdm(info):
                articles_average_distance_info[-1][article_id] = np.average(
                    np.array(info[article_id]))
        prediction_articles_id = list()
        for info in articles_average_distance_info:
            id = -1
            min_dist = sys.maxsize
            for article_id, dist in info.items():
                if dist < min_dist:
                    id = article_id
                    min_dist = dist
            prediction_articles_id.append(id)
        print(prediction_articles_id)

    def save_unique_words_in_articles_analysis(self, codex_type, codex_id):
        raw_articles_info = self.parser.sorted_articles_info[codex_type]
        articles_info = list()
        for article_info in tqdm(raw_articles_info):
            text = self.parser.get_article_text_by_id(article_info.id)
            text = text.lower()
            text = self.remove_chars_from_text(text, self.spec_chars)
            article_tokens = word_tokenize(' '.join(
                self.mystem.lemmatize(text)))
            for stop_word in self.stop_words:
                while stop_word in article_tokens:
                    article_tokens.remove(stop_word)
            text = Text(article_tokens)
            f_dist = FreqDist(text)
            f_dist = list(filter(lambda item: item[1] == 1, f_dist.items()))
            articles_info.append(
                (article_info.id, len(f_dist) / len(article_tokens)))
        if os.path.exists(
                generate_file_name_with_postfix(
                    self.config['unique_words_in_articles_analysis_file'],
                    str(codex_id))):
            os.remove(
                generate_file_name_with_postfix(
                    self.config['unique_words_in_articles_analysis_file'],
                    str(codex_id)))
        with open(generate_file_name_with_postfix(
                self.config['unique_words_in_articles_analysis_file'],
                str(codex_id)),
                  mode='w') as unique_words_in_articles_analysis_file:
            unique_words_in_articles_analysis_writer = csv.writer(
                unique_words_in_articles_analysis_file,
                delimiter=',',
                quotechar='"',
                quoting=csv.QUOTE_MINIMAL)
            unique_words_in_articles_analysis_writer.writerow(
                ['article_id', 'unique_words_frequency'])
            for frequency_info in articles_info:
                unique_words_in_articles_analysis_writer.writerow(
                    [frequency_info[0], frequency_info[1]])

    def save_most_popular_words_analysis(self, most_common_quantity):
        articles_tokens = list()
        for (codex_type, _) in tqdm(self.parser.codex_urls):
            raw_articles_info = self.parser.sorted_articles_info[codex_type]
            for article_info in tqdm(raw_articles_info):
                text = self.parser.get_article_text_by_id(article_info.id)
                text = text.lower()
                text = self.remove_chars_from_text(text, self.spec_chars)
                article_tokens = word_tokenize(' '.join(
                    self.mystem.lemmatize(text)))
                for stop_word in self.stop_words:
                    while stop_word in article_tokens:
                        article_tokens.remove(stop_word)
                articles_tokens.extend(article_tokens)
        text = Text(articles_tokens)
        f_dist = FreqDist(text)
        if os.path.exists(self.config['most_popular_words_analysis_file']):
            os.remove(self.config['most_popular_words_analysis_file'])
        with open(self.config['most_popular_words_analysis_file'],
                  mode='w') as most_popular_words_analysis_file:
            most_popular_words_analysis_writer = csv.writer(
                most_popular_words_analysis_file,
                delimiter=',',
                quotechar='"',
                quoting=csv.QUOTE_MINIMAL)
            most_popular_words_analysis_writer.writerow(
                ['word', 'word_count', 'frequency'])
            for info in f_dist.most_common(most_common_quantity):
                most_popular_words_analysis_writer.writerow(
                    [info[0], info[1], info[1] / len(articles_tokens)])

    def save_unique_words_analysis(self, uniqueness_threshold):
        """Сохраняем информацию о количестве уникальных слов и количестве статей, в которых эти слова встречаются, а также информацию о заданном количестве уникальных слов"""
        articles_tokens = list()
        articles_words_info = dict()
        for (codex_type, _) in tqdm(self.parser.codex_urls):
            raw_articles_info = self.parser.sorted_articles_info[codex_type]
            for article_info in tqdm(raw_articles_info):
                text = self.parser.get_article_text_by_id(article_info.id)
                text = text.lower()
                text = self.remove_chars_from_text(text, self.spec_chars)
                article_tokens = word_tokenize(' '.join(
                    self.mystem.lemmatize(text)))
                for stop_word in self.stop_words:
                    while stop_word in article_tokens:
                        article_tokens.remove(stop_word)
                articles_words_info[self.get_unique_article_identifier(
                    codex_type, article_info.id)] = list(set(article_tokens))
                articles_tokens.extend(article_tokens)
        text = Text(articles_tokens)
        f_dist = FreqDist(text)
        f_dist = list(
            filter(lambda item: item[1] <= uniqueness_threshold,
                   f_dist.items()))
        unique_words_info = dict()
        # Сохраняем информацию в виде: 'уникальное слово': ['количество во всем корпусе', 'количество статей, в котром встретилось это слово']
        for word_info in f_dist:
            if word_info[0] not in unique_words_info:
                unique_words_info[word_info[0]] = [word_info[1], 0]
            for article_id in tqdm(articles_words_info):
                if word_info[0] in articles_words_info[article_id]:
                    unique_words_info[word_info[0]][1] += 1
        if os.path.exists(self.config['articles_unique_words_info_file']):
            os.remove(self.config['articles_unique_words_info_file'])
        with open(self.config['articles_unique_words_info_file'],
                  mode='w') as articles_unique_words_info_file:
            articles_unique_words_info_writer = csv.writer(
                articles_unique_words_info_file,
                delimiter=',',
                quotechar='"',
                quoting=csv.QUOTE_MINIMAL)
            articles_unique_words_info_writer.writerow(
                ['word', 'word_count', 'articles_count'])
            for info in unique_words_info.items():
                articles_unique_words_info_writer.writerow(
                    [info[0], info[1][0], info[1][1]])
        unique_words_metrics = dict()
        # Сохраняем информацию в виде: 'заданное количество слова во всем корпусе': 'количество таких слов во всем корпусе'
        for value in unique_words_info.values():
            if value[0] not in unique_words_metrics:
                unique_words_metrics[value[0]] = value[1]
            else:
                unique_words_metrics[value[0]] += value[1]
        if os.path.exists(self.config['articles_unique_words_analysis_file']):
            os.remove(self.config['articles_unique_words_analysis_file'])
        with open(self.config['articles_unique_words_analysis_file'],
                  mode='w') as articles_unique_words_analysis_file:
            articles_unique_words_analysis_writer = csv.writer(
                articles_unique_words_analysis_file,
                delimiter=',',
                quotechar='"',
                quoting=csv.QUOTE_MINIMAL)
            articles_unique_words_analysis_writer.writerow([
                'count_unique_words_frequency', 'count_unique_words_in_corpus'
            ])
            for info in unique_words_metrics.items():
                articles_unique_words_analysis_writer.writerow(
                    [info[0], info[1]])
        if os.path.exists(
                self.
                config['articles_unique_words_analysis_file_with_frequency']):
            os.remove(
                self.
                config['articles_unique_words_analysis_file_with_frequency'])
        with open(self.
                  config['articles_unique_words_analysis_file_with_frequency'],
                  mode='w') as articles_unique_words_analysis_file:
            articles_unique_words_analysis_writer = csv.writer(
                articles_unique_words_analysis_file,
                delimiter=',',
                quotechar='"',
                quoting=csv.QUOTE_MINIMAL)
            articles_unique_words_analysis_writer.writerow([
                'count_unique_words_frequency',
                'count_unique_words_in_corpus_frequency'
            ])
            for info in unique_words_metrics.items():
                articles_unique_words_analysis_writer.writerow(
                    [info[0], info[1] / len(articles_tokens)])

    def save_codex_hist_info(self, codex_type, codex_id, constraint=None):
        """Сохранение частотности слов во всем корпусе"""
        raw_articles_info = self.parser.sorted_articles_info[codex_type]
        articles_tokens = list()
        for article_info in tqdm(raw_articles_info):
            text = self.parser.get_article_text_by_id(article_info.id)
            text = text.lower()
            text = self.remove_chars_from_text(text, self.spec_chars)
            article_tokens = word_tokenize(' '.join(
                self.mystem.lemmatize(text)))
            for stop_word in self.stop_words:
                while stop_word in article_tokens:
                    article_tokens.remove(stop_word)
            articles_tokens.extend(article_tokens)
        text = Text(articles_tokens)
        f_dist = FreqDist(text)
        if not constraint:
            if os.path.exists(
                    generate_file_name_with_postfix(
                        self.config['articles_frequency_info_file'],
                        str(codex_id))):
                os.remove(
                    generate_file_name_with_postfix(
                        self.config['articles_frequency_info_file'],
                        str(codex_id)))
            with open(generate_file_name_with_postfix(
                    self.config['articles_frequency_info_file'],
                    str(codex_id)),
                      mode='w') as articles_frequency_info_file:
                articles_frequency_info_writer = csv.writer(
                    articles_frequency_info_file,
                    delimiter=',',
                    quotechar='"',
                    quoting=csv.QUOTE_MINIMAL)
                articles_frequency_info_writer.writerow(['word', 'frequency'])
                for frequency_info in f_dist.most_common(100):
                    articles_frequency_info_writer.writerow([
                        frequency_info[0],
                        frequency_info[1] / len(articles_tokens)
                    ])
        else:
            if os.path.exists(
                    generate_file_name_with_postfix(
                        self.
                        config['articles_frequency_info_file_with_constraint'],
                        str(codex_id))):
                os.remove(
                    generate_file_name_with_postfix(
                        self.
                        config['articles_frequency_info_file_with_constraint'],
                        str(codex_id)))
            with open(generate_file_name_with_postfix(
                    self.
                    config['articles_frequency_info_file_with_constraint'],
                    str(codex_id)),
                      mode='w') as articles_frequency_info_file:
                articles_frequency_info_writer = csv.writer(
                    articles_frequency_info_file,
                    delimiter=',',
                    quotechar='"',
                    quoting=csv.QUOTE_MINIMAL)
                articles_frequency_info_writer.writerow(['word', 'frequency'])
                f_dist = list(
                    filter(lambda item: item[1] > constraint, f_dist.items()))
                for frequency_info in f_dist:
                    articles_frequency_info_writer.writerow([
                        frequency_info[0],
                        frequency_info[1] / len(articles_tokens)
                    ])

    def save_word_vectors_analysis_info(self, codex_type, most_common_count):
        articles_info = dict()
        raw_articles_info = self.parser.sorted_articles_info[codex_type]
        i = 0
        for article_info in tqdm(raw_articles_info):
            text = self.parser.get_article_text_by_id(article_info.id)
            text = text.lower()
            text = self.remove_chars_from_text(text, self.spec_chars)
            article_tokens = word_tokenize(' '.join(
                self.mystem.lemmatize(text)))
            for stop_word in self.stop_words:
                while stop_word in article_tokens:
                    article_tokens.remove(stop_word)
            article_vectors = list()
            article_words = list()
            text = Text(article_tokens)
            f_dist = FreqDist(text)
            for token in tqdm(f_dist.most_common(most_common_count)):
                vector = self.model.get_elmo_vectors(token[0])
                article_words.append(token[0])
                article_vectors.append(vector[0][0])
            articles_info[article_info.id] = [article_vectors, article_words]
            i += 1
            if i == 20:
                break
        if os.path.exists(self.config['articles_vectors_info_file']):
            os.remove(self.config['articles_vectors_info_file'])
        with open(self.config['articles_vectors_info_file'],
                  mode='w') as articles_vectors_info_file:
            articles_vectors_info_writer = csv.writer(
                articles_vectors_info_file,
                delimiter=',',
                quotechar='"',
                quoting=csv.QUOTE_MINIMAL)
            for article_id, article_vectors_info in articles_info.items():
                for i, article_vector_info in enumerate(
                        article_vectors_info[0]):
                    articles_vectors_info_writer.writerow([
                        article_id, *article_vector_info,
                        article_vectors_info[1][i]
                    ])

    def frequency_analysis_of_words(self):
        articles_tokens = list()
        for i in tqdm(range(10)):
            text = self.parser.get_article_text_by_id(i)
            text = text.lower()
            text = self.remove_chars_from_text(text, self.spec_chars)
            articles_tokens.extend(
                word_tokenize(' '.join(self.mystem.lemmatize(text))))
        for stop_word in self.stop_words:
            while stop_word in articles_tokens:
                articles_tokens.remove(stop_word)
        raw_text = ' '.join(articles_tokens)
        word_cloud = WordCloud().generate(raw_text)
        plt.imshow(word_cloud, interpolation='bilinear')
        plt.axis('off')
        plt.show()
        text = Text(articles_tokens)
        f_dist = FreqDist(text)
        f_dist.plot(30, cumulative=False)

    def links_on_target_words_analysis(self):
        for codex_id in tqdm(range(len(self.parser.codex_urls))):
            if os.path.exists(
                    generate_file_name_with_postfix(
                        self.
                        config['information_about_target_words_with_links'],
                        str(codex_id))):
                os.remove(
                    generate_file_name_with_postfix(
                        self.
                        config['information_about_target_words_with_links'],
                        str(codex_id)))
            with open(generate_file_name_with_postfix(
                    self.config['information_about_target_words_with_links'],
                    str(codex_id)),
                      mode='w') as information_about_target_words_file:
                information_about_target_words_writer = csv.writer(
                    information_about_target_words_file,
                    delimiter=',',
                    quotechar='"',
                    quoting=csv.QUOTE_MINIMAL)
                information_about_target_words_writer.writerow([
                    'article_id', 'article_title', 'article_url',
                    'links_on_target'
                ])
                target_words_info = pd.read_csv(
                    generate_file_name_with_postfix(
                        self.config['information_about_target_words'],
                        str(codex_id)))
                for row in tqdm(target_words_info.itertuples()):
                    links_on_target = list()
                    for part_of_target_words in row[3].split('~'):
                        if self.parser.get_links_on_target_words_by_id_and_target_words(
                                row[1], part_of_target_words):
                            links_on_target.append(
                                self.parser.
                                get_links_on_target_words_by_id_and_target_words(
                                    row[1], part_of_target_words))
                        else:
                            links_on_target.append('None')
                    information_about_target_words_writer.writerow([
                        row[1], row[2],
                        self.parser.get_article_url_by_id(row[1]),
                        ' '.join(links_on_target)
                    ])

    @staticmethod
    def save_syntax_analysis(analyzer):
        for codex_id in tqdm(range(len(analyzer.parser.codex_urls))):
            target_words_info = pd.read_csv(
                generate_file_name_with_postfix(
                    analyzer.config['information_about_target_words'],
                    str(codex_id)))
            for row in tqdm(target_words_info.itertuples()):
                for sentence in row[-1].split('~'):
                    analyzer.save_syntax_analysis_by_text(
                        sentence,
                        generate_file_name_with_postfix(
                            analyzer.
                            config['article_target_words_realation_info'],
                            str(row[1])))

    @staticmethod
    def save_syntax_analysis_in_links(analyzer):
        for codex_id in tqdm(range(len(analyzer.parser.codex_urls))):
            target_words_info = pd.read_csv(
                generate_file_name_with_postfix(
                    analyzer.
                    config['information_about_target_words_with_links'],
                    str(codex_id)))
            for row in tqdm(target_words_info.itertuples()):
                for url in row[-1].split(' '):
                    if url != 'None':
                        analyzer.save_syntax_analysis_by_text(
                            analyzer.parser.get_text_by_url(url),
                            generate_file_name_with_postfix(
                                analyzer.config[
                                    'article_target_words_in_links_realation_info'],
                                str(row[1])),
                            is_many_sentences=True)
                    else:
                        analyzer.save_syntax_analysis_by_text(
                            'None',
                            generate_file_name_with_postfix(
                                analyzer.config[
                                    'article_target_words_in_links_realation_info'],
                                str(row[1])))

    @staticmethod
    def plot_frequency_analysis_of_words(analyzer, is_constraint=None):
        """Построение частотности слов во всем корпусе"""
        if not is_constraint:
            for i in range(10):
                data = pd.read_csv(generate_file_name_with_postfix(
                    analyzer.config['articles_frequency_info_file'], str(i)),
                                   delimiter=',')
                data.plot(x='word',
                          y='frequency',
                          figsize=(50, 7),
                          kind='scatter')
                plt.xticks(rotation=60)
                plt.show()
        else:
            for i in range(10):
                data = pd.read_csv(generate_file_name_with_postfix(
                    analyzer.
                    config['articles_frequency_info_file_with_constraint'],
                    str(i)),
                                   delimiter=',')
                data = data.sort_values(by='frequency', axis='index')
                data.plot(x='word',
                          y='frequency',
                          figsize=(50, 7),
                          kind='scatter')
                plt.xticks(rotation=60)
                plt.show()

    @staticmethod
    def plot_unique_words_in_articles_analysis(analyzer):
        """Графики частотности уникальных слов в каждом кодексе по article_id"""
        for i in range(10):
            data = pd.read_csv(generate_file_name_with_postfix(
                analyzer.config['unique_words_in_articles_analysis_file'],
                str(i)),
                               delimiter=',')
            data = data.sort_values('unique_words_frequency')
            data.plot(x='article_id',
                      y='unique_words_frequency',
                      kind='scatter')
            plt.show()

    @staticmethod
    def plot_unique_words_in_articles_analysis_on_one_graph(analyzer):
        """График частотности уникальных слов в каждом кодексе на одном графике с отсортированной частотностью"""
        data = pd.read_csv(generate_file_name_with_postfix(
            analyzer.config['unique_words_in_articles_analysis_file'], str(0)),
                           delimiter=',')
        for i in range(1, 10):
            data = pd.concat([
                data,
                pd.read_csv(generate_file_name_with_postfix(
                    analyzer.config['unique_words_in_articles_analysis_file'],
                    str(i)),
                            delimiter=',')
            ])
        data['article_id'] = data.apply(
            lambda row: row['article_id'] / data['article_id'].max(), axis=1)
        data = data.sort_values('unique_words_frequency')
        data = data.reset_index()
        data.drop('article_id', axis='columns', inplace=True)
        data.drop('index', axis='columns', inplace=True)
        data.plot()
        plt.show()

    @staticmethod
    def plot_unique_words_analysis(analyzer, is_frequency_analysis=False):
        """Построение графика анализа уникальных слов"""
        if not is_frequency_analysis:
            data = pd.read_csv(
                analyzer.config['articles_unique_words_analysis_file'])
        else:
            data = pd.read_csv(
                analyzer.
                config['articles_unique_words_analysis_file_with_frequency'])
        data.plot(x='count_unique_words_frequency',
                  y='count_unique_words_in_corpus',
                  kind='scatter')
        plt.show()
        plt.hist(data.count_unique_words_frequency,
                 weights=data.count_unique_words_in_corpus)
        plt.show()

    @staticmethod
    def plot_most_popular_words_analysis(analyzer):
        """Построение графика частотности самых популярных во всем корпусе слов"""
        data = pd.read_csv(analyzer.config['most_popular_words_analysis_file'])
        plt.hist(data.word_count, weights=data.frequency)
        plt.show()
        data.plot(x='word', y='frequency', kind='scatter', figsize=(50, 7))
        plt.xticks(rotation=60)
        plt.show()

    @staticmethod
    def remove_chars_from_text(text, chars):
        return ''.join([ch for ch in text if ch not in chars])

    @staticmethod
    def convert_vector_from_string_value(vector):
        return list(map(lambda value: float(value), vector))

    @staticmethod
    def get_euclidean_distance(vector1, vector2):
        if len(vector1) != len(vector2):
            raise ConsultantPlusAnalyzerException(
                'It is not possible to compare vectors of different dimensions'
            )
        v1 = np.array(vector1)
        v2 = np.array(vector2)
        return np.linalg.norm(v1 - v2)

    @staticmethod
    def get_unique_article_identifier(codex_type, article_id):
        return codex_type + '_' + str(article_id)
Exemplo n.º 3
0
    # Actually producing ELMo embeddings for our data:
    start = time.time()

    CACHE = 12800

    lines_processed = 0
    lines_cache = []
    with open(data_path, "r") as dataset:
        for line in dataset:
            res = line.strip().split()[:WORD_LIMIT]
            if target_words & set(res):
                lines_cache.append(res)
                lines_processed += 1
            if len(lines_cache) == CACHE:
                elmo_vectors = model.get_elmo_vectors(lines_cache,
                                                      layers=args.layers)
                for sent, matrix in zip(lines_cache, elmo_vectors):
                    for word, vector in zip(sent, matrix):
                        if word in vect_dict:
                            vect_dict[word][counters[word], :] = vector
                            counters[word] += 1
                lines_cache = []
                if lines_processed % 256 == 0:
                    logger.info(
                        f"{data_path}; Lines processed: {lines_processed}")
        if lines_cache:
            elmo_vectors = model.get_elmo_vectors(lines_cache,
                                                  layers=args.layers)
            for sent, matrix in zip(lines_cache, elmo_vectors):
                for word, vector in zip(sent, matrix):
                    if word in vect_dict:
Exemplo n.º 4
0
            raw_sentences.append(res)
            if len(raw_sentences) > max_sentences:
                break
    sentences = [s.split()[:100] for s in raw_sentences]

    print('=====')
    print(f'{len(sentences)} sentences total')
    print('=====')

    model = ElmoModel()

    model.load(args.elmo, top=False)

    # Actually producing ELMo embeddings for our data:

    elmo_vectors = model.get_elmo_vectors(sentences)

    print('ELMo embeddings for your input are ready')
    print(f'Tensor shape: {elmo_vectors.shape}')

    # Due to batch processing, the above code produces for each sentence
    # the same number of token vectors, equal to the length of the longest sentence
    # (the 2nd dimension of the elmo_vector tensor).
    # If a sentence is shorter, the vectors for non-existent words are filled with zeroes.
    # Let's make a version without these redundant vectors:
    cropped_vectors = []
    for vect, sent in zip(elmo_vectors, sentences):
        cropped_vector = vect[:len(sent), :]
        cropped_vectors.append(cropped_vector)

    # A quick test:
Exemplo n.º 5
0
    sentences = ["Привет, мир!", "Надо отправить сообщение. В цехе до сих пор не была установлена седьмая центрифуга, а по плану мы должны уже на ней работать"]

    sentences = [s.split()[:100] for s in raw_sentences]

    print("=====")
    print(f"{len(sentences)} sentences total")
    print("=====")

    model = ElmoModel()

    model.load("212")

    # Actually producing ELMo embeddings for our data:
    start = time.time()
    elmo_vectors = model.get_elmo_vectors(sentences, layers="average")
    end = time.time()

    processing_time = int(end - start)

    print(f"ELMo embeddings for your input are ready in {processing_time} seconds")
    print(f"Tensor shape: {elmo_vectors.shape}")

    # Due to batch processing, the above code produces for each sentence
    # the same number of token vectors, equal to the length of the longest sentence
    # (the 2nd dimension of the elmo_vector tensor).
    # If a sentence is shorter, the vectors for non-existent words are filled with zeroes.
    # Let's make a version without these redundant vectors:
    cropped_vectors = []
    for vect, sent in zip(elmo_vectors, sentences):
        cropped_vector = vect[: len(sent), :]