示例#1
0
 def process_dataset(self, dataset, pickle_name):
     tfidf = TfidfPreprocess()
     pack = tfidf.preprocess_tfidf_dataset(dataset)
     sim_vector = pack[0]
     labels = pack[1]
     sim_vector = sim_vector.flatten()
     print(sim_vector)
     qwords = QwordPreprocess()
     question_vector = qwords.get_question_word_data(dataset)
     print(question_vector)
     we = WordEmbeddings()
     det_val_vector = we.get_det_val_dataset(dataset)
     print(det_val_vector)
     sum_val_vector = we.get_sum_vals_dataset(dataset)
     print(sum_val_vector)
     spacy_sim_vector = we.get_spacy_sim_dataset(dataset)
     print(spacy_sim_vector)
     ui = UnionIntersect()
     ui_vector = ui.get_percentage_dataset(dataset)
     print(ui_vector)
     matrix = np.vstack((sim_vector, question_vector, sum_val_vector, spacy_sim_vector, ui_vector))
     matrix = matrix.transpose()
     print(matrix)
     processed_data = dict(x=matrix, y=labels)
     with open(str(pickle_name) + ".p", "wb") as p:
         pickle.dump(processed_data, p)
示例#2
0
 def __init__(self, text_utils):
     self.text_utils = text_utils
     self.logger = logging.getLogger('YesNoBot')
     self.relevancy_detector = LGB_RelevancyDetector()
     #self.yes_no_model = XGB_YesNoModel()
     self.yes_no_model = NN_YesNoModel()
     self.word_embeddings = WordEmbeddings()
     self.show_relevancy = True
示例#3
0
class YesNoBot:
    def __init__(self, text_utils):
        self.text_utils = text_utils
        self.logger = logging.getLogger('YesNoBot')
        self.relevancy_detector = LGB_RelevancyDetector()
        #self.yes_no_model = XGB_YesNoModel()
        self.yes_no_model = NN_YesNoModel()
        self.word_embeddings = WordEmbeddings()
        self.show_relevancy = True

    def load_models(self, models_folder, w2v_folder):
        self.logger.info(u'Loading models from {}'.format(models_folder))
        self.models_folder = models_folder
        self.relevancy_detector.load(models_folder)
        self.yes_no_model.load(models_folder)

        self.wordchar2vector_path = os.path.join(models_folder,
                                                 'wordchar2vector.dat')

        self.word_embeddings.load_wc2v_model(self.wordchar2vector_path)
        p = self.yes_no_model.w2v_path
        if p is not None:
            p = os.path.join(w2v_folder, os.path.basename(p))
            self.word_embeddings.load_w2v_model(p)

    def get_yes_answer(self):
        return self.text_utils.language_resources[u'да']

    def get_no_answer(self):
        return self.text_utils.language_resources[u'нет']

    def get_unknown_answer(self):
        return self.text_utils.language_resources[u'неопределено']

    def infer_answer(self, premises0, question0):
        premises = [self.text_utils.canonize_text(f) for f in premises0]
        question = self.text_utils.canonize_text(question0)

        rel = 1.0
        if len(premises) == 1:
            # Проверим, что введенная пользователем предпосылка релевантна заданному вопросу.
            premise = premises[0]
            rel = self.relevancy_detector.calc_relevancy1(
                premise, question, self.text_utils, self.word_embeddings)
            self.logger.debug('relevancy={}'.format(rel))

        y = self.yes_no_model.calc_yes_no(premises, question, self.text_utils,
                                          self.word_embeddings)
        self.logger.debug('y={}'.format(y))

        answer = None
        if y < 0.5:
            answer = self.get_no_answer()
        else:
            answer = self.get_yes_answer()
        return u'{} ({:4.2f})'.format(answer, rel)
示例#4
0
 def process_run(self, query_candidates):
     tfidf = TfidfPreprocess()
     sim_vector = tfidf.preprocess_tfidf_runtime(query_candidates)
     sim_vector = sim_vector.flatten()
     qwords = QwordPreprocess()
     question_vector = qwords.get_question_word_run(query_candidates)
     we = WordEmbeddings()
     det_val_vector = we.get_det_vals_run(query_candidates)
     sum_val_vector = we.get_sum_vals_run(query_candidates)
     spacy_sim_vector = we.get_spacy_sim_run(query_candidates)
     ui = UnionIntersect()
     ui_vector = ui.get_percentage_run(query_candidates)
     feature_matrix = np.vstack((sim_vector, question_vector, sum_val_vector, spacy_sim_vector, ui_vector))
     feature_matrix = feature_matrix.transpose()
     return(feature_matrix)
示例#5
0
def main(args: argparse.Namespace) -> None:
    """Main entrypoint for the script."""
    temporal_embeddings = []
    paths = Path(args.checkpoint_root).glob(args.glob)
    for path in paths:
        path_name = path.stem
        path_name = path_name[path_name.find('-') + 1:path_name.find('_p')].replace('_', '-')
        path_date = parse(path_name, fuzzy=True).replace(day=1)

        # Load the word embeddings
        embeddings = WordEmbeddings(
            checkpoint_filepath=path,
            suffix_tree=False,
            nearest_neighbours=False
        )
        temporal_embeddings.append((path_date.date(), embeddings))

    figsize = (args.figure_width / args.figure_dpi, args.figure_height / args.figure_dpi)
    plt.figure(figsize=figsize, dpi=args.figure_dpi)

    # Draw the graph.
    plot_similarity_over_time(temporal_embeddings, args.word_a, args.word_b)

    if not args.output_path:
        plt.show()
    else:
        output_format = (args.output_path.suffix or 'png').replace('.', '')
        args.output_path.parent.mkdir(parents=True, exist_ok=True)
        if output_format == 'tex' or output_format == 'latex':
            tikzplotlib.save(args.output_path)
        else:
            plt.savefig(args.output_path, dpi=args.export_dpi)
        logger.info('Exported figure to {}'.format(args.output_path))
def main(args: argparse.Namespace) -> None:
    """Main entrypoint for the script."""
    # Ensure that at least on data argument was provided
    if args.checkpoint_directory is None \
       and args.weights_filepath is None \
       and args.vocab_filepath is None:

        logger.error('One of --checkpoints / (--weights-filepath '
                     'and --vocab-filepath) is required!')
        exit(1)

    if args.checkpoint_directory is not None:
        weights_filepath = args.checkpoint_directory / 'proj_weights.npy'
        vocab_filepath = args.checkpoint_directory / 'vocab.txt'
    else:
        weights_filepath = args.weights_filepath
        args.vocab_filepath = args.vocab_filepath

    embeddings = WordEmbeddings(
        weights_filepath, vocab_filepath,
        name_metadata=weights_filepath.parent.stem
    )

    # Start the embedding projector
    embedding_projector([embeddings], debug=args.debug, port=args.port)
def event_extraction(events_table, node, parent_verb):
    ancora = Ancora("ancora.vbs")
    if verb_not_aux(node):
        events_table[node.id] = Event(node, None, None, parent_verb)
    for child in node.children:
        node_child = child[1]
        relation_child = child[0]
        if is_verb(node):
            current_event = events_table[node.id]
            if relation_child == 'suj':
                current_event.subj = node_child
            elif relation_child == 'cd':
                current_event.obj = node_child
                if parent_verb is not None and verb_not_gerunde_nor_participle(
                        parent_verb):
                    parent_event = events_table[parent_verb.id]
                    if parent_event.obj is None:
                        unanimity_valence_obj = ancora.unanimity_argument(
                            parent_verb.form, parent_verb.lemma, SRL.OBJ)
                        if unanimity_valence_obj:
                            parent_event.obj = node_child
                        elif unanimity_arg == None or (
                                unanimity_arg == False
                                and ancora.one_category_argument(
                                    parent_verb.form, parent_verb.lemma,
                                    SRL.OBJ)):
                            word_embeddings = WordEmbeddings()
                            if word_embeddings.similar_words(
                                    parent_verb.form, node_child.form):
                                parent_event.obj = node_child
            elif relation_child in ('cpred', 'ci', 'cc', 'creg'):
                check_majority = ancora.check_majority_rule_category(
                    node.form, node.lemma, SRL.THIRD, node_child.tag[0])
                if is_location_or_time(node_child):
                    current_event.add_circumstance_complements(node_child)
                elif check_majority or check_majority == None:
                    current_event.complement(node_child)
                if parent_verb is not None:
                    parent_event = events_table[parent_verb.id]
                    if parent_event.complement == None:
                        check_unanimity_complement = ancora.check_unanimity_categories_argument_rule(
                            parent_verb.form, parent_verb.lemma, SRL.THIRD,
                            node_child.tag[0])
                        if check_unanimity_complement:
                            parent_event.complement = node_child
        event_extraction(events_table, node_child,
                         node if node.tag[0] == 'V' else parent_verb)
示例#8
0
def run_embedding_projector(root_directory: Union[str, Path]) -> None:
    """Run the embedding projector on ALL the trained embeddings in the output/word2vec folder.

    Args:
        root_directory: The directory containing all the embedding checkpoints.
    """
    root_directory = Path(root_directory)
    embeddings_list = [
        WordEmbeddings(checkpoint_filepath=path, name_metadata=path.stem)
        for path in root_directory.glob('*/')
    ]
    # We can't have no embeddings!
    assert len(embeddings_list) > 0
    embedding_projector(embeddings_list)
def build_k_hop_graph(embeddings: WordEmbeddings,
                      target_word: str,
                      k: int,
                      alpha: Optional[float] = 0.50) -> nx.Graph:
    """Builds the k-hop graph for a word embeddings space.

    Args:
        embeddings: The word embeddings to generate the graph for.
        target_word: The word of interest.
        k: The number of 'hops' between the word of interest and every node
            in the graph. The resultant graph has the property that the word
            of interest is reachable from any node in at most k edges.
        alpha: The similarity threshold. Words that have a cosine similarity
            of at least this threshold are kept, and the rest are discarded.
    """
    # Verify the alpha threshold is <= max(similarity between interest word).
    max_alpha = embeddings.most_similar(target_word, k=1)[0][1]
    if alpha > max_alpha:
        raise ValueError(
            'Alpha threshold too high! The word of interest was not included '
            'in the graph. For the given target word, '
            '\'{}\', alpha can be AT MOST {}!'.format(target_word, max_alpha))

    graph = build_infinity_hop_graph(embeddings, alpha)

    # Get the word index of the word of interest.
    T = embeddings._vocabulary[target_word]

    # Compute the shortest paths from the word of interest to all reachable nodes.
    logger.info('Computing shortest paths')
    paths = nx.single_source_shortest_path_length(graph, T)

    logger.info('Building k-hop graph')
    nodes_to_delete = set()
    for node in tqdm.tqdm(graph.nodes):
        # Remove the node if the word of interest is not reachable in at most k edges.
        if node not in paths or paths[node] > k:
            nodes_to_delete.add(node)

    for node in nodes_to_delete:
        graph.remove_node(node)

    logger.info('Generated k-hop graph (nodes: {}, edges: {})'.format(
        len(graph.nodes), len(graph.edges)))
    return graph
示例#10
0
def main():
    print('Starting test model...')

    with tf.Graph().as_default() as g:
        #session = tf.Session()
        with tf.Session() as session:
            # saver = tf.train.Saver(tf.all_variables())
            model = Classifier()
            # model.session.run(tf.global_variables_initializer())
            saver = tf.train.import_meta_graph(model_path + '.meta')
            saver.restore(model.session, save_path='/tmp/' + model_path)
            # model.session.run(tf.global_variables_initializer())
            print('Model restored.')

            # print([v.op.name for v in tf.all_variables()])

            # print(model.session.run())

            # TODO complete this portion (feed in our own data, command line interface)
            embeddings = WordEmbeddings()

            while True:
                headline = input('Headline? ')
                article = input('Article? ')
                true_label = input('True label? ')

                h, a, t, l_h, l_a = get_articles_word_vectors(
                    headline, article, true_label, embeddings)
                g_f = generate_features(headline, article)

                pred_stances = model.session.run(
                    [model.pred_stance, model.train_fn],
                    {
                        model.inputs_articles: a,  # INSERT EMBEDDING
                        model.inputs_headlines: h,  # INSERT EMBEDDING
                        model.outputs: t,  # INSERT EMBEDDING
                        model.h_lengths: l_h,
                        model.a_lengths: l_a,
                        model.global_feats: g_f
                    })[0]

                print('predicted label = ' + str(LABELS[pred_stances[0]]) +
                      '\n')
 def __init__(self):
     Helpers.__init__(self)
     WordEmbeddings.__init__(self)
     FeatureEngineering.__init__(self)
示例#12
0
# -*- coding: utf-8 -*-

from random import shuffle
from chinese import dist, silouhette_coefficent
from node import Node
from evento import Event
from etiquetadoeventos import Etiquetado_eventos
from word_embeddings import WordEmbeddings
from functools import reduce
from collections import Counter
import math

we = WordEmbeddings()

#Chinese Whispers Algorithm
# def CWA(event_list):
#     for i in range(len(event_list)):
#         event_list[i].type = i
#     not_converged = True
#     iteracion = 0
#     while not_converged:
#         # for e in event_list:
#         #     print(e)
#         iteracion += 1
#         print("==ITERACION==>"+str(iteracion))
#         shuffle(event_list)
#         clases_antes = [event.type for event in event_list]
#         for i in range(len(event_list)):
#             max_similarity = -1
#             for j in range(len(event_list)):
#                 if i != j:
示例#13
0
    def load_models(self, models_folder, w2v_folder):
        self.logger.info(u'Loading models from {}'.format(models_folder))
        self.models_folder = models_folder

        # Загружаем общие параметры для сеточных моделей
        with open(os.path.join(models_folder, 'qa_model_selector.config'),
                  'r') as f:
            model_config = json.load(f)
            self.max_inputseq_len = model_config['max_inputseq_len']
            self.wordchar2vector_path = self.get_model_filepath(
                models_folder, model_config['wordchar2vector_path'])
            self.PAD_WORD = model_config['PAD_WORD']
            self.word_dims = model_config['word_dims']

        self.qa_model_config = model_config

        # TODO: выбор конкретной реализации для каждого типа моделей сделать внутри базового класса
        # через анализ поля 'engine' в конфигурации модели. Для нейросетевых моделей там будет
        # значение 'nn', для градиентного бустинга - 'xgb'. Таким образом, уберем ненужную связность
        # данного класса и конкретных реализации моделей.

        # Определение релевантности предпосылки и вопроса на основе XGB модели
        #self.relevancy_detector = XGB_RelevancyDetector()
        self.relevancy_detector = LGB_RelevancyDetector()
        self.relevancy_detector.load(models_folder)

        # Модель определения синонимичности двух фраз
        #self.synonymy_detector = NN_SynonymyDetector()
        #self.synonymy_detector.load(models_folder)
        self.synonymy_detector = Jaccard_SynonymyDetector()

        self.interpreter = NN_Interpreter()
        self.interpreter.load(models_folder)

        # Определение достаточности набора предпосылок для ответа на вопрос
        self.enough_premises = NN_EnoughPremisesModel()
        self.enough_premises.load(models_folder)

        # Комплексная модель (группа моделей) для генерации текста ответа
        self.answer_builder = AnswerBuilder()
        self.answer_builder.load_models(models_folder)

        # Классификатор грамматического лица на базе XGB
        self.person_classifier = XGB_PersonClassifierModel()
        self.person_classifier.load(models_folder)

        # Нейросетевая модель для манипуляции с грамматическим лицом
        self.person_changer = NN_PersonChange()
        self.person_changer.load(models_folder)

        # Загрузка векторных словарей
        self.word_embeddings = WordEmbeddings()
        self.word_embeddings.load_models(models_folder)
        self.word_embeddings.load_wc2v_model(self.wordchar2vector_path)
        for p in self.answer_builder.get_w2v_paths():
            p = os.path.join(w2v_folder, os.path.basename(p))
            self.word_embeddings.load_w2v_model(p)

        self.word_embeddings.load_w2v_model(
            os.path.join(w2v_folder,
                         os.path.basename(
                             self.enough_premises.get_w2v_path())))
        self.logger.debug('All models loaded')
from node import Node
from word_embeddings import WordEmbeddings
from ancora import Ancora
from ancora_enum import SRL
from evento import Event

ancora = Ancora("ancora.vbs")
word_embeddings = WordEmbeddings()


def main(node, sn_root_list):
    events_table = {}
    event_extraction(events_table, node, None)
    sentenceList = []

    #soluciono coref
    for event in events_table.values():
        if event.subj is not None and is_pronoun(event.subj):
            event.subj = resolve_coref(event, sn_root_list, node)

    #aumento
    for verb_id, event in events_table.items():
        if event.subj is None and event.parent_verb is not None:
            augment_subject(events_table, verb_id, event.parent_verb.id, node)

    list_bow = [(event.bag_of_words(), event)
                for event in events_table.values()]
    erase_list = []
    for i, duple in enumerate(list_bow):
        for j in range(0, len(list_bow)):
            if i != j:
from flask import request

from sentence_embeddings import get_sentence_embedding, operations
from word_embeddings import WordEmbeddings

ukp_embeddings_share_url = 'https://public.ukp.informatik.tu-darmstadt.de/arxiv2018-xling-sentence-embeddings'
ukp_xling_embeddings_share_url = '{}/xling-wordembeddings'.format(
    ukp_embeddings_share_url)
ukp_monoling_embeddings_share_url = '{}/monolingual-wordembeddings'.format(
    ukp_embeddings_share_url)

embeddings = {
    'en-de': [
        WordEmbeddings('mapped_bivcd_en_de',
                       ukp_xling_embeddings_share_url,
                       'mapped_bivcd_en_de.txt.gz',
                       approximate_filesize=101035275,
                       file_n_lines=86761,
                       lowercased=True),
        WordEmbeddings('mapped_attract_repel_en_de',
                       ukp_xling_embeddings_share_url,
                       'mapped_attract_repel_en_de.txt.gz',
                       approximate_filesize=270155631,
                       file_n_lines=234036,
                       lowercased=True),
        WordEmbeddings(
            # this is the small FT version. See full version in the comments below
            'mapped_fasttext_en_de',
            ukp_xling_embeddings_share_url,
            'mapped_fasttext_300k_en_de.txt.gz',
            approximate_filesize=680901729,
            file_n_lines=599959,
示例#16
0
def es_similar_we(arg_1,arg_2):
    res = WordEmbeddings().similar_words(arg_1.form(), arg_2.form())
    if arg_1.tag[0] == arg_2.tag[0] == 'v':
        res *= 0.2
    return  res
示例#17
0
def load_embeddings(wordchar2vector_path, word2vector_path, computed_params):
    embeddings = WordEmbeddings.load_word_vectors(wordchar2vector_path,
                                                  word2vector_path)
    computed_params['word_dims'] = embeddings.get_vector_size()
    computed_params['word2vec'] = embeddings
    return embeddings
def main():
    # set up data
    d = DataSet()
    folds, hold_out = kfold_split(d, n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)
    embeddings = WordEmbeddings()
    print('Created data set and word embeddings')

    # create classifier
    model = Classifier()
    print('Set up model')

    # get word vector data
    x_articles = {}
    x_headlines = {}
    y_vals = {}

    for fold in fold_stances:
        x_headlines[fold], x_articles[fold], y_vals[
            fold] = get_articles_word_vectors(fold_stances[fold], d,
                                              embeddings)

    test_x_headlines, test_x_articles, test_y = get_articles_word_vectors(
        hold_out_stances, d, embeddings)
    print('Finished separating folds')

    # TODO get global feature data

    # train LSTM (fold -> epoch -> batch)
    model.session.run(tf.global_variables_initializer())

    for fold in fold_stances:
        ids = list(range(len(folds)))
        del ids[fold]
        x_train_articles = np.vstack(tuple([x_articles[i] for i in ids]))
        x_train_headlines = np.vstack(tuple([x_headlines[i] for i in ids]))
        y_train = np.vstack(tuple([y_vals[i] for i in ids]))
        print('train articles shape = ' + str(x_train_articles.shape))
        print('train headlines shape = ' + str(x_train_headlines.shape))
        print('y train shape = ' + str(y_train.shape))

        x_valid_articles = x_articles[fold]
        x_valid_headlines = x_headlines[fold]
        y_valid = y_vals[fold]

        fold_error = 0
        print('Training fold ' + str(fold))
        for epoch in range(10):

            batch_size = 512
            article_batches = []
            headline_batches = []
            output_batches = []

            start = 0
            while start < len(x_train_articles):
                article_chunk = x_train_articles[start:start + batch_size]
                headline_chunk = x_train_headlines[start:start + batch_size]
                output_chunk = y_train[start:start + batch_size]
                article_batches.append(article_chunk)
                headline_batches.append(headline_chunk)
                output_batches.append(output_chunk)
                start += batch_size

            for i in range(len(article_batches)):
                # Training error
                print("inputs_a: " + str(type(article_batches[i])))
                print("inputs_h: " + str(headline_batches[i]))
                print("inputs_a: " + str(article_batches[i].shape))
                print("inputs_h: " + str(headline_batches[i].shape))
                print("outputs: " + str(output_batches[i][:, -1, :].shape))
                epoch_error = model.session.run(
                    [model.error, model.train_fn], {
                        model.inputs_articles: article_batches[i],
                        model.inputs_headlines: headline_batches[i],
                        model.outputs: output_batches[i][:, -1, :]
                    })[0]
                if i % 10 == 0:
                    print('\tEpoch error for batch ' + str(i) + ' = ' +
                          str(epoch_error))

                fold_error += epoch_error

        print('Training error (fold) = ' + str(fold_error / 10.0) + '\n')

        # cross-validation error
        valid_accuracy, pred_y_stances = model.session.run(
            [model.accuracy, model.pred_stance], {
                model.inputs_articles: x_valid_articles,
                model.inputs_headlines: x_valid_headlines,
                model.outputs: y_valid[:, -1, :]
            })

    # assess performance on validation set
    print('\n#### RUNNING ON HOLDOUT SET ####')

    test_accuracy, pred_y_stances = model.session.run(
        [model.accuracy, model.pred_stance], {
            model.inputs_articles: test_x_articles,
            model.inputs_headlines: test_x_headlines,
            model.outputs: test_y[:, -1, :]
        })

    simple_y = np.array([array[0].tolist().index(1) for array in test_y])
    f1_score = metrics.f1_score(simple_y, pred_y_stances, average='macro')
    print("F1 MEAN score: " + str(f1_score))
    f1_score_labels = metrics.f1_score(simple_y,
                                       pred_y_stances,
                                       labels=[0, 1, 2, 3],
                                       average=None)
    print("F1 LABEL scores: " + str(f1_score_labels))

    # Convert to string labels for FNC scoring metric
    label_map = {0: "agree", 1: "disagree", 2: "discuss", 3: "unrelated"}
    simple_y_str = [label_map[label] for label in simple_y]
    pred_y_stances_str = [label_map[label] for label in pred_y_stances]
    report_score(simple_y_str, pred_y_stances_str)
def main():
    d = DataSet()
    folds, hold_out = kfold_split(d, n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)
    embeddings = WordEmbeddings()
    print('Created data set and word embeddings')

    # with tf.Graph().as_default() as g:
    # create classifier
    model = Classifier()
    print('Set up model')

    with model.graph.as_default():
        # get word vector data
        x_articles = {}
        x_headlines = {}
        y_vals = {}
        lengths_a = {}
        lengths_h = {}

        x_global = {}
        y_global = {}

        for fold in fold_stances:
            x_headlines[fold], x_articles[fold], y_vals[fold], lengths_h[
                fold], lengths_a[fold] = get_articles_word_vectors(
                    fold_stances[fold], d, embeddings)
            x_global[fold], y_global[fold] = generate_features(
                fold_stances[fold], d, str(fold))

        test_x_headlines, test_x_articles, test_y, test_h_lengths, test_a_lengths = get_articles_word_vectors(
            hold_out_stances, d, embeddings)
        test_x_global, test_y_global = generate_features(
            hold_out_stances, d, 'holdout')
        print('Finished separating folds')

        # TODO get global feature data

        # train LSTM (fold -> epoch -> batch)
        # model.session.run(tf.initialize_all_variables())
        saver = tf.train.Saver(tf.all_variables())
        '''
		saver = tf.train.Saver({'cell_articles_fw' : model.cell_articles_fw,
														'cell_articles_bw' : model.cell_articles_bw,
														'cell_headlines_fw' : model.cell_headlines_fw,
														'cell_headlines_bw' : model.cell_headlines_bw
													})
		'''

        for fold in fold_stances:
            ids = list(range(len(folds)))
            del ids[fold]
            x_train_articles = np.vstack(tuple([x_articles[i] for i in ids]))
            x_train_headlines = np.vstack(tuple([x_headlines[i] for i in ids]))
            y_train = np.vstack(tuple([y_vals[i] for i in ids]))
            lengths_h_train = np.vstack(tuple([lengths_h[i] for i in ids]))
            lengths_a_train = np.vstack(tuple([lengths_a[i] for i in ids]))
            global_train = np.vstack(tuple([x_global[i] for i in ids]))
            # print('train articles shape = ' + str(x_train_articles.shape))
            # print('train headlines shape = ' + str(x_train_headlines.shape))
            # print('y train shape = ' + str(y_train.shape))

            x_valid_articles = x_articles[fold]
            x_valid_headlines = x_headlines[fold]
            y_valid = y_vals[fold]
            length_h_valid = lengths_h[fold]
            length_a_valid = lengths_a[fold]
            global_valid = x_global[fold]
            # Training batches
            article_batches_train, headline_batches_train, output_batches_train, length_h_batches_train, length_a_batches_train, global_batches_train = create_batches(
                x_train_articles, x_train_headlines, y_train, lengths_h_train,
                lengths_a_train, global_train)

            fold_error = 0
            print('Training fold ' + str(fold))
            j = 0
            for epoch in range(5):
                '''
				# Training batches
				article_batches_train,headline_batches_train,output_batches_train,length_h_batches_train,length_a_batches_train, global_batches_train = create_batches(x_train_articles, 
				x_train_headlines, 
				y_train, 
				lengths_h_train, 
				lengths_a_train, 
				global_train)
				'''

                print(len(article_batches_train))

                for i in range(len(article_batches_train)):
                    # Training error
                    epoch_error = model.session.run(
                        [model.error, model.train_fn], {
                            model.inputs_articles: article_batches_train[i],
                            model.inputs_headlines: headline_batches_train[i],
                            model.outputs: output_batches_train[i],
                            model.h_lengths: length_h_batches_train[i],
                            model.a_lengths: length_a_batches_train[i],
                            model.global_feats: global_batches_train[i]
                        })[0]
                    print('\tEpoch ' + str(j) + ' error = ' + str(epoch_error))

                    fold_error += epoch_error
                    j += 1

            print('Training error (fold) = ' + str(fold_error / j) + '\n')
            print('LSTM Cell Weights')
            print('\tFW articles ' + str(model.rnn_states_articles[0]))
            print('\tBW articles ' + str(model.rnn_states_articles[1]))
            print('\tFW headlines ' + str(model.rnn_states_headlines[0]))
            print('\tBW headlines ' + str(model.rnn_states_headlines[1]))

            # Validation batches
            article_batches_valid, headline_batches_valid, output_batches_valid, length_h_batches_valid, length_a_batches_valid, global_batches_valid = create_batches(
                x_valid_articles, x_valid_headlines, y_valid, length_h_valid,
                length_a_valid, global_valid)

            all_pred_y_stances = []
            for i in range(len(article_batches_valid)):
                # cross-validation error
                pred_y_stances = model.session.run(
                    [model.pred_stance], {
                        model.inputs_articles: article_batches_valid[i],
                        model.inputs_headlines: headline_batches_valid[i],
                        model.outputs: output_batches_valid[i],
                        model.h_lengths: length_h_batches_valid[i],
                        model.a_lengths: length_a_batches_valid[i],
                        model.global_feats: global_batches_valid[i]
                    })
                all_pred_y_stances = np.append(all_pred_y_stances,
                                               pred_y_stances)

            simple_y = np.array([array.tolist().index(1) for array in y_valid])
            '''
			f1_score = metrics.f1_score(simple_y, pred_y_stances, average='macro')
			print("F1 MEAN score: " + str(f1_score))
			f1_score_labels =  metrics.f1_score(simple_y, pred_y_stances, labels=[0, 1, 2, 3], average=None)
			print("F1 LABEL scores: " + str(f1_score_labels))
			'''
            # Convert to string labels for FNC scoring metric
            label_map = {
                0: "agree",
                1: "disagree",
                2: "discuss",
                3: "unrelated"
            }
            simple_y_str = [label_map[label] for label in simple_y]
            pred_y_stances_str = [
                label_map[label] for label in all_pred_y_stances
            ]
            report_score(simple_y_str, pred_y_stances_str)

        # assess performance on test set
        print('\n#### RUNNING ON HOLDOUT SET ####')

        # Test batches
        article_batches_test, headline_batches_test, output_batches_test, length_h_batches_test, length_a_batches_test, global_batches_test = create_batches(
            test_x_articles, test_x_headlines, test_y, test_h_lengths,
            test_a_lengths, test_x_global)

        all_pred_y_test = []
        for i in range(len(article_batches_test)):
            pred_y_stances = model.session.run(
                [model.pred_stance], {
                    model.inputs_articles: article_batches_test[i],
                    model.inputs_headlines: headline_batches_test[i],
                    model.outputs: output_batches_test[i],
                    model.h_lengths: length_h_batches_test[i],
                    model.a_lengths: length_a_batches_test[i],
                    model.global_feats: global_batches_test[i]
                })
            all_pred_y_test = np.append(all_pred_y_test, pred_y_stances)

        simple_y = np.array([array.tolist().index(1) for array in test_y])
        f1_score = metrics.f1_score(simple_y, all_pred_y_test, average='macro')
        print("F1 MEAN score: " + str(f1_score))
        f1_score_labels = metrics.f1_score(simple_y,
                                           all_pred_y_test,
                                           labels=[0, 1, 2, 3],
                                           average=None)
        print("F1 LABEL scores: " + str(f1_score_labels))

        # Convert to string labels for FNC scoring metric
        label_map = {0: "agree", 1: "disagree", 2: "discuss", 3: "unrelated"}
        simple_y_str = [label_map[label] for label in simple_y]
        pred_y_stances_str = [label_map[label] for label in all_pred_y_test]
        report_score(simple_y_str, pred_y_stances_str)

        h, b = [], []
        for stance in hold_out_stances:
            h.append(stance['Headline'])
            b.append(d.articles[stance['Body ID']])

        b = [" ".join(body) for body in b]

        print('### CLASSIFICATIONS ###')
        for i in range(len(b)):
            print('Pair ' + str(i))
            print('\tHeadline: ' + str(h[i]))
            print('\tBody: ' + str(b[i]))
            print('\tTrue label: ' + str(simple_y_str[i]))
            print('\tAssigned label: ' + str(pred_y_stances_str[i]))

        saver.save(model.session, '/tmp/' + model_path)
示例#20
0
class SimpleAnsweringMachine(BaseAnsweringMachine):
    """
    Чат-бот на основе набора нейросетевых и прочих моделей (https://github.com/Koziev/chatbot).
    """
    def __init__(self, facts_storage, text_utils):
        super(SimpleAnsweringMachine, self).__init__()
        self.facts_storage = facts_storage
        self.trace_enabled = False
        self.session_factory = SimpleDialogSessionFactory(self.facts_storage)
        self.text_utils = text_utils
        self.logger = logging.getLogger('SimpleAnsweringMachine')
        self.scripting = None
        self.enable_smalltalk = False
        self.enable_scripting = False

        # Если релевантность факта к вопросу в БФ ниже этого порога, то факт не подойдет
        # для генерации ответа на основе факта.
        self.min_premise_relevancy = 0.3

    def get_model_filepath(self, models_folder, old_filepath):
        """
        Для внутреннего использования - корректирует абсолютный путь
        к файлам данных модели так, чтобы был указанный каталог.
        """
        _, tail = os.path.split(old_filepath)
        return os.path.join(models_folder, tail)

    def load_models(self, models_folder, w2v_folder):
        self.logger.info(u'Loading models from {}'.format(models_folder))
        self.models_folder = models_folder

        # Загружаем общие параметры для сеточных моделей
        with open(os.path.join(models_folder, 'qa_model_selector.config'),
                  'r') as f:
            model_config = json.load(f)
            self.max_inputseq_len = model_config['max_inputseq_len']
            self.wordchar2vector_path = self.get_model_filepath(
                models_folder, model_config['wordchar2vector_path'])
            self.PAD_WORD = model_config['PAD_WORD']
            self.word_dims = model_config['word_dims']

        self.qa_model_config = model_config

        # TODO: выбор конкретной реализации для каждого типа моделей сделать внутри базового класса
        # через анализ поля 'engine' в конфигурации модели. Для нейросетевых моделей там будет
        # значение 'nn', для градиентного бустинга - 'xgb'. Таким образом, уберем ненужную связность
        # данного класса и конкретных реализации моделей.

        # Определение релевантности предпосылки и вопроса на основе XGB модели
        #self.relevancy_detector = XGB_RelevancyDetector()
        self.relevancy_detector = LGB_RelevancyDetector()
        self.relevancy_detector.load(models_folder)

        # Модель определения синонимичности двух фраз
        #self.synonymy_detector = NN_SynonymyDetector()
        #self.synonymy_detector.load(models_folder)
        self.synonymy_detector = Jaccard_SynonymyDetector()

        self.interpreter = NN_Interpreter()
        self.interpreter.load(models_folder)

        # Определение достаточности набора предпосылок для ответа на вопрос
        self.enough_premises = NN_EnoughPremisesModel()
        self.enough_premises.load(models_folder)

        # Комплексная модель (группа моделей) для генерации текста ответа
        self.answer_builder = AnswerBuilder()
        self.answer_builder.load_models(models_folder)

        # Классификатор грамматического лица на базе XGB
        self.person_classifier = XGB_PersonClassifierModel()
        self.person_classifier.load(models_folder)

        # Нейросетевая модель для манипуляции с грамматическим лицом
        self.person_changer = NN_PersonChange()
        self.person_changer.load(models_folder)

        # Загрузка векторных словарей
        self.word_embeddings = WordEmbeddings()
        self.word_embeddings.load_models(models_folder)
        self.word_embeddings.load_wc2v_model(self.wordchar2vector_path)
        for p in self.answer_builder.get_w2v_paths():
            p = os.path.join(w2v_folder, os.path.basename(p))
            self.word_embeddings.load_w2v_model(p)

        self.word_embeddings.load_w2v_model(
            os.path.join(w2v_folder,
                         os.path.basename(
                             self.enough_premises.get_w2v_path())))
        self.logger.debug('All models loaded')

    def set_scripting(self, scripting):
        self.scripting = scripting

    def start_conversation(self, interlocutor):
        """
        Начало общения бота с interlocutor. Ни одной реплики еще не было.
        Бот может поприветствовать собеседника или напомнить ему что-то, если
        в сессии с ним была какая-то напоминалка, т.д. Фразу, которую надо показать собеседнику,
        поместим в буфер выходных фраз с помощью метода say, а внешний цикл обработки уже извлечет ее оттуда
        и напечатает в консоли и т.д.

        :param interlocutor: строковый идентификатор собеседника.
        :return: строка реплики, которую скажет бот.
        """
        session = self.get_session(interlocutor)
        if self.scripting is not None and self.enable_scripting:
            phrase = self.scripting.start_conversation(self, session)
            if phrase is not None:
                self.say(session, phrase)

    def change_person(self, phrase, target_person):
        return self.person_changer.change_person(phrase, target_person,
                                                 self.text_utils,
                                                 self.word_embeddings)

    def get_session_factory(self):
        return self.session_factory

    def is_question(self, phrase):
        return phrase[-1] == u'?'

    def interpret_phrase(self, session, raw_phrase):
        interpreted = InterpretedPhrase(raw_phrase)
        phrase = raw_phrase
        phrase_is_question = self.is_question(raw_phrase)

        # история фраз доступна в session как conversation_history

        if len(session.conversation_history) > 0\
            and session.conversation_history[-1].is_bot_phrase\
            and session.conversation_history[-1].is_question\
            and not phrase_is_question\
            and self.interpreter is not None:
            # В отдельной ветке обрабатываем ситуацию, когда бот
            # задал вопрос, на который собеседник дал краткий ответ.
            # с помощью специальной модели мы попробуем восстановить полный
            # текст ответа ообеседника.
            context_phrases = []
            context_phrases.append(
                session.conversation_history[-1].interpretation)
            context_phrases.append(raw_phrase)
            phrase = self.interpreter.interpret(context_phrases,
                                                self.text_utils,
                                                self.word_embeddings)

            # определим грамматическое лицо получившейся интерпретации
            person = self.person_classifier.detect_person(
                phrase, self.text_utils, self.word_embeddings)

            if person == '2s':  # интерпретация "Тебя зовут Илья" получена из "Меня зовут илья"
                person = '1s'
            elif person == '1s':
                person = '2s'

            if self.trace_enabled:
                self.logger.debug('detected person={}'.format(person))
        else:
            # определим грамматическое лицо введенного предложения.
            person = self.person_classifier.detect_person(
                raw_phrase, self.text_utils, self.word_embeddings)
            if self.trace_enabled:
                self.logger.debug('detected person={}'.format(person))

            # Может потребоваться смена грамматического лица.
            if person == '1s':
                phrase = self.change_person(raw_phrase, '2s')
            elif person == '2s':
                phrase = self.change_person(raw_phrase, '1s')

        interpreted.interpretation = phrase
        interpreted.is_question = phrase_is_question
        interpreted.phrase_person = person
        return interpreted

    def say(self, session, answer):
        answer_interpretation = InterpretedPhrase(answer)
        answer_interpretation.is_bot_phrase = True
        answer_interpretation.is_question = self.is_question(answer)
        session.add_to_buffer(answer)
        session.add_phrase_to_history(answer_interpretation)

    def push_phrase(self, interlocutor, phrase):
        question = self.text_utils.canonize_text(phrase)
        if question == u'#traceon':
            self.trace_enabled = True
            return
        elif question == u'#traceoff':
            self.trace_enabled = False
            return
        elif question == u'#facts':
            for fact, person, fact_id in self.facts_storage.enumerate_facts(
                    interlocutor):
                print(u'{}'.format(fact))
            return

        session = self.get_session(interlocutor)

        # Выполняем интерпретацию фразы с учетом ранее полученных фраз,
        # так что мы можем раскрыть анафору, подставить в явном виде опущенные составляющие и т.д.,
        # определить, является ли фраза вопросом, фактом или императивным высказыванием.
        interpreted_phrase = self.interpret_phrase(session, question)

        # Интерпретация фраз и в общем случае реакция на них зависит и от истории
        # общения, поэтому результат интерпретации сразу добавляем в историю.
        session.add_phrase_to_history(interpreted_phrase)

        answer_generated = False

        if not interpreted_phrase.is_question:
            # Утверждение добавляем как факт в базу знаний, в раздел для
            # текущего собеседника.
            # TODO: факты касательно третьих лиц надо вносить в общий раздел базы, а не
            # для текущего собеседника.
            fact_person = '3'
            if interpreted_phrase.phrase_person == '1s':
                fact_person = '2s'
            elif interpreted_phrase.phrase_person == '2s':
                fact_person = '1s'
            fact = interpreted_phrase.interpretation
            if self.trace_enabled:
                print(u'Adding [{}] to knowledge base'.format(fact))
            self.facts_storage.store_new_fact(
                interlocutor, (fact, fact_person, '--from dialogue--'))

            if self.scripting is not None and self.enable_scripting:
                answer = self.scripting.generate_response4nonquestion(
                    self, interlocutor, interpreted_phrase)
                if answer is not None:
                    answer_generated = True

            if not answer_generated:
                if self.enable_smalltalk:
                    # подбираем подходящую реплику в ответ на не-вопрос собеседника (обычно это
                    # ответ на наш вопрос, заданный ранее).
                    smalltalk_phrases = self.facts_storage.enumerate_smalltalk_replicas(
                    )
                    best_premise, best_rel = self.synonymy_detector.get_most_similar(
                        interpreted_phrase.interpretation,
                        [(item.query, -1, -1) for item in smalltalk_phrases],
                        self.text_utils, self.word_embeddings)

                    # если релевантность найденной реплики слишком мала, то нужен другой алгоритм...
                    for item in smalltalk_phrases:
                        if item.query == best_premise:
                            # выбираем случайный вариант ответа
                            # TODO: уточнить выбор, подбирая наиболее релевантный вариант, так что выдаваемая
                            # реплика будет учитывать либо текущий дискурс, либо ???...
                            # Следует учесть, что ответные реплики в SmalltalkReplicas могут быть ненормализованы,
                            # поэтому их следует сначала нормализовать.
                            answer = np.random.choice(item.answers)
                            answer_generated = True
                            break

            if answer_generated:
                self.say(session, answer)
        else:
            # обрабатываем вопрос
            answers = self.build_answers(interlocutor, interpreted_phrase)
            for answer in answers:
                self.say(session, answer)

            # Возможно, кроме ответа на вопрос, надо выдать еще какую-то реплику.
            # Например, для смены темы разговора.
            if len(answers) > 0:
                if self.scripting is not None and self.enable_scripting:
                    additional_speech = self.scripting.generate_after_answer(
                        self, interlocutor, interpreted_phrase, answers[-1])
                    if additional_speech is not None:
                        self.say(session, additional_speech)

    def build_answers0(self, interlocutor, interpreted_phrase):
        if self.trace_enabled:
            self.logger.debug(u'Question to process={}'.format(
                interpreted_phrase.interpretation))

        # Нужна ли предпосылка, чтобы ответить на вопрос?
        # Используем модель, которая вернет вероятность того, что
        # пустой список предпосылок достаточен.
        p_enough = self.enough_premises.is_enough(
            premise_str_list=[],
            question_str=interpreted_phrase.interpretation,
            text_utils=self.text_utils,
            word_embeddings=self.word_embeddings)
        if p_enough > 0.5:
            # Единственный ответ можно построить без предпосылки, например для вопроса "Сколько будет 2 плюс 2?"
            answer_rel = p_enough
            answers, answer_rels = self.answer_builder.build_answer_text(
                [u''], [1.0], interpreted_phrase.interpretation,
                self.text_utils, self.word_embeddings)
            if len(answers) != 1:
                self.logger.debug(
                    u'Exactly 1 answer was expected for question={}, got {}'.
                    format(interpreted_phrase.interpretation, len(answers)))

            return answers, answer_rels

        else:
            # определяем наиболее релевантную предпосылку
            memory_phrases = list(
                self.facts_storage.enumerate_facts(interlocutor))

            best_premises, best_rels = self.relevancy_detector.get_most_relevant(
                interpreted_phrase.interpretation,
                memory_phrases,
                self.text_utils,
                self.word_embeddings,
                nb_results=3)
            if self.trace_enabled:
                self.logger.debug(
                    u'Best premise is "{}" with relevancy={}'.format(
                        best_premises[0], best_rels[0]))

            premises2 = []
            premise_rels2 = []
            max_rel = max(best_rels)
            for premise, rel in itertools.izip(best_premises, best_rels):
                if rel >= self.min_premise_relevancy and rel >= 0.5 * max_rel:
                    premises2.append([premise])
                    premise_rels2.append(rel)

            # генерация ответа на основе выбранной предпосылки.
            answers, answer_rels = self.answer_builder.build_answer_text(
                premises2, premise_rels2, interpreted_phrase.interpretation,
                self.text_utils, self.word_embeddings)

            return answers, answer_rels

    def build_answers(self, interlocutor, interpreted_phrase):
        answers, answer_confidenses = self.build_answers0(
            interlocutor, interpreted_phrase)
        if len(answer_confidenses
               ) == 0 or max(answer_confidenses) < self.min_premise_relevancy:
            # тут нужен алгоритм генерации ответа в условиях, когда
            # у бота нет нужных фактов. Это может быть как ответ "не знаю",
            # так и вариант "нет" для определенных категорий вопросов.
            if self.scripting is not None:
                answer = self.scripting.buid_answer(self, interlocutor,
                                                    interpreted_phrase)
                answers = [answer]

        return answers

    def pop_phrase(self, interlocutor):
        session = self.get_session(interlocutor)
        return session.extract_from_buffer()

    def get_session(self, interlocutor):
        return self.session_factory[interlocutor]
import pickle
from word_embeddings import WordEmbeddings
from nltk import word_tokenize


def findKMostFrequentWords(data_x, k):
    data = []
    for ele in data_x:
        sentence = ele[1]
        sentence = word_tokenize(sentence)
        data.append(sentence)
    ctr = Counter(tuple([word for sublist in data for word in sublist]))
    sorted_ctr = sorted(ctr.items(), key=operator.itemgetter(1), reverse=True)
    return [item[0] for item in sorted_ctr[0:k]]


full_embeddings = WordEmbeddings()
full_embeddings.create_embeddings_from_file(args.embedding_path)

words = findKMostFrequentWords(train_dataset.final_data, args.vocab_size)
reduced_embeddings = WordEmbeddings()
reduced_embeddings.create_reduced_embeddings(full_embeddings, words)

def visualise_k_hop_graph(target_word: str,
                          checkpoint: Optional[Union[str, Path]] = None,
                          weights_filepath: Optional[Union[str, Path]] = None,
                          vocab_filepath: Optional[Union[str, Path]] = None,
                          k: Optional[int] = 2,
                          alpha: Optional[float] = None,
                          min_node_size: Optional[float] = 20,
                          max_node_size: Optional[float] = 120,
                          min_font_size: Optional[float] = 6,
                          max_font_size: Optional[float] = 24,
                          node_alpha: Optional[float] = 1,
                          edge_alpha: Optional[float] = 0.15,
                          target_word_label_colour: Optional[str] = 'black',
                          colour_map: Optional[str] = 'tab20c',
                          output_path: Optional[Union[str, Path]] = None,
                          figure_width: Optional[int] = 800,
                          figure_height: Optional[int] = 600,
                          figure_dpi: Optional[int] = 96,
                          export_dpi: Optional[int] = 96,
                          verbose: Optional[bool] = False) -> None:
    """Visualise the k-hop graph for the given word embeddings and interest word.
    Requires one of checkpoint / (weights_filepath and vocab_filepath).

    If output_path is specified, then no preview window is drawn.
    """
    # Ensure that at least on data argument was provided
    if checkpoint is None and weights_filepath is None and vocab_filepath is None:
        logger.error(
            'One of checkpoint / (weights-filepath and vocab-filepath) is required!'
        )
        exit(1)

    if checkpoint is not None:
        checkpoint = Path(checkpoint)
        weights_filepath = checkpoint / 'proj_weights.npy'
        vocab_filepath = checkpoint / 'vocab.txt'
    else:
        weights_filepath = Path(weights_filepath)
        vocab_filepath = Path(vocab_filepath)

    if not verbose:
        logger.setLevel(logging.ERROR)

    embeddings = WordEmbeddings(weights_filepath,
                                vocab_filepath,
                                name_metadata=weights_filepath.parent.stem)

    figsize = (figure_width / figure_dpi, figure_height / figure_dpi)
    plt.figure(figsize=figsize, dpi=figure_dpi)

    draw_k_hop_graph(embeddings,
                     target_word,
                     k,
                     alpha=alpha,
                     min_node_size=min_node_size,
                     max_node_size=max_node_size,
                     min_font_size=min_font_size,
                     max_font_size=max_font_size,
                     node_alpha=node_alpha,
                     edge_alpha=edge_alpha,
                     target_word_label_colour=target_word_label_colour,
                     community_colour_map=colour_map)

    # Show the plot, or output it, depending on the mode.
    plt.axis('off')
    if not output_path:
        plt.show()
    else:
        output_path = Path(output_path)

        output_format = (output_path.suffix or 'png').replace('.', '')
        output_path.parent.mkdir(parents=True, exist_ok=True)
        if output_format == 'tex' or output_format == 'latex':
            tikzplotlib.save(output_path)
        else:
            plt.savefig(output_path, dpi=export_dpi)
        logger.info('Exported figure to {}'.format(output_path))
def draw_k_hop_graph(embeddings: WordEmbeddings,
                     target_word: str,
                     k: int,
                     alpha: Optional[float] = 0.50,
                     min_node_size: Optional[float] = 20,
                     max_node_size: Optional[float] = 120,
                     min_font_size: Optional[float] = 6,
                     max_font_size: Optional[float] = 24,
                     node_alpha: Optional[float] = 1,
                     edge_alpha: Optional[float] = 0.05,
                     target_word_label_colour: Optional[str] = 'black',
                     community_colour_map: Optional[str] = 'plasma') -> None:
    """Draw the k-hop graph for the given word embeddings and interest word.
    This function DOES NOT show the matplotlib plot.

    Args:
        embeddings: The word embeddings to generate the graph for.
        target_word: The word of interest.
        k: The number of 'hops' between the word of interest and every node
            in the graph. The resultant graph has the property that the word
            of interest is reachable from any node in at most k edges.
        alpha: The similarity threshold. Words that have a cosine similarity
            of at least this threshold are kept, and the rest are discarded.
        min_node_size: The minimum size of a node, in pixels.
        max_node_size: The maximum size of a node, in pixels.
        min_font_size: The minimum size of a label, in pixels.
        max_font_size: The maximum size of a label, in pixels.
        node_alpha: The alpha/transparency to draw nodes with.
        edge_alpha: The alpha/transparency to draw edges with.
        target_word_label_colour: The colour of the target word label.
            Makes the target word stand out. Useless when there are many words.
        community_colour_map: The colour map to use when assigning colours to communities.
    """
    if alpha is None:
        _, similarity = embeddings.most_similar(target_word, k=1)[0]
        alpha = similarity - 0.05
        logger.info(
            'No alpha threshold provided. Using alpha = {}'.format(alpha))

    graph = build_k_hop_graph(embeddings, target_word, k, alpha=alpha)

    logger.info('Computing best partition (Louvain community detection)')
    # compute the best partition
    partition = community_louvain.best_partition(graph)

    logger.info('Computing layout (ForceAtlas2)')
    forceatlas2 = ForceAtlas2(outboundAttractionDistribution=True,
                              edgeWeightInfluence=1.0,
                              jitterTolerance=1.0,
                              barnesHutOptimize=True,
                              barnesHutTheta=1.2,
                              scalingRatio=2.0,
                              strongGravityMode=False,
                              gravity=1.0,
                              verbose=False)

    positions = forceatlas2.forceatlas2_networkx_layout(graph)

    logger.info('Rendering graph with matplotlib')
    cmap = cm.get_cmap(community_colour_map, max(partition.values()) + 1)

    degrees = dict(graph.degree)
    max_degree = max(degrees.values())
    size_multipliers = {i: degrees[i] / max_degree for i in positions}

    # Generate node sizes
    node_size = [
        max(max_node_size * size_multipliers[i], min_node_size)
        for i in positions
    ]

    # Draw the nodes
    nx.draw_networkx_nodes(graph,
                           positions,
                           partition.keys(),
                           node_size=node_size,
                           cmap=cmap,
                           node_color=list(partition.values()),
                           alpha=node_alpha)

    # Draw the edges with a bezier curve
    curves = curved_edges(graph, positions)
    # Remove nan values
    curves = np.nan_to_num(curves)

    # Assign a colour to each edge, based on the community of the source node.
    edge_color = [cmap(partition[a]) for a, _ in graph.edges]
    edge_lines = LineCollection(curves,
                                color=edge_color,
                                cmap=cmap,
                                alpha=edge_alpha,
                                linewidths=1)
    plt.gca().add_collection(edge_lines)

    # Draw node labels (words)
    for i, (x, y) in positions.items():
        # The size of the label is proportional to the degree of the node.
        fontsize = max(max_font_size * size_multipliers[i]**4, min_font_size)
        word = embeddings.words[i]
        colour = target_word_label_colour if word == target_word else 'black'
        plt.text(x,
                 y,
                 word,
                 fontsize=fontsize,
                 ha='center',
                 va='center',
                 color=colour)
                    help='Weights file path to save the pretrained model.',
                    type=str,
                    required=True)
#print(parser.format_help())

args = parser.parse_args()
word_embeddings_file_path = args.word2vec
pretrained_weights_file_path = args.save
epochs = args.epochs
df = read_SEMEVAL_data(args.data)

# initialize objects
print('Initializing objects ...')
print('Initializing word embeddings ...')
t1 = time.time()
word_embeddings = WordEmbeddings(word_embeddings_file_path)
t2 = time.time()
print('\tTook %f seconds' % (t2 - t1))
print('Initializing tokenizer ...')
tokenizer = Tokenizer()
print('Initializing vectorizer ...')
vectorizer = Vectorizer(word_embeddings, tokenizer)

#### training dataset ####
# vectorizing
ids, train_a_vectors, train_b_vectors, train_gold = vectorizer.vectorize_df(df)
train_max_a_length = len(max(train_a_vectors, key=len))
train_max_b_length = len(max(train_b_vectors, key=len))
print('maximum number of tokens per sentence A in training set is %d' %
      train_max_a_length)
print('maximum number of tokens per sentence B in training set is %d' %
示例#25
0
def main():
    d = DataSet()
    folds, hold_out = kfold_split(d, n_folds=10)
    fold_stances, hold_out_stances, hold_out_stances_small = get_stances_for_folds2(
        d, folds, hold_out)
    embeddings = WordEmbeddings()
    print('Created data set and word embeddings')

    # create classifier
    model = Classifier()
    print('Set up model')

    # get word vector data
    x_articles = {}
    x_headlines = {}
    y_vals = {}
    lengths_a = {}
    lengths_h = {}

    x_global = {}
    y_global = {}

    for fold in fold_stances:
        x_headlines[fold], x_articles[fold], y_vals[fold], lengths_h[
            fold], lengths_a[fold] = get_articles_word_vectors(
                fold_stances[fold], d, embeddings)
        x_global[fold], y_global[fold] = generate_features(
            fold_stances[fold], d, str(fold))

    test_x_headlines, test_x_articles, test_y, test_h_lengths, test_a_lengths = get_articles_word_vectors(
        hold_out_stances, d, embeddings)
    test_x_global, test_y_global = generate_features(hold_out_stances, d,
                                                     'holdout')
    print("hold_out_stances: " + str(len(hold_out_stances)))
    print("test_x_global: " + str(len(test_x_global)))
    print("test_y_global: " + str(len(test_y_global)))

    test_x_headlines_small, test_x_articles_small, test_y_small, test_h_lengths_small, test_a_lengths_small = get_articles_word_vectors(
        hold_out_stances_small, d, embeddings)
    test_x_global_small, test_y_global_small = generate_features(
        hold_out_stances_small, d, 'holdout_small')
    print("test_x_global_small: " + str(len(test_x_global_small)))
    print("test_y_global_small: " + str(len(test_y_global_small)))
    print('Finished separating folds')

    # train LSTM (fold -> epoch -> batch)
    model.session.run(tf.global_variables_initializer())

    for fold in fold_stances:
        ids = list(range(len(folds)))
        del ids[fold]
        x_train_articles = np.vstack(tuple([x_articles[i] for i in ids]))
        x_train_headlines = np.vstack(tuple([x_headlines[i] for i in ids]))
        y_train = np.vstack(tuple([y_vals[i] for i in ids]))
        lengths_h_train = np.vstack(tuple([lengths_h[i] for i in ids]))
        lengths_a_train = np.vstack(tuple([lengths_a[i] for i in ids]))
        global_train = np.vstack(tuple([x_global[i] for i in ids]))
        # print('train articles shape = ' + str(x_train_articles.shape))
        # print('train headlines shape = ' + str(x_train_headlines.shape))
        # print('y train shape = ' + str(y_train.shape))

        x_valid_articles = x_articles[fold]
        x_valid_headlines = x_headlines[fold]
        y_valid = y_vals[fold]
        length_h_valid = lengths_h[fold]
        length_a_valid = lengths_a[fold]
        global_valid = x_global[fold]

        # SVM 1 : distinguishing unrelated from related
        X_train = global_train
        y_train_round1 = []
        for item in y_train:
            # if unrelated
            if item[3] == 1:
                y_train_round1.append(0)
            else:
                y_train_round1.append(1)
        y_train_round1 = np.array(y_train_round1)

        X_valid = global_valid
        y_valid_round1 = []
        for item in y_valid:
            if item[3] == 1:
                y_valid_round1.append(0)
            else:
                y_valid_round1.append(1)
        y_valid_round1 = np.array(y_valid_round1)

        clf1 = svm.SVC()
        # NOTE: Train on valid because it's a smaller set
        # Want to use train set in LSTM
        clf1.fit(X_valid, y_valid_round1)
        print("X_valid, y_valid_round1 shape: " + str(X_valid.shape) + ", " +
              str(y_valid_round1.shape))

        round1_pred = clf1.predict(X_train)
        round1_score = 0
        for i in range(len(round1_pred)):
            if round1_pred[i] == y_train_round1[i]:
                round1_score += 1
        round1_score = 1.0 * round1_score / len(round1_pred)
        print('round 1 score: ' + str(round1_score))

        # REFORMAT FOR BILSTM
        # Reformat y_train so it only has related labels (now only 3 labels)
        y_train_round2 = []
        for index, label in enumerate(round1_pred):
            if label == 1:
                y_train_round2.append(y_train[index][:-1])
            # If unrelated, append as all 0's
            else:
                y_train_round2.append([0, 0, 0])
        # Reformat y_valid so it only has related labels
        y_valid_round2 = []
        for label in y_valid:
            if label[3] != 1:
                y_valid_round2.append(label[:-1])
            # If unrelated, append as all 0's
            else:
                y_valid_round2.append([0, 0, 0])

        # TRAINING
        fold_error = 0
        print('Training fold ' + str(fold))
        j = 0
        #NOTE: Change epoch back to 5!!
        for epoch in range(5):

            # Training batches
            article_batches_train, headline_batches_train, output_batches_train, length_h_batches_train, length_a_batches_train, global_batches_train = create_batches(
                x_train_articles, x_train_headlines, y_train_round2,
                lengths_h_train, lengths_a_train, global_train)

            for i in range(len(article_batches_train)):
                # Training error
                epoch_error = model.session.run(
                    [model.error, model.train_fn], {
                        model.inputs_articles: article_batches_train[i],
                        model.inputs_headlines: headline_batches_train[i],
                        model.outputs: output_batches_train[i],
                        model.h_lengths: length_h_batches_train[i],
                        model.a_lengths: length_a_batches_train[i],
                        model.global_feats: global_batches_train[i]
                    })[0]
                print('\tEpoch ' + str(j) + ' error = ' + str(epoch_error))

                fold_error += epoch_error
                j += 1

        print('Training error (fold) = ' + str(fold_error / j) + '\n')

        # Validation batches
        article_batches_valid, headline_batches_valid, output_batches_valid, length_h_batches_valid, length_a_batches_valid, global_batches_valid = create_batches(
            x_valid_articles, x_valid_headlines, y_valid_round2,
            length_h_valid, length_a_valid, global_valid)

        all_pred_y_stances = []
        for i in range(len(article_batches_valid)):
            # cross-validation error
            pred_y_stances = model.session.run(
                [model.pred_stance], {
                    model.inputs_articles: article_batches_valid[i],
                    model.inputs_headlines: headline_batches_valid[i],
                    model.outputs: output_batches_valid[i],
                    model.h_lengths: length_h_batches_valid[i],
                    model.a_lengths: length_a_batches_valid[i],
                    model.global_feats: global_batches_valid[i]
                })
            all_pred_y_stances = np.append(all_pred_y_stances, pred_y_stances)

        # Merge related and unrelated labels together for final prediction
        final_pred = []
        all_pred_count = 0
        for label in y_valid_round1:
            # If unrelated
            if label == 0:
                final_pred.append(3)
            else:
                final_pred.append(all_pred_y_stances[all_pred_count])
                all_pred_count += 1

        simple_y = np.array([array.tolist().index(1) for array in y_valid])
        '''
		f1_score = metrics.f1_score(simple_y, pred_y_stances, average='macro')
		print("F1 MEAN score: " + str(f1_score))
		f1_score_labels =  metrics.f1_score(simple_y, pred_y_stances, labels=[0, 1, 2, 3], average=None)
		print("F1 LABEL scores: " + str(f1_score_labels))
		'''
        # Convert to string labels for FNC scoring metric
        label_map = {0: "agree", 1: "disagree", 2: "discuss", 3: "unrelated"}
        simple_y_str = [label_map[label] for label in simple_y]
        pred_y_stances_str = [label_map[label] for label in final_pred]
        report_score(simple_y_str, pred_y_stances_str)

    # assess performance on test set
    print('\n#### RUNNING ON HOLDOUT SET ####')

    # Reformat yvals for round1
    y_test_round1 = []
    for item in test_y:
        if item[3] == 1:
            y_test_round1.append(0)
        else:
            y_test_round1.append(1)
    y_test_round1 = np.array(y_test_round1)

    y_test_small_round1 = []
    for item in test_y_small:
        if item[3] == 1:
            y_test_small_round1.append(0)
        else:
            y_test_small_round1.append(1)
    y_test_small_round1 = np.array(y_test_small_round1)

    # ROUND 1 TESTING
    clf2 = svm.SVC()
    # NOTE: Train on valid because it's a smaller set
    # Want to use train set in LSTM
    clf2.fit(test_x_global_small, test_y_global_small)

    round1_pred = clf2.predict(test_x_global)
    round1_score = 0
    for i in range(len(round1_pred)):
        if round1_pred[i] == y_test_round1[i]:
            round1_score += 1
    round1_score = 1.0 * round1_score / len(round1_pred)
    print('round 1 score: ' + str(round1_score))

    # REFORMAT FOR BILSTM
    y_test_round2 = []
    for index, label in enumerate(round1_pred):
        if label == 1:
            y_test_round2.append(test_y_global[index][:-1])
        # If unrelated, append as all 0's
        else:
            y_test_round2.append([0, 0, 0])

    # Test batches
    article_batches_test, headline_batches_test, output_batches_test, length_h_batches_test, length_a_batches_test, global_batches_test = create_batches(
        test_x_articles, test_x_headlines, y_test_round2, test_h_lengths,
        test_a_lengths, test_x_global)

    all_pred_y_test = []
    for i in range(len(article_batches_test)):
        pred_y_stances = model.session.run(
            [model.pred_stance], {
                model.inputs_articles: article_batches_test[i],
                model.inputs_headlines: headline_batches_test[i],
                model.outputs: output_batches_test[i],
                model.h_lengths: length_h_batches_test[i],
                model.a_lengths: length_a_batches_test[i],
                model.global_feats: global_batches_test[i]
            })
        all_pred_y_test = np.append(all_pred_y_test, pred_y_stances)

    # Merge related and unrelated labels together for final prediction
    final_pred = []
    all_pred_count = 0
    for label in y_test_round1:
        # If unrelated
        if label == 0:
            final_pred.append(3)
        else:
            final_pred.append(all_pred_y_test[all_pred_count])
            all_pred_count += 1

    simple_y = np.array([array.tolist().index(1) for array in test_y])
    f1_score = metrics.f1_score(simple_y, final_pred, average='macro')
    print("F1 MEAN score: " + str(f1_score))
    f1_score_labels = metrics.f1_score(simple_y,
                                       final_pred,
                                       labels=[0, 1, 2, 3],
                                       average=None)
    print("F1 LABEL scores: " + str(f1_score_labels))

    # Convert to string labels for FNC scoring metric
    label_map = {0: "agree", 1: "disagree", 2: "discuss", 3: "unrelated"}
    simple_y_str = [label_map[label] for label in simple_y]
    pred_y_stances_str = [label_map[label] for label in final_pred]
    report_score(simple_y_str, pred_y_stances_str)