예제 #1
0
def load_paths(corpus_prefix, dataset_keys, lemma_index):
    """
    Override load_paths from lstm_common to include (x, y) vectors
    :param corpus_prefix:
    :param dataset_keys:
    :return:
    """

    # Define the dictionaries
    pos_index = defaultdict(count(0).next)
    dep_index = defaultdict(count(0).next)
    dir_index = defaultdict(count(0).next)

    dummy = pos_index['#UNKNOWN#']
    dummy = dep_index['#UNKNOWN#']
    dummy = dir_index['#UNKNOWN#']

    # Load the resource (processed corpus)
    print 'Loading the corpus...'
    corpus = KnowledgeResource(corpus_prefix)
    print 'Done!'

    keys = [(corpus.get_id_by_term(str(x)), corpus.get_id_by_term(str(y))) for (x, y) in dataset_keys]
    paths_x_to_y = [{ vectorize_path(path, lemma_index, pos_index, dep_index, dir_index) : count
                      for path, count in get_paths(corpus, x_id, y_id).iteritems() }
                    for (x_id, y_id) in keys]
    paths_x_to_y = [ { p : c for p, c in paths_x_to_y[i].iteritems() if p is not None } for i in range(len(keys)) ]

    paths = paths_x_to_y

    empty = [dataset_keys[i] for i, path_list in enumerate(paths) if len(path_list.keys()) == 0]
    print 'Pairs without paths:', len(empty), ', all dataset:', len(dataset_keys)

    pos_inverted_index = { i : p for p, i in pos_index.iteritems() }
    dep_inverted_index = { i : p for p, i in dep_index.iteritems() }
    dir_inverted_index = { i : p for p, i in dir_index.iteritems() }

    return paths, pos_index, dep_index, dir_index, pos_inverted_index, dep_inverted_index, dir_inverted_index
def predict():

    # The LSTM-based integrated pattern-based and distributional method for multiclass semantic relations classification
    corpus_prefix = CORPUS
    dataset_prefix = DATA_MODEL
    model_file_prefix = MODEL_OUTPUT

    # Load the relations
    with codecs.open(dataset_prefix + '/relations.txt', 'r', 'utf-8') as f_in:
        relations = [line.strip() for line in f_in]
        relation_index = {relation: i for i, relation in enumerate(relations)}
    print relation_index

    # Load the datasets
    print 'Loading the dataset...'
    with codecs.open(DATA_PREDICT, 'r', 'utf-8') as f_in:
        dataset = [tuple(line.strip().split('\t')) for line in f_in]
        dataset = list(set(dataset))

    # Load the resource (processed corpus)
    print 'Loading the corpus...'
    corpus = KnowledgeResource(corpus_prefix)
    print 'Done!'

    # Load the pre-trained model file
    classifier, word_index, pos_index, dep_index, dir_index = load_model(
        model_file_prefix)

    # Load the paths and create the feature vectors
    print 'Loading path files...'
    x_y_vectors_test, X_test = load_paths_and_word_vectors(
        corpus, dataset, word_index, pos_index, dep_index, dir_index)

    lemma_inverted_index = {i: p for p, i in word_index.iteritems()}
    pos_inverted_index = {i: p for p, i in pos_index.iteritems()}
    dep_inverted_index = {i: p for p, i in dep_index.iteritems()}
    dir_inverted_index = {i: p for p, i in dir_index.iteritems()}

    pred = classifier.predict(X_test, x_y_vectors=x_y_vectors_test)
    # write out prediction results
    df = pd.read_csv(DATA_PREDICT, sep='\t', header=None, index_col=None)
    df['predict'] = pred
    df.to_csv(DATA_PREDICT, sep='\t', header=False, index=False)
예제 #3
0
def main():
    """
    Load a pre-trained model of the LSTM-based integrated path-based and distributional method
    for hypernymy detection, and test it on the test set
    :return:
    """
    corpus_prefix = sys.argv[3]
    dataset_prefix = sys.argv[4]
    model_file_prefix = sys.argv[5]

    # Load the datasets
    print 'Loading the dataset...'
    test_set = load_dataset(dataset_prefix + 'test.tsv')
    y_test = [1 if 'True' in label else 0 for label in test_set.values()]

    # Load the resource (processed corpus)
    print 'Loading the corpus...'
    corpus = KnowledgeResource(corpus_prefix)
    print 'Done!'

    # Load the model
    classifier, lemma_index, pos_index, dep_index, dir_index = load_model(
        model_file_prefix)

    # Load the paths and create the feature vectors
    print 'Loading path files...'
    x_y_vectors_test, X_test = load_paths(corpus, test_set.keys(), lemma_index,
                                          pos_index, dep_index, dir_index)

    print 'Evaluation:'
    pred = classifier.predict(X_test, x_y_vectors=x_y_vectors_test)
    p, r, f1, support = precision_recall_fscore_support(y_test,
                                                        pred,
                                                        average='binary')
    print 'Precision: %.3f, Recall: %.3f, F1: %.3f' % (p, r, f1)

    # Write the predictions to a file
    relations = ['False', 'True']
    output_predictions(model_file_prefix + '.test_predictions', relations,
                       pred, test_set.keys(), y_test)
예제 #4
0
def main():

    # The LSTM-based path-based method for multiclass semantic relations classification
    corpus_prefix = sys.argv[5]
    dataset_prefix = sys.argv[6]
    model_file_prefix = sys.argv[7]

    # Load the relations
    with codecs.open(dataset_prefix + '/relations.txt', 'r', 'utf-8') as f_in:
        relations = [line.strip() for line in f_in]
        relation_index = {relation: i for i, relation in enumerate(relations)}

    # Load the datasets
    print 'Loading the dataset...'
    test_set = load_dataset(dataset_prefix + '/test.tsv', relations)
    y_test = [relation_index[label] for label in test_set.values()]

    # Load the resource (processed corpus)
    print 'Loading the corpus...'
    corpus = KnowledgeResource(corpus_prefix)
    print 'Done!'

    # Load the pre-trained model file
    classifier, word_index, pos_index, dep_index, dir_index = load_model(
        model_file_prefix)

    # Load the paths and create the feature vectors
    print 'Loading path files...'
    X_test = load_paths(corpus, test_set.keys(), word_index, pos_index,
                        dep_index, dir_index)
    print 'Number of words %d, number of pos tags: %d, number of dependency labels: %d, number of directions: %d' % \
          (len(word_index), len(pos_index), len(dep_index), len(dir_index))

    print 'Evaluation:'
    pred = classifier.predict(X_test)
    precision, recall, f1, support = evaluate(y_test,
                                              pred,
                                              relations,
                                              do_full_reoprt=True)
    print 'Precision: %.3f, Recall: %.3f, F1: %.3f' % (precision, recall, f1)
예제 #5
0
def main():

    # The LSTM-based integrated pattern-based and distributional method for multiclass semantic relations classification
    corpus_prefix = sys.argv[5]
    dataset_prefix = sys.argv[6]
    model_prefix_file = sys.argv[7]
    embeddings_file = sys.argv[8]
    num_hidden_layers = int(sys.argv[9])

    np.random.seed(133)

    # Load the relations
    with codecs.open(dataset_prefix + '/relations.txt', 'r', 'utf-8') as f_in:
        relations = [line.strip() for line in f_in]
        relation_index = {relation: i for i, relation in enumerate(relations)}

    # Load the datasets
    print 'Loading the dataset...'
    train_set = load_dataset(dataset_prefix + '/train.tsv', relations)
    print "Len of train set: " + str(len(train_set))
    val_set = load_dataset(dataset_prefix + '/val.tsv', relations)
    print "Len of val set: " + str(len(val_set))
    test_set = load_dataset(dataset_prefix + '/test.tsv', relations)
    print "Len of test set: " + str(len(test_set))
    y_train = [relation_index[label] for label in train_set.values()]
    y_val = [relation_index[label] for label in val_set.values()]
    y_test = [relation_index[label] for label in test_set.values()]
    dataset_keys = train_set.keys() + val_set.keys() + test_set.keys()
    print 'Done!'

    # Load the resource (processed corpus)
    print 'Loading the corpus...'
    corpus = KnowledgeResource(corpus_prefix)
    print 'Done!'

    # Get the vocabulary
    vocabulary = get_vocabulary(corpus, dataset_keys)

    # Load the word embeddings
    print 'Initializing word embeddings...'
    word_vectors, word_index = load_embeddings(embeddings_file, vocabulary)
    word_inverted_index = {i: w for w, i in word_index.iteritems()}

    # Load the paths and create the feature vectors
    print 'Loading path files...'
    x_y_vectors, dataset_instances, pos_index, dep_index, dir_index, pos_inverted_index, dep_inverted_index, \
    dir_inverted_index = load_paths_and_word_vectors(corpus, dataset_keys, word_index)
    print 'Number of words %d, number of pos tags: %d, number of dependency labels: %d, number of directions: %d' % \
          (len(word_index), len(pos_index), len(dep_index), len(dir_index))

    X_train = dataset_instances[:len(train_set)]
    X_val = dataset_instances[len(train_set):len(train_set) + len(val_set)]
    X_test = dataset_instances[len(train_set) + len(val_set):]

    x_y_vectors_train = x_y_vectors[:len(train_set)]
    x_y_vectors_val = x_y_vectors[len(train_set):len(train_set) + len(val_set)]
    x_y_vectors_test = x_y_vectors[len(train_set) + len(val_set):]

    # Tune the hyper-parameters using the validation set
    alphas = [0.001]
    word_dropout_rates = [0.0, 0.2, 0.4]
    f1_results = []
    models = []
    descriptions = []

    for alpha in alphas:
        for word_dropout_rate in word_dropout_rates:

            # Create the classifier
            classifier = PathLSTMClassifier(
                num_lemmas=len(word_index),
                num_pos=len(pos_index),
                num_dep=len(dep_index),
                num_directions=len(dir_index),
                num_negation_markers=3,
                n_epochs=5,
                num_relations=len(relations),
                lemma_embeddings=word_vectors,
                dropout=word_dropout_rate,
                alpha=alpha,
                use_xy_embeddings=True,
                num_hidden_layers=num_hidden_layers)

            print 'Training with learning rate = %f, dropout = %f...' % (
                alpha, word_dropout_rate)
            classifier.fit(X_train, y_train, x_y_vectors=x_y_vectors_train)

            pred = classifier.predict(X_val, x_y_vectors=x_y_vectors_val)
            precision, recall, f1, support = evaluate(y_val,
                                                      pred,
                                                      relations,
                                                      do_full_reoprt=False)
            print 'Learning rate = %f, dropout = %f, Precision: %.3f, Recall: %.3f, F1: %.3f' % \
                  (alpha, word_dropout_rate, precision, recall, f1)
            f1_results.append(f1)
            models.append(classifier)

            # Save intermediate models
            classifier.save_model(
                model_prefix_file + '.' + str(word_dropout_rate),
                [word_index, pos_index, dep_index, dir_index])
            descriptions.append('Learning rate = %f, dropout = %f' %
                                (alpha, word_dropout_rate))

    best_index = np.argmax(f1_results)
    classifier = models[best_index]
    description = descriptions[best_index]
    print 'Best hyper-parameters: ' + description

    # Save the best model to a file
    print 'Saving the model...'
    classifier.save_model(model_prefix_file,
                          [word_index, pos_index, dep_index, dir_index])

    # Evaluate on the test set
    print 'Evaluation:'
    pred = classifier.predict(X_test, x_y_vectors=x_y_vectors_test)
    precision, recall, f1, support = evaluate(y_test,
                                              pred,
                                              relations,
                                              do_full_reoprt=True)
    print 'Precision: %.3f, Recall: %.3f, F1: %.3f' % (precision, recall, f1)

    # Write the predictions to a file
    output_predictions(model_prefix_file + '.predictions', relations, pred,
                       test_set.keys(), y_test)

    # Retrieve k-best scoring paths for each class
    all_paths = unique(
        [path for path_list in dataset_instances for path in path_list])
    top_k = classifier.get_top_k_paths(all_paths, relation_index, 0.7)

    for i, relation in enumerate(relations):
        with codecs.open(model_prefix_file + '.paths.' + relation, 'w',
                         'utf-8') as f_out:
            for path, score in top_k[i]:
                path_str = '_'.join([
                    reconstruct_edge(edge, word_inverted_index,
                                     pos_inverted_index, dep_inverted_index,
                                     dir_inverted_index) for edge in path
                ])
                print >> f_out, '\t'.join([path_str, str(score)])
def main():
    np.random.seed(133)
    # The seed is for when we want repeatable results.

    # Load the relations
    with codecs.open(args.dataset_prefix + '/relations.txt', 'r', 'utf-8') as f_in:
        relations = [line.strip() for line in f_in]
        relation_index = { relation : i for i, relation in enumerate(relations) }
        print('relation_index :')
        pprint.pprint(relation_index)

    # Load the datasets
    print 'Loading the dataset...'
    train_set = load_dataset(args.dataset_prefix + '/train.tsv', relations)
    val_set = load_dataset(args.dataset_prefix + '/val.tsv', relations)
    test_set = load_dataset(args.dataset_prefix + '/test.tsv', relations)

    print("test_set ", test_set)
    print('\n')

    y_train = [relation_index[label] for label in train_set.values()]
    y_val = [relation_index[label] for label in val_set.values()]
    y_test = [relation_index[label] for label in test_set.values()]

    print("y_test ", y_test)
    print('\n')

    print("test_set.keys() ", test_set.keys())
    dataset_keys = train_set.keys() + val_set.keys() + test_set.keys()

    print 'Done!'
    print('\n')

    # Load the resource (processed corpus)
    print 'Loading the corpus...'
    corpus = KnowledgeResource(args.corpus_prefix)
    print("corpus", corpus)
    print 'Done!'

    # Get the vocabulary
    vocabulary = get_vocabulary(corpus, dataset_keys)

    # Load the word embeddings
    print 'Initializing word embeddings...'
    word_vectors, word_index = load_embeddings(args.embeddings_file, vocabulary)
    # print('word_vectors', word_vectors)
    # print('word_index', word_index)
    # 61, u'anti-terrorism': 77116, u'sugino': 378214, u'v-p': 192698, u"l'association": 183544, u'nicolino': 284460, u'paskal': 134561, u'wons': 344069, u'jianming': 147054, u'paskah': 250517, u'paskai': 389896, u'sivarasa': 206828, u'cutlet': 142874, u'cutler': 18017, u'b\xe9liveau': 270700, u'2213': 275550, u'92.29': 344139, u"l'eglise": 248592, u'farhud': 397788, u'hollen': 59631, u'5150': 153632, u'birthmark': 105028, u'namadi': 291246, u'uebber': 235170, u'holler': 53382, u'holles': 137285, u'libretti': 92423, u'holley': 38966, u'suiciders': 297045, u'leaden': 65314, u'blue-and-white': 199303, u'text-types': 365053, u'csaa':
    # 352872, u'csac': 280893, u'leaded': 55631, u'maranville': 311808, u'reza\xef': 213221, u'csas': 177125, u'csar': 120796, u'csat': 240101, u'haywood': 35298, u'deciliter': 118490, u'yannitsis': 390408, u'thoroughfare': 23669, u'rivonia': 149865, u'ultratop': 133395, u'148-member': 256295, u'kashagan': 100228, u'wesley': 10467, u'matrixx': 207821, u'phylis': 272842, u'fiumicino': 95253, u'pailan': 161723, u'first-of-its-kind': 225091, u'genets': 296893, u'goldwork': 338499, u'neo-babylonian': 195998, u'interventionists': 171769, u'clojure': 303098, u'adhaim': 374678, u'bugle': 41652, u'skaha': 397963, u'aldaco': 396559, u'myisha': 185566, u'frolunda': 195939, u'basilisks': 346164, u'bhatkar': 379085, u'jenine': 184185, u'long-winged': 327407, u'arden-arcade': 341140, u'relaci\xf3n': 267608, u'blockhouses': 114387, u'poove': 367186, u'maize': 14908, u'mojahedin': 224956, u'gerstel': 311743, u'brujas': 128695, u'cracroft': 301781, u'ajna': 377586, u'lederer': 37662, u'100-person': 200702, u'dieguito': 217408, u'quez\xf3n': 302127, u'footwork': 31682, u'kojima': 64072, u'cl\xe9ment': 69390, u'cisowski': 260671, u'jerrys': 247111, u'clampdown': 24779, u'near-fatal': 112364, u'third-world': 163091, u'brashness': 99576, u'adhunik': 398217, u'witherell':
    # 256863, u'chelyabinsk-70': 321375, u'achmat': 110656, u'short-acting': 262297, u'p19': 320735, u'chromate': 136926, u'worrywarts': 340377, u'achmad': 76861, u'semi-nomadic': 109866, u'chalonnaise': 357270, u'flaminius': 202827, u'hirschbiegel': 215738, u'pandyan': 94815, u'd.n.c.': 367506, u'polevanov': 276819, u'spanks': 280699, u'pandyas': 111459, u'morihiro': 103633, u'121.26': 262184, u'spanky': 85069, u'flameouts': 226637, u'15km': 53708, u'50cm': 358620, u'campton': 154447, u'sancton': 349215, u'beguelin': 345069, u'1.218': 398596, u'50cc': 163840, u'hickling': 105165, u'dogtown': 89630, u'league-record': 362391, u'performance-based': 123474, u'guadagno': 185339, u'guadagni': 190095, u'whitesand': 279029, u'rowshan': 393443, u'bruschi': 45649, u'shyatt': 214189, u'profanity-laced': 388047, u'psone': 234041, u'gencon': 353806,
    # u'torpey': 144547, u'daum': 57164, u'gao': 9308, u'ipfw': 203997, u'expands': 14320, u'gam': 17423, u'scratchcards': 348767, u'wassermann': 215028, u'sheinbein': 48976, u'clinopyroxene': 356820, u'sd-6': 241100, u'zinka': 274365})
    word_inverted_index = { i : w for w, i in word_index.iteritems() }

    # Load the paths and create the feature vectors
    print 'Loading path files...'
    x_y_vectors, dataset_instances, pos_index, dep_index, dir_index, pos_inverted_index, dep_inverted_index, \
    dir_inverted_index = load_paths_and_word_vectors(corpus, dataset_keys, word_index)

    print 'Number of words %d, number of pos tags: %d, number of dependency labels: %d, number of directions: %d' % \
          (len(word_index), len(pos_index), len(dep_index), len(dir_index))

    X_train = dataset_instances[:len(train_set)]
    X_val = dataset_instances[len(train_set):len(train_set)+len(val_set)]
    X_test = dataset_instances[len(train_set)+len(val_set):]
    print('X_test', X_test)

    x_y_vectors_train = x_y_vectors[:len(train_set)]
    x_y_vectors_val = x_y_vectors[len(train_set):len(train_set)+len(val_set)]
    x_y_vectors_test = x_y_vectors[len(train_set)+len(val_set):]
    print('x_y_vectors_test', x_y_vectors_test)

    # Tune the hyper-parameters using the validation set
    alphas = [0.001]
    word_dropout_rates = [0.0] # [0.0, 0.2, 0.4]
    f1_results = []
    models = []
    descriptions = []

    for alpha in alphas:
        for word_dropout_rate in word_dropout_rates:

            # Create the classifier
            classifier = PathLSTMClassifierKeras(num_lemmas=len(word_index), num_pos=len(pos_index),
                                            num_dep=len(dep_index), num_directions=len(dir_index),
                                            n_epochs=args.num_epochs,
                                            num_relations=len(relations), lemma_embeddings=word_vectors,
                                            dropout=word_dropout_rate, alpha=alpha, use_xy_embeddings=True,
                                            num_hidden_layers=args.num_hidden_layers)

            print 'Training with learning rate = %f, dropout = %f...' % (alpha, word_dropout_rate)
            classifier.fit(X_train, y_train, x_y_vectors=x_y_vectors_train)

            pred = classifier.predict(X_val, x_y_vectors=x_y_vectors_val)
            precision, recall, f1, support = evaluate(y_val, pred, relations, do_full_reoprt=False)
            print 'Learning rate = %f, dropout = %f, Precision: %.3f, Recall: %.3f, F1: %.3f' % \
                  (alpha, word_dropout_rate, precision, recall, f1)
            f1_results.append(f1)
            models.append(classifier)

            # Save intermediate models
            classifier.save_model(args.model_prefix_file + '.' + str(word_dropout_rate),
                                  [word_index, pos_index, dep_index, dir_index])
            descriptions.append('Learning rate = %f, dropout = %f' % (alpha, word_dropout_rate))

    best_index = np.argmax(f1_results)
    classifier = models[best_index]
    description = descriptions[best_index]
    print 'Best hyper-parameters: ' + description

    # Save the best model to a file
    print 'Saving the model...'
    classifier.save_model(args.model_prefix_file, [word_index, pos_index, dep_index, dir_index])

    # Evaluate on the test set
    print 'Evaluation:'
    pred = classifier.predict(X_test, x_y_vectors=x_y_vectors_test)
    precision, recall, f1, support = evaluate(y_test, pred, relations, do_full_reoprt=True)
    print 'Precision: %.3f, Recall: %.3f, F1: %.3f' % (precision, recall, f1)

    # Write the predictions to a file
    output_predictions(args.model_prefix_file + '.predictions', relations, pred, test_set.keys(), y_test)

    # Retrieve k-best scoring paths for each class
    all_paths = unique([path for path_list in dataset_instances for path in path_list])
    top_k = classifier.get_top_k_paths(all_paths, relation_index, 0.7)

    for i, relation in enumerate(relations):
        with codecs.open(args.model_prefix_file + '.paths.' + relation, 'w', 'utf-8') as f_out:
            for path, score in top_k[i]:
                path_str = '_'.join([reconstruct_edge(edge, word_inverted_index, pos_inverted_index,
                                                      dep_inverted_index, dir_inverted_index) for edge in path])
                print >> f_out, '\t'.join([path_str, str(score)])
예제 #7
0
def main():
    """
    Applies bi-directional search in two phases to find the shortest paths
    between every term-pair in the dataset: the first phase finds the nodes
    along the shortest paths, while the second reconstructs the paths themselves.
    """

    # Get the arguments
    args = docopt("""Find the shortest paths between every term-pair in the dataset.

    Usage:
        search.py <dataset_path> <resource_matrix_path>
        <resource_entities_path> <resource_properties_path>
        <resource_l2r_path> <max_path_length>
        <allow_reversed_edges> <find_relevant_nodes>
        <relevant_nodes_file> <paths_out_file>

        <dataset_path> = the dataset file
        <resource_matrix_path> = the resource adjacency matrix file (.mm/.npz)
        <resource_entities_path> = the entity str-id map file
        <resource_properties_path> = the property str-id map file
        <resource_l2r_path> = the edges file
        <max_path_length> = the maximum path length
        <allow_reversed_edges> = whether reversed edges are allowed in this resource
        <find_relevant_nodes> = whether to find the relevant nodes (or use the results file)
        <relevant_nodes_file> = relevant nodes file (input / output)
        <paths_out_file> = the paths file (output)
    """)

    dataset_file = args['<dataset_path>']
    resource_mat_file = args['<resource_matrix_path>']
    entity_map_file = args['<resource_entities_path>']
    property_map_file = args['<resource_properties_path>']
    edges_file = args['<resource_l2r_path>']
    max_length = int(args['<max_path_length>'])
    allow_reversed_edges = args['<allow_reversed_edges>'][0].upper() == 'T'
    do_find_relevant_nodes = args['<find_relevant_nodes>'][0].upper() == 'T'
    relevant_nodes_file = args['<relevant_nodes_file>']
    paths_file = args['<paths_out_file>']

    initialize_logger()

    # Find relevant nodes  
    if do_find_relevant_nodes:

        # Load the resource
        resource = KnowledgeResource(resource_mat_file, entity_map_file,
                                     property_map_file, edges_file, allow_reversed_edges)
        adjacency_matrix = resource.adjacency_matrix
        term_to_id = resource.term_to_id

        # Load the dataset
        dataset = load_data_labels(dataset_file, adjacency_matrix)

        node_finder = RelevantNodesFinder(adjacency_matrix)
        relevant_nodes = find_relevant_nodes(dataset, max_length, relevant_nodes_file, term_to_id, node_finder)
    else:

        # Load the resource partially according to the relevant nodes
        relevant_nodes = load_relevant_nodes(relevant_nodes_file)

        resource = KnowledgeResource(resource_mat_file, entity_map_file,
                                     property_map_file, edges_file, allow_reversed_edges, get_all_nodes(relevant_nodes))
        adjacency_matrix = resource.adjacency_matrix
        term_to_id = resource.term_to_id

        # Load the dataset
        dataset = load_data_labels(dataset_file, adjacency_matrix)

    path_finder = PathFinder(resource)
    paths_output = open(paths_file, 'w')

    pair_num = 0

    # For each term-pair, find relevant nodes and then find paths
    for (x, y) in dataset.keys():

        pair_num = pair_num + 1

        x_id = -1
        if x in term_to_id:
            x_id = term_to_id[x]

        y_id = -1
        if y in term_to_id:
            y_id = term_to_id[y]

        # Limit the search space using the relevant nodes and find paths
        nodes = relevant_nodes[(x_id, y_id)]
        l2r_edges, r2l_edges = resource.get_nodes_edges(nodes)
        paths = path_finder.find_shortest_paths(x_id, y_id, max_length, l2r_edges, r2l_edges)
        paths_output.write('pair number ' + str(pair_num) + ': ' + x + '->' + y + '\n')
        for path in paths:
            paths_output.write(nice_print_path(path, resource.id_to_prop) + '\n')

    paths_output.close()
예제 #8
0
def main():

    args = docopt("""The LSTM-based integrated pattern-based and distributional method for multiclass
    semantic relations classification

    Usage:
        parse_wikipedia.py <corpus_prefix> <dataset_prefix> <model_prefix_file> <embeddings_file> <num_hidden_layers>

        <wiki_file> = the Wikipedia dump file
        <vocabulary_file> = a file containing the words to include
        <out_file> = the output file
    """)

    corpus_prefix = args['<corpus_prefix>']
    dataset_prefix = args['<dataset_prefix>']
    model_prefix_file = args['<model_prefix_file>']
    embeddings_file = args['<embeddings_file>']
    num_hidden_layers = int(args['<num_hidden_layers>'])

    np.random.seed(133)

    # Load the relations
    with codecs.open(dataset_prefix + '/relations.txt', 'r', 'utf-8') as f_in:
        relations = [line.strip() for line in f_in]
        relation_index = { relation : i for i, relation in enumerate(relations) }

    # Load the datasets
    print 'Loading the dataset...'
    train_set = load_dataset(dataset_prefix + '/train.tsv', relations)
    val_set = load_dataset(dataset_prefix + '/val.tsv', relations)
    test_set = load_dataset(dataset_prefix + '/test.tsv', relations)
    y_train = [relation_index[label] for label in train_set.values()]
    y_val = [relation_index[label] for label in val_set.values()]
    y_test = [relation_index[label] for label in test_set.values()]
    dataset_keys = train_set.keys() + val_set.keys() + test_set.keys()
    print 'Done!'

    # Load the resource (processed corpus)
    print 'Loading the corpus...'
    corpus = KnowledgeResource(corpus_prefix)
    print 'Done!'

    # Get the vocabulary
    vocabulary = get_vocabulary(corpus, dataset_keys)

    # Load the word embeddings
    print 'Initializing word embeddings...'
    word_vectors, word_index = load_embeddings(embeddings_file, vocabulary)

    # Load the paths and create the feature vectors
    print 'Loading path files...'
    x_y_vectors, dataset_instances, pos_index, dep_index, dir_index, pos_inverted_index, dep_inverted_index, \
    dir_inverted_index = load_paths_and_word_vectors(corpus, dataset_keys, word_index)
    print 'Number of words %d, number of pos tags: %d, number of dependency labels: %d, number of directions: %d' % \
          (len(word_index), len(pos_index), len(dep_index), len(dir_index))

    X_train = dataset_instances[:len(train_set)]
    X_val = dataset_instances[len(train_set):len(train_set)+len(val_set)]
    X_test = dataset_instances[len(train_set)+len(val_set):]

    x_y_vectors_train = x_y_vectors[:len(train_set)]
    x_y_vectors_val = x_y_vectors[len(train_set):len(train_set)+len(val_set)]
    x_y_vectors_test = x_y_vectors[len(train_set)+len(val_set):]

    # Tune the hyper-parameters using the validation set
    epochs = [10, 15, 20]
    word_dropout_rates = [0.0, 0.2, 0.4]
    f1_results = []
    descriptions = []
    model_prefixes = []

    for word_dropout_rate in word_dropout_rates:
        for n_epochs in epochs:

            # Create the classifier
            classifier = PathLSTMClassifier(num_lemmas=len(word_index), num_pos=len(pos_index), num_dep=len(dep_index),
                                            num_directions=len(dir_index), n_epochs=n_epochs, num_relations=len(relations),
                                            lemma_embeddings=word_vectors, dropout=word_dropout_rate,
                                            num_hidden_layers=num_hidden_layers)

            description = 'dropout = %.2f, num epochs = %d' % (word_dropout_rate, n_epochs)
            print 'Training with ' + description + '...'
            classifier.fit(X_train, y_train, x_y_vectors=x_y_vectors_train)

            pred = classifier.predict(X_val, x_y_vectors=x_y_vectors_val)
            precision, recall, f1, support = evaluate(y_val, pred, relations, do_full_reoprt=False)
            print 'Dropout = %f, num epochs = %d, Precision: %.3f, Recall: %.3f, F1: %.3f' % \
                  (word_dropout_rate, n_epochs, precision, recall, f1)
            f1_results.append(f1)

            # Save intermediate models
            curr_model_prefix = '%s_%.2f_%d' % (model_prefix_file, word_dropout_rate, n_epochs)
            model_prefixes.append(curr_model_prefix)
            classifier.save_model(curr_model_prefix, [word_index, pos_index, dep_index, dir_index])
            descriptions.append(description)
            classifier.close()

    best_index = np.argmax(f1_results)
    description = descriptions[best_index]
    print 'Best hyper-parameters: ' + description

    # Save the best model to a file
    print 'Saving the model...'
    best_model_prefix = model_prefixes[best_index]
    for file in glob.glob(best_model_prefix + '.*'):
        shutil.copy(file, model_prefix_file + file[file.index(best_model_prefix) + len(best_model_prefix):])

    classifier, word_index, pos_index, dep_index, dir_index = PathLSTMClassifier.load_model(model_prefix_file)

    # Evaluate on the test set
    print 'Evaluation:'
    pred = classifier.predict(X_test, x_y_vectors=x_y_vectors_test)
    precision, recall, f1, support = evaluate(y_test, pred, relations, do_full_reoprt=True)
    print 'Precision: %.3f, Recall: %.3f, F1: %.3f' % (precision, recall, f1)
    classifier.close()

    # Write the predictions to a file
    output_predictions(model_prefix_file + '.predictions', relations, pred, test_set.keys(), y_test)
def main():

    np.random.seed(133)

    # Load the relations
    with codecs.open(args.dataset_prefix + '/relations.txt', 'r', 'utf-8') as f_in:
        relations = [line.strip() for line in f_in]
        relation_index = { relation : i for i, relation in enumerate(relations) }

    # Load the datasets
    print 'Loading the dataset...'
    train_set = load_dataset(args.dataset_prefix + '/train.tsv', relations)
    val_set = load_dataset(args.dataset_prefix + '/val.tsv', relations)
    test_set = load_dataset(args.dataset_prefix + '/test.tsv', relations)
    y_train = [relation_index[label] for label in train_set.values()]
    y_val = [relation_index[label] for label in val_set.values()]
    y_test = [relation_index[label] for label in test_set.values()]
    dataset_keys = train_set.keys() + val_set.keys() + test_set.keys()
    print 'Done!'

    # Load the resource (processed corpus)
    print 'Loading the corpus...'
    corpus = KnowledgeResource(args.corpus_prefix)
    print 'Done!'

    # Get the vocabulary
    vocabulary = get_vocabulary(corpus, dataset_keys)

    # Load the word embeddings
    print 'Initializing word embeddings...'
    word_vectors, lemma_index = load_embeddings(args.embeddings_file, vocabulary)
    lemma_inverted_index = { i : w for w, i in lemma_index.iteritems() }

    # Load the paths and create the feature vectors
    print 'Loading path files...'
    dataset_instances, pos_index, dep_index, dir_index, pos_inverted_index, dep_inverted_index, \
    dir_inverted_index = load_paths(corpus, dataset_keys, lemma_index)
    print 'Number of words %d, number of pos tags: %d, number of dependency labels: %d, number of directions: %d' % \
          (len(lemma_index), len(pos_index), len(dep_index), len(dir_index))

    X_train = dataset_instances[:len(train_set)]
    X_val = dataset_instances[len(train_set):len(train_set)+len(val_set)]
    X_test = dataset_instances[len(train_set)+len(val_set):]

    # Tune the hyper-parameters using the validation set
    alphas = [0.001]
    word_dropout_rates = [0.0, 0.2, 0.4]
    f1_results = []
    models = []
    descriptions = []

    for alpha in alphas:
        for word_dropout_rate in word_dropout_rates:

            # Create the classifier
            classifier = PathLSTMClassifier(num_lemmas=len(lemma_index), num_pos=len(pos_index),
                                            num_dep=len(dep_index), num_directions=len(dir_index),
                                            n_epochs=args.num_epochs,
                                            num_relations=len(relations), lemma_embeddings=word_vectors,
                                            dropout=word_dropout_rate, alpha=alpha, use_xy_embeddings=False,
                                            num_hidden_layers=args.num_hidden_layers)

            print 'Training with learning rate = %f, dropout = %f...' % (alpha, word_dropout_rate)
            classifier.fit(X_train, y_train)

            pred = classifier.predict(X_val)
            precision, recall, f1, support = evaluate(y_val, pred, relations, do_full_reoprt=False)
            print 'Learning rate = %f, dropout = %f, Precision: %.3f, Recall: %.3f, F1: %.3f' % \
                  (alpha, word_dropout_rate, precision, recall, f1)
            f1_results.append(f1)
            models.append(classifier)

            # Save intermediate model
            classifier.save_model(args.model_prefix_file + '.' + str(word_dropout_rate),
                                  [lemma_index, pos_index, dep_index, dir_index])
            descriptions.append('Learning rate = %f, dropout = %f' % (alpha, word_dropout_rate))

    best_index = np.argmax(f1_results)
    classifier = models[best_index]
    description = descriptions[best_index]
    print 'Best hyper-parameters: ' + description

    # Save the best model to a file
    print 'Saving the model...'
    classifier.save_model(args.model_prefix_file, [lemma_index, pos_index, dep_index, dir_index])

    # Evaluate on the test set
    print 'Evaluation:'
    pred = classifier.predict(X_test)
    precision, recall, f1, support = evaluate(y_test, pred, relations, do_full_reoprt=True)
    print 'Precision: %.3f, Recall: %.3f, F1: %.3f' % (precision, recall, f1)
예제 #10
0
파일: train_RL.py 프로젝트: xcgfth/TaxoRL
def main():
    print_config(opt)
    # Load the relations
    with codecs.open(args.dataset_prefix + '/relations.txt', 'r', 'utf-8') as f_in:
        relations = [line.strip() for line in f_in]
        relation_index = {relation: i for i, relation in enumerate(relations)}

    # Load the datasets
    if args.debug:
        trainname = '../datasets/wn-bo/train_sample.tsv'
        print 'Loading the dataset...', trainname, '*' * 10
        train_set = load_dataset(trainname, relations)
        val_set = load_dataset(trainname, relations)
        test_set = load_dataset(trainname, relations)
    else:
        trainname = '/' + args.trainname + '.tsv'
        valname = '/' + args.valname + '.tsv'
        testname = '/' + args.testname + '.tsv'
        print 'Loading the dataset...', trainname, '*' * 10
        train_set = load_dataset(args.dataset_prefix + trainname, relations)
        print 'Loading the dataset...', valname, '*' * 10
        val_set = load_dataset(args.dataset_prefix + valname, relations)
        print 'Loading the dataset...', testname, '*' * 10
        test_set = load_dataset(args.dataset_prefix + testname, relations)
    # y_train = [relation_index[label] for label in train_set.values()]
    # y_val = [relation_index[label] for label in val_set.values()]
    # y_test = [relation_index[label] for label in test_set.values()]
    dataset_keys = train_set.keys() + val_set.keys() + test_set.keys()
    # add (x, root) to dataset_keys
    vocab = set()
    for (x, y) in dataset_keys:
        vocab.add(x)
        vocab.add(y)
    dataset_keys += [(term, 'root007') for term in vocab]

    if not args.debug:
        trees = read_tree_file(
            "../datasets/wn-bo/wn-bo-trees-4-11-50-train533-lower.ptb",
            given_root=args.given_root_train, filter_root=args.filter_root, allow_up=args.allow_up)
        trees_val = read_tree_file(
            "../datasets/wn-bo/wn-bo-trees-4-11-50-dev114-lower.ptb",
            given_root=args.given_root_test, filter_root=args.filter_root, allow_up=args.allow_up)
        trees_test = read_tree_file(
            "../datasets/wn-bo/wn-bo-trees-4-11-50-test114-lower.ptb",
            given_root=args.given_root_test, filter_root=args.filter_root, allow_up=args.allow_up)
        trees_semeval = read_edge_files("../datasets/SemEval-2016/original/",
                                        given_root=True, filter_root=args.filter_root, allow_up=False)
    else:
        trees = read_tree_file(
            "../datasets/wn-bo/train_sample.ptb2",
            given_root=args.given_root_train, filter_root=args.filter_root, allow_up=args.allow_up)
        trees_val = read_tree_file(
            "../datasets/wn-bo/train_sample.ptb2",
            given_root=args.given_root_train, filter_root=args.filter_root, allow_up=args.allow_up)
        trees_test = read_tree_file(
            "../datasets/wn-bo/train_sample.ptb2",
            given_root=args.given_root_test, filter_root=args.filter_root, allow_up=args.allow_up)
        trees_semeval = read_tree_file(
            "../datasets/wn-bo/train_sample.ptb2",
            given_root=args.given_root_test, filter_root=args.filter_root, allow_up=args.allow_up)

    # Load the resource (processed corpus)
    print 'Loading the corpus...', args.corpus_prefix, '*' * 10
    corpus = KnowledgeResource(args.corpus_prefix)

    if not os.path.exists('pickled_data/preload_data_{}_debug{}.pkl'.format(args.model_prefix_file, args.debug)):
        print 'Loading the vocabulary...'
        # path_lemmas_name = "pickled_data/path_lemmas_3in1.pkl"
        # print 'reload path_lemmas from:', path_lemmas_name
        # path_lemmas = pickle.load(open(path_lemmas_name, 'rb'))
        path_lemmas, x_y_words, keys = get_vocabulary(corpus, dataset_keys, None)
        if not args.debug:
            pickle.dump(path_lemmas, open('pickled_data/path_lemmas_{}.pkl'.format(args.model_prefix_file), 'wb'))
            pickle.dump(x_y_words, open('pickled_data/x_y_words_{}.pkl'.format(args.model_prefix_file), 'wb'))

        # Load the word embeddings
        print 'Initializing word embeddings...'
        word_vectors, word_index, word_set = load_embeddings(args.embeddings_file, path_lemmas, x_y_words,
                                                             debug=args.debug)
        # Load the paths and create the feature vectors
        print 'Loading path files...'
        dataset_instances, pos_index, dep_index, dir_index, pos_inverted_index, dep_inverted_index, \
        dir_inverted_index = load_paths_and_word_vectors(corpus, dataset_keys, word_index, keys)
        print 'saving pkl...'
        pickle.dump((word_vectors, word_index, word_set, dataset_instances, pos_index, dep_index, dir_index,
                     pos_inverted_index, dep_inverted_index, dir_inverted_index),
                    open('pickled_data/preload_data_{}_debug{}.pkl'.format(args.model_prefix_file, args.debug), 'wb'))
    else:
        print 'Data loaded from', 'pickled_data/preload_data_{}_debug{}.pkl'.format(args.model_prefix_file,
                                                                                    args.debug), 'make sure pkl is correct'
        (word_vectors, word_index, word_set, dataset_instances, pos_index, dep_index, dir_index, pos_inverted_index,
         dep_inverted_index, dir_inverted_index) = pickle.load(
            open('pickled_data/preload_data_{}_debug{}.pkl'.format(args.model_prefix_file, args.debug), 'rb'))

    print 'Number of words %d, number of pos tags: %d, number of dependency labels: %d, number of directions: %d' % \
          (len(word_index), len(pos_index), len(dep_index), len(dir_index))

    # dataset_instances is now (paths, x_y_vectors, features)
    X_train = dataset_instances[:len(train_set)]
    X_val = dataset_instances[len(train_set):len(train_set) + len(val_set)]
    X_test = dataset_instances[len(train_set) + len(val_set):]
    print len(X_train), len(X_val), len(X_test)

    # check_data(train_set, X_train, word_set)
    # check_data(val_set, X_val, word_set)
    # check_data(test_set, X_test, word_set)
    # save_path_info(dataset_keys, dataset_instances)
    # scores_save = []
    # scores_save_test = []
    # prob_save = []
    # prob_save_test = []
    policy = Policy(dataset_keys, dataset_instances, num_lemmas=len(word_index), num_pos=len(pos_index),
                    num_dep=len(dep_index), num_directions=len(dir_index), opt=opt, num_relations=len(relations),
                    lemma_embeddings=word_vectors)
    trainer = dy.AdamTrainer(policy.model, alpha=args.lr)
    if args.debug:
        n_epoch = 1000
    else:
        n_epoch = 1000
    best = [0] * 6
    best_idx = [0] * 6
    best_val = [0] * 6
    best_val_idx = [0] * 6
    best_test = [0] * 6
    best_test_idx = [0] * 6
    best_semeval = [0] * 6
    best_semeval_idx = [0] * 6
    policy_save_test = defaultdict(list)
    wrong_total_l = []

    # check_limit(trees, policy, policy.unk_hard)
    # check_limit(trees, policy, policy.unk_soft)
    # check_limit(trees_test, policy, policy.unk_hard)
    # check_limit(trees_test, policy, policy.unk_soft)
    # exit(0)

    # TRAIN / TEST START HERE
    if args.load_model_file is None:
        for epoch in range(n_epoch):
            best, best_idx = train(epoch, trees, policy, trainer, best, best_idx, wrong_total_l)
            # policy_save_test, best_test, best_test_idx = test(epoch, trees_test, policy, policy_save_test, best_test,
            #                                                   best_test_idx)
            _, best_val, best_val_idx = test_single(epoch, trees_val, policy, [], best_val, best_val_idx, wrong_total_l)
            policy_save_test, best_test, best_test_idx = test_single(epoch, trees_test, policy, policy_save_test,
                                                                     best_test, best_test_idx, wrong_total_l)
    else:
        load_candidate_from_pickle(trees_semeval)
        _, best_semeval, best_semeval_idx = test_single(0, trees_semeval, policy, [], best_semeval,
                                                        best_semeval_idx, wrong_total_l,
                                                        reward_type='print_each')