import networkx as nx if __name__ == "__main__": # Method variables #predict_threshold = 0.9 # prediction threshold input_file = "test-full.txt" output_file = "test-expanded-all-neighb.txt" # Generate graph from edges file G = ex.read_graph(c.output_dir + "edges-directed-6.txt") # Get feature space and target words features_index = u.read_features_file(c.output_dir + "feat-space.txt") target_words = u.get_target_words() # Get sentences/vectors of data to expand sentences, data = ex.get_expansion_data(input_file, features_index) # Get matrix of weight vectors print "generating weight matrix..." W, b_arr = u.get_weight_matrix(target_words) print "expanding feature vectors..." i = 0 for vect in data: #if i == 5: # break
# Get training data (sentences) to learn agreement with. # We select different datasets for every pair of words to optimise the agreement measure. # dDtaset for pair x and y: N sents containing x, N sents containing y, and N sents containing neither. import lib.utility as u import config as c import random import itertools import os from time import time if __name__ == "__main__": # Load target words into memory target_words = u.get_target_words() # Get highly related pairs of words to get agreement data for #pairs = u.get_lines(c.output_dir + "target-word-high-pairs.txt") # Store all sentences in memory sentences_with = set(u.get_lines(c.output_dir + "sentences-with.txt")) sentences_without = set(u.get_lines(c.output_dir + "sentences-without.txt")) # Get all sentences used already - using a dict (so we only care about sentences used for specific word) used_sentences = {} for word in target_words: with open(c.train_data_dir + word + ".txt") as file: sents = set() for line in file: sents.add(line[3:].strip()) used_sentences[word] = sents
Predicts presence of word in each instance in dataset, and returns results array. """ file_name = c.rbf_data + word + '.txt' # Get predicted classes for word results = u.get_lines(file_name) results = results[1:] return results if __name__ == "__main__": # Read features and target words features_index = u.read_features_file(c.output_dir + "feat-space.txt") feat_size = len(features_index) #5000 target_words = u.get_target_words() # len = 700 # Set up two matrices - one for directed (asymmetric matrix) and one for undirected (symmetric matrix) agreements #dimensions = len(target_words) #SM = [[0.0000 for j in xrange(dimensions)] for i in xrange(dimensions)] #AM = [[0.0000 for j in xrange(dimensions)] for i in xrange(dimensions)] # Get pairs #pairs = u.get_lines(c.output_dir + "target-word-high-pairs.txt") # Arrays to hold agreements pos_agr_a = [] cond_prob_a = [] # Iterate through each pair, and compute two types of agreement # counter