def get_example_data(): # read the tensor from the folder passed by args data_file_prefix = sys.argv[1] header_file = data_file_prefix + '/headers.txt' data_files = [data_file_prefix + "/connection.mtx", data_file_prefix + "/needtype.mtx", data_file_prefix + "/subject.mtx", data_file_prefix + "/content.mtx", data_file_prefix + "/category.mtx"] slices = [SparseTensor.CONNECTION_SLICE, SparseTensor.NEED_TYPE_SLICE, SparseTensor.ATTR_SUBJECT_SLICE, SparseTensor.ATTR_CONTENT_SLICE, SparseTensor.CATEGORY_SLICE] tensor = read_input_tensor(header_file, data_files, slices, False) data = [] target = [] # Store the chosen input into lists. for need_index in tensor.getNeedIndices(): content = "" category_index = tensor.getSliceMatrix(SparseTensor.CATEGORY_SLICE)[need_index,].nonzero()[1].tolist() target.append(category_index) for word in tensor.getAttributesForNeed(need_index, SparseTensor.ATTR_SUBJECT_SLICE): content += word + " " data.append(content) # Print out the input, just a check: target_names = tensor.getHeaders() print("test") print data print target_names print target return data, target, target_names
def get_example_data(): # read the tensor from the folder passed by args data_file_prefix = sys.argv[1] header_file = data_file_prefix + '/headers.txt' data_files = [data_file_prefix + "/connection.mtx", data_file_prefix + "/needtype.mtx", data_file_prefix + "/subject.mtx", data_file_prefix + "/content.mtx", data_file_prefix + "/category.mtx"] slices = [SparseTensor.CONNECTION_SLICE, SparseTensor.NEED_TYPE_SLICE, SparseTensor.ATTR_SUBJECT_SLICE, SparseTensor.ATTR_CONTENT_SLICE, SparseTensor.CATEGORY_SLICE] tensor = read_input_tensor(header_file, data_files, slices, False) data = [] target = [] # Store the chosen input into lists. # The "if" statement is meant to include only samples with a single category (No multilabel) for need_index in tensor.getNeedIndices(): content = "" categories = tensor.getAttributesForNeed(need_index, SparseTensor.CATEGORY_SLICE) numCategories = len(categories) if numCategories >= 1: category_index = tensor.getSliceMatrix(SparseTensor.CATEGORY_SLICE)[need_index,].nonzero()[1][0] target.append(category_index) for word in tensor.getAttributesForNeed(need_index, SparseTensor.ATTR_SUBJECT_SLICE): content += word + " " data.append(content) # Include only few of all the categories (e.g. with samples > n) newdata = [] newtarget = [] for i in range(len(target)): if target.count(target[i]) > 50: newtarget.append(target[i]) newdata.append(data[i]) data = newdata target = newtarget # Print out the input, just a check: target_names = tensor.getHeaders() print("test") print data print target_names print target return data, target, target_names
help="threshold of rescal algorithm to produce hints") args = parser.parse_args() # load the tensor header_file = "headers.txt" slice_files = [] for file in os.listdir(args.inputfolder): if file.endswith(".mtx"): slice_files.append(file) header_input = args.inputfolder + "/" + header_file data_input = [] for slice in slice_files: data_input.append(args.inputfolder + "/" + slice) input_tensor = read_input_tensor(header_input, data_input, True) # execute rescal A, R = execute_extrescal(input_tensor, args.rank) # predict new hints _log.info("predict hints with threshold: %f" % args.threshold) mask_matrix = create_hint_mask_matrix(input_tensor) connection_prediction = predict_rescal_hints_by_threshold( A, R, args.threshold, mask_matrix) _log.info("number of hints created: %d" % len(connection_prediction.nonzero()[0])) # write the hint output matrix output = args.outputfolder + "/" + "hints.mtx" _log.info("write hint prediction output matrix: " + output)
# load the tensor header_file = "headers.txt" atom_indices_file = "atomIndices.txt" slice_files = [] for file in os.listdir(args.inputfolder): if file.endswith(".mtx"): slice_files.append(file) header_input = args.inputfolder + "/" + header_file atom_indices_input = args.inputfolder + "/" + atom_indices_file data_input = [] for slice in slice_files: data_input.append(args.inputfolder + "/" + slice) input_tensor = read_input_tensor(header_input, atom_indices_input, data_input, True) # execute rescal A, R = execute_extrescal(input_tensor, args.rank) # predict new hints _log.info("predict hints with threshold: %f" % args.threshold) connection_prediction = predict_rescal_hints_by_threshold( A, R, args.threshold, input_tensor) _log.info("number of hints created: %d" % len(connection_prediction.nonzero()[0])) # write the hint output matrix output = args.outputfolder + "/" + "hints.mtx" _log.info("write hint prediction output matrix: " + output)
def __init__(self, args, output_folder, logger, ground_truth, start_time): self.init(args, output_folder, logger, ground_truth, start_time) header_input = args.inputfolder + "/" + args.headers self.file_prediction_tensor = read_input_tensor( header_input, [args.prediction_matrix_file], [SparseTensor.CONNECTION_SLICE], True)
if args.outputfolder: outfolder = args.outputfolder else: outfolder = folder + "/out/" + start_time if not os.path.exists(outfolder): os.makedirs(outfolder) hdlr = logging.FileHandler(outfolder + "/eval_result_" + start_time + ".log") _log.addHandler(hdlr) # load the tensor input data data_input = [folder + "/" + args.connection_slice, folder + "/" + args.needtype_slice] for slice in args.additional_slices: data_input.append(folder + "/" + slice) header_input = folder + "/" + args.headers slices = SparseTensor.defaultSlices + [SparseTensor.ATTR_CONTENT_SLICE, SparseTensor.CATEGORY_SLICE] input_tensor = read_input_tensor(header_input, data_input, slices, True) # TEST-PARAMETERS: # =================== # (10-)fold cross validation FOLDS = args.folds # True means: for testing mask all connections of random test needs (Test Case: Predict connections for new need # without connections) # False means: for testing mask random connections (Test Case: Predict connections for existing need which may # already have connections) MASK_ALL_CONNECTIONS_OF_TEST_NEED = not args.maskrandom # by changing this parameter the number of training connections per need can be set. Choose a high value (e.g. # 100) to use all connection in the connections file. Choose a low number to restrict the number of training