Пример #1
0
def test_dbpedia_loading():
	adjpath = abspath(expanduser('~/Projects/truthy_data/dbpedia/2016-04/processed/kg/adjacency.npy'))
	shape = (6060993, 6060993, 663)
	dirpath = join(dirname(adjpath), '_undir')
	G = Graph.reconstruct(dirpath, shape, sym=True)
	assert np.all(G.csr.indices >= 0)

	# reverse graph
	dirpath = join(dirname(adjpath), '_revundir')
	revG = Graph.reconstruct(dirpath, shape, sym=True)
	assert np.all(revG.csr.indices >= 0)
Пример #2
0
def test_graph1_creation():
    shape = np.asarray([4, 4, 2], dtype=np.int32)
    adj = np.asarray([
        [0, 1, 0],
        [0, 2, 1],
        [1, 2, 0],
        [1, 2, 1],
        [1, 3, 1],
        [2, 3, 0],
        [2, 1, 1],
        [2, 3, 1],
    ],
                     dtype=np.int32)
    values = np.arange(adj.shape[0]) + 10.

    # create graph
    expect_G = np.asarray([[0., 1., 0., 0., 0., 0., 1., 0.],
                           [1., 0., 1., 0., 0., 0., 1., 1.],
                           [0., 1., 0., 1., 1., 1., 0., 1.],
                           [0., 0., 1., 0., 0., 1., 1., 0.]])
    G = make_graph(adj, shape, sym=True, save_csc=True)
    assert np.array_equal(G.csr.toarray(), G.csc.toarray())
    dirpath = join(abspath(expanduser(os.curdir)), '_undir')
    if not exists(dirpath):
        os.mkdir(dirpath)
    G.save_graph(dirpath)
    assert np.array_equal(G.indeg_vec, np.asarray([2, 3, 3, 2]))
    assert np.array_equal(expect_G, G.csr.toarray())

    # rebuild graph
    G = Graph.reconstruct(dirpath, shape, sym=True, save_csc=True)
    assert np.array_equal(expect_G, G.csr.toarray())
    if exists(dirpath):
        shutil.rmtree(dirpath)
        print('Removed: %s' % dirpath)
Пример #3
0
def test_dbpedia():
    adjpath = abspath(expanduser('./data/kg/adjacency.npy'))
    shape = (6060993, 6060993, 663)
    adj = np.load(adjpath)
    adj = adj.astype(np.int32)
    T = Graph(adj, shape, sym=True)

    # save graph
    print('Saving graph..')
    t1 = time()
    dirpath = join(dirname(adjpath), '_undir')
    if not exists(dirpath):
        os.makedirs(dirpath)
        print('* Created: %s' % dirpath)
    T.save_graph(dirpath)
    print('Graph saved in {:.4f} secs at: {} '.format(time() - t1, dirpath))
Пример #4
0
def test_dbpedia():
	dirpath = abspath(expanduser('./data/kg/_undir/'))
	shape = (6060993, 6060993, 663)
	G = Graph.reconstruct(dirpath, shape, sym=True)
	cost_vec = np.log(G.indeg_vec)
	
	s, p, o = 2145431, 178, 459128 # Gravity, Alfonso Cuarón
	mincostflow = succ_shortest_path(G, cost_vec, s, p, o)
	print mincostflow
Пример #5
0
def main(args=None):
    # parse arguments
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-m',
                        type=str,
                        required=True,
                        dest='method',
                        help='Method to use: stream, relklinker, klinker, \
			predpath, pra, katz, pathent, simrank, adamic_adar, jaccard, degree_product.'
                        )
    parser.add_argument('-d',
                        type=str,
                        required=True,
                        dest='dataset',
                        help='Dataset to test on.')
    parser.add_argument('-o',
                        type=str,
                        required=True,
                        dest='outdir',
                        help='Path to the output directory.')
    args = parser.parse_args()

    # logging
    disable_logging(log.DEBUG)

    if args.method not in ('stream', 'relklinker', 'klinker', 'predpath',
                           'pra', 'katz', 'pathent', 'simrank', 'adamic_adar',
                           'jaccard', 'degree_product'):
        raise Exception('Invalid method specified.')

    # ensure input file and output directory is valid.
    outdir = abspath(expanduser(args.outdir))
    assert exists(outdir)
    args.outdir = outdir
    datafile = abspath(expanduser(args.dataset))
    assert exists(datafile)
    args.dataset = datafile
    log.info('Launching {}..'.format(args.method))
    log.info('Dataset: {}'.format(basename(args.dataset)))
    log.info('Output dir: {}'.format(args.outdir))

    # read data
    df = pd.read_table(args.dataset, sep=',', header=0)
    log.info('Read data: {} {}'.format(df.shape, basename(args.dataset)))
    spo_df = df.dropna(axis=0, subset=['sid', 'pid', 'oid'])
    log.info('Note: Found non-NA records: {}'.format(spo_df.shape))
    df = spo_df[['sid', 'pid', 'oid']].values
    subs, preds, objs = df[:, 0].astype(_int), df[:, 1].astype(
        _int), df[:, 2].astype(_int)

    # load knowledge graph
    G = Graph.reconstruct(PATH, SHAPE, sym=True)  # undirected
    assert np.all(G.csr.indices >= 0)

    # relational similarity
    relsim = np.load(RELSIMPATH)

    # execute
    base = splitext(basename(args.dataset))[0]
    t1 = time()
    if args.method == 'stream':  # KNOWLEDGE STREAM (KS)
        # compute min. cost flow
        log.info('Computing KS for {} triples..'.format(spo_df.shape[0]))
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            outjson = join(args.outdir,
                           'out_kstream_{}_{}.json'.format(base, DATE))
            outcsv = join(args.outdir,
                          'out_kstream_{}_{}.csv'.format(base, DATE))
            mincostflows, times = compute_mincostflow(G, relsim, subs, preds,
                                                      objs, outjson)
            # save the results
            spo_df['score'] = mincostflows
            spo_df['time'] = times
            spo_df = normalize(spo_df)
            spo_df.to_csv(outcsv, sep=',', header=True, index=False)
            log.info('* Saved results: %s' % outcsv)
        log.info(
            'Mincostflow computation complete. Time taken: {:.2f} secs.\n'.
            format(time() - t1))
    elif args.method == 'relklinker':  # RELATIONAL KNOWLEDGE LINKER (KL-REL)
        log.info('Computing KL-REL for {} triples..'.format(spo_df.shape[0]))
        scores, paths, rpaths, times = compute_relklinker(
            G, relsim, subs, preds, objs)
        # save the results
        spo_df['score'] = scores
        spo_df['path'] = paths
        spo_df['rpath'] = rpaths
        spo_df['time'] = times
        spo_df = normalize(spo_df)
        outcsv = join(args.outdir,
                      'out_relklinker_{}_{}.csv'.format(base, DATE))
        spo_df.to_csv(outcsv, sep=',', header=True, index=False)
        log.info('* Saved results: %s' % outcsv)
        log.info(
            'Relatioanal KL computation complete. Time taken: {:.2f} secs.\n'.
            format(time() - t1))
    elif args.method == 'klinker':
        log.info('Computing KL for {} triples..'.format(spo_df.shape[0]))
        scores, paths, rpaths, times = compute_klinker(G, subs, preds, objs)
        # save the results
        spo_df['score'] = scores
        spo_df['path'] = paths
        spo_df['rpath'] = rpaths
        spo_df['time'] = times
        spo_df = normalize(spo_df)
        outcsv = join(args.outdir, 'out_klinker_{}_{}.csv'.format(base, DATE))
        spo_df.to_csv(outcsv, sep=',', header=True, index=False)
        log.info('* Saved results: %s' % outcsv)
        log.info('KL computation complete. Time taken: {:.2f} secs.\n'.format(
            time() - t1))
    elif args.method == 'predpath':  # PREDPATH
        vec, model = predpath_train_model(G, spo_df)  # train
        print 'Time taken: {:.2f}s\n'.format(time() - t1)
        # save model
        predictor = {'dictvectorizer': vec, 'model': model}
        try:
            outpkl = join(args.outdir,
                          'out_predpath_{}_{}.pkl'.format(base, DATE))
            with open(outpkl, 'wb') as g:
                pkl.dump(predictor, g, protocol=pkl.HIGHEST_PROTOCOL)
            print 'Saved: {}'.format(outpkl)
        except IOError, e:
            raise e
Пример #6
0
class Pra(object):

    name = 'pra'

    HOME = abspath(expanduser('./data/'))

    if not exists(HOME):
        print 'Data directory not found: %s' % HOME
        print 'Download data per instructions on:'
        print '\thttps://github.com/shiralkarprashant/knowledgestream#data'
        print 'and enter the directory path below.'
        data_dir = raw_input('\nPlease enter data directory path: ')
        if data_dir != '':
            data_dir = abspath(expanduser(data_dir))
        if not os.path.isdir(data_dir):
            raise Exception('Entered path "%s" not a directory.' % data_dir)
        if not exists(data_dir):
            raise Exception('Directory does not exist: %s' % data_dir)
        HOME = data_dir
    # raise Exception('Please set HOME to data directory in algorithms/__main__.py')
    PATH = join(HOME, 'kg/_undir/')
    assert exists(PATH)
    SHAPE = (6060993, 6060993, 663)
    WTFN = 'logdegree'

    # relational similarity using TF-IDF representation and cosine similarity
    RELSIMPATH = join(HOME, 'relsim/coo_mat_sym_2016-10-24_log-tf_tfidf.npy')
    assert exists(RELSIMPATH)

    # Date
    DATE = '{}'.format(date.today())

    # data types for int and float
    _short = np.int16
    _int = np.int32
    _int64 = np.int64
    _float = np.float

    # load knowledge graph
    G = Graph.reconstruct(PATH, SHAPE, sym=True)  # undirected
    assert np.all(G.csr.indices >= 0)

    # relational similarity
    relsim = np.load(RELSIMPATH)

    # ================= PRA ALGORITHM IMPLEMENTATION ============

    @rpc  # Methods are exposed to the outside world with entrypoint decorators (RPC in our case)
    def stream(self, data, args=None):

        print('\nThe following request in RDF format was passed:')
        print(data)

        identification, theDate, suri, puri, ouri = extract.getValues(data)

        print('\nSURI, PURI and OURI are:')
        print(suri)
        print(puri)
        print(ouri)
        print('\n')

        # sid, pid, oid = self.uriToId(suri, puri, ouri)
        sid, pid, oid = mapping.convert(suri, puri, ouri)

        # required for passing it to compute_mincostflow
        sid, pid, oid = np.array([sid]), np.array([pid]), np.array([oid])

        t1 = time()

        print('\nTheir corresponding IDs are:')
        print(sid)
        print(pid)
        print(oid)
        print('\n')

        log.info('Computing Predpath for triple')
        int_sid = int(sid)
        int_pid = int(pid)
        int_oid = int(oid)

        print("The subject id is: %s " % int_sid)
        print("The predicate id is: %s" % int_pid)
        print("The object id is: %s" % int_oid)

        # Creating a dataframe
        data = {
            'sid': [int_sid],
            'pid': [int_pid],
            'oid': [int_oid],
            'class': [0]
        }

        #__________________test________________________
        dfObj = pd.DataFrame(data)
        test_spo_df = dfObj.dropna(axis=0,
                                   subset=['sid', 'pid', 'oid', 'class'])

        test_model_pkl = open("./output/trained_pra_model.pkl", "rb")
        test_model = pkl.load(test_model_pkl)

        test_features_pkl = open("./output/pra_features_file.pkl", "rb")
        test_features = pkl.load(test_features_pkl)
        with warnings.catch_warnings():
            try:
                warnings.simplefilter("ignore")

                # pra_predict() function is used to predict the triple's veracity
                array_value = pra_predict(self.G, test_features, test_model,
                                          test_spo_df)  # test
                val = str(array_value)[1:-1]
                log.info(
                    'Predpath computation complete. Time taken: {:.2f} secs.\n'
                    .format(time() - t1))
                result = '<http://swc2017.aksw.org/task2/dataset/s-' + str(
                    identification
                ) + '> <http://swc2017.aksw.org/hasTruthValue>\"' + str(
                    val) + '\"<http://www.w3.org/2001/XMLSchema#double> .'
                print('The result in RDF format is:')
                print(result)

            except MemoryError:
                print('\nA MemoryError is successfully caught.')
                result = 'MemoryError'

        return result
Пример #7
0
def main(args=None):
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-d',
                        type=str,
                        required=True,
                        dest='dataset',
                        help='Dataset to test on.')
    parser.add_argument('-o',
                        type=str,
                        required=True,
                        dest='outdir',
                        help='Path to the output directory.')
    parser.add_argument('-m',
                        type=str,
                        required=True,
                        dest='method',
                        help='Method to use: stream, relklinker, klinker, \
            predpath, sm')
    args = parser.parse_args()

    relsim = np.load(RELSIMPATH)

    outdir = abspath(expanduser(args.outdir))
    assert exists(outdir)
    args.outdir = outdir
    datafile = abspath(expanduser(args.dataset))
    assert exists(datafile)
    args.dataset = datafile
    LOGPATH = join(HOME, '../logs')
    assert exists(LOGPATH)
    base = splitext(basename(args.dataset))[0]
    log_file = join('logs/', 'log_{}_{}_{}.log'.format(args.method, base,
                                                       DATE))
    log.basicConfig(format='[%(asctime)s] %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S %p',
                    filename=log_file,
                    level=log.DEBUG)
    log.getLogger().addHandler(log.StreamHandler())
    log.info('Launching {}..'.format(args.method))
    log.info('Dataset: {}'.format(basename(args.dataset)))
    log.info('Output dir: {}'.format(args.outdir))

    # read data
    df = pd.read_table(args.dataset, sep=',', header=0)
    log.info('Read data: {} {}'.format(df.shape, basename(args.dataset)))
    spo_df = df.dropna(axis=0, subset=['sid', 'pid', 'oid'])
    log.info('Note: Found non-NA records: {}'.format(spo_df.shape))
    df = spo_df[['sid', 'pid', 'oid']].values
    subs, preds, objs = df[:, 0].astype(_int), df[:, 1].astype(
        _int), df[:, 2].astype(_int)

    # load knowledge graph
    G = Graph.reconstruct(PATH, SHAPE, sym=True)  # undirected
    assert np.all(G.csr.indices >= 0)

    t1 = time()

    if args.method == 'stream':  # KNOWLEDGE STREAM (KS)
        # compute min. cost flow
        log.info('Computing KS for {} triples..'.format(spo_df.shape[0]))
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            outjson = join(args.outdir,
                           'out_kstream_{}_{}.json'.format(base, DATE))
            outcsv = join(args.outdir,
                          'out_kstream_{}_{}.csv'.format(base, DATE))
            mincostflows, times = compute_mincostflow(G, relsim, subs, preds,
                                                      objs, outjson)
            # save the results
            spo_df['score'] = mincostflows
            spo_df['time'] = times
            spo_df = normalize(spo_df)
            spo_df.to_csv(outcsv, sep=',', header=True, index=False)
            log.info('* Saved results: %s' % outcsv)
        log.info(
            'Mincostflow computation complete. Time taken: {:.2f} secs.\n'.
            format(time() - t1))
    elif args.method == 'relklinker':  # RELATIONAL KNOWLEDGE LINKER (KL-REL)
        log.info('Computing KL-REL for {} triples..'.format(spo_df.shape[0]))
        scores, paths, rpaths, times = compute_relklinker(
            G, relsim, subs, preds, objs)
        # save the results
        spo_df['score'] = scores
        spo_df['path'] = paths
        spo_df['rpath'] = rpaths
        spo_df['time'] = times
        spo_df = normalize(spo_df)
        outcsv = join(args.outdir,
                      'out_relklinker_{}_{}.csv'.format(base, DATE))
        spo_df.to_csv(outcsv, sep=',', header=True, index=False)
        log.info('* Saved results: %s' % outcsv)
        log.info(
            'Relatioanal KL computation complete. Time taken: {:.2f} secs.\n'.
            format(time() - t1))
    elif args.method == 'klinker':
        log.info('Computing KL for {} triples..'.format(spo_df.shape[0]))
        scores, paths, rpaths, times = compute_klinker(G, subs, preds, objs)
        # save the results
        spo_df['score'] = scores
        spo_df['path'] = paths
        spo_df['rpath'] = rpaths
        spo_df['time'] = times
        spo_df = normalize(spo_df)
        outcsv = join(args.outdir, 'out_klinker_{}_{}.csv'.format(base, DATE))
        spo_df.to_csv(outcsv, sep=',', header=True, index=False)
        log.info('* Saved results: %s' % outcsv)
        log.info('KL computation complete. Time taken: {:.2f} secs.\n'.format(
            time() - t1))
    elif args.method == 'predpath':  # PREDPATH
        vec, model = predpath_train_model(G, spo_df)  # train
        # vec, model = predpath_train_model(G, spo_df, relsim)
        print 'Time taken: {:.2f}s\n'.format(time() - t1)
        # save model
        predictor = {'dictvectorizer': vec, 'model': model}
        try:
            outpkl = join(args.outdir,
                          'out_predpath_{}_{}.pkl'.format(base, DATE))
            with open(outpkl, 'wb') as g:
                pkl.dump(predictor, g, protocol=pkl.HIGHEST_PROTOCOL)
            print 'Saved: {}'.format(outpkl)
        except IOError, e:
            raise e
Пример #8
0
class KnowledgeLinker(object):

    name = 'klinker'

    HOME = abspath(expanduser('./data/'))

    if not exists(HOME):
        print 'Data directory not found: %s' % HOME
        print 'Download data per instructions on:'
        print '\thttps://github.com/shiralkarprashant/knowledgestream#data'
        print 'and enter the directory path below.'
        data_dir = raw_input('\nPlease enter data directory path: ')
        if data_dir != '':
            data_dir = abspath(expanduser(data_dir))
        if not os.path.isdir(data_dir):
            raise Exception('Entered path "%s" not a directory.' % data_dir)
        if not exists(data_dir):
            raise Exception('Directory does not exist: %s' % data_dir)
        HOME = data_dir
    # raise Exception('Please set HOME to data directory in algorithms/__main__.py')
    PATH = join(HOME, 'kg/_undir/')
    assert exists(PATH)
    SHAPE = (6060993, 6060993, 663)
    WTFN = 'logdegree'

    # relational similarity using TF-IDF representation and cosine similarity
    RELSIMPATH = join(HOME, 'relsim/coo_mat_sym_2016-10-24_log-tf_tfidf.npy')
    assert exists(RELSIMPATH)

    # Date
    DATE = '{}'.format(date.today())

    # data types for int and float
    _short = np.int16
    _int = np.int32
    _int64 = np.int64
    _float = np.float

    # load knowledge graph
    G = Graph.reconstruct(PATH, SHAPE, sym=True)  # undirected
    assert np.all(G.csr.indices >= 0)

    # relational similarity
    relsim = np.load(RELSIMPATH)

    # ================= KNOWLEDGE LINKER ALGORITHM ============

    def compute_klinker(self, G, sid, pid, oid):
        """
		Parameters:
		-----------
		G: rgraph
			See `datastructures`.
		subs, preds, objs: sequence
			Sequences representing the subject, predicate and object of
			input triples.

		Returns:
		--------
		scores, paths, rpaths, times: sequence
			One sequence each for the proximity scores, shortest path in terms of
			nodes, shortest path in terms of relation sequence, and times taken.
		"""
        # set weights
        indegsim = weighted_degree(G.indeg_vec, weight=self.WTFN).reshape(
            (1, G.N))
        indegsim = indegsim.ravel()
        targets = G.csr.indices % G.N
        specificity_wt = indegsim[targets]  # specificity
        G.csr.data = specificity_wt.copy()

        # back up
        data = G.csr.data.copy()
        indices = G.csr.indices.copy()
        indptr = G.csr.indptr.copy()

        # compute closure
        scores, paths, rpaths, times = [], [], [], []
        for idx, (s, p, o) in enumerate(zip(sid, pid, oid)):
            print '{}. Working on {}..'.format(idx + 1, (s, p, o)),
            ts = time()
            rp = closure(G, s, p, o, kind='metric', linkpred=True)
            tend = time()
            print 'time: {:.2f}s'.format(tend - ts)
            times.append(tend - ts)
            scores.append(rp.score)
            paths.append(rp.path)
            rpaths.append(rp.relational_path)

            # reset graph
            G.csr.data = data.copy()
            G.csr.indices = indices.copy()
            G.csr.indptr = indptr.copy()
            sys.stdout.flush()
        log.info('')
        return scores, paths, rpaths, times

    @rpc  # Methods are exposed to the outside world with entrypoint decorators (RPC in our case)
    def stream(self, data):

        print('\nThe following request in RDF format was passed:')
        print(data)

        identification, theDate, suri, puri, ouri = extract.getValues(data)

        print('\nSURI, PURI and OURI are:')
        print(suri)
        print(puri)
        print(ouri)
        print('\n')

        # sid, pid, oid = self.uriToId(suri, puri, ouri)
        sid, pid, oid = mapping.convert(suri, puri, ouri)

        # required for passing it to compute_mincostflow
        sid, pid, oid = np.array([sid]), np.array([pid]), np.array([oid])

        t1 = time()

        print('\nTheir corresponding IDs are:')
        print(sid)
        print(pid)
        print(oid)
        print('\n')

        log.info('Computing KL for triple')
        with warnings.catch_warnings():
            try:
                warnings.simplefilter("ignore")

                # compute klinker
                scores, paths, rpaths, times = self.compute_klinker(
                    self.G, sid, pid, oid)

                normalizedScore = normalization.score(scores[0])

                log.info(
                    'KLinker computation complete. Time taken: {:.2f} secs.\n'.
                    format(time() - t1))
                result = '<http://swc2017.aksw.org/task2/dataset/s-' + str(
                    identification
                ) + '> <http://swc2017.aksw.org/hasTruthValue>\"' + str(
                    normalizedScore
                ) + '\"<http://www.w3.org/2001/XMLSchema#double> .'
                print('The result in RDF format is:')
                print(result)

            except MemoryError:
                print('\nA MemoryError is successfully caught.')
                result = 'MemoryError'

        return result
Пример #9
0
class KnowledgeStream(object):

	name = 'kstream'

	HOME = abspath(expanduser('./data/'))

	if not exists(HOME):
		print 'Data directory not found: %s' % HOME
		print 'Download data per instructions on:'
		print '\thttps://github.com/shiralkarprashant/knowledgestream#data'
		print 'and enter the directory path below.'
		data_dir = raw_input('\nPlease enter data directory path: ')
		if data_dir != '':
			data_dir = abspath(expanduser(data_dir))
		if not os.path.isdir(data_dir):
			raise Exception('Entered path "%s" not a directory.' % data_dir)
		if not exists(data_dir):
			raise Exception('Directory does not exist: %s' % data_dir)
		HOME = data_dir
	# raise Exception('Please set HOME to data directory in algorithms/__main__.py')
	PATH = join(HOME, 'kg/_undir/')
	assert exists(PATH)
	SHAPE = (6060993, 6060993, 663)
	WTFN = 'logdegree'

	# relational similarity using TF-IDF representation and cosine similarity
	RELSIMPATH = join(HOME, 'relsim/coo_mat_sym_2016-10-24_log-tf_tfidf.npy')
	assert exists(RELSIMPATH)

	# Date
	DATE = '{}'.format(date.today())

	# data types for int and float
	_short = np.int16
	_int = np.int32
	_int64 = np.int64
	_float = np.float

	# load knowledge graph
	G = Graph.reconstruct(PATH, SHAPE, sym=True)  # undirected
	assert np.all(G.csr.indices >= 0)

	# relational similarity
	relsim = np.load(RELSIMPATH)

	# ================= KNOWLEDGE STREAM ALGORITHM ============

	def compute_mincostflow(self, G, relsim, sid, pid, oid):
		"""
        Parameters:
        -----------
        G: rgraph
            See `datastructures`.
        relsim: ndarray
            A square matrix containing relational similarity scores.
        subs, preds, objs: sequence
            Sequences representing the subject, predicate and object of
            input triples.
        flowfile: str
            Absolute path of the file where flow will be stored as JSON,
            one line per triple.

        Returns:
        --------
        mincostflows: sequence
            A sequence containing total flow for each triple.
        times: sequence
            Times taken to compute stream of each triple.
        """
		# take graph backup
		G_bak = {
			'data': G.csr.data.copy(),
			'indices': G.csr.indices.copy(),
			'indptr': G.csr.indptr.copy()
		}
		cost_vec_bak = np.log(G.indeg_vec).copy()

		# some set up
		G.sources = np.repeat(np.arange(G.N), np.diff(G.csr.indptr))
		G.targets = G.csr.indices % G.N
		cost_vec = cost_vec_bak.copy()
		indegsim = weighted_degree(G.indeg_vec, weight=self.WTFN)
		specificity_wt = indegsim[G.targets]  # specificity
		relations = (G.csr.indices - G.targets) / G.N

		s, p, o = [int(x) for x in (sid, pid, oid)]
		ts = time()
		print '{}. Working on {} .. '.format(1, (s, p, o)),
		sys.stdout.flush()

		# set weights
		relsimvec = np.array(relsim[p, :])  # specific to predicate p
		relsim_wt = relsimvec[relations]
		G.csr.data = np.multiply(relsim_wt, specificity_wt)

		# compute
		mcflow = succ_shortest_path(
			G, cost_vec, s, p, o, return_flow=False, npaths=5
		)
		mincostflow = mcflow.flow
		tend = time()
		times = tend - ts
		print 'mincostflow: {:.5f}, #paths: {}, time: {:.2f}s.'.format(
			mcflow.flow, len(mcflow.stream['paths']), tend - ts
		)

		# reset state of the graph
		np.copyto(G.csr.data, G_bak['data'])
		np.copyto(G.csr.indices, G_bak['indices'])
		np.copyto(G.csr.indptr, G_bak['indptr'])
		np.copyto(cost_vec, cost_vec_bak)
		return mincostflow, times

	@rpc	# Methods are exposed to the outside world with entrypoint decorators (RPC in our case)
	def stream(self, data):

		print('\nThe following request in RDF format was passed:')
		print(data)

		identification, theDate, suri, puri, ouri = extract.getValues(data)

		print('\nSURI, PURI and OURI are:')
		print(suri)
		print(puri)
		print(ouri)
		print('\n')

		# sid, pid, oid = self.uriToId(suri, puri, ouri)
		sid, pid, oid = mapping.convert(suri, puri, ouri)

		# required for passing it to compute_mincostflow
		sid, pid, oid = np.array([sid]), np.array([pid]), np.array([oid])

		t1 = time()

		print('\nTheir corresponding IDs are:')
		print(sid)
		print(pid)
		print(oid)
		print('\n')

		log.info('Computing KS for triple')
		with warnings.catch_warnings():
			try:
				warnings.simplefilter("ignore")
				# compute min. cost flow
				mincostflows, times = self.compute_mincostflow(self.G, self.relsim, sid, pid, oid)
				# spo_df = self.normalize(spo_df)

				normalizedScore = normalization.score(mincostflows)

				log.info('Mincostflow computation complete. Time taken: {:.2f} secs.\n'.format(time() - t1))
				result = '<http://swc2017.aksw.org/task2/dataset/s-' + str(
					identification) + '> <http://swc2017.aksw.org/hasTruthValue>\"' + str(
					normalizedScore) + '\"<http://www.w3.org/2001/XMLSchema#double> .'
				print('The result in RDF format is:')
				print(result)

			except MemoryError:
				print('\nA MemoryError is successfully caught.')
				result = 'MemoryError'

		return result
class DegreeProduct(object):

    name = 'degree_product'

    HOME = abspath(expanduser('./data/'))

    if not exists(HOME):
        print 'Data directory not found: %s' % HOME
        print 'Download data per instructions on:'
        print '\thttps://github.com/shiralkarprashant/knowledgestream#data'
        print 'and enter the directory path below.'
        data_dir = raw_input('\nPlease enter data directory path: ')
        if data_dir != '':
            data_dir = abspath(expanduser(data_dir))
        if not os.path.isdir(data_dir):
            raise Exception('Entered path "%s" not a directory.' % data_dir)
        if not exists(data_dir):
            raise Exception('Directory does not exist: %s' % data_dir)
        HOME = data_dir
    # raise Exception('Please set HOME to data directory in algorithms/__main__.py')
    PATH = join(HOME, 'kg/_undir/')
    assert exists(PATH)
    SHAPE = (6060993, 6060993, 663)
    WTFN = 'logdegree'

    # relational similarity using TF-IDF representation and cosine similarity
    RELSIMPATH = join(HOME, 'relsim/coo_mat_sym_2016-10-24_log-tf_tfidf.npy')
    assert exists(RELSIMPATH)

    # Date
    DATE = '{}'.format(date.today())

    # data types for int and float
    _short = np.int16
    _int = np.int32
    _int64 = np.int64
    _float = np.float

    # load knowledge graph
    G = Graph.reconstruct(PATH, SHAPE, sym=True)  # undirected
    assert np.all(G.csr.indices >= 0)

    # relational similarity
    relsim = np.load(RELSIMPATH)

    # ================= DEGREE PRODUCT ALGORITHM ============

    def compute_degree_product(self, G, subs, preds, objs):
        """
		Performs link prediction using a specified measure, such as Katz or SimRank.

		Parameters:
		-----------
		G: rgraph
			See `datastructures`.
		subs, preds, objs: sequence
			Sequences representing the subject, predicate and object of
			input triples.

		Returns:
		--------
		scores, times: sequence
			One sequence each for the proximity scores and times taken.
		"""
        measure_map = {
            'degree_product': {
                'measure': preferential_attachment,
                'tag': 'PA'
            }
        }

        selected_measure = 'degree_product'

        # back up
        data = G.csr.data.copy()
        indices = G.csr.indices.copy()
        indptr = G.csr.indptr.copy()

        # compute closure
        measure_name = measure_map[selected_measure]['tag']
        measure = measure_map[selected_measure]['measure']
        log.info('Computing {} for {} triples..'.format(
            measure_name, len(subs)))
        t1 = time()
        scores, times = [], []
        for idx, (s, p, o) in enumerate(zip(subs, preds, objs)):
            print '{}. Working on {}..'.format(idx + 1, (s, p, o)),
            sys.stdout.flush()
            ts = time()
            score = measure(G, s, p, o, linkpred=True)
            tend = time()
            print 'score: {:.5f}, time: {:.2f}s'.format(score, tend - ts)
            times.append(tend - ts)
            scores.append(score)

            # reset graph
            G.csr.data = data.copy()
            G.csr.indices = indices.copy()
            G.csr.indptr = indptr.copy()
            sys.stdout.flush()
        print ''
        return scores, times

    @rpc  # Methods are exposed to the outside world with entrypoint decorators (RPC in our case)
    def stream(self, data):

        print('\nThe following request in RDF format was passed:')
        print(data)

        identification, theDate, suri, puri, ouri = extract.getValues(data)

        print('\nSURI, PURI and OURI are:')
        print(suri)
        print(puri)
        print(ouri)
        print('\n')

        # sid, pid, oid = self.uriToId(suri, puri, ouri)
        sid, pid, oid = mapping.convert(suri, puri, ouri)

        # required for passing it to compute_degree_product
        sid, pid, oid = np.array([sid]), np.array([pid]), np.array([oid])

        t1 = time()

        print('\nTheir corresponding IDs are:')
        print(sid)
        print(pid)
        print(oid)
        print('\n')

        log.info('Computing PA for triple')
        with warnings.catch_warnings():
            try:
                warnings.simplefilter("ignore")
                # compute degree_product
                scores, times = self.compute_degree_product(
                    self.G, sid, pid, oid)

                normalizedScore = normalization.score(scores[0])

                log.info(
                    'DegreeProduct computation complete. Time taken: {:.2f} secs.\n'
                    .format(time() - t1))
                result = '<http://swc2017.aksw.org/task2/dataset/s-' + str(
                    identification
                ) + '> <http://swc2017.aksw.org/hasTruthValue>\"' + str(
                    normalizedScore
                ) + '\"<http://www.w3.org/2001/XMLSchema#double> .'
                print('The result in RDF format is:')
                print(result)

            except MemoryError:
                print('\nA MemoryError is successfully caught.')
                result = 'MemoryError'

        return result
Пример #11
0
class Predpath(object):

    name = 'pra'

    HOME = abspath(expanduser('./data/'))

    if not exists(HOME):
        print 'Data directory not found: %s' % HOME
        print 'Download data per instructions on:'
        print '\thttps://github.com/shiralkarprashant/knowledgestream#data'
        print 'and enter the directory path below.'
        data_dir = raw_input('\nPlease enter data directory path: ')
        if data_dir != '':
            data_dir = abspath(expanduser(data_dir))
        if not os.path.isdir(data_dir):
            raise Exception('Entered path "%s" not a directory.' % data_dir)
        if not exists(data_dir):
            raise Exception('Directory does not exist: %s' % data_dir)
        HOME = data_dir
    # raise Exception('Please set HOME to data directory in algorithms/__main__.py')
    PATH = join(HOME, 'kg/_undir/')
    assert exists(PATH)
    SHAPE = (6060993, 6060993, 663)
    WTFN = 'logdegree'

    # relational similarity using TF-IDF representation and cosine similarity
    RELSIMPATH = join(HOME, 'relsim/coo_mat_sym_2016-10-24_log-tf_tfidf.npy')
    assert exists(RELSIMPATH)

    # Date
    DATE = '{}'.format(date.today())

    # data types for int and float
    _short = np.int16
    _int = np.int32
    _int64 = np.int64
    _float = np.float

    # load knowledge graph
    G = Graph.reconstruct(PATH, SHAPE, sym=True)  # undirected
    assert np.all(G.csr.indices >= 0)

    # relational similarity
    relsim = np.load(RELSIMPATH)

    #__________________train_______________________

    # ensure input file and output directory is valid.
    outdir = abspath(expanduser('./output'))
    assert exists(outdir)

    #sample data file consisting of records that is used to train the model.
    datafile = abspath(expanduser('./datasets/sample_data_pra.csv'))
    assert exists(datafile)
    log.info('Dataset: {}'.format(basename(datafile)))

    # Date
    DATE = '{}'.format(date.today())

    # read data
    df = pd.read_table(datafile, sep=',', header=0)
    log.info('Read data: {} {}'.format(df.shape, basename(datafile)))
    spo_df = df.dropna(axis=0, subset=['sid', 'pid', 'oid'])
    log.info('Note: Found non-NA records: {}'.format(spo_df.shape))

    # execute
    base = splitext(basename(datafile))[0]
    t1 = time()
    log.info('Computing pra for {} triples..'.format(spo_df.shape[0]))

    #function that trains the model
    features, model = pra_train_model(G, spo_df)  # train
    print 'Time taken: {:.2f}s\n'.format(time() - t1)

    # save model
    predictor = {'dictvectorizer': features, 'model': model}
    try:
        outpkl = join(outdir, 'trained_pra_model.pkl')
        with open(outpkl, 'wb') as g:
            s = pkl.dump(model, g, protocol=pkl.HIGHEST_PROTOCOL)

        outpkl_features = join(outdir, 'pra_features_file.pkl')
        with open(outpkl_features, 'wb') as g:
            s = pkl.dump(features, g, protocol=pkl.HIGHEST_PROTOCOL)

    except IOError, e:
        raise e