Пример #1
0
 def __init__(self, datafile, modelfile, usePR=False):
     self.datafile = datafile
     self.modelfile = modelfile
     self.t = Triples(datafile)
     self.model = Model()
     self.model.loadModel(modelfile)
     self.stats = Stats(self.t, usePR)
     self.meanE = self.model.E.mean(axis=0)
     self.meanR = self.model.R.mean(axis=0)
Пример #2
0
def add_sup_triples(triples1, triples2, sup_ent1, sup_ent2):
    newly_triples1, newly_triples2 = generate_sup_triples(
        triples1, triples2, sup_ent1, sup_ent2)
    triples1 = Triples(triples1.triples | newly_triples1,
                       ori_triples=triples1.triples)
    triples2 = Triples(triples2.triples | newly_triples2,
                       ori_triples=triples2.triples)
    print("now triples: {}, {}".format(len(triples1.triples),
                                       len(triples2.triples)))
    return triples1, triples2
Пример #3
0
def read_input(folder):
    triples_set1 = read_triples(folder + 'triples_1')
    triples_set2 = read_triples(folder + 'triples_2')
    triples1 = Triples(triples_set1)
    triples2 = Triples(triples_set2)
    total_ent_num = len(triples1.ents | triples2.ents)
    total_rel_num = len(triples1.props | triples2.props)
    total_triples_num = len(triples1.triple_list) + len(triples2.triple_list)
    print('total ents:', total_ent_num)
    print('total rels:', len(triples1.props), len(triples2.props), total_rel_num)
    print('total triples: %d + %d = %d' % (len(triples1.triples), len(triples2.triples), total_triples_num))
    ref_ent1, ref_ent2 = read_references(folder + 'ref_ent_ids')
    assert len(ref_ent1) == len(ref_ent2)
    print("To aligned entities:", len(ref_ent1))
    sup_ent1, sup_ent2 = read_references(folder + 'sup_ent_ids')
    return triples1, triples2, sup_ent1, sup_ent2, ref_ent1, ref_ent2, total_triples_num, total_ent_num, total_rel_num
Пример #4
0
def read_input(folder):
    triples_set1 = read_triples(folder + 'twitter_triples')
    triples_set2 = read_triples(folder + 'foursquare_triples')
    triples1 = Triples(triples_set1)
    triples2 = Triples(triples_set2)
    total_ent_num = len(triples1.ents | triples2.ents)
    total_rel_num = len(triples1.props | triples2.props)
    total_triples_num = len(triples1.triple_list) + len(triples2.triple_list)
    print('total ents:', total_ent_num)
    print('total rels:', len(triples1.props), len(triples2.props),
          total_rel_num)
    print('total triples: %d + %d = %d' %
          (len(triples1.triples), len(triples2.triples), total_triples_num))
    all_truth_1, all_truth_2 = read_references(folder + 'truth_ents_ids')
    train_num = int(len(all_truth_1) * P.train_ratio)
    real_ent1, real_ent2 = all_truth_1[train_num:], all_truth_2[train_num:]
    sup_ent1, sup_ent2 = all_truth_1[:train_num], all_truth_2[:train_num]
    ref_ent1, ref_ent2 = get_ref_ent(sup_ent1, sup_ent2, triples1.ent_list,
                                     triples2.ent_list)
    print("To aligned entities:", len(ref_ent1), len(ref_ent2))
    return triples1, triples2, sup_ent1, sup_ent2, ref_ent1, ref_ent2, total_triples_num, total_ent_num, total_rel_num, real_ent1, real_ent2
Пример #5
0
def main():
    parser = getParser()
    try:
        args = parser.parse_args()
    except:
        parser.print_help()
        sys.exit(1)
    t = Triples(args.datafile)
    stats = Stats(t)
    rRanges = [((0, 50), 50), ((50, 100), 50), ((100, 200), 100),
               ((200, 500), 300), ((500, t.nr), t.nr - 500)]
    idxSets = []
    for rankBand, ns in rRanges:
        idxSets.append(stats.getRels(rankBand, ns))
    rels = []
    cats = []
    for idxSet in idxSets:
        cur_rels = []
        cur_cats = {}
        for idx in idxSet:
            cur_rels.append(t.rNames[idx])
            cat = t.rNames[idx].split("/")[1]
            cur_cats[cat] = cur_cats.get(cat, 0) + 1
        rels.append(cur_rels)
        cats.append(cur_cats)
    if args.outfile:
        with open(args.outfile + '.rel.txt', "w") as fout:
            for idx, cur_rels in enumerate(rels):
                fout.write("%s\n" % str(rRanges[idx][0]))
                for rel in cur_rels:
                    fout.write("%s\n" % rel)
        with open(args.outfile + '.cat.txt', "w") as fout:
            for idx, cur_cats in enumerate(cats):
                fout.write("%s\n" % str(rRanges[idx][0]))
                for cat, count in cur_cats.iteritems():
                    fout.write("%s:%d\t" % (cat, count))
                fout.write("\n")

    import pdb
    pdb.set_trace()
Пример #6
0
class Analyser:
    def __init__(self, datafile, modelfile, usePR=False):
        self.datafile = datafile
        self.modelfile = modelfile
        self.t = Triples(datafile)
        self.model = Model()
        self.model.loadModel(modelfile)
        self.stats = Stats(self.t, usePR)
        self.meanE = self.model.E.mean(axis=0)
        self.meanR = self.model.R.mean(axis=0)

    def getEntIdxs(self, ranges):
        idxs = []
        for rankBand, ns in ranges:
            idxs.append(self.stats.getEnts(rankBand, ns))
        return idxs

    def getRelIdxs(self, ranges):
        idxs = []
        for rankBand, ns in ranges:
            idxs.append(self.stats.getRels(rankBand, ns))
        return idxs

    def entPerf(self, opdir):
        #eRanges = [((0,100), nSamples), ((100,500), nSamples), ((500,5000), nSamples), ((5000, -1), nSamples)]
        eRanges = [(0, 100), (100, 500), (500, 5000), (5000, self.t.ne)]
        entIndices = []
        for rankband in eRanges:
            entIndices.append(
                self.stats.getEnts(rankband, rankband[1] - rankband[0]))

        rel_triples = self.t.groupByRelation("test")
        ranks = {}
        ent_perf = {}
        for rel, val in self.model.fpos_test.iteritems():
            for idx, (h, t) in enumerate(rel_triples[rel]):
                ranks.setdefault(h, {}).setdefault('head', []).append(
                    (val['head'][idx], val['tail'][idx]))
                ranks.setdefault(t, {}).setdefault('tail', []).append(
                    (val['head'][idx], val['tail'][idx]))
        all_ranks = []
        for rangeIdx, idxSet in enumerate(entIndices):
            cur_head_ranks = []
            cur_tail_ranks = []
            cur_all_ranks = []
            for idx in idxSet:
                cur_head_ranks.extend(ranks.get(idx, {}).get('head', []))
                cur_tail_ranks.extend(ranks.get(idx, {}).get('tail', []))
            cur_all_ranks = cur_head_ranks + cur_tail_ranks
            all_ranks.extend(cur_all_ranks)
            ent_perf[eRanges[rangeIdx]] = {
                "head":
                getPerfFromRanks(np.array(cur_head_ranks, dtype=np.int32)),
                "tail":
                getPerfFromRanks(np.array(cur_tail_ranks, dtype=np.int32)),
                "all": getPerfFromRanks(np.array(cur_all_ranks,
                                                 dtype=np.int32)),
            }
        all_perf = getPerfFromRanks(np.array(all_ranks, dtype=np.int32))
        outfile = os.path.join(
            opdir, ".".join(
                os.path.split(self.modelfile)[1].split(".")[:-1] +
                ["ent_perf", "p"]))
        with open(outfile, "wb") as fout:
            pickle.dump({"ent_perf": ent_perf, "all_perf": all_perf}, fout)
        outfile = os.path.join(
            opdir, ".".join(
                os.path.split(self.modelfile)[1].split(".")[:-1] +
                ["ent_perf", "txt"]))
        with open(outfile, "w") as fout:
            fout.write("Range\t\tMR\tMRR\tHits@1\tHits@3\tHits@10\tHits@100\n")
            for a in eRanges:
                perf = ent_perf[a]['all']
                line = "%10s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n" % (
                    str(a), perf['MR'][1], perf['MRR'][1], perf['Hits@1'][1],
                    perf['Hits@3'][1], perf['Hits@10'][1], perf['Hits@100'][1])
                fout.write(line)

    def relPerf(self, opdir):
        rRanges = []
        interval = 4
        for i in range(0, self.t.nr - 1, interval):
            rRanges.append(((i, i + interval), interval))
        #rRanges = [((0,50), 50), ((50,100), 50), ((100,200), 100), ((200, 500), 300), ((500,self.t.nr), self.t.nr-500)]
        #rRanges = [((0,100), 100), ((100,500), 400), ((500,self.t.nr), self.t.nr-500)]
        relIndices = self.getRelIdxs(rRanges)
        idxSets = []
        for rankBand, ns in rRanges:
            idxSets.append(self.stats.getRels(rankBand, ns))
        rel_perf = {}
        all_ranks = self.model.fpos_test
        for rangeIdx, idxSet in enumerate(idxSets):
            cur_ranks = []
            for idx in idxSet:
                cur_ranks.extend(all_ranks.get(idx, {}).get('tail', []))
            rel_perf[rRanges[rangeIdx][0]] = getPerfFromRanks(
                np.array(cur_ranks, dtype=np.int32))
            #rel_perf.append(getPerfFromRanks(np.array(cur_ranks, dtype=np.int32)))
        outfile = os.path.join(
            opdir, ".".join(
                os.path.split(self.modelfile)[1].split(".")[:-1] +
                ["rel_perf", "p"]))
        #outfile = os.path.join(os.path.split(self.modelfile)[0], "rel_perf.p")
        with open(outfile, "wb") as fout:
            pickle.dump(rel_perf, fout)
        #outfile = os.path.join(os.path.split(self.modelfile)[0], "rel_perf.txt")
        outfile = os.path.join(
            opdir, ".".join(
                os.path.split(self.modelfile)[1].split(".")[:-1] +
                ["rel_perf", "txt"]))
        with open(outfile, "w") as fout:
            fout.write("Range\t\tMR\tMRR\tHits@1\tHits@3\tHits@10\tHits@100\n")
            for a, b in rRanges:
                perf = rel_perf[a]
                line = "%10s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n" % (
                    str(a), perf['MR'], perf['MRR'], perf['Hits@1'],
                    perf['Hits@3'], perf['Hits@10'], perf['Hits@100'])
                fout.write(line)

    def run(self, vectorType, sampleMean, isnormalized, outputdir, showplot):

        outputfile = ".".join(
            os.path.split(self.modelfile)[-1].split(".")[:-1])
        #outputfile = outputfile + ".p"
        #outputfile = outputfile + ".png"
        outputfile = os.path.join(outputdir, outputfile)
        if os.path.exists(outputfile):
            print "File already exists. Exitting..."
            print outputfile
            #return

        #finalize the set to be analysed
        nSamples = 100
        #eRanges = [((0,100), nSamples), ((100,500), nSamples), ((500,5000), nSamples), ((5000, 50000), nSamples), ((50000, -1), nSamples)]
        eRanges = [((0, 100), nSamples), ((100, 500), nSamples),
                   ((500, 5000), nSamples), ((5000, -1), nSamples)]
        entIndices = self.getEntIdxs(eRanges)
        rRanges = [((0, 100), nSamples), ((100, 500), nSamples),
                   ((500, -1), nSamples)]
        relIndices = self.getRelIdxs(rRanges)

        #colors = ['r','g','b','c']
        #colors = "rgbcmykw" #plt.cm.get_cmap("hsv", N)
        #legendLabels = ["0-100", "100-500", "500-5000", "5000-"]
        legendLabels = []
        for a, b in eRanges:
            curLabel = "%d-%d" % (a[0], a[1])
            legendLabels.append(curLabel)
        #markers = ["+", ".", "x", 3]
        markers = "+.x3ov^<>p"
        """
        plt.figure(1)
        plt.suptitle(self.model.modelName + " - TSNE")
        if vectorType in ["ent"]:
            self.runTSNE(entIndices, True)
        else:
            self.runTSNE(relIndices, False)

        plt.figure(2)
        plt.suptitle(self.model.modelName + " - PCA")
        if vectorType in ["ent"]:
            self.runPCA(entIndices, True)
        else:
            self.runPCA(relIndices, False)


        """
        if vectorType in ["ent"]:
            gp, lp = self.getInnerProducts(entIndices,
                                           sampleMean=sampleMean,
                                           normalized=isnormalized)
        else:
            gp, lp = self.getInnerProducts(relIndices,
                                           ent=False,
                                           normalized=isnormalized)
        nBuckets = len(gp)

        params = os.path.split(self.modelfile)[-1].split(".")[:-1]
        products = " ".join(["%.4f" % lpp for lpp in lp])
        outstr = "%s %d %d %s" % (params[1], int(
            params[3][1:]), int(params[2][1:]), products)
        print outstr

        plt.figure(3)
        message = ["Dot Product with", "Global Mean"]
        if isnormalized:
            message[0] = "Normalized " + message[0]
        if sampleMean:
            message[1] = "Sample Mean"

        plt.title(self.model.modelName)
        #plt.title(self.model.modelName + " - %s"%(" ".join(message)), loc='center')
        #plt.suptitle(self.model.modelName + " - Dot Product with Global Mean")
        plt.xlim(-1.0, 1.0)
        if "trans" in self.model.modelName.lower():
            #maxy = 5.0 #entities
            maxy = 3.0
        else:
            #maxy = 16.0 #entities
            maxy = 8.0  #relations
        plt.ylim(0, maxy)
        plt.yticks(np.arange(maxy))
        figs = []
        for i, gpi in enumerate(gp):
            #plt.subplot(nBuckets, 1, i+1)
            density = scistats.gaussian_kde(gpi)
            #x,y, _ = plt.hist(gpi, nSamples)
            #plt.plot(y, density(y), c='r')
            x, y = np.histogram(gpi, nSamples / 2)
            figs.append(
                plt.plot(y,
                         density(y),
                         c=colors[i],
                         label=legendLabels[i],
                         marker=markers[i]))
        #plt.legend(figs,  legendLabels, loc='upper right')
        #plt.legend(loc='upper left')
        #plt.legend(figs, legendLabels, loc='upper right')
        """
        plt.figure(4)
        plt.suptitle(self.model.modelName + " - Dot Product with Local Means")
        for i in range(nBuckets):
            for j in range(nBuckets):
                plt.subplot(nBuckets, nBuckets, nBuckets*i + j + 1)
                plt.xlim(-1,1)
                plt.hist(lp[i][j])
        """

        if vectorType in ['rel']:
            outputfile += ".rel"
        fig = plt.gcf()
        fig.set_size_inches(16, 10)
        plt.savefig(outputfile + ".png", dpi=72)
        pickle.dump(
            {
                "model": params[1],
                "dim": int(params[3][1:]),
                "neg": int(params[2][1:]),
                "dots": products
            }, open(outputfile + ".p", "wb"))
        if showplot:
            print outputfile
            plt.show()

    def runTSNE(self, indices, ent=True):
        if ent:
            vectors = self.model.E
        else:
            vectors = self.model.R
        nComponents = 2
        dim = vectors.shape[1]
        colors = ['r', 'g', 'b', 'c']

        allIndices = []
        for idxs in indices:
            allIndices.extend(idxs)

        #temp = tsne(vectors[allIndices,:], 2, dim, 20.0)
        temp = TSNE(n_components=2).fit_transform(vectors[allIndices, :])
        for iteration, idxs in enumerate(indices):
            nSamples = len(idxs)
            plt.scatter(temp[iteration * nSamples:(iteration + 1) * nSamples,
                             0],
                        temp[iteration * nSamples:(iteration + 1) * nSamples,
                             1],
                        c=colors[iteration],
                        marker="o")

        #plt.show()

    def getInnerProducts(self,
                         indices,
                         sampleMean=False,
                         ent=True,
                         normalized=False):
        if ent:
            vectors = self.model.E
            mean = self.meanE
        else:
            vectors = self.model.R
            mean = self.meanR

        localProducts = []
        globalProducts = []
        meanDotProducts = []

        if sampleMean:
            means = [vectors[index, :].mean(axis=0) for index in indices]
            mean = np.mean(means, axis=0)

        if normalized:
            vectors = normalize(vectors)
            mean = mean / np.linalg.norm(mean)

        for index in indices:
            x = np.dot(vectors[index, :], mean)
            globalProducts.append(x)
            meanDotProducts.append(x.mean())

        meanDotProducts.append(np.mean(meanDotProducts))
        """
        for index1 in indices:
            curVectors = vectors[index1,:]
            curMean = curVectors.mean(axis=0)
            curMean = curMean/np.linalg.norm(curMean)
            curProducts = []
            for index2 in indices:
                curProducts.append(np.dot(vectors[index2,:], curMean))
            localProducts.append(curProducts)
        """
        return globalProducts, meanDotProducts
        #return globalProducts, localProducts

    def getLengths(self, indices, ent=True):
        if ent:
            vectors = self.model.E
        else:
            vectors = self.model.R

        vectorLengths = []
        meanVectorLengths = []

        for index in indices:
            x = np.linalg.norm(vectors[index, :], axis=1, ord=2)
            vectorLengths.append(x)
            meanVectorLengths.append(x.mean())

        meanVectorLengths.append(np.mean(meanVectorLengths))

        return vectorLengths, meanVectorLengths

    def runPCA(self, entIndices, ent=True):
        nComponents = 2
        colors = ['r', 'g', 'b', 'c']
        pca = PCA(n_components=nComponents)
        if ent:
            vectors = self.model.E
        else:
            vectors = self.model.R
        for iteration, idxs in enumerate(entIndices):
            nSamples = len(idxs)
            temp = pca.fit_transform(vectors[idxs, :])
            plt.scatter(temp[:, 0],
                        temp[:, 1],
                        c=colors[iteration],
                        marker="v")
            iteration += 1