Exemplo n.º 1
0
    def test_inspect_adjacency(self):
        # This "test" is best viewed in PyCharm debugger, put a breakpoint at "print(df)" and look at the
        # resulting adacency-weight matrix. You will see that it makes sense.
        t2g = Text2GraphTransformer(min_df=1, window_size=3, rm_stopwords=True)

        X = [
            "Time is an illusion. Lunchtime doubly so.",
            "The ships hung in the sky in much the same way that bricks don't.",
            "If there's anything more important than my ego around, I want it caught and shot now.",
            "Would it save you a lot of time if I just gave up and went mad now?"
        ]

        res = t2g.fit_transform(X, y=[1, 0, 1], test_idx=[2])

        # expand adjacency matrix
        A = np.zeros((t2g.n_vocabs_ + 4, t2g.n_vocabs_ + 4))

        for (e1, e2), weight in zip(res.edge_index.T, res.edge_attr):
            A[e1, e2] = weight

        inv_map = {v: k for k, v in t2g.vocabulary.items()}
        inv_map[t2g.n_vocabs_] = "Document 1"
        inv_map[t2g.n_vocabs_ + 1] = "Document 2"
        inv_map[t2g.n_vocabs_ + 2] = "Document 3"
        inv_map[t2g.n_vocabs_ + 3] = "Document 4"
        names = [inv_map[i] for i in range(t2g.n_vocabs_ + 4)]
        df = pd.DataFrame(data=A, columns=names, index=names)
        print(df)
Exemplo n.º 2
0
val_idx = np.random.choice(len(x), int(train_val_split * len(x)), replace=False)
train_idx = np.array([x for x in range(len(x)) if x not in val_idx])

x_test = test['Text'].tolist()
y_test = test[labels].tolist()

test_idx = np.arange(len(x), len(x) + len(x_test))

# Combine training & test data set
x = x + x_test
y = y + y_test

y = LabelEncoder().fit_transform(y)
print("Data loaded!")

t2g = Text2GraphTransformer(n_jobs=8, min_df=5, save_path=None, verbose=1, max_df=max_df, window_size=window_size)
# t2g = Text2GraphTransformer(n_jobs=8, min_df=1, save_path=save_path, verbose=1, max_df=1.0)
ls = os.listdir("textgcn/graphs")
# if not ls:
if True:
    g = t2g.fit_transform(x, y, test_idx=test_idx, val_idx=val_idx)
    print("Graph built!")
else:
    g = t2g.load_graph(os.path.join(save_path, ls[0]))
    print(f"Graph loaded from {os.path.join(save_path, ls[0])}!")
    print(f"n_classes={len(np.unique(g.y))}")

# gcn = JumpingKnowledgeNetwork(g.x.shape[1], len(np.unique(y)), n_hidden_gcn=100, dropout=dropout, activation=th.nn.SELU)
# gcn = EGCN(g.x.shape[1], len(np.unique(y)), n_hidden_gcn=100, embedding_dim=2000, dropout=dropout)
gcn = model(g.x.shape[1], len(np.unique(y)), n_hidden_gcn=100, dropout=dropout)
Exemplo n.º 3
0
y += y_test

y = LabelEncoder().fit_transform(y)
print("Data loaded!")

################################################  Text to Graph ################################################

kf = KFold(n_splits=k_split, shuffle=True)
frameIterator = 0
timestamp = datetime.now().strftime("%d_%b_%y_%H_%M_%S")
csv_name = "DBPEDIA_Flat_HypOpt_" + lable_category + "_" + timestamp + ".csv"

for mdf in dfs:
    t2g = Text2GraphTransformer(n_jobs=1,
                                min_df=50,
                                save_path=None,
                                verbose=1,
                                max_df=mdf)

    g = t2g.fit_transform(x, y, test_idx=test_idx, val_idx=val_idx)
    print("Graph built!")

    # Mask for doc nodes
    mask = th.logical_or(g.train_mask, g.test_mask)

    indizes = th.nonzero(mask)
    for dropout in dos:
        for lr in lrs:
            for model in models:
                model_name = "GCN" if model == GCN else "EGCN"
                try:
Exemplo n.º 4
0
y_top1 = LabelEncoder().fit_transform(y_top1)
y_top2 = LabelEncoder().fit_transform(y_top2)

del x_val
del x_test
del y_top1_val
del y_top2_val
del y_top1_test
del y_top2_test

print("Data loaded!")

t2g = Text2GraphTransformer(n_jobs=1,
                            min_df=100,
                            save_path=save_path,
                            verbose=1,
                            max_df=max_df,
                            max_length=MAX_LENGTH,
                            window_size=window_size)

g1 = t2g.fit_transform(x,
                       y_top1,
                       test_idx=test_idx,
                       val_idx=val_idx,
                       hierarchy_feats=None)

print("Graph built!")

gcn1 = model(g1.x.shape[1],
             len(np.unique(y_top1)),
             n_hidden_gcn=n_hidden,
Exemplo n.º 5
0
x_test = test['Text'].tolist()
y_test = test['Cat2'].tolist()
y_test_top = test['Cat1'].tolist()

test_idx = np.arange(len(x), len(x) + len(x_test))

y = y + y_test
y_top = y_top + y_test_top
x = x + x_test

y = LabelEncoder().fit_transform(y)
y_top = LabelEncoder().fit_transform(y_top)
print("Data loaded!")

t2g = Text2GraphTransformer(n_jobs=8, min_df=5, save_path=save_path, verbose=1, max_df=0.6)

g1 = t2g.fit_transform(x, y_top, test_idx=test_idx, val_idx=val_idx, hierarchy_feats=None)
g1 = g1.to(device)
gcn1 = gcn1.to(device)

with th.no_grad():
    pred_test = np.argmax(gcn1(g1)[g1.test_mask].cpu().detach().numpy(), axis=1)

gcn1 = gcn1.to(cpu)

g2 = t2g.fit_transform(x, y, test_idx=test_idx, val_idx=val_idx, hierarchy_feats=None)
g2 = g2.to(device)

predictions = np.zeros(len(g2.y))
predictions[:] = -1