strings_y=["QAutf8"],
                              feat_y=["QApos",]) #"QAaspect", "QAperson", "QAgender", "QAnumber", "QAcase", "QAvoice", "QAmood", "QAstate"])
    source = list(itertools.chain(*sawarefData.quran_sent))
    df = pd.DataFrame(source,
                      columns=["sid", "aid", "wid", "mid"] +
                      feat_x +
                      ["embeddings","word","QAutf8"] +
                      feat_y)
    df["embeddings"] = df["embeddings"].apply(truncate)
    df = flattencolumns(df, ["embeddings"])
    df.set_index(["sid", "aid", "wid", "mid"], inplace=True)
    df.sort_index(inplace=True)

    ## 2. Pad the rows according to the longest word (in # of morphemes)
    SENTLEN = max(df.index.get_level_values("mid"))
    df = df.reindex(padIndexes(
        df, max(df.index.get_level_values("mid"))), fill_value=0).sort_index()

    ## 3. Get the hot encoding of all caterogirical data (see columns attr)
    dumm = pd.get_dummies(df, columns=feat_x +
                          feat_y)

    ## 4. Add two-level columns for easy indexing later (wid, mid)
    EXAMPLES_LEN = df.shape[0]//SENTLEN
    new_columns = []
    for x in dumm.columns:
        new_columns.append(re.sub('(_.*|[0-9]*)', '', x))
    dumm.columns = [new_columns, dumm.columns]
    dumm.index = [[x for x in range(EXAMPLES_LEN) for _ in range(SENTLEN)],
                  [x for _ in range(EXAMPLES_LEN) for x in range(SENTLEN)]]
    dumm = dumm.sort_index(axis=1)
예제 #2
0
    df = pd.concat(
        [pd.DataFrame(df1[x].values.tolist()).add_prefix(x) for x in cols],
        axis=1)
    return pd.concat([df, df1.drop(cols, axis=1)], axis=1)


def truncate(x):
    return x[:EMBEDDINGS]


df["embeddings"] = df["embeddings"].apply(truncate)
df = flattencolumns(df, ["embeddings"])
df.set_index(["sid", "aid", "wid", "mid"], inplace=True)
df.sort_index(inplace=True)
SENTLEN = max(df.index.get_level_values("mid"))
df = df.reindex(padIndexes(df, max(df.index.get_level_values("mid"))),
                fill_value=0).sort_index()
dumm = pd.get_dummies(
    df, columns=sawarefData.features_map_x + sawarefData.features_map_y
)  #.reset_index().set_index("mid")  #.drop(["sid", "aid", "wid"], 1)
print("Done")

# dumm = dumm.reindex(padIndexes(dumm, max(df.index.get_level_values("mid"))), fill_value=0.0).sort_index()
# x_columns = [k + "_" + xx for k, x in sawarefData.features_set_x.items()
#              for xx in x]
# y_columns = [k + "_" + xx for k, x in sawarefData.features_set_y.items()
#              for xx in x]
x_columns = [
    y for f in sawarefData.features_map_x
    for y in dumm.columns if y.replace(f, "")[0] == "_"
] + ["embeddings" + str(i) for i in range(EMBEDDINGS)]