示例#1
0
yO = yO[0:subsetSize]
yC = yC[0:subsetSize]
yE = yE[0:subsetSize]
yA = yA[0:subsetSize]
yN = yN[0:subsetSize]

#save lists because transformTextForTraining() changes them
old_yO = yO
old_yC = yC
old_yE = yE
old_yA = yA
old_yN = yN

[sumE, yO, yC, yE, yA,
 yN] = embeddings.transformTextForTraining(wordDictionary, post_threshold,
                                           posts, old_yO, old_yC, old_yE,
                                           old_yA, old_yN, "sum", transform)
maxE = embeddings.transformTextForTraining(wordDictionary, post_threshold,
                                           posts, old_yO, old_yC, old_yE,
                                           old_yA, old_yN, "max", transform)[0]
minE = embeddings.transformTextForTraining(wordDictionary, post_threshold,
                                           posts, old_yO, old_yC, old_yE,
                                           old_yA, old_yN, "min", transform)[0]
avgE = embeddings.transformTextForTraining(wordDictionary, post_threshold,
                                           posts, old_yO, old_yC, old_yE,
                                           old_yA, old_yN, "avg", transform)[0]
conE = embeddings.transformTextForTraining(wordDictionary, post_threshold,
                                           posts, old_yO, old_yC, old_yE,
                                           old_yA, old_yN, "conc",
                                           transform)[0]
subsetSize = 1000
posts = posts[0:subsetSize]
yO = yO[0:subsetSize]
yC = yC[0:subsetSize]
yE = yE[0:subsetSize]
yA = yA[0:subsetSize]
yN = yN[0:subsetSize]

old_yO = yO
old_yC = yC
old_yE = yE
old_yA = yA
old_yN = yN
[conE, yO, yC, yE, yA,
 yN] = embeddings.transformTextForTraining(wordDictionary, post_threshold,
                                           posts, old_yO, old_yC, old_yE,
                                           old_yA, old_yN, method, transform)
print("Embeddings computed.")

split_index = round(len(conE) * 0.85)
data_train = conE[:split_index]
data_test = conE[split_index:]

l = 1
for labels in [yO, yC, yE, yA, yN]:

    if l == 1:
        big5trait = "O"
        print("[SVM] computing results for Openness...")
    elif l == 2:
        big5trait = "C"
dfs = df.sample(3000)
print("Training set shuffled.")

print("Loading embeddings dataset...")
wordDictionary = dsu.parseFastText(dataset_path)
print("Dataset correctly laoded.")

posts = dfs["message"]
yO = np.array(dfs["ope"], dtype=pd.Series)
yC = np.array(dfs["con"], dtype=pd.Series)
yE = np.array(dfs["ext"], dtype=pd.Series)
yA = np.array(dfs["agr"], dtype=pd.Series)
yN = np.array(dfs["neu"], dtype=pd.Series)
[conE, yO, yC, yE, yA,
 yN] = embeddings.transformTextForTraining(wordDictionary, post_threshold,
                                           posts, yO, yC, yE, yA, yN, "conc",
                                           True)
print("\tEmbeddings computed.")

trait = 1
for labels in [yO, yC, yE, yA, yN]:

    if trait == 1:
        big5trait = "O"
        gamma = 1
        C = 1
        print("   Training model for Openness...")
    elif trait == 2:
        big5trait = "C"
        gamma = 1
        C = 1
yA = np.array(yA)
yN = np.array(yN)

s = np.arange(filteredTweets.shape[0])
np.random.shuffle(s)
filteredTweets = filteredTweets[s]
yO = yO[s]
yC = yC[s]
yE = yE[s]
yA = yA[s]
yN = yN[s]
print("Data shuffled.")

[conE, yO, yC, yE, yA,
 yN] = embeddings.transformTextForTraining(wordDictionary, tweet_threshold,
                                           filteredTweets, yO, yC, yE, yA, yN,
                                           "conc", True)
print("Embeddings computed.")

l = 1
k_fold = KFold(n_splits=4)
for labels in [yO, yC, yE, yA, yN]:

    if l == 1:
        big5trait = "O"
        gamma = 1
        C = 1
        print("Training model for Openness...")
    elif l == 2:
        big5trait = "C"
        gamma = 1