예제 #1
0
def features_from(i):
    df_fiscalite, df_resp_fis, df_ids_fis, df_democratie, df_resp_dem, df_ids_dem, df_ecologie, df_resp_eco, df_ids_eco, df_organisation, df_resp_org, df_ids_org = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    gmm, features = 0, 0
    print(df_fiscalite)
    if (i == 0):
        df_fiscalite = ut.read_data(
            'data/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json')
        df_resp_fis = get_open_reponses(df_fiscalite)
        df_ids_fis = get_ids_open_reponses(df_fiscalite)
    elif (i == 1):
        df_democratie = ut.read_data('data/DEMOCRATIE_ET_CITOYENNETE.json')
        df_resp_dem = get_open_reponses(df_democratie)
        df_ids_dem = get_ids_open_reponses(df_democratie)
    elif (i == 2):
        df_ecologie = ut.read_data('data/LA_TRANSITION_ECOLOGIQUE.json')
        df_resp_eco = get_open_reponses(df_ecologie)
        df_ids_eco = get_ids_open_reponses(df_ecologie)
    elif (i == 3):
        df_organisation = ut.read_data(
            'data/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json')
        df_resp_org = get_open_reponses(df_organisation)
        df_ids_org = get_ids_open_reponses(df_organisation)
    dfs = np.array([["fiscalite", df_fiscalite], ["democratie", df_democratie],
                    ["ecologie", df_ecologie],
                    ["organisation", df_organisation]])
    dfs_responses = np.array([["responses fiscalite", df_resp_fis],
                              ["responses democratie", df_resp_dem],
                              ["responses ecologie", df_resp_eco],
                              ["responses organisation", df_resp_org]])
    dfs_ids = np.array([df_ids_fis, df_ids_dem, df_ids_eco, df_ids_org])
    # read features
    features = np.loadtxt(dfs_responses[i, 0] + '_all_questions.tsv',
                          delimiter='\t')
    # Fit GMM
    gmm = GaussianMixture(n_components=10)
    gmm.fit(np.array(features))
    local_pool = multiprocessing.Pool(20, initializer)
    local_pool.map(fill_X, range(four_surveys_taken_auth_ids))
    local_pool.close()
    local_pool.join()
    np.savetxt("X_" + str(i) + ".csv", X, delimiter=",")
def extract_features():
    for k in [1]:
        ids_questions = get_ids_open_reponses(dfs[k,1])
        ids_auth = np.sort(list(set(dfs_responses[k,1]['authorId'].values)))
        # Extract embeddings for sentences
        features = np.zeros((len(ids_auth), 300*len(ids_questions)))
        for i in range(len(ids_auth)) :
            for j in range(len(ids_questions)) :
                response_unique = dfs_responses[k,1][dfs_responses[k,1]['authorId'] == ids_auth[i]][dfs_responses[k,1][dfs_responses[k,1]['authorId'] == ids_auth[i]]['questionId'] == ids_questions[j]].formattedValue.values.tolist()
                if (len(response_unique) > 0) :
                    features[i][300*j:300*(j+1)] = s.get_features(response_unique[0])
                else:
                    features[i][300*j:300*(j+1)] = [0.]*300
        np.savetxt(dfs_responses[k,0,]+'_all_questions.tsv', features, delimiter='\t')
예제 #3
0

def fill_X(auth_index):
    global gmm
    global ids_auth
    global features
    global four_surveys_taken_auth_ids
    auth = four_surveys_taken_auth_ids[auth_index]
    k = list(ids_auth).index(auth)
    return gmm.predict_proba(features[k].reshape(1, -1))[0]


n_compo = 10
df_ecologie = read_data('data/LA_TRANSITION_ECOLOGIQUE.json')
df_resp_eco = get_open_reponses(df_ecologie)
df_ids_eco = get_ids_open_reponses(df_ecologie)
four_surveys_taken_auth_ids = np.loadtxt("four_surveys_taken_auth_ids.csv",
                                         delimiter=",",
                                         dtype=str)
ids_auth = np.sort(list(set(df_resp_eco['authorId'].values)))
np.savetxt("ids_auth_sorted.csv", ids_auth, delimiter=",", fmt="%s")
X = np.zeros((len(four_surveys_taken_auth_ids), n_compo))
# read features
features = np.loadtxt('responses ecologie_all_questions.tsv', delimiter='\t')
# Fit GMM
gmm = GaussianMixture(n_components=n_compo)
gmm.fit(features)
# pool
local_pool = multiprocessing.Pool(10)
X = np.array(local_pool.map(fill_X, range(len(four_surveys_taken_auth_ids))))
local_pool.close()
예제 #4
0
def fill_X(auth_index):
    global gmm
    global ids_auth
    global features
    global four_surveys_taken_auth_ids
    auth = four_surveys_taken_auth_ids[auth_index]
    k = list(ids_auth).index(auth)
    return gmm.predict_proba(features[k].reshape(1, -1))[0]


n_compo = 10
df_organisation = read_data(
    'data/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json')
df_resp_org = get_open_reponses(df_organisation)
df_ids_org = get_ids_open_reponses(df_organisation)
four_surveys_taken_auth_ids = np.loadtxt("four_surveys_taken_auth_ids.csv",
                                         delimiter=",",
                                         dtype=str)
ids_auth = np.sort(list(set(df_resp_org['authorId'].values)))
np.savetxt("ids_auth_sorted.csv", ids_auth, delimiter=",", fmt="%s")
X = np.zeros((len(four_surveys_taken_auth_ids), n_compo))
# read features
features = np.loadtxt('responses organisation_all_questions.tsv',
                      delimiter='\t')
# Fit GMM
gmm = GaussianMixture(n_components=n_compo)
gmm.fit(features)
# pool
local_pool = multiprocessing.Pool(10)
X = np.array(local_pool.map(fill_X, range(len(four_surveys_taken_auth_ids))))
예제 #5
0

def fill_X(auth_index):
    global gmm
    global ids_auth
    global features
    global four_surveys_taken_auth_ids
    auth = four_surveys_taken_auth_ids[auth_index]
    k = list(ids_auth).index(auth)
    return gmm.predict_proba(features[k].reshape(1, -1))[0]


n_compo = 10
df_democratie = read_data('data/DEMOCRATIE_ET_CITOYENNETE.json')
df_resp_dem = get_open_reponses(df_democratie)
df_ids_dem = get_ids_open_reponses(df_democratie)
four_surveys_taken_auth_ids = np.loadtxt("four_surveys_taken_auth_ids.csv",
                                         delimiter=",",
                                         dtype=str)
ids_auth = np.sort(list(set(df_resp_dem['authorId'].values)))
np.savetxt("ids_auth_sorted.csv", ids_auth, delimiter=",", fmt="%s")
X = np.zeros((len(four_surveys_taken_auth_ids), n_compo))
# read features
features = np.loadtxt('responses democratie_all_questions.tsv', delimiter='\t')
# Fit GMM
gmm = GaussianMixture(n_components=n_compo)
gmm.fit(features)
# pool
local_pool = multiprocessing.Pool(10)
X = np.array(local_pool.map(fill_X, range(len(four_surveys_taken_auth_ids))))
local_pool.close()
예제 #6
0
from sklearn.mixture import GaussianMixture

def fill_X(auth_index):
    global gmm
    global ids_auth
    global features
    global four_surveys_taken_auth_ids
    auth = four_surveys_taken_auth_ids[auth_index]
    k = list(ids_auth).index(auth)
    return gmm.predict_proba(features[k].reshape(1, -1))[0]


n_compo = 10
df_fiscalite = read_data('data/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json')
df_resp_fis = get_open_reponses(df_fiscalite)
df_ids_fis = get_ids_open_reponses(df_fiscalite)
four_surveys_taken_auth_ids = np.loadtxt("four_surveys_taken_auth_ids.csv", delimiter=",", dtype=str)
ids_auth = np.sort(list(set(df_resp_fis['authorId'].values)))
np.savetxt("ids_auth_sorted.csv", ids_auth, delimiter=",", fmt="%s")
X = np.zeros((len(four_surveys_taken_auth_ids), n_compo))
# read features
features = np.loadtxt('responses fiscalite_all_questions.tsv', delimiter='\t')
# Fit GMM
gmm = GaussianMixture(n_components=n_compo)
gmm.fit(features)
# pool
local_pool = multiprocessing.Pool(10)
X = np.array(local_pool.map(fill_X, range(len(four_surveys_taken_auth_ids))))
local_pool.close()
local_pool.join()
np.savetxt("X_fiscalite.csv", X, delimiter=",")