def appendMatrix(matrix01, matrix02):
    #Column-wise appends matrix02 columns to matrix01, where user indices are unaligned. Returns new matrix01.
    hold_matrix = csr_matrix((1, csr_matrix.get_shape(matrix02)[1])) #Empty matrix with 1 row

    for user in riskFactor_userIDs:
        if user in regex_userIDs:
            hold_matrix = vstack([hold_matrix, matrix02[numpy.where(regex_userIDs==user)[0][0],:]])
            continue
        hold_matrix = vstack([hold_matrix, numpy.zeros(csr_matrix.get_shape(matrix02)[1])])

    hold_matrix = csr_matrix(hold_matrix)[1:,:] #convert to csr matrix and remove first row
    matrix01 = csr_matrix(hstack([matrix01, hold_matrix]))
    return matrix01
Пример #2
0
    def _compute_heat_diffusion(self, lap):

        eps = 1e-9
        n_simplices = csr_matrix.get_shape(lap)[0]

        norm = np.vectorize(lambda x: 0 if np.abs(x) < eps else x)
        n_filters = len(self.taus_)

        if self.proc_ == 'exact':
            eigenvals, U = self._get_eigens(lap)

            heat = list()
            for i in range(n_filters):
                temp = U.dot(np.diagflat(
                    np.exp(- self.taus_[i] * eigenvals).flatten())).dot(U.T).\
                    dot(self.initial_condition)
                heat.append((norm(temp)))
        else:
            heat = [sp.sparse.csc_matrix((n_simplices, n_simplices)) for i in
                    range(n_filters)]
            monome = {0: sp.sparse.eye(n_simplices),
                      1: lap - sp.sparse.eye(n_simplices)}
            for k in range(2, self.order_ + 1):
                monome[k] = 2 * (lap - sp.sparse.eye(n_simplices)).dot(
                    monome[k - 1]) - monome[k - 2]
            for i in range(n_filters):
                coeffs = self._compute_cheb_coeff_basis(
                    self.taus_[i], self.order_)
                temp = sp.sum([coeffs[k] * monome[k] for k in
                               range(0, self.order_ + 1)])
                heat[i] = norm(temp.A)  # cleans up the small coefficients
        return heat
import os, codecs
import numpy
from scipy.sparse import hstack, csr_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score


# Loading pickle file data into numpy array called data
f = open("smoking_1_analytic_data_mapreduce.pkl", "rb")
data = numpy.load(f)
data = numpy.array(data)
f.close()

# Loading smoking posts_matrix data
posts_matrix = data[0]  # 11616 rows x 605107 columns
rows = csr_matrix.get_shape(posts_matrix)[0]
users_vector = data[3]
labels_vector = data[4]
keywords_vector = data[2]

# Empty matrix for load columns
loader_matrix = numpy.empty([rows, 1])

# List of RegExs
with open("collocation_smoker_regexs.txt", "r") as f:
    queries = [l.strip() for l in f]

keywords_vector.extend(queries)  # Extend keywords list so all columns names accessible

# Appends each regex column to loader_matrix
for query in queries:
import codecs
import os
import string
import numpy
from scipy.sparse import csr_matrix
from nltk import sent_tokenize, word_tokenize

#Loading pickle file data into numpy array called data
f = open('smoking_1_analytic_data_mapreduce.pkl', 'rb')
data = numpy.load(f)
data = numpy.array(data)
f.close()

#Loading smoking posts_matrix data
posts_matrix = data[0] #11616 rows x 605107 columns
rows = csr_matrix.get_shape(posts_matrix)[0]
users_vector = data[3]
labels_vector = data[4]
keywords_vector = data[2]

#Empty matrix for load columns
loader_matrix = numpy.empty([1, 7])


def word_count(tokens):
    count = len([token for token in tokens if token not in string.punctuation])
    return count


def sent_per_status(file_text):
    content = file_text.readlines()