예제 #1
0
def displayHistoOfMatrix(fname):
    infos = readFile(fname)
    titles, words, matrix = extractArrays(infos)

    #pl.hist(matrix.sum(axis=0), bins=120, range=(0, 120))
    pl.hist(matrix.sum(axis=0), bins=120)
    pl.show()
예제 #2
0
def whatever():
    fname = 'data3/data10.json'

        #matrix = np.array([
        #    [0,0,0,12,2,0],  # valeurs pour le 1er film
        #    [0,0,0,12,2,0],  # valeurs pour le 2e film
        #    [0,0,0,12,2,0]
        #    ])
              # valeur pour un mot

    infos = readFile(fname)
    titles, words, matrix = extractArrays(infos)

    print titles

    pl.hist(matrix.sum(axis=0), bins=140)
    pl.show()

    word_counts = matrix.sum(axis=0)
    word_mask = word_counts > 1

    words = np.array(words)
    matrix[:,word_mask]
    words[word_mask]

    print matrix.shape
    print words
    print matrix[:,word_mask].shape
    print words[word_mask].shape
예제 #3
0
	print 'Taille matrix avant : ' + str(matrix.shape)
	print 'Taille words avant : ' + str(words.shape)

	return words.tolist(), matrix

if __name__ == '__main__':

    # THIS CODE LOAD 3 ARRAYS FROM A FILE FROM THE FOLDER 'data3'
    # The arrays are 'titles', 'words' and 'matrix'
    # They are filtered and saved in the folder 'data4'

    #dataset = [1, 3, 5, 10, 50, 100]#, 500, 3393]
    #dataset = [3393]
    dataset = [3]

    for n in dataset:
        fname = 'data3/data' + str(n) + '.json'
        infos = readFile(fname)
    	titles, words, matrix = extractArrays(infos)

    	# apply filter
    	words, matrix = removeLonelyWords(words, matrix)

    	matrix = matrix.tolist()
        
        output_fname = 'data4/data' + str(n) + '.json'
        saveToFile(titles, words, matrix, output_fname)

        print 'File \'' + output_fname + '\' saved.' 
        
예제 #4
0
def printSizeOfMatrix(fname):
    infos = readFile(fname)
    titles, words, matrix = extractArrays(infos)

    print matrix.shape