ns = numpy.arange(5, 105, 5) runLSI = not args.runLDA dataset = ArnetMinerDataset(runLSI=runLSI) #dataset.dataFilename = dataset.dataDir + "DBLP-citation-100000.txt" #dataset.dataFilename = dataset.dataDir + "DBLP-citation-1000000.txt" dataset.dataFilename = dataset.dataDir + "DBLP-citation-1000000.txt" #dataset.dataFilename = dataset.dataDir + "DBLP-citation-7000000.txt" #dataset.dataFilename = dataset.dataDir + "DBLP-citation-Feb21.txt" dataset.overwriteGraph = True dataset.overwriteModel = True dataset.overwriteVectoriser = True dataset.vectoriseDocuments() dataset.loadVectoriser() X = scipy.io.mmread(dataset.docTermMatrixFilename + ".mtx") X = X.tocsc() X.data[:] = 1 print(numpy.max(X.data), numpy.min(X.data)) rowSums = numpy.array(X.sum(0), numpy.int).flatten() colSums = numpy.array(X.sum(1), numpy.int).flatten() sortedIndsColSums = numpy.argsort(rowSums) featureNames = dataset.vectoriser.get_feature_names() #Print out lowest frequency terms