parser.add_argument("-r", "--runLDA", action="store_true", help="Run Latent Dirchlet Allocation") args = parser.parse_args() averagePrecisionN = 50 similarityCutoff = 0.30 ns = numpy.arange(5, 105, 5) runLSI = not args.runLDA dataset = ArnetMinerDataset(runLSI=runLSI) #dataset.dataFilename = dataset.dataDir + "DBLP-citation-100000.txt" #dataset.dataFilename = dataset.dataDir + "DBLP-citation-1000000.txt" dataset.dataFilename = dataset.dataDir + "DBLP-citation-1000000.txt" #dataset.dataFilename = dataset.dataDir + "DBLP-citation-7000000.txt" #dataset.dataFilename = dataset.dataDir + "DBLP-citation-Feb21.txt" dataset.overwriteGraph = True dataset.overwriteModel = True dataset.overwriteVectoriser = True dataset.vectoriseDocuments() dataset.loadVectoriser() X = scipy.io.mmread(dataset.docTermMatrixFilename + ".mtx") X = X.tocsc() X.data[:] = 1 print(numpy.max(X.data), numpy.min(X.data)) rowSums = numpy.array(X.sum(0), numpy.int).flatten() colSums = numpy.array(X.sum(1), numpy.int).flatten()