def assign_labels(model, data_file, cols): table = [] labels = [] cnter = 0 for row in prep.gen_file_stream(data_file, cols): table.append(row) cnter += 1 if cnter % 100000 == 0: print cnter data = mx.DataSet() print 'table size: %d' % len(table) data.fromArray(np.array(table)) labels += list(model.classify(data, None, None, 1)) del data table[:] = [] # print len(list(labels)) # process the trailing entries if len(table) > 0: data = mx.DataSet() data.fromArray(np.array(table)) labels += list(model.classify(data, None, None, 1)) return labels
def classify_data_kmeans(k, cols, path, centers): for f in prep.gen_file_list(path): if f.endswith('.train'): print 'classifying %s' % f fw = open(f[:f.rfind('/')] + '/.' + str(k) + '.labels', 'w') prog = 0 for row in prep.gen_file_stream(f, cols): if prog % 10000 == 0: print 'progress: %d' % prog label = assign_center(row, centers) fw.write(str(label) + '\n') prog += 1 fw.close()