args = parser.parse_args() if args.create: # Build the corpus dictionary and the cooccurrence matrix. print 'Pre-processing corpus' if args.wiki: print 'Using wikipedia corpus' get_data = read_wikipedia_corpus else: get_data = read_corpus corpus_model = Corpus() corpus_model.fit_dictionary(get_data(args.create)) corpus_model.fit_matrix(get_data(args.create), window=10) corpus_model.save('corpus.model') print 'Dict size: %s' % len(corpus_model.dictionary) print 'Collocations: %s' % corpus_model.matrix.nnz if args.train: # Train the GloVe model and save it to disk. if not args.create: # Try to load a corpus from disk. print 'Reading corpus statistics' corpus_model = Corpus.load('corpus.model') print 'Dict size: %s' % len(corpus_model.dictionary) print 'Collocations: %s' % corpus_model.matrix.nnz