logging.info("Fitting local TFIDF weights from: %s ..." % args.input) lines = streamer(args.input) vectorizer.fit(lines) except: logging.info("IDF model file does not exist in: %s ..." % args.idf) exit() DBexists = os.path.exists(outputf) logging.info("Instantiating index object...") index = indexing.file_index( input_file=args. input, #'/almac/ignacio/data/INEXQA2012corpus/wikiEn_sts_clean_ph2.txt', index_file=outputf, vectorizer=vectorizer, mmap=True, wsize=args.wsize, sampsize=args.samples, n_jobs=1, chunk_size=args.chunk, verbose=args.verbo) if not DBexists: logging.info("Starting to build index into DB file %s" % outputf) index.fit() logging.info("Index fitted!!") logging.info("Output database: {}".format(outputf)) else: logging.info("Index loaded from DB file %s" % outputf) if index.vocab_size < args.bsize: logging.info(
sublinear_tf = True,# if args.localw.startswith("subl") else False, stop_words = "english" #if args.stop == 'ost' else None ) logging.info("Fitting local TFIDF weights from: %s ..." % args.input) lines = streamer(args.input) vectorizer.fit(lines) except: logging.info("IDF model file does not exist in: %s ..." % args.idf) exit() DBexists = os.path.exists(outputf) logging.info("Instantiating index object...") index = indexing.file_index(input_file = args.input, index_file = outputf, vectorizer=vectorizer, mmap=True, wsize=args.wsize, sampsize=args.samples, n_jobs=1, chunk_size=args.chunk, verbose=args.verbo) if not DBexists: logging.info("Starting to build index into DB file %s" % outputf) index.fit() logging.info("Index fitted!!") logging.info("Output database: {}".format(outputf)) else: logging.info("Index loaded from DB file %s" % outputf) sparse_word_centroids = wordCentroids(db=index, vect=vectorizer) # Tal vez pueda cargar la matrix dipersa de word_centroids en ram y hacer NMF. logging.info("Fitting Sparse Random Projections for sparse coding ...") X_s = Dict(sorted({w: v for w, v in sparse_word_centroids