def build_doc_vector(dir_name, model, build_option, process_option=ProcessOption(), \ save=True, save_file="doc_vector.bin", \ to_normalize = False, to_scale = False, cluster_factor=20, num_cpus=-2): sentences = Sentences(dir_name) docs = sentences.paragraph_iterator() doc_num = sentences.doc_num stop_words = set(stopwords.words("english")) tknzr = TweetTokenizer(preserve_case=False) post_docs = util.process_sentences(docs, tknzr, process_option, stop_words) if build_option == 1: # average doc_vector = build_average_dv(post_docs, doc_num, model, save, save_file) elif build_option == 2: # cluster doc_vector = build_av_tf_idf_dv(post_docs, doc_num, model, save, save_file) else: doc_vector = build_cluster_dv(post_docs, doc_num, model, cluster_factor, num_cpus, save, save_file) if (normalize): doc_vector = normalize(doc_vector, copy=False) if (scale): doc_vector = scale(doc_vector, copy=True) return doc_vector # "/Users/Crazyconv/Conv/DEVELOPMENT/GitFolder/Word2Vec2NLP/dataset"
def build_doc_vector(dir_name, model, build_option, process_option=ProcessOption(), \ save=True, save_file="doc_vector.bin", \ to_normalize = False, to_scale = False, cluster_factor=20, num_cpus=-2): sentences = Sentences(dir_name) docs = sentences.paragraph_iterator() doc_num = sentences.doc_num stop_words = set(stopwords.words("english")) tknzr = TweetTokenizer(preserve_case=False) post_docs = util.process_sentences(docs, tknzr, process_option, stop_words) if build_option == 1: # average doc_vector = build_average_dv(post_docs, doc_num, model, save, save_file) elif build_option == 2: # cluster doc_vector = build_av_tf_idf_dv(post_docs, doc_num, model, save, save_file) else: doc_vector = build_cluster_dv(post_docs, doc_num, model, cluster_factor, num_cpus, save, save_file) if(normalize): doc_vector = normalize(doc_vector, copy=False) if(scale): doc_vector = scale(doc_vector, copy=True) return doc_vector # "/Users/Crazyconv/Conv/DEVELOPMENT/GitFolder/Word2Vec2NLP/dataset"
def build_doc_vector(dir_name, model, build_option, process_option=ProcessOption(), cluster_factor=20, num_cpus=-2): sentences = Sentences(dir_name) docs = sentences.paragraph_iterator() doc_num = sentences.doc_num stop_words = set(stopwords.words("english")) post_docs = util.process_sentences(docs, process_option, stop_words) if build_option == 1: # average doc_vector = build_average_dv(post_docs, doc_num, model) elif build_option == 2: # cluster doc_vector = build_av_tf_idf_dv(post_docs, doc_num, model) else: doc_vector = build_cluster_dv(post_docs, doc_num, model, cluster_factor, num_cpus) return doc_vector