def make_dictionary(datafile, savefile, filter={}): dictionary = corpora.Dictionary([lines for lines in myutil.tokenize_file(datafile, filter=filter)]); min_count = filter["min_count"] if ("min_count" in filter) else 5; max_rate = filter["max_rate"] if ("max_rate" in filter) else 0.3; dictionary.filter_extremes(no_below=min_count, no_above=max_rate); #print(myutil.pp(dictionary.token2id)); print(datafile + ": " + str(len(dictionary)) + " tokens"); dictionary.save_as_text(savefile);
def _init_model(self, datafiles): corpus = [] for datafile in datafiles: corpus.extend([ self._dictionary.doc2bow(line) for line in myutil.tokenize_file(datafile) ]) return models.TfidfModel(corpus)
def make_dictionary(datafile, savefile, filter={}): dictionary = corpora.Dictionary( [lines for lines in myutil.tokenize_file(datafile, filter=filter)]) min_count = filter["min_count"] if ("min_count" in filter) else 5 max_rate = filter["max_rate"] if ("max_rate" in filter) else 0.3 dictionary.filter_extremes(no_below=min_count, no_above=max_rate) #print(myutil.pp(dictionary.token2id)); print(datafile + ": " + str(len(dictionary)) + " tokens") dictionary.save_as_text(savefile)
def _init_model(self, datafiles): corpus = [] for datafile in datafiles: corpus.extend([ self._dictionary.doc2bow(line) for line in myutil.tokenize_file(datafile) ]) return models.LdaModel(corpus=corpus, id2word=self._dictionary, num_topics=self._num_topics)
def classify_hcluster(datafiles, model, num_disp=-1): feature_vecs = [] lines = [] for datafile in datafiles: for (tokens, line) in myutil.tokenize_file(datafile, include_line=True): feature_vec = model.to_feature_vec(tokens) feature_vecs.append(feature_vec) lines.append(line.decode("utf-8")) result = linkage(feature_vecs[0:num_disp], metric="chebyshev", method="average") #print result; dendrogram(result, labels=lines[0:num_disp]) show()
def classify_best(datafiles, model, map): for datafile in datafiles: for (tokens, line) in myutil.tokenize_file(datafile, include_line=True): feature_vec = model.to_feature_vec(tokens) category_candidate = -1 max = 0 for i in range(0, len(feature_vec)): if (feature_vec[i] > max): max = feature_vec[i] category_candidate = i if (category_candidate in map): classified_texts = map[category_candidate] else: classified_texts = [] map[category_candidate] = classified_texts classified_texts.append(line)
def classify_kmeans(datafiles, model, map, num_categories): feature_vecs = [] lines = [] for datafile in datafiles: for (tokens, line) in myutil.tokenize_file(datafile, include_line=True): feature_vec = model.to_feature_vec(tokens) feature_vecs.append(feature_vec) lines.append(line) features = np.array(feature_vecs) kmeans_model = KMeans(n_clusters=num_categories, random_state=10).fit(features) labels = kmeans_model.labels_ for label, line in zip(labels, lines): if (label in map): classified_texts = map[label] else: classified_texts = [] map[label] = classified_texts classified_texts.append(line)
print("num_categories=%d, dictionary=%s, infile=%s, outfile=%s" % (num_categories, options.dict_file, options.infile, options.outfile)) # ワード辞書 dictionary = corpora.Dictionary.load_from_text(options.dict_file) # 学習/評価用データセットリスト datasets = [options.infile] # 特徴抽出モデル #model = HDPModel(dictionary, datasets, num_categories); #model = LDAModel(dictionary, datasets, num_categories); # model = LSIModel(dictionary, datasets, num_categories); model = TFIDFModel(dictionary, datasets) # model = BoWModel(dictionary) # 特徴量(またはトピック内容)の表示 model.show_topics() tokens = myutil.tokenize("ジョブズが最新の携帯モデルを発表する") print(len(model.to_feature_vec(tokens))) print(model.to_feature_vec(tokens)) tokens_line = [] for (tokens, line) in myutil.tokenize_file("data/it1.txt", include_line=True): tokens_line.extend(tokens) print(len(model.to_feature_vec(tokens_line))) print(model.to_feature_vec(tokens_line))
def _init_model(self, datafiles): corpus = []; for datafile in datafiles: corpus.extend([self._dictionary.doc2bow(line) for line in myutil.tokenize_file(datafile)]); tfidf = models.TfidfModel(corpus); return models.LsiModel(corpus=tfidf[corpus], id2word=self._dictionary, num_topics=self._num_topics);
optParser.add_option("-o", dest="outfile", default="work/out.tsv"); optParser.add_option("-c", dest="num_categories", default="2"); optParser.add_option("-d", dest="dict_file", default="work/dictionary"); (options, args) = optParser.parse_args(); num_categories = int(options.num_categories); print("num_categories=%d, dictionary=%s, infile=%s, outfile=%s" % (num_categories, options.dict_file, options.infile, options.outfile)); # ワード辞書 dictionary = corpora.Dictionary.load_from_text(options.dict_file); # 学習/評価用データセットリスト datasets = [options.infile] model = TFIDFModel(dictionary, datasets) feature_vec_list = {} tokens_line = [] for (tokens, line) in myutil.tokenize_file("data/it1.txt", include_line=True): tokens_line.extend(tokens) feature_vec_list.update({"it1":model.to_feature_vec(tokens_line)}) tokens_line = [] for (tokens, line) in myutil.tokenize_file("data/it2.txt", include_line=True): tokens_line.extend(tokens) feature_vec_list.update({"it2":model.to_feature_vec(tokens_line)}) tokens_line = [] for (tokens, line) in myutil.tokenize_file("data/it3.txt", include_line=True): tokens_line.extend(tokens) feature_vec_list.update({"it3":model.to_feature_vec(tokens_line)}) tokens_line = [] for (tokens, line) in myutil.tokenize_file("data/it4.txt", include_line=True):