def load_dataset(exp): dataset_train, dataset_val, dataset_test = load_pickle(exp.experiment_dir.joinpath('datasets.pkl')) vocab, style_vocab = load_pickle(exp.experiment_dir.joinpath('vocabs.pkl')) W_emb = load_pickle(exp.experiment_dir.joinpath('W_emb.pkl')) print(f'Dataset: {len(dataset_train)}, val: {len(dataset_val)}, test: {len(dataset_test)}') print(f'Vocab: {len(vocab)}, style vocab: {len(style_vocab)}') print(f'W_emb: {W_emb.shape}') return dataset_train, dataset_val, dataset_test, vocab, style_vocab, W_emb
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer import torch import torch.nn as nn import torch.nn.functional as F from preprocess import cutwords output_dir = os.path.join(pwd(__file__), './output') if __name__ == '__main__': N = 100 #每个类别有100个文本参与比较 type = [ 'fraudsters', 'intentkill', 'thieves', 'rape', 'traffic', 'rob', 'position', 'drug', 'damage' ] num = [N for _ in range(9)] data_pkl = os.path.join(pwd(__file__), './data/data.pkl') datas = load_pickle(data_pkl) documents = [] #存放的比较的文档 for i, atype in enumerate(type): for apeople in datas: if apeople["type"] == atype and num[i] > 0: documents.append(apeople["note"]) num[i] = num[i] - 1 if num[i] == 0: break print("用于对比的文本规模:", len(documents)) # adata="乔元,56岁,游手好闲,抢劫他人手机,犯抢劫罪" ##需要预测的数据 adata = "14216黄明英,29岁,家庭教养方式不足,加入盗窃团伙,实施盗窃" ##需要预测的数据 documents.append(adata) # print(documents) docs = cutwords(documents)
def _load_config(cls, experiment_dir): filename = experiment_dir.joinpath(Experiment._CONFIG_FILENAME) config = load_pickle(filename) return config
args = parser.parse_args() if args.c not in ['kmeans', 'dp']: raise Exception('Illegal clustering method') if args.c=='kmeans': clusterer=KMeansClusterer() else: clusterer=DPClusterer() # 统计真实情况 docs=load_pickle(docs_file) true_categories=set() for doc in docs: true_categories.add(doc['type']) print(true_categories) true_clusters=[] for category in true_categories: cluster=[] for globalidx,doc in enumerate(docs): if doc['type']==category: cluster.append(globalidx) true_clusters.append(cluster) ###每个true_categorie真实的类别对应这个类别的[全部数据id列表] representer_file = os.path.join( output_dir, "{}_representer.pkl".format(args.c)) districts_file = os.path.join(
# some dirty work if not os.path.isdir(output_dir): os.mkdir(output_dir) if not os.path.isdir(log_dir): os.mkdir(log_dir) if args.p: logger.info("Multiple process mode") else: logger.info("Single process mode") data_pkl = os.path.join(pwd(__file__), './data/data.pkl') if os.path.isfile(data_pkl): logger.info("Exsiting pkl,loading...") datas = load_pickle(data_pkl) else: logger.info("Loading from json") loader = DataLoader() datas = loader() logger.info("Shuffle data") random.shuffle(datas) random.shuffle(datas) logger.info("Serialize data") dump_pickle(datas, data_pkl) logger.info("Loaded data {}".format(len(datas))) # data_per_worker = int(1e2)*6 data_per_worker = int(args.s) num_worker = round(len(datas) / data_per_worker)
representers = voters[:n_representers] districts.append({"voters": voters, "representers": representers}) voters = [] for district in districts: voters.extend(district['voters']) logger.info("{} districts in total".format(len(districts))) return districts if __name__ == "__main__": logger = defaultlogger datas = load_pickle(docs_file) docs = [w['note'] for w in datas] argparser = argparse.ArgumentParser() argparser.add_argument("-c", help="clusterer:'kmeans' or 'dp'", type=str, default='kmeans') args = argparser.parse_args() if args.c not in ['kmeans', 'dp']: raise Exception("Illegal args") logger.info('{} democracy clustering'.format(args.c)) docs_representes = []