Пример #1
0
def load_dataset(exp):
    dataset_train, dataset_val, dataset_test = load_pickle(exp.experiment_dir.joinpath('datasets.pkl'))
    vocab, style_vocab = load_pickle(exp.experiment_dir.joinpath('vocabs.pkl'))
    W_emb = load_pickle(exp.experiment_dir.joinpath('W_emb.pkl'))

    print(f'Dataset: {len(dataset_train)}, val: {len(dataset_val)}, test: {len(dataset_test)}')
    print(f'Vocab: {len(vocab)}, style vocab: {len(style_vocab)}')
    print(f'W_emb: {W_emb.shape}')

    return dataset_train, dataset_val, dataset_test, vocab, style_vocab, W_emb
Пример #2
0
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import torch
import torch.nn as nn
import torch.nn.functional as F
from preprocess import cutwords
output_dir = os.path.join(pwd(__file__), './output')
if __name__ == '__main__':
    N = 100  #每个类别有100个文本参与比较
    type = [
        'fraudsters', 'intentkill', 'thieves', 'rape', 'traffic', 'rob',
        'position', 'drug', 'damage'
    ]
    num = [N for _ in range(9)]
    data_pkl = os.path.join(pwd(__file__), './data/data.pkl')
    datas = load_pickle(data_pkl)
    documents = []  #存放的比较的文档
    for i, atype in enumerate(type):
        for apeople in datas:
            if apeople["type"] == atype and num[i] > 0:
                documents.append(apeople["note"])
                num[i] = num[i] - 1
                if num[i] == 0:
                    break
    print("用于对比的文本规模:", len(documents))

    # adata="乔元,56岁,游手好闲,抢劫他人手机,犯抢劫罪"        ##需要预测的数据
    adata = "14216黄明英,29岁,家庭教养方式不足,加入盗窃团伙,实施盗窃"  ##需要预测的数据
    documents.append(adata)
    # print(documents)
    docs = cutwords(documents)
Пример #3
0
    def _load_config(cls, experiment_dir):
        filename = experiment_dir.joinpath(Experiment._CONFIG_FILENAME)
        config = load_pickle(filename)

        return config
Пример #4
0
    

    args = parser.parse_args()

    if args.c not in ['kmeans', 'dp']:
        raise Exception('Illegal clustering method')
    

    if args.c=='kmeans':
        clusterer=KMeansClusterer()
    else:
        clusterer=DPClusterer()
    

    # 统计真实情况
    docs=load_pickle(docs_file)
    true_categories=set()
    for doc in docs:
        true_categories.add(doc['type'])
    print(true_categories)
    true_clusters=[]
    for category in true_categories:
        cluster=[]
        for globalidx,doc in enumerate(docs):
            if doc['type']==category:
                cluster.append(globalidx)

        true_clusters.append(cluster)   ###每个true_categorie真实的类别对应这个类别的[全部数据id列表]
    representer_file = os.path.join(
        output_dir, "{}_representer.pkl".format(args.c))
    districts_file = os.path.join(
Пример #5
0
    # some dirty work
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    if not os.path.isdir(log_dir):
        os.mkdir(log_dir)

    if args.p:
        logger.info("Multiple process mode")
    else:
        logger.info("Single process mode")

    data_pkl = os.path.join(pwd(__file__), './data/data.pkl')

    if os.path.isfile(data_pkl):
        logger.info("Exsiting pkl,loading...")
        datas = load_pickle(data_pkl)
    else:
        logger.info("Loading from json")
        loader = DataLoader()
        datas = loader()
        logger.info("Shuffle data")
        random.shuffle(datas)
        random.shuffle(datas)
        logger.info("Serialize data")
        dump_pickle(datas, data_pkl)

    logger.info("Loaded data {}".format(len(datas)))
    # data_per_worker = int(1e2)*6
    data_per_worker = int(args.s)
    num_worker = round(len(datas) / data_per_worker)
Пример #6
0
                representers = voters[:n_representers]

            districts.append({"voters": voters, "representers": representers})

    voters = []
    for district in districts:
        voters.extend(district['voters'])

    logger.info("{} districts in total".format(len(districts)))

    return districts


if __name__ == "__main__":
    logger = defaultlogger
    datas = load_pickle(docs_file)
    docs = [w['note'] for w in datas]
    argparser = argparse.ArgumentParser()
    argparser.add_argument("-c",
                           help="clusterer:'kmeans' or 'dp'",
                           type=str,
                           default='kmeans')
    args = argparser.parse_args()

    if args.c not in ['kmeans', 'dp']:
        raise Exception("Illegal args")

    logger.info('{} democracy clustering'.format(args.c))

    docs_representes = []