Пример #1
0
def __20ng_classification():
    datadir = 'e:/data/emadr/20ng_bydate'
    all_vecs_file_name = os.path.join(datadir, 'vecs/dew-vecs-0_8-50.bin')
    # all_vecs_file_name = 'e:/data/emadr/20ng_bydate/vecs/dedw-vecs.bin'
    split_labels_file_name = os.path.join(datadir,
                                          'bindata/dataset-split-labels.bin')
    train_label_file = os.path.join(datadir, 'bindata/train-labels.bin')
    test_label_file = os.path.join(datadir, 'bindata/test-labels.bin')
    train_vecs_file_name = os.path.join(datadir, 'bindata/train-dedw-vecs.bin')
    test_vecs_file_name = os.path.join(datadir, 'bindata/test-dedw-vecs.bin')
    dst_y_pred_file = os.path.join(datadir, 'bindata/ypred-emadr.bin')

    dataarange.split_vecs(all_vecs_file_name,
                          split_labels_file_name,
                          train_vecs_file_name,
                          test_vecs_file_name,
                          train_label=0,
                          test_label=2)
    # doc_classification_lr(train_vecs_file_name, train_label_file, test_vecs_file_name,
    #                       test_label_file, 0, -1)
    y_pred_test = doc_classification_svm(train_vecs_file_name,
                                         train_label_file, test_vecs_file_name,
                                         0, -1)
    get_scores_label_file(test_label_file, y_pred_test)
    save_labels(y_pred_test, dst_y_pred_file)
Пример #2
0
def __job_text_vecs_to_bin_classification():
    datadir = 'e:/data/emadr/nyt-less-docs/world'
    minoc = 100
    text_vecs_file = os.path.join(datadir, 'rsm/rsm-hidden-%d.txt' % minoc)
    binfile = os.path.join(datadir, 'rsm/rsm-vecs-%d.bin' % minoc)
    # text_vecs_file = os.path.join(datadir, 'drbm-vecs-100-30.txt')
    # binfile = os.path.join(datadir, 'drbm-vecs.bin')
    text_vecs_to_bin(text_vecs_file, binfile)

    # all_vecs_file_name = os.path.join(datadir, 'rsm/drbm-vecs.bin')
    # split_labels_file_name = os.path.join(datadir, 'bindata/data-split-labels.bin')
    split_labels_file_name = os.path.join(datadir,
                                          'bindata/dataset-split-labels.bin')
    train_label_file = os.path.join(datadir, 'bindata/train-labels.bin')
    test_label_file = os.path.join(datadir, 'bindata/test-labels.bin')
    train_vecs_file_name = os.path.join(datadir, 'rsm/train-rsm-vecs.bin')
    test_vecs_file_name = os.path.join(datadir, 'rsm/test-rsm-vecs.bin')

    dataarange.split_vecs(binfile,
                          split_labels_file_name,
                          train_vecs_file_name,
                          test_vecs_file_name,
                          train_label=0,
                          test_label=2)
    # doc_classification_lr(train_vecs_file_name, train_label_file, test_vecs_file_name,
    #                       test_label_file, 0, -1)
    y_pred_test = doc_classification_svm(train_vecs_file_name,
                                         train_label_file, test_vecs_file_name,
                                         0, -1)
    get_scores_label_file(test_label_file, y_pred_test)
Пример #3
0
def __job_split_vecs():
    datadir = 'e:/data/emadr/nyt-less-docs/business/'
    all_vecs_file_name = os.path.join(datadir, 'vecs/dew-vecs-100-0_15-40.bin')
    split_labels_file_name = os.path.join(datadir,
                                          'bindata/dataset-split-labels.bin')
    train_vecs_file_name = os.path.join(datadir, 'vecs/train-dedw-vecs.bin')
    test_vecs_file_name = os.path.join(datadir, 'vecs/test-dedw-vecs.bin')

    dataarange.split_vecs(all_vecs_file_name,
                          split_labels_file_name,
                          train_vecs_file_name,
                          test_vecs_file_name,
                          train_label=0,
                          test_label=2)
Пример #4
0
def __job_train_classification():
    docs_file = 'e:/data/emadr/20ng_bydate/tokenizedlc/docs-tokenized-lc-2.txt'
    # method = 'pvdm'
    method = 'pvdbow'
    dm = 1 if method == 'pvdm' else 0
    # dst_vecs_file = 'e:/data/emadr/20ng_bydate/bindata/pvdbow-vecs.bin'
    dst_vecs_file = 'e:/data/emadr/20ng_bydate/bindata/%s-vecs.bin' % method

    dst_result_file = 'e:/data/emadr/20ng_bydate/pvdm-results.txt'

    split_labels_file_name = 'e:/data/emadr/20ng_bydate/bindata/dataset-split-labels.bin'
    train_vecs_file_name = 'e:/data/emadr/20ng_bydate/bindata/train-%s-vecs.bin' % method
    test_vecs_file_name = 'e:/data/emadr/20ng_bydate/bindata/test-%s-vecs.bin' % method

    data_dir = 'e:/data/emadr/20ng_bydate/bindata/'
    train_label_file = os.path.join(data_dir, 'train-labels.bin')
    test_label_file = os.path.join(data_dir, 'test-labels.bin')

    # min_counts = [2, 5, 10, 20]
    # def_alphas = [0.1, 0.01, 0.001]
    # nss = [0, 5, 10, 15]
    min_counts = [5]
    def_alphas = [0.01]
    nss = [0]
    niters = 100
    fout = open(dst_result_file, 'wb')
    for min_count in min_counts:
        for def_alpha in def_alphas:
            for ns in nss:
                model = train_doc_vectors(docs_file, min_count=min_count, def_alpha=def_alpha, ns=ns, dm=dm,
                                          niter=niters)
                save_doc2vec_vectors(model, dst_vecs_file)
                dataarange.split_vecs(dst_vecs_file, split_labels_file_name, train_vecs_file_name, test_vecs_file_name,
                                      train_label=0, test_label=2)

                y_pred_test = doc_classification_svm(train_vecs_file_name, train_label_file,
                                                     test_vecs_file_name, 0, -1)
                acc, prec, recall, f1 = get_scores_label_file(test_label_file, y_pred_test)
                print '%d\t%f\t%d\t%d' % (min_count, def_alpha, ns, niters)
                fout.write('%d\t%f\t%d\n' % (min_count, def_alpha, ns))
                fout.write('%f\t%f\t%f\t%f\n' % (acc, prec, recall, f1))
                fout.flush()
    fout.close()
Пример #5
0
def __train_pv_20ng():
    docs_file = 'e:/data/emadr/20ng_bydate/tokenizedlc/docs-tokenized-lc-2.txt'
    method = 'pvdm'
    dst_vecs_file = 'e:/data/emadr/20ng_bydate/bindata/%s-vecs.bin' % method

    niters = 40
    def_alpha = 0.01
    min_count = 5
    ns = 0
    dm = 1 if method == 'pvdm' else 0
    model = train_doc_vectors(docs_file, min_count=min_count, def_alpha=def_alpha, ns=ns, dm=dm,
                              niter=niters)
    # dst_vecs_file = 'e:/data/emadr/20ng_bydate/bindata/pvdbow-vecs.bin'
    save_doc2vec_vectors(model, dst_vecs_file)

    split_labels_file_name = 'e:/data/emadr/20ng_bydate/bindata/dataset-split-labels.bin'
    train_vecs_file_name = 'e:/data/emadr/20ng_bydate/bindata/train-%s-vecs.bin' % method
    test_vecs_file_name = 'e:/data/emadr/20ng_bydate/bindata/test-%s-vecs.bin' % method
    dataarange.split_vecs(dst_vecs_file, split_labels_file_name, train_vecs_file_name, test_vecs_file_name,
                          train_label=0, test_label=2)
Пример #6
0
def __nyt_classification():
    # datadir = 'e:/data/emadr/nyt-world-full/processed/'
    datadir = 'f:/data/emadr/nyt-less-docs/world/'
    all_vecs_file_name = os.path.join(datadir, 'vecs/dew-vecs-0_9-40.bin')
    split_labels_file_name = os.path.join(datadir,
                                          'bindata/dataset-split-labels.bin')
    train_label_file = os.path.join(datadir, 'bindata/train-labels.bin')
    test_label_file = os.path.join(datadir, 'bindata/test-labels.bin')
    train_vecs_file_name = os.path.join(datadir, 'vecs/train-dedw-vecs.bin')
    test_vecs_file_name = os.path.join(datadir, 'vecs/test-dedw-vecs.bin')

    dataarange.split_vecs(all_vecs_file_name,
                          split_labels_file_name,
                          train_vecs_file_name,
                          test_vecs_file_name,
                          train_label=0,
                          test_label=2)
    y_pred_test = doc_classification_svm(train_vecs_file_name,
                                         train_label_file, test_vecs_file_name,
                                         0, -1)
    get_scores_label_file(test_label_file, y_pred_test)
Пример #7
0
def __train_pv_nyt():
    method = 'pvdm'
    # data_dir = 'e:/data/emadr/nyt-world-full/processed/'
    data_dir = 'e:/data/emadr/nyt-less-docs/business/'
    docs_file = os.path.join(data_dir, 'tokenizedlc/docs-tokenized-lc-2.txt')
    dst_vecs_file = os.path.join(data_dir, 'bindata/%s-vecs.bin' % method)

    niters = 40
    def_alpha = 0.01
    min_count = 5
    ns = 0
    dm = 1 if method == 'pvdm' else 0
    model = train_doc_vectors(docs_file, min_count=min_count, def_alpha=def_alpha, ns=ns, dm=dm,
                              niter=niters)
    save_doc2vec_vectors(model, dst_vecs_file)

    split_labels_file_name = os.path.join(data_dir, 'bindata/dataset-split-labels.bin')
    train_vecs_file_name = os.path.join(data_dir, 'bindata/train-%s-vecs.bin' % method)
    test_vecs_file_name = os.path.join(data_dir, 'bindata/test-%s-vecs.bin' % method)
    dataarange.split_vecs(dst_vecs_file, split_labels_file_name, train_vecs_file_name, test_vecs_file_name,
                          train_label=0, test_label=2)
Пример #8
0
def __nyt_classification():
    # datadir = 'e:/data/emadr/20ng_bydate'
    # datadir = 'e:/data/emadr/nyt-all/world'
    datadir = 'e:/data/emadr/nyt-less-docs/arts'
    all_vecs_file_name = os.path.join(datadir, 'rsm/rsm-vecs-70.bin')
    # all_vecs_file_name = os.path.join(datadir, 'rsm/drbm-vecs.bin')
    # split_labels_file_name = os.path.join(datadir, 'bindata/data-split-labels.bin')
    split_labels_file_name = os.path.join(datadir,
                                          'bindata/dataset-split-labels.bin')
    train_label_file = os.path.join(datadir, 'bindata/train-labels.bin')
    test_label_file = os.path.join(datadir, 'bindata/test-labels.bin')
    train_vecs_file_name = os.path.join(datadir, 'rsm/train-rsm-vecs.bin')
    test_vecs_file_name = os.path.join(datadir, 'rsm/test-rsm-vecs.bin')

    dataarange.split_vecs(all_vecs_file_name,
                          split_labels_file_name,
                          train_vecs_file_name,
                          test_vecs_file_name,
                          train_label=0,
                          test_label=2)
    doc_classification_lr(train_vecs_file_name, train_label_file,
                          test_vecs_file_name, test_label_file, 0, -1)
Пример #9
0
def __classification():
    # data_dir = 'e:/data/emadr/nyt-world-full/processed/'
    # data_dir = 'e:/data/emadr/nyt-all/arts/'
    # data_dir = 'e:/data/emadr/nyt-all/business/'
    # data_dir = 'e:/data/emadr/nyt-less-docs/business/bindata/'
    data_dir = 'e:/data/emadr/20ng_bydate/bindata/'
    # method = 'pvdm'
    method = 'pvdbow'

    all_vecs_file = os.path.join(data_dir, '%s-vecs.bin' % method)
    split_labels_file_name = os.path.join(data_dir, 'dataset-split-labels.bin')
    train_label_file = os.path.join(data_dir, 'train-labels.bin')
    test_label_file = os.path.join(data_dir, 'test-labels.bin')
    train_vecs_file_name = os.path.join(data_dir, 'train-%s-vecs.bin' % method)
    test_vecs_file_name = os.path.join(data_dir, 'test-%s-vecs.bin' % method)
    dst_y_pred_file = os.path.join(data_dir, 'ypred-%s.bin' % method)

    dataarange.split_vecs(all_vecs_file, split_labels_file_name, train_vecs_file_name, test_vecs_file_name,
                          train_label=0, test_label=2)
    # doc_classification_lr(train_vecs_file_name, train_label_file, test_vecs_file_name,
    #                       test_label_file, 0, -1)
    y_pred_test = doc_classification_svm(train_vecs_file_name, train_label_file, test_vecs_file_name, 0, -1)
    get_scores_label_file(test_label_file, y_pred_test)
    ioutils.save_labels(y_pred_test, dst_y_pred_file)