Exemplo n.º 1
0
def load_data(input_dir, input_prefix, log_file, vocab=None):
    print("Loading data")
    temp = fh.load_sparse(os.path.join(input_dir,
                                       input_prefix + '.npz')).todense()
    X = np.array(temp, dtype='float32')
    if vocab is None:
        vocab = fh.read_json(
            os.path.join(input_dir, input_prefix + '.vocab.json'))
    #lists_of_indices = fh.read_json(os.path.join(input_dir, input_prefix + '.indices.json'))
    #index_arrays = [np.array(l, dtype='int32') for l in lists_of_indices]
    n_items, vocab_size = X.shape
    #print(n_items, len(index_arrays))
    assert vocab_size == len(vocab)
    #assert n_items == len(index_arrays)
    print(X[0, :])

    label_file = os.path.join(input_dir, input_prefix + '.labels.npz')
    if os.path.exists(label_file):
        print("Loading labels")
        temp = fh.load_sparse(label_file).todense()
        labels = np.array(temp, dtype='float32')
    else:
        print("Label file not found")
        labels = np.zeros([n_items, 1], dtype='float32')
    assert len(labels) == n_items

    counts_sum = X.sum(axis=0)
    order = list(np.argsort(counts_sum).tolist())
    order.reverse()
    print("Most common words: ", ' '.join([vocab[i] for i in order[:10]]))

    return X, vocab, labels
Exemplo n.º 2
0
def load_word_counts(input_dir, input_prefix, vocab=None):
    print("Loading data")
    # laod the word counts and convert to a dense matrix
    #temp = fh.load_sparse(os.path.join(input_dir, input_prefix + '.npz')).todense()
    #X = np.array(temp, dtype='float32')
    X = fh.load_sparse(os.path.join(input_dir, input_prefix + '.npz')).tocsr()
    # load the vocabulary
    if vocab is None:
        vocab = fh.read_json(os.path.join(input_dir, input_prefix + '.vocab.json'))
    n_items, vocab_size = X.shape
    assert vocab_size == len(vocab)
    print("Loaded %d documents with %d features" % (n_items, vocab_size))

    ids = fh.read_json(os.path.join(input_dir, input_prefix + '.ids.json'))

    # filter out empty documents and return a boolean selector for filtering labels and covariates
    #row_selector = np.array(X.sum(axis=1) > 0, dtype=bool)
    row_sums = np.array(X.sum(axis=1)).reshape((n_items,))
    row_selector = np.array(row_sums > 0, dtype=bool)

    print("Found %d non-empty documents" % np.sum(row_selector))
    X = X[row_selector, :]
    ids = [doc_id for i, doc_id in enumerate(ids) if row_selector[i]]

    return X, vocab, row_selector, ids
Exemplo n.º 3
0
def load_and_compute_npmi(topics_file,
                          ref_vocab_file,
                          ref_counts_file,
                          n_vals,
                          cols_to_skip=0,
                          output_file=None):
    print("Loading reference counts")
    ref_vocab = fh.read_json(ref_vocab_file)
    ref_counts = fh.load_sparse(ref_counts_file).tocsc()
    compute_npmi(topics_file, ref_vocab, ref_counts, n_vals, cols_to_skip,
                 output_file)
Exemplo n.º 4
0
def load_data(input_dir: str,
              input_prefix: str,
              vocab_size=None,
              vocab=None,
              col_sel=None):
    print("Loading data")
    temp = fh.load_sparse(os.path.join(input_dir,
                                       input_prefix + '.npz')).todense()
    n_items, temp_size = temp.shape
    print("Loaded %d documents with %d features" % (n_items, temp_size))

    if vocab is None:
        col_sel = None
        vocab = fh.read_json(
            os.path.join(input_dir, input_prefix + '.vocab.json'))
        # filter vocabulary by word frequency
        if vocab_size is not None:
            print("Filtering vocabulary to the most common %d terms" %
                  int(vocab_size))
            col_sums = np.array(temp.sum(axis=0)).reshape((len(vocab), ))
            order = list(np.argsort(col_sums))
            order.reverse()
            col_sel = np.array(np.zeros(len(vocab)), dtype=bool)
            for i in range(int(vocab_size)):
                col_sel[order[i]] = True
            temp = temp[:, col_sel]
            vocab = [word for i, word in enumerate(vocab) if col_sel[i]]

    elif col_sel is not None:
        print("Using given vocabulary")
        temp = temp[:, col_sel]

    X = np.array(temp, dtype='float32')
    n_items, vocab_size = X.shape
    assert vocab_size == len(vocab)
    print("Loaded %d documents with %d features" % (n_items, vocab_size))

    # filter out empty documents
    non_empty_sel = X.sum(axis=1) > 0
    print("Found %d non-empty documents" % np.sum(non_empty_sel))
    X = X[non_empty_sel, :]

    counts_sum = X.sum(axis=0)
    order = list(np.argsort(counts_sum).tolist())
    order.reverse()
    print("Most common words: ", ' '.join([vocab[i] for i in order[:10]]))
    num = list(vocab[i] for i in order[:200])
    return X, vocab, col_sel, num
Exemplo n.º 5
0
def load_word_counts(input_dir, input_prefix, vocab=None):
    print("Loading data")
    # laod the word counts and convert to a dense matrix
    temp = fh.load_sparse(os.path.join(input_dir,
                                       input_prefix + '.npz')).todense()
    X = np.array(temp, dtype='float32')
    # load the vocabulary
    if vocab is None:
        vocab = fh.read_json(
            os.path.join(input_dir, input_prefix + '.vocab.json'))
    n_items, vocab_size = X.shape
    assert vocab_size == len(vocab)
    print("Loaded %d documents with %d features" % (n_items, vocab_size))

    # filter out empty documents and return a boolean selector for filtering labels and covariates
    row_selector = X.sum(axis=1) > 0
    print("Found %d non-empty documents" % np.sum(row_selector))
    X = X[row_selector, :]

    return X, vocab, row_selector
Exemplo n.º 6
0
def load_data(input_dir, input_prefix, label_file_name=None, covar_file_names=None, vocab_size=None, vocab=None, col_sel=None):
    print("Loading data")
    temp = fh.load_sparse(os.path.join(input_dir, input_prefix + '.npz')).todense()
    n_items, temp_size = temp.shape
    print("Loaded %d documents with %d features" % (n_items, temp_size))

    if vocab is None:
        col_sel = None
        vocab = fh.read_json(os.path.join(input_dir, input_prefix + '.vocab.json'))
        # filter vocabulary by word frequency
        if vocab_size is not None:
            print("Filtering vocabulary to the most common %d terms" % int(vocab_size))
            col_sums = np.array(temp.sum(axis=0)).reshape((len(vocab), ))
            order = list(np.argsort(col_sums))
            order.reverse()
            col_sel = np.array(np.zeros(len(vocab)), dtype=bool)
            for i in range(int(vocab_size)):
                col_sel[order[i]] = True
            temp = temp[:, col_sel]
            vocab = [word for i, word in enumerate(vocab) if col_sel[i]]

    elif col_sel is not None:
        print("Using given vocabulary")
        temp = temp[:, col_sel]

    X = np.array(temp, dtype='float32')
    n_items, vocab_size = X.shape
    assert vocab_size == len(vocab)
    print("Loaded %d documents with %d features" % (n_items, vocab_size))

    # filter out empty documents
    non_empty_sel = X.sum(axis=1) > 0
    print("Found %d non-empty documents" % np.sum(non_empty_sel))
    X = X[non_empty_sel, :]
    n_items, vocab_size = X.shape

    if label_file_name is not None:
        label_file = os.path.join(input_dir, input_prefix + '.' + label_file_name + '.csv')
        if os.path.exists(label_file):
            print("Loading labels from", label_file)
            temp = pd.read_csv(label_file, header=0, index_col=0)
            label_names = temp.columns
            if 'NA' in label_names:
                na_label_index = list(label_names).index('NA')
            else:
                na_label_index = len(label_names) + 1
            labels = np.array(temp.values)
            labels = labels[non_empty_sel, :]
            n, n_labels = labels.shape
            assert n == n_items
            print("%d labels" % n_labels)
        else:
            print("Label file not found:", label_file)
            sys.exit()
        if (np.sum(labels, axis=1) == 1).all() and (np.sum(labels == 0) + np.sum(labels == 1) == labels.size):
            label_type = 'categorical'
        elif np.sum(labels == 0) + np.sum(labels == 1) == labels.size:
            label_type = 'bernoulli'
        else:
            label_type = 'real'
        print("Found labels of type %s" % label_type)

    else:
        labels = None
        label_names = None
        label_type = None
        na_label_index = None

    if covar_file_names is not None:
        covariate_list = []
        covariate_names_list = []
        covar_file_names = covar_file_names.split(',')
        for covar_file_name in covar_file_names:
            covariates_file = os.path.join(input_dir, input_prefix + '.' + covar_file_name + '.csv')
            if os.path.exists(covariates_file):
                print("Loading covariates from", covariates_file)
                temp = pd.read_csv(covariates_file, header=0, index_col=0)
                covariate_names = temp.columns
                covariates = np.array(temp.values, dtype=np.float32)
                covariates = covariates[non_empty_sel, :]
                n, n_covariates = covariates.shape
                assert n == n_items
                covariate_list.append(covariates)
                covariate_names_list.extend(covariate_names)
            else:
                print("Covariates file not found:", covariates_file)
                sys.exit()
        covariates = np.hstack(covariate_list)
        covariate_names = covariate_names_list
        n, n_covariates = covariates.shape

        if (np.sum(covariates, axis=1) == 1).all() and (np.sum(covariates == 0) + np.sum(covariates == 1) == covariates.size):
            covariates_type = 'categorical'
        else:
            covariates_type = 'other'

        print("Found covariates of type %s" % covariates_type)

        assert n == n_items
        print("%d covariates" % n_covariates)
    else:
        covariates = None
        covariate_names = None
        covariates_type = None

    counts_sum = X.sum(axis=0)
    order = list(np.argsort(counts_sum).tolist())
    order.reverse()
    print("Most common words: ", ' '.join([vocab[i] for i in order[:10]]))

    return X, vocab, labels, label_names, na_label_index, label_type, covariates, covariate_names, covariates_type, col_sel
Exemplo n.º 7
0
def load_data(input_dir,
              input_prefix,
              label_file_name=None,
              covar_file_names=None,
              vocab=None):
    print("Loading data")
    temp = fh.load_sparse(os.path.join(input_dir,
                                       input_prefix + '.npz')).todense()
    X = np.array(temp, dtype='float32')
    if vocab is None:
        vocab = fh.read_json(
            os.path.join(input_dir, input_prefix + '.vocab.json'))
    n_items, vocab_size = X.shape
    assert vocab_size == len(vocab)
    print("Loaded %d documents with %d features" % (n_items, vocab_size))

    # filter out empty documents
    non_empty_sel = X.sum(axis=1) > 0
    print("Found %d non-empty documents" % np.sum(non_empty_sel))
    X = X[non_empty_sel, :]
    n_items, vocab_size = X.shape

    if label_file_name is not None:
        label_file = os.path.join(
            input_dir, input_prefix + '.' + label_file_name + '.csv')
        if os.path.exists(label_file):
            print("Loading labels from", label_file)
            temp = pd.read_csv(label_file, header=0, index_col=0)
            label_names = temp.columns
            labels = np.array(temp.values)
            labels = labels[non_empty_sel, :]
            n, n_labels = labels.shape
            assert n == n_items
            print("%d labels" % n_labels)
        else:
            print("Label file not found:", label_file)
            sys.exit()
        if (np.sum(labels, axis=1)
                == 1).all() and (np.sum(labels == 0) + np.sum(labels == 1)
                                 == labels.size):
            label_type = 'categorical'
        elif np.sum(labels == 0) + np.sum(labels == 1) == labels.size:
            label_type = 'bernoulli'
        else:
            label_type = 'real'
        print("Found labels of type %s" % label_type)

    else:
        labels = None
        label_names = None
        label_type = None

    if covar_file_names is not None:
        covariate_list = []
        covariate_names_list = []
        covar_file_names = covar_file_names.split(',')
        for covar_file_name in covar_file_names:
            covariates_file = os.path.join(
                input_dir, input_prefix + '.' + covar_file_name + '.csv')
            if os.path.exists(covariates_file):
                print("Loading covariates from", covariates_file)
                temp = pd.read_csv(covariates_file, header=0, index_col=0)
                covariate_names = temp.columns
                covariates = np.array(temp.values, dtype=np.float32)
                covariates = covariates[non_empty_sel, :]
                n, n_covariates = covariates.shape
                assert n == n_items
                covariate_list.append(covariates)
                covariate_names_list.extend(covariate_names)
            else:
                print("Covariates file not found:", covariates_file)
                sys.exit()
        covariates = np.hstack(covariate_list)
        covariate_names = covariate_names_list
        n, n_covariates = covariates.shape

        if (np.sum(covariates, axis=1)
                == 1).all() and (np.sum(covariates == 0) +
                                 np.sum(covariates == 1) == covariates.size):
            covariates_type = 'categorical'
        else:
            covariates_type = 'other'

        print("Found covariates of type %s" % covariates_type)

        assert n == n_items
        print("%d covariates" % n_covariates)
    else:
        covariates = None
        covariate_names = None
        covariates_type = None

    counts_sum = X.sum(axis=0)
    order = list(np.argsort(counts_sum).tolist())
    order.reverse()
    print("Most common words: ", ' '.join([vocab[i] for i in order[:10]]))

    return X, vocab, labels, label_names, label_type, covariates, covariate_names, covariates_type
    run_parser.add_argument("--dev-folds", type=int)
    run_parser.add_argument("--npmi-words", type=int, default=10)
    run_parser.add_argument("--min-acceptable-npmi", type=float, default=0.)
    run_parser.add_argument(
        "--ext-counts-fpath",
    )
    run_parser.add_argument(
        "--ext-vocab-fpath",
    )
    run_args, additional_args = run_parser.parse_known_args()

    outdir_parser = argparse.ArgumentParser()
    outdir_parser.add_argument("-o")
    outdir_args, _ = outdir_parser.parse_known_args(additional_args)

    nyt_counts = fh.load_sparse(run_args.ext_counts_fpath)
    nyt_vocab = fh.read_json(run_args.ext_vocab_fpath)
    
    np.random.seed(run_args.global_seed)
    run_seeds = iter([
        121958, 671155, 131932, 365838, 259178, 921881, 616685, 919314, 130398,
        5591, 11235, 2020, 19, 8000, 1001, 12345,
    ])
    
    # copy over code
    Path(outdir_args.o).mkdir(parents=True, exist_ok=True)
    shutil.copy("run_scholar.py", Path(outdir_args.o, "run_scholar.py"))
    shutil.copy("scholar.py", Path(outdir_args.o, "scholar.py"))

    if Path(outdir_args.o, "dev_metrics.csv").exists():
        old_path = Path(outdir_args.o, "dev_metrics.csv")
Exemplo n.º 9
0
def get_results_data(
        basedir,
        pattern,
        ignore_cols_with_same_vals=True,
        coherence_reference_dir="/fs/clip-political/scholar/congress_votes_dwnom"
):
    """
    Get the results data in folders matching `pattern` in `basedir`
    """
    dirs = [(p.name, p) for p in Path(basedir).glob(pattern) if p.is_dir()]

    ref_vocab = fh.read_json(Path(coherence_reference_dir, "train.vocab.json"))
    ref_counts = fh.load_sparse(Path(coherence_reference_dir,
                                     "test.npz")).tocsc()

    experiments = pd.DataFrame()
    column_names = []
    for run_name, run_dir in tqdm.tqdm(dirs):

        model_path = Path(run_dir, 'torch_model.pt')
        try:
            checkpoint = torch.load(model_path, map_location='cpu')
        except FileNotFoundError:
            continue

        npmi_internal = None
        try:
            topics = fh.read_text(Path(run_dir, "topic.txt"))
        except FileNotFoundError:
            print(
                f"topics.txt not found for {run_name}. Will not calculate npmi"
            )
            pass
        else:
            npmi_internal = compute_npmi_at_n(
                topics=topics,
                ref_vocab=ref_vocab,
                ref_counts=ref_counts,
                n=10,  # could change?
                silent=True,
            )

        model_time = (datetime.fromtimestamp(
            model_path.stat().st_mtime).strftime('%Y-%m-%d %H:%M'))
        run_data = {
            'run_name':
            run_name,
            'git_hash':
            checkpoint['git_hash'],
            'date':
            model_time,

            # hyperparameters
            **checkpoint['options'].__dict__,  # works if we switch to argparse as well

            # results
            'saved_at_epoch':
            checkpoint['epoch'],
            'accuracy_train':
            read_result_from_file(Path(run_dir, 'accuracy.train.txt')),
            'accuracy_dev':
            read_result_from_file(Path(run_dir, 'accuracy.dev.txt')),
            'accuracy_dev_from_chkpt':
            checkpoint['dev_metrics']['accuracy'],
            'accuracy_test':
            read_result_from_file(Path(run_dir, 'accuracy.test.txt')),
            'perplexity_dev':
            read_result_from_file(Path(run_dir, 'perplexity.dev.txt')),
            'perplexity_test':
            read_result_from_file(Path(run_dir, 'perplexity.test.txt')),
            'maw':
            read_result_from_file(Path(run_dir, 'maw.txt'))
        }

        # keep longest set of cols for data ordering (python>=3.6 keeps dict key order)
        if len(run_data.keys()) > len(column_names):
            column_names = list(run_data.keys())

        experiments = experiments.append(run_data, ignore_index=True)

    # reorder columns
    experiments = experiments[column_names]
    if ignore_cols_with_same_vals:
        # remove any columns where the values have not been altered run-to-run
        # see https://stackoverflow.com/a/39658662/5712749
        nunique_vals = experiments.apply(pd.Series.nunique)
        cols_to_drop = nunique_vals[nunique_vals <= 1].index
        experiments = experiments.drop(cols_to_drop, axis=1)

    return experiments.sort_values(by=['date'])