예제 #1
0
def preprocess_data(train_infile, test_infile, output_dir, vocab_size, use_mallet_stopwords=False, replace_num=False, lemmatize=False, log_transform=False, keep_nonalphanum=False, only_alpha=False, min_length=1):

    print("Loading SpaCy")
    parser = English()
    train_X, train_vocab, train_indices, train_y, label_list, word_freqs, train_dat, train_mallet_strings, train_sage_output, train_svm_strings = load_and_process_data(train_infile, vocab_size, parser, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, lemmatize=lemmatize, log_transform=log_transform, keep_nonalphanum=keep_nonalphanum, only_alpha=only_alpha, min_length=min_length)
    test_X, _, test_indices, test_y, _, _, test_dat, test_mallet_strings, test_sage_output, test_svm_strings = load_and_process_data(test_infile, vocab_size, parser, vocab=train_vocab, label_list=label_list, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, lemmatize=lemmatize, log_transform=log_transform, keep_nonalphanum=keep_nonalphanum, only_alpha=only_alpha, min_length=min_length)
    fh.save_sparse(train_X, os.path.join(output_dir, 'train.npz'))
    fh.write_to_json(train_vocab, os.path.join(output_dir, 'train.vocab.json'))
    fh.write_to_json(train_indices, os.path.join(output_dir, 'train.indices.json'))
    fh.save_sparse(train_y, os.path.join(output_dir, 'train.labels.npz'))
    fh.save_sparse(test_X, os.path.join(output_dir, 'test.npz'))
    fh.write_to_json(test_indices, os.path.join(output_dir, 'test.indices.json'))
    fh.save_sparse(test_y, os.path.join(output_dir, 'test.labels.npz'))
    n_labels = len(label_list)
    label_dict = dict(zip(range(n_labels), label_list))
    fh.write_to_json(label_dict, os.path.join(output_dir, 'train.label_list.json'))
    fh.write_to_json(list(word_freqs.tolist()), os.path.join(output_dir, 'train.word_freq.json'))

    # save output for David Blei's lda-c code
    fh.write_list_to_text(train_dat, os.path.join(output_dir, 'train.dat'))
    fh.write_list_to_text(test_dat, os.path.join(output_dir, 'test.dat'))

    # save output for Mallet
    fh.write_list_to_text(train_mallet_strings, os.path.join(output_dir, 'train.mallet.txt'))
    fh.write_list_to_text(test_mallet_strings, os.path.join(output_dir, 'test.mallet.txt'))

    # save output for Jacob Eisenstein's SAGE code:
    train_sage_output['te_data'] = test_sage_output['tr_data']
    train_sage_output['te_aspect'] = test_sage_output['tr_aspect']
    savemat(os.path.join(output_dir, 'sage.mat'), train_sage_output)

    # save output in SVM format
    fh.write_list_to_text(train_svm_strings, os.path.join(output_dir, 'train.svm.txt'))
    fh.write_list_to_text(test_svm_strings, os.path.join(output_dir, 'test.svm.txt'))
예제 #2
0
def preprocess_data(train_infile, output_dir, vocab_size, label_type, test_prop, use_mallet_stopwords=False, replace_num=False, group_size=1, only_alpha=False, min_length=3):

    print("Loading SpaCy")
    parser = English()

    with codecs.open(train_infile, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    n_items = len(lines)
    n_test = int(test_prop * n_items)
    n_train = n_items - n_test
    train_indices = np.random.choice(range(n_items), n_train, replace=False)
    test_indices = list(set(range(n_items)) - set(train_indices))

    train_X, train_vocab, train_indices, train_y, label_list, word_freqs, train_dat, train_mallet_strings, train_sage_output, train_svm_strings, label_index = load_and_process_data(train_infile, vocab_size, parser, label_type, train_indices, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, group_size=group_size, only_alpha=only_alpha, min_length=min_length)
    test_X, _, test_indices, test_y, _, _, test_dat, test_mallet_strings, test_sage_output, test_svm_strings, _ = load_and_process_data(train_infile, vocab_size, parser, label_type, test_indices, vocab=train_vocab, label_list=label_list, label_index=label_index, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, group_size=group_size, only_alpha=only_alpha, min_length=min_length)
    fh.save_sparse(train_X, os.path.join(output_dir, 'train.npz'))
    fh.write_to_json(train_vocab, os.path.join(output_dir, 'train.vocab.json'))
    fh.write_to_json(train_indices, os.path.join(output_dir, 'train.indices.json'))
    fh.save_sparse(train_y, os.path.join(output_dir, 'train.labels.npz'))
    fh.save_sparse(test_X, os.path.join(output_dir, 'test.npz'))
    fh.write_to_json(test_indices, os.path.join(output_dir, 'test.indices.json'))
    fh.save_sparse(test_y, os.path.join(output_dir, 'test.labels.npz'))
    fh.write_to_json(list(word_freqs.tolist()), os.path.join(output_dir, 'train.word_freq.json'))
    fh.write_list_to_text(train_dat, os.path.join(output_dir, 'train.dat'))
    n_labels = len(label_list)
    label_dict = dict(zip(range(n_labels), label_list))
    fh.write_to_json(label_dict, os.path.join(output_dir, 'train.label_list.json'))

    fh.write_list_to_text(train_mallet_strings, os.path.join(output_dir, 'train.mallet.txt'))
    fh.write_list_to_text(test_mallet_strings, os.path.join(output_dir, 'test.mallet.txt'))

    train_sage_output['te_data'] = test_sage_output['tr_data']
    train_sage_output['te_aspect'] = test_sage_output['tr_aspect']
    savemat(os.path.join(output_dir, 'sage.mat'), train_sage_output)

    fh.write_list_to_text(train_svm_strings, os.path.join(output_dir, 'train.svm.txt'))
    fh.write_list_to_text(test_svm_strings, os.path.join(output_dir, 'test.svm.txt'))
예제 #3
0
def process_subset(items, parsed, label_fields, label_lists, vocab, output_dir, output_prefix):
    n_items = len(items)
    vocab_size = len(vocab)
    vocab_index = dict(zip(vocab, range(vocab_size)))

    ids = []
    for i, item in enumerate(items):
        if 'id' in item:
            ids.append(item['id'])
    if len(ids) != n_items:
        ids = [str(i) for i in range(n_items)]

    # create a label index using string representations
    for label_field in label_fields:
        label_list = label_lists[label_field]
        n_labels = len(label_list)
        label_list_strings = [str(label) for label in label_list]
        label_index = dict(zip(label_list_strings, range(n_labels)))

        # convert labels to a data frame
        if n_labels > 0:
            label_matrix = np.zeros([n_items, n_labels], dtype=int)
            label_vector = np.zeros(n_items, dtype=int)

            for i, item in enumerate(items):
                label = item[label_field]
                label_matrix[i, label_index[str(label)]] = 1
                label_vector[i] = label_index[str(label)]

            labels_df = pd.DataFrame(label_matrix, index=ids, columns=label_list_strings)
            labels_df.to_csv(os.path.join(output_dir, output_prefix + '.' + label_field + '.csv'))
            label_vector_df = pd.DataFrame(label_vector, index=ids, columns=[label_field])
            if n_labels == 2:
                label_vector_df.to_csv(os.path.join(output_dir, output_prefix + '.' + label_field + '_vector.csv'))

    rows = []
    cols = []
    vals = []

    dat_strings = []
    dat_labels = []
    mallet_strings = []
    fast_text_lines = []

    counter = Counter()
    word_counter = Counter()
    doc_lines = []
    print("Converting to count representations")
    for i, words in enumerate(parsed):
        # get the vocab indices of words that are in the vocabulary
        indices = [vocab_index[word] for word in words if word in vocab_index]
        word_subset = [word for word in words if word in vocab_index]

        counter.clear()
        counter.update(indices)
        word_counter.clear()
        word_counter.update(word_subset)

        if len(counter.keys()) > 0:
            # udpate the counts
            mallet_strings.append(str(i) + '\t' + 'en' + '\t' + ' '.join(word_subset))

            dat_string = str(int(len(counter))) + ' '
            dat_string += ' '.join([str(k) + ':' + str(int(v)) for k, v in zip(list(counter.keys()), list(counter.values()))])
            dat_strings.append(dat_string)

            # for dat formart, assume just one label is given
            if len(label_fields) > 0:
                label = items[i][label_fields[-1]]
                dat_labels.append(str(label_index[str(label)]))

            values = list(counter.values())
            rows.extend([i] * len(counter))
            token_indices = sorted(counter.keys())
            cols.extend(list(token_indices))
            vals.extend([counter[k] for k in token_indices])

    # convert to a sparse representation
    sparse_X = sparse.coo_matrix((vals, (rows, cols)), shape=(n_items, vocab_size)).tocsr()
    fh.save_sparse(sparse_X, os.path.join(output_dir, output_prefix + '.npz'))

    print("Size of {:s} document-term matrix:".format(output_prefix), sparse_X.shape)

    fh.write_to_json(ids, os.path.join(output_dir, output_prefix + '.ids.json'))

    # save output for Mallet
    fh.write_list_to_text(mallet_strings, os.path.join(output_dir, output_prefix + '.mallet.txt'))

    # save output for David Blei's LDA/SLDA code
    fh.write_list_to_text(dat_strings, os.path.join(output_dir, output_prefix + '.data.dat'))
    if len(dat_labels) > 0:
        fh.write_list_to_text(dat_labels, os.path.join(output_dir, output_prefix + '.' + label_field + '.dat'))

    # save output for Jacob Eisenstein's SAGE code:
    #sparse_X_sage = sparse.csr_matrix(X, dtype=float)
    vocab_for_sage = np.zeros((vocab_size,), dtype=np.object)
    vocab_for_sage[:] = vocab

    # for SAGE, assume only a single label has been given
    if len(label_fields) > 0:
        # convert array to vector of labels for SAGE
        sage_aspect = np.argmax(np.array(labels_df.values, dtype=float), axis=1) + 1
    else:
        sage_aspect = np.ones([n_items, 1], dtype=float)
    sage_no_aspect = np.array([n_items, 1], dtype=float)
    widx = np.arange(vocab_size, dtype=float) + 1

    return sparse_X, sage_aspect, sage_no_aspect, widx, vocab_for_sage
예제 #4
0
def process_subset(items, parsed, label_field, label_list, vocab, output_dir,
                   output_prefix):
    n_items = len(items)
    n_labels = len(label_list)
    vocab_size = len(vocab)
    vocab_index = dict(zip(vocab, range(vocab_size)))

    ids = []
    for i, item in enumerate(items):
        if 'id' in item:
            ids.append(item['id'])
    if len(ids) != n_items:
        ids = [str(i) for i in range(n_items)]

    # create a label index using string representations
    label_list_strings = [str(label) for label in label_list]
    label_index = dict(zip(label_list_strings, range(n_labels)))

    # convert labels to a data frame
    if n_labels > 0:
        label_matrix = np.zeros([n_items, n_labels], dtype=int)
        label_vector = np.zeros(n_items, dtype=int)

        for i, item in enumerate(items):
            id = ids[i]
            label = item[label_field]
            label_matrix[i, label_index[str(label)]] = 1
            label_vector[i] = label_index[str(label)]

        labels_df = pd.DataFrame(label_matrix,
                                 index=ids,
                                 columns=label_list_strings)
        labels_df.to_csv(
            os.path.join(output_dir,
                         output_prefix + '.' + label_field + '.csv'))
        label_vector_df = pd.DataFrame(label_vector,
                                       index=ids,
                                       columns=[label_field])
        label_vector_df.to_csv(
            os.path.join(output_dir, output_prefix + '.label_vector.csv'))

    else:
        print("No labels found")

    X = np.zeros([n_items, vocab_size], dtype=int)

    dat_strings = []
    dat_labels = []
    mallet_strings = []
    fast_text_lines = []

    counter = Counter()
    word_counter = Counter()
    doc_lines = []
    print("Converting to count representations")
    for i, words in enumerate(parsed):
        # get the vocab indices of words that are in the vocabulary
        indices = [vocab_index[word] for word in words if word in vocab_index]
        word_subset = [word for word in words if word in vocab_index]

        counter.clear()
        counter.update(indices)
        word_counter.clear()
        word_counter.update(word_subset)

        if len(counter.keys()) > 0:
            # udpate the counts
            mallet_strings.append(
                str(i) + '\t' + 'en' + '\t' + ' '.join(word_subset))

            dat_string = str(int(len(counter))) + ' '
            dat_string += ' '.join([
                str(k) + ':' + str(int(v))
                for k, v in zip(list(counter.keys()), list(counter.values()))
            ])
            dat_strings.append(dat_string)

            if label_field is not None:
                label = items[i][label_field]
                dat_labels.append(str(label_index[str(label)]))

            values = list(counter.values())
            X[np.ones(len(counter.keys()), dtype=int) * i,
              list(counter.keys())] += values

    # convert to a sparse representation
    sparse_X = sparse.csr_matrix(X)
    fh.save_sparse(sparse_X, os.path.join(output_dir, output_prefix + '.npz'))

    print(sparse_X.shape)
    print(len(dat_strings))

    fh.write_to_json(ids, os.path.join(output_dir,
                                       output_prefix + '.ids.json'))

    # save output for Mallet
    fh.write_list_to_text(
        mallet_strings, os.path.join(output_dir,
                                     output_prefix + '.mallet.txt'))

    # save output for David Blei's LDA/SLDA code
    fh.write_list_to_text(
        dat_strings, os.path.join(output_dir, output_prefix + '.data.dat'))
    if len(dat_labels) > 0:
        fh.write_list_to_text(
            dat_labels,
            os.path.join(output_dir,
                         output_prefix + '.' + label_field + '.dat'))

    # save output for Jacob Eisenstein's SAGE code:
    sparse_X_sage = sparse.csr_matrix(X, dtype=float)
    vocab_for_sage = np.zeros((vocab_size, ), dtype=np.object)
    vocab_for_sage[:] = vocab
    if n_labels > 0:
        # convert array to vector of labels for SAGE
        sage_aspect = np.argmax(np.array(labels_df.values, dtype=float),
                                axis=1) + 1
    else:
        sage_aspect = np.ones([n_items, 1], dtype=float)
    sage_no_aspect = np.array([n_items, 1], dtype=float)
    widx = np.arange(vocab_size, dtype=float) + 1

    return sparse_X_sage, sage_aspect, sage_no_aspect, widx, vocab_for_sage
예제 #5
0
def main():
    usage = "%prog infile.txt output_dir output_prefix"
    parser = OptionParser(usage=usage)
    parser.add_option(
        '-m',
        dest='max_lines',
        default=None,
        help=
        'Quit after processing this many lines (documents): default=%default')
    #parser.add_option('--lower', action="store_true", dest="lower", default=False,
    #                  help='Lower case words: default=%default')

    (options, args) = parser.parse_args()

    infile = args[0]
    output_dir = args[1]
    output_prefix = args[2]

    max_lines = options.max_lines
    if max_lines is not None:
        max_lines = int(max_lines)

    vocab = []
    vocab_index = {}

    counter = Counter()

    # start by converting each document into a dict of word counts, building a vocab as we go
    rows = []
    cols = []
    values = []
    n_docs = 0
    print("Counting words...")
    with codecs.open(infile, 'r', encoding='utf-8') as f:
        for line_i, line in enumerate(f):
            line = line.strip()
            if len(line) > 0:
                if max_lines is not None and line_i >= max_lines:
                    print("Quitting after processing %d lines" % (line_i + 1))
                    break
                if n_docs % 1000 == 0 and n_docs > 0:
                    print(n_docs)
                # split on white space
                words = line.split()
                # filter out everything that's not just letters, and lower case
                words = [
                    word.lower() for word in words
                    if re.match('^[a-zA-Z]*$', word) is not None
                ]
                # look for new words and add them to the vocabulary
                new_words = [word for word in words if word not in vocab_index]
                if len(new_words) > 0:
                    vocab_size = len(vocab)
                    #print("Adding %d words to vocab" % len(new_words))
                    #print("New total should be %d" % (vocab_size + len(new_words)))
                    vocab.extend(new_words)
                    vocab_index.update(
                        dict(
                            zip(new_words,
                                range(vocab_size,
                                      vocab_size + len(new_words)))))
                indices = [vocab_index[word] for word in words]
                counter.clear()
                counter.update(indices)
                keys = counter.keys()
                counts = counter.values()
                rows.extend([line_i] * len(keys))
                cols.extend(keys)
                values.extend(counts)
                n_docs += 1

    print("Processed %d documents" % n_docs)
    print("Size of final vocab = %d" % len(vocab))
    print("Saving counts...")

    # now convert these count vectors in to a giant sparse matrix
    counts = sparse.coo_matrix((values, (rows, cols)),
                               shape=(n_docs, len(vocab)))
    fh.save_sparse(counts, os.path.join(output_dir, output_prefix + '.npz'))
    fh.write_to_json(vocab,
                     os.path.join(output_dir, output_prefix + '.vocab.json'))
    print("Done")
예제 #6
0
def process_subset(
    items,
    ids,
    parsed,
    labels,
    label_fields,
    label_lists,
    vocab,
    output_dir,
    output_prefix,
    count_dtype=np.int,
):
    n_items = len(items)
    vocab_size = len(vocab)
    vocab_index = dict(zip(vocab, range(vocab_size)))

    if not ids or len(ids) != n_items:
        ids = [str(i) for i in range(n_items)]

    # create a label index using string representations
    if labels:
        labels_df = pd.DataFrame.from_records(labels, index=ids)

        for label_field in label_fields:
            labels_df_subset = pd.get_dummies(labels_df[label_field])

            # for any classes not present in the subset, add 0 columns
            # (handles case where classes appear in only one of train or test)
            for category in label_lists[label_field]:
                if category not in labels_df_subset:
                    labels_df_subset[category] = 0

            labels_df_subset.to_csv(
                os.path.join(output_dir,
                             output_prefix + "." + label_field + ".csv"))
            if labels_df[label_field].nunique() == 2:
                labels_df_subset.iloc[:, 1].to_csv(
                    os.path.join(
                        output_dir,
                        output_prefix + "." + label_field + "_vector.csv"),
                    header=[label_field],
                )
            # used later
            label_index = dict(
                zip(labels_df_subset.columns, range(len(labels_df_subset))))
    X = np.zeros([n_items, vocab_size], dtype=count_dtype)

    dat_strings = []
    dat_labels = []
    mallet_strings = []
    fast_text_lines = []

    counter = Counter()
    word_counter = Counter()
    doc_lines = []
    print("Converting to count representations")
    for i, words in enumerate(parsed):
        # get the vocab indices of words that are in the vocabulary
        words = words.split()

        indices = [vocab_index[word] for word in words if word in vocab_index]
        word_subset = [word for word in words if word in vocab_index]

        counter.clear()
        counter.update(indices)
        word_counter.clear()
        word_counter.update(word_subset)

        if len(counter.keys()) > 0:
            # udpate the counts
            mallet_strings.append(
                str(i) + "\t" + "en" + "\t" + " ".join(word_subset))

            dat_string = str(int(len(counter))) + " "
            dat_string += " ".join([
                str(k) + ":" + str(int(v))
                for k, v in zip(list(counter.keys()), list(counter.values()))
            ])
            dat_strings.append(dat_string)

            # for dat formart, assume just one label is given
            if len(label_fields) > 0:
                label = labels[i][label_fields[-1]]
                dat_labels.append(str(label_index[str(label)]))
            values = np.array(list(counter.values()), dtype=count_dtype)
            X[np.ones(len(counter.keys()), dtype=int) * i,
              list(counter.keys())] += values

    # convert to a sparse representation
    sparse_X = sparse.csr_matrix(X)
    fh.save_sparse(sparse_X, os.path.join(output_dir, output_prefix + ".npz"))

    print("Size of {:s} document-term matrix:".format(output_prefix),
          sparse_X.shape)

    fh.write_to_json(ids, os.path.join(output_dir,
                                       output_prefix + ".ids.json"))

    # save output for Mallet
    fh.write_list_to_text(
        mallet_strings, os.path.join(output_dir,
                                     output_prefix + ".mallet.txt"))

    # save output for David Blei's LDA/SLDA code
    fh.write_list_to_text(
        dat_strings, os.path.join(output_dir, output_prefix + ".data.dat"))
    if len(dat_labels) > 0:
        fh.write_list_to_text(
            dat_labels,
            os.path.join(output_dir,
                         output_prefix + "." + label_field + ".dat"),
        )

    # save output for Jacob Eisenstein's SAGE code:
    sparse_X_sage = sparse.csr_matrix(X, dtype=float)
    vocab_for_sage = np.zeros((vocab_size, ), dtype=np.object)
    vocab_for_sage[:] = vocab

    # for SAGE, assume only a single label has been given
    if len(label_fields) > 0:
        # convert array to vector of labels for SAGE
        sage_aspect = (
            np.argmax(np.array(labels_df_subset.values, dtype=float), axis=1) +
            1)
    else:
        sage_aspect = np.ones([n_items, 1], dtype=float)
    sage_no_aspect = np.array([n_items, 1], dtype=float)
    widx = np.arange(vocab_size, dtype=float) + 1

    return sparse_X_sage, sage_aspect, sage_no_aspect, widx, vocab_for_sage