def preprocess_data(train_infile, test_infile, output_dir, vocab_size, use_mallet_stopwords=False, replace_num=False, lemmatize=False, log_transform=False, keep_nonalphanum=False, only_alpha=False, min_length=1): print("Loading SpaCy") parser = English() train_X, train_vocab, train_indices, train_y, label_list, word_freqs, train_dat, train_mallet_strings, train_sage_output, train_svm_strings = load_and_process_data(train_infile, vocab_size, parser, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, lemmatize=lemmatize, log_transform=log_transform, keep_nonalphanum=keep_nonalphanum, only_alpha=only_alpha, min_length=min_length) test_X, _, test_indices, test_y, _, _, test_dat, test_mallet_strings, test_sage_output, test_svm_strings = load_and_process_data(test_infile, vocab_size, parser, vocab=train_vocab, label_list=label_list, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, lemmatize=lemmatize, log_transform=log_transform, keep_nonalphanum=keep_nonalphanum, only_alpha=only_alpha, min_length=min_length) fh.save_sparse(train_X, os.path.join(output_dir, 'train.npz')) fh.write_to_json(train_vocab, os.path.join(output_dir, 'train.vocab.json')) fh.write_to_json(train_indices, os.path.join(output_dir, 'train.indices.json')) fh.save_sparse(train_y, os.path.join(output_dir, 'train.labels.npz')) fh.save_sparse(test_X, os.path.join(output_dir, 'test.npz')) fh.write_to_json(test_indices, os.path.join(output_dir, 'test.indices.json')) fh.save_sparse(test_y, os.path.join(output_dir, 'test.labels.npz')) n_labels = len(label_list) label_dict = dict(zip(range(n_labels), label_list)) fh.write_to_json(label_dict, os.path.join(output_dir, 'train.label_list.json')) fh.write_to_json(list(word_freqs.tolist()), os.path.join(output_dir, 'train.word_freq.json')) # save output for David Blei's lda-c code fh.write_list_to_text(train_dat, os.path.join(output_dir, 'train.dat')) fh.write_list_to_text(test_dat, os.path.join(output_dir, 'test.dat')) # save output for Mallet fh.write_list_to_text(train_mallet_strings, os.path.join(output_dir, 'train.mallet.txt')) fh.write_list_to_text(test_mallet_strings, os.path.join(output_dir, 'test.mallet.txt')) # save output for Jacob Eisenstein's SAGE code: train_sage_output['te_data'] = test_sage_output['tr_data'] train_sage_output['te_aspect'] = test_sage_output['tr_aspect'] savemat(os.path.join(output_dir, 'sage.mat'), train_sage_output) # save output in SVM format fh.write_list_to_text(train_svm_strings, os.path.join(output_dir, 'train.svm.txt')) fh.write_list_to_text(test_svm_strings, os.path.join(output_dir, 'test.svm.txt'))
def preprocess_data(train_infile, output_dir, vocab_size, label_type, test_prop, use_mallet_stopwords=False, replace_num=False, group_size=1, only_alpha=False, min_length=3): print("Loading SpaCy") parser = English() with codecs.open(train_infile, 'r', encoding='utf-8') as f: lines = f.readlines() n_items = len(lines) n_test = int(test_prop * n_items) n_train = n_items - n_test train_indices = np.random.choice(range(n_items), n_train, replace=False) test_indices = list(set(range(n_items)) - set(train_indices)) train_X, train_vocab, train_indices, train_y, label_list, word_freqs, train_dat, train_mallet_strings, train_sage_output, train_svm_strings, label_index = load_and_process_data(train_infile, vocab_size, parser, label_type, train_indices, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, group_size=group_size, only_alpha=only_alpha, min_length=min_length) test_X, _, test_indices, test_y, _, _, test_dat, test_mallet_strings, test_sage_output, test_svm_strings, _ = load_and_process_data(train_infile, vocab_size, parser, label_type, test_indices, vocab=train_vocab, label_list=label_list, label_index=label_index, use_mallet_stopwords=use_mallet_stopwords, replace_num=replace_num, group_size=group_size, only_alpha=only_alpha, min_length=min_length) fh.save_sparse(train_X, os.path.join(output_dir, 'train.npz')) fh.write_to_json(train_vocab, os.path.join(output_dir, 'train.vocab.json')) fh.write_to_json(train_indices, os.path.join(output_dir, 'train.indices.json')) fh.save_sparse(train_y, os.path.join(output_dir, 'train.labels.npz')) fh.save_sparse(test_X, os.path.join(output_dir, 'test.npz')) fh.write_to_json(test_indices, os.path.join(output_dir, 'test.indices.json')) fh.save_sparse(test_y, os.path.join(output_dir, 'test.labels.npz')) fh.write_to_json(list(word_freqs.tolist()), os.path.join(output_dir, 'train.word_freq.json')) fh.write_list_to_text(train_dat, os.path.join(output_dir, 'train.dat')) n_labels = len(label_list) label_dict = dict(zip(range(n_labels), label_list)) fh.write_to_json(label_dict, os.path.join(output_dir, 'train.label_list.json')) fh.write_list_to_text(train_mallet_strings, os.path.join(output_dir, 'train.mallet.txt')) fh.write_list_to_text(test_mallet_strings, os.path.join(output_dir, 'test.mallet.txt')) train_sage_output['te_data'] = test_sage_output['tr_data'] train_sage_output['te_aspect'] = test_sage_output['tr_aspect'] savemat(os.path.join(output_dir, 'sage.mat'), train_sage_output) fh.write_list_to_text(train_svm_strings, os.path.join(output_dir, 'train.svm.txt')) fh.write_list_to_text(test_svm_strings, os.path.join(output_dir, 'test.svm.txt'))
def process_subset(items, parsed, label_fields, label_lists, vocab, output_dir, output_prefix): n_items = len(items) vocab_size = len(vocab) vocab_index = dict(zip(vocab, range(vocab_size))) ids = [] for i, item in enumerate(items): if 'id' in item: ids.append(item['id']) if len(ids) != n_items: ids = [str(i) for i in range(n_items)] # create a label index using string representations for label_field in label_fields: label_list = label_lists[label_field] n_labels = len(label_list) label_list_strings = [str(label) for label in label_list] label_index = dict(zip(label_list_strings, range(n_labels))) # convert labels to a data frame if n_labels > 0: label_matrix = np.zeros([n_items, n_labels], dtype=int) label_vector = np.zeros(n_items, dtype=int) for i, item in enumerate(items): label = item[label_field] label_matrix[i, label_index[str(label)]] = 1 label_vector[i] = label_index[str(label)] labels_df = pd.DataFrame(label_matrix, index=ids, columns=label_list_strings) labels_df.to_csv(os.path.join(output_dir, output_prefix + '.' + label_field + '.csv')) label_vector_df = pd.DataFrame(label_vector, index=ids, columns=[label_field]) if n_labels == 2: label_vector_df.to_csv(os.path.join(output_dir, output_prefix + '.' + label_field + '_vector.csv')) rows = [] cols = [] vals = [] dat_strings = [] dat_labels = [] mallet_strings = [] fast_text_lines = [] counter = Counter() word_counter = Counter() doc_lines = [] print("Converting to count representations") for i, words in enumerate(parsed): # get the vocab indices of words that are in the vocabulary indices = [vocab_index[word] for word in words if word in vocab_index] word_subset = [word for word in words if word in vocab_index] counter.clear() counter.update(indices) word_counter.clear() word_counter.update(word_subset) if len(counter.keys()) > 0: # udpate the counts mallet_strings.append(str(i) + '\t' + 'en' + '\t' + ' '.join(word_subset)) dat_string = str(int(len(counter))) + ' ' dat_string += ' '.join([str(k) + ':' + str(int(v)) for k, v in zip(list(counter.keys()), list(counter.values()))]) dat_strings.append(dat_string) # for dat formart, assume just one label is given if len(label_fields) > 0: label = items[i][label_fields[-1]] dat_labels.append(str(label_index[str(label)])) values = list(counter.values()) rows.extend([i] * len(counter)) token_indices = sorted(counter.keys()) cols.extend(list(token_indices)) vals.extend([counter[k] for k in token_indices]) # convert to a sparse representation sparse_X = sparse.coo_matrix((vals, (rows, cols)), shape=(n_items, vocab_size)).tocsr() fh.save_sparse(sparse_X, os.path.join(output_dir, output_prefix + '.npz')) print("Size of {:s} document-term matrix:".format(output_prefix), sparse_X.shape) fh.write_to_json(ids, os.path.join(output_dir, output_prefix + '.ids.json')) # save output for Mallet fh.write_list_to_text(mallet_strings, os.path.join(output_dir, output_prefix + '.mallet.txt')) # save output for David Blei's LDA/SLDA code fh.write_list_to_text(dat_strings, os.path.join(output_dir, output_prefix + '.data.dat')) if len(dat_labels) > 0: fh.write_list_to_text(dat_labels, os.path.join(output_dir, output_prefix + '.' + label_field + '.dat')) # save output for Jacob Eisenstein's SAGE code: #sparse_X_sage = sparse.csr_matrix(X, dtype=float) vocab_for_sage = np.zeros((vocab_size,), dtype=np.object) vocab_for_sage[:] = vocab # for SAGE, assume only a single label has been given if len(label_fields) > 0: # convert array to vector of labels for SAGE sage_aspect = np.argmax(np.array(labels_df.values, dtype=float), axis=1) + 1 else: sage_aspect = np.ones([n_items, 1], dtype=float) sage_no_aspect = np.array([n_items, 1], dtype=float) widx = np.arange(vocab_size, dtype=float) + 1 return sparse_X, sage_aspect, sage_no_aspect, widx, vocab_for_sage
def process_subset(items, parsed, label_field, label_list, vocab, output_dir, output_prefix): n_items = len(items) n_labels = len(label_list) vocab_size = len(vocab) vocab_index = dict(zip(vocab, range(vocab_size))) ids = [] for i, item in enumerate(items): if 'id' in item: ids.append(item['id']) if len(ids) != n_items: ids = [str(i) for i in range(n_items)] # create a label index using string representations label_list_strings = [str(label) for label in label_list] label_index = dict(zip(label_list_strings, range(n_labels))) # convert labels to a data frame if n_labels > 0: label_matrix = np.zeros([n_items, n_labels], dtype=int) label_vector = np.zeros(n_items, dtype=int) for i, item in enumerate(items): id = ids[i] label = item[label_field] label_matrix[i, label_index[str(label)]] = 1 label_vector[i] = label_index[str(label)] labels_df = pd.DataFrame(label_matrix, index=ids, columns=label_list_strings) labels_df.to_csv( os.path.join(output_dir, output_prefix + '.' + label_field + '.csv')) label_vector_df = pd.DataFrame(label_vector, index=ids, columns=[label_field]) label_vector_df.to_csv( os.path.join(output_dir, output_prefix + '.label_vector.csv')) else: print("No labels found") X = np.zeros([n_items, vocab_size], dtype=int) dat_strings = [] dat_labels = [] mallet_strings = [] fast_text_lines = [] counter = Counter() word_counter = Counter() doc_lines = [] print("Converting to count representations") for i, words in enumerate(parsed): # get the vocab indices of words that are in the vocabulary indices = [vocab_index[word] for word in words if word in vocab_index] word_subset = [word for word in words if word in vocab_index] counter.clear() counter.update(indices) word_counter.clear() word_counter.update(word_subset) if len(counter.keys()) > 0: # udpate the counts mallet_strings.append( str(i) + '\t' + 'en' + '\t' + ' '.join(word_subset)) dat_string = str(int(len(counter))) + ' ' dat_string += ' '.join([ str(k) + ':' + str(int(v)) for k, v in zip(list(counter.keys()), list(counter.values())) ]) dat_strings.append(dat_string) if label_field is not None: label = items[i][label_field] dat_labels.append(str(label_index[str(label)])) values = list(counter.values()) X[np.ones(len(counter.keys()), dtype=int) * i, list(counter.keys())] += values # convert to a sparse representation sparse_X = sparse.csr_matrix(X) fh.save_sparse(sparse_X, os.path.join(output_dir, output_prefix + '.npz')) print(sparse_X.shape) print(len(dat_strings)) fh.write_to_json(ids, os.path.join(output_dir, output_prefix + '.ids.json')) # save output for Mallet fh.write_list_to_text( mallet_strings, os.path.join(output_dir, output_prefix + '.mallet.txt')) # save output for David Blei's LDA/SLDA code fh.write_list_to_text( dat_strings, os.path.join(output_dir, output_prefix + '.data.dat')) if len(dat_labels) > 0: fh.write_list_to_text( dat_labels, os.path.join(output_dir, output_prefix + '.' + label_field + '.dat')) # save output for Jacob Eisenstein's SAGE code: sparse_X_sage = sparse.csr_matrix(X, dtype=float) vocab_for_sage = np.zeros((vocab_size, ), dtype=np.object) vocab_for_sage[:] = vocab if n_labels > 0: # convert array to vector of labels for SAGE sage_aspect = np.argmax(np.array(labels_df.values, dtype=float), axis=1) + 1 else: sage_aspect = np.ones([n_items, 1], dtype=float) sage_no_aspect = np.array([n_items, 1], dtype=float) widx = np.arange(vocab_size, dtype=float) + 1 return sparse_X_sage, sage_aspect, sage_no_aspect, widx, vocab_for_sage
def main(): usage = "%prog infile.txt output_dir output_prefix" parser = OptionParser(usage=usage) parser.add_option( '-m', dest='max_lines', default=None, help= 'Quit after processing this many lines (documents): default=%default') #parser.add_option('--lower', action="store_true", dest="lower", default=False, # help='Lower case words: default=%default') (options, args) = parser.parse_args() infile = args[0] output_dir = args[1] output_prefix = args[2] max_lines = options.max_lines if max_lines is not None: max_lines = int(max_lines) vocab = [] vocab_index = {} counter = Counter() # start by converting each document into a dict of word counts, building a vocab as we go rows = [] cols = [] values = [] n_docs = 0 print("Counting words...") with codecs.open(infile, 'r', encoding='utf-8') as f: for line_i, line in enumerate(f): line = line.strip() if len(line) > 0: if max_lines is not None and line_i >= max_lines: print("Quitting after processing %d lines" % (line_i + 1)) break if n_docs % 1000 == 0 and n_docs > 0: print(n_docs) # split on white space words = line.split() # filter out everything that's not just letters, and lower case words = [ word.lower() for word in words if re.match('^[a-zA-Z]*$', word) is not None ] # look for new words and add them to the vocabulary new_words = [word for word in words if word not in vocab_index] if len(new_words) > 0: vocab_size = len(vocab) #print("Adding %d words to vocab" % len(new_words)) #print("New total should be %d" % (vocab_size + len(new_words))) vocab.extend(new_words) vocab_index.update( dict( zip(new_words, range(vocab_size, vocab_size + len(new_words))))) indices = [vocab_index[word] for word in words] counter.clear() counter.update(indices) keys = counter.keys() counts = counter.values() rows.extend([line_i] * len(keys)) cols.extend(keys) values.extend(counts) n_docs += 1 print("Processed %d documents" % n_docs) print("Size of final vocab = %d" % len(vocab)) print("Saving counts...") # now convert these count vectors in to a giant sparse matrix counts = sparse.coo_matrix((values, (rows, cols)), shape=(n_docs, len(vocab))) fh.save_sparse(counts, os.path.join(output_dir, output_prefix + '.npz')) fh.write_to_json(vocab, os.path.join(output_dir, output_prefix + '.vocab.json')) print("Done")
def process_subset( items, ids, parsed, labels, label_fields, label_lists, vocab, output_dir, output_prefix, count_dtype=np.int, ): n_items = len(items) vocab_size = len(vocab) vocab_index = dict(zip(vocab, range(vocab_size))) if not ids or len(ids) != n_items: ids = [str(i) for i in range(n_items)] # create a label index using string representations if labels: labels_df = pd.DataFrame.from_records(labels, index=ids) for label_field in label_fields: labels_df_subset = pd.get_dummies(labels_df[label_field]) # for any classes not present in the subset, add 0 columns # (handles case where classes appear in only one of train or test) for category in label_lists[label_field]: if category not in labels_df_subset: labels_df_subset[category] = 0 labels_df_subset.to_csv( os.path.join(output_dir, output_prefix + "." + label_field + ".csv")) if labels_df[label_field].nunique() == 2: labels_df_subset.iloc[:, 1].to_csv( os.path.join( output_dir, output_prefix + "." + label_field + "_vector.csv"), header=[label_field], ) # used later label_index = dict( zip(labels_df_subset.columns, range(len(labels_df_subset)))) X = np.zeros([n_items, vocab_size], dtype=count_dtype) dat_strings = [] dat_labels = [] mallet_strings = [] fast_text_lines = [] counter = Counter() word_counter = Counter() doc_lines = [] print("Converting to count representations") for i, words in enumerate(parsed): # get the vocab indices of words that are in the vocabulary words = words.split() indices = [vocab_index[word] for word in words if word in vocab_index] word_subset = [word for word in words if word in vocab_index] counter.clear() counter.update(indices) word_counter.clear() word_counter.update(word_subset) if len(counter.keys()) > 0: # udpate the counts mallet_strings.append( str(i) + "\t" + "en" + "\t" + " ".join(word_subset)) dat_string = str(int(len(counter))) + " " dat_string += " ".join([ str(k) + ":" + str(int(v)) for k, v in zip(list(counter.keys()), list(counter.values())) ]) dat_strings.append(dat_string) # for dat formart, assume just one label is given if len(label_fields) > 0: label = labels[i][label_fields[-1]] dat_labels.append(str(label_index[str(label)])) values = np.array(list(counter.values()), dtype=count_dtype) X[np.ones(len(counter.keys()), dtype=int) * i, list(counter.keys())] += values # convert to a sparse representation sparse_X = sparse.csr_matrix(X) fh.save_sparse(sparse_X, os.path.join(output_dir, output_prefix + ".npz")) print("Size of {:s} document-term matrix:".format(output_prefix), sparse_X.shape) fh.write_to_json(ids, os.path.join(output_dir, output_prefix + ".ids.json")) # save output for Mallet fh.write_list_to_text( mallet_strings, os.path.join(output_dir, output_prefix + ".mallet.txt")) # save output for David Blei's LDA/SLDA code fh.write_list_to_text( dat_strings, os.path.join(output_dir, output_prefix + ".data.dat")) if len(dat_labels) > 0: fh.write_list_to_text( dat_labels, os.path.join(output_dir, output_prefix + "." + label_field + ".dat"), ) # save output for Jacob Eisenstein's SAGE code: sparse_X_sage = sparse.csr_matrix(X, dtype=float) vocab_for_sage = np.zeros((vocab_size, ), dtype=np.object) vocab_for_sage[:] = vocab # for SAGE, assume only a single label has been given if len(label_fields) > 0: # convert array to vector of labels for SAGE sage_aspect = ( np.argmax(np.array(labels_df_subset.values, dtype=float), axis=1) + 1) else: sage_aspect = np.ones([n_items, 1], dtype=float) sage_no_aspect = np.array([n_items, 1], dtype=float) widx = np.arange(vocab_size, dtype=float) + 1 return sparse_X_sage, sage_aspect, sage_no_aspect, widx, vocab_for_sage