def test_random_resampling(): data = [ { 'key': True, 'data': 123 }, { 'key': True, 'data': 123 }, { 'key': True, 'data': 123 }, { 'key': True, 'data': 123 }, { 'key': True, 'data': 123 }, { 'key': False, 'data': 123 }, { 'key': False, 'data': 123 }, ] def get_state(seed): return np.random.RandomState(seed) sampled_data = label_sample(data, 'key', random_state=get_state(41)) sampled_data2 = label_sample(data, 'key', random_state=get_state(41)) assert sampled_data == sampled_data2
def gen_observations(all_clusters, lookup_order, document_data, features, parameters, vocab, full_vocab, encoder_decoder, lda_model, tf_session, w2v_model, hdf5_path=None, dtype=np.float32): ''' Generates observations for each cluster found in JSON file and calculates the specified features. Args: all_clusters (set): cluster IDs lookup_order (dict): document arrival order document_data (array): parsed JSON documents features (dict): the specified features to be calculated parameters (dict): data structure with run parameters vocab (dict): the vocabulary of the data set full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation encoder_decoder (???): the encoder/decoder for skipthoughts vectors lda_model (sklearn.???): trained LDA model tf_session: active TensorFlow session w2v_model (gensim.word2vec): trained word2vec model Returns: data(list): contains for each obeservation the features of the document vs corpus which could include: tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding labels(list): the labels for each document where a one is novel and zero is duplicate ''' # Prepare to store results of feature assessments data = list() labels = list() # mem_net_features is used when the mem_net algorithm is ran # It consist of inputs, labels(answers), input_masks and questions for each entry mem_net_features = {} inputs = [] input_masks = [] questions = [] # Sentence punctuation delimiters punkt = ['.', '?', '!'] corpus_unprocessed = list() # HDF5-related parameters hdf5_save_frequency = parameters['hdf5_save_frequency'] data_key = 'data' labels_key = 'labels' # Truncate any existing files at save location, or return early if # using existing files if hdf5_path is not None: if parameters['hdf5_use_existing'] and os.path.isfile(hdf5_path): return hdf5_path, hdf5_path open(hdf5_path, 'w').close() # Create random state random_state = np.random.RandomState(parameters['seed']) # Iterate through clusters found in JSON file, generate observations # pairing data and label for cluster in all_clusters: # Determine arrival order in this cluster sorted_entries = [ x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0]) ] observations = [document_data[sorted_entries[0]]] for index in sorted_entries[1:]: next_doc = document_data[index] observations.append(next_doc) labeled_observation = { 'novelty': next_doc['novelty'], 'data': copy.copy(observations) } corpus_unprocessed.append(labeled_observation) # Resample if necessary # If oversampling +/- replacement, sample up # to larger class size for both classes, with replacement # If -oversampling, sample down to # smaller class size for both classes with or w/o replacement if 'resampling' in parameters: resampling_parameters = parameters['resampling'] if resampling_parameters.get('over', False): desired_size = None resampling_parameters['replacement'] = True else: desired_size = -np.Inf if resampling_parameters.get('replacement', False): replacement = True else: replacement = False logger.debug("Replacement: {}, Desired size: {}".format( replacement, desired_size)) logger.debug("Size of data: {}, Number of clusters: {}".format( len(corpus_unprocessed), len(all_clusters))) corpus = sampling.label_sample(corpus_unprocessed, "novelty", replacement, desired_size, random_state) else: corpus = corpus_unprocessed # Featurize each observation # Some duplication of effort here bc docs will appear multiple times # across observations clusterids = [] postids = [] for case in corpus: # Create raw and normalized document arrays case_docs_raw = [record['body_text'] for record in case['data']] case_docs_normalized = [ normalize.xml_normalize(body_text) for body_text in case_docs_raw ] case_docs_no_stop_words = [ normalize.normalize_and_remove_stop_words(body_text) for body_text in case_docs_raw ] #create ids for individual data points postid = [record['post_id'] for record in case['data']][-1] postids.append(postid) clusterid = [record['cluster_id'] for record in case['data']][0] clusterids.append(clusterid) # Pull out query documents doc_raw = case_docs_raw[-1] doc_normalized = case_docs_normalized[-1] doc_no_stop_words = case_docs_no_stop_words[-1] # Create lists of background documents bkgd_docs_raw = case_docs_raw[:-1] bkgd_docs_normalized = case_docs_normalized[:-1] bkgd_docs_no_stop_words = case_docs_no_stop_words[:-1] bkgd_text_raw = '\n'.join(bkgd_docs_raw) bkgd_text_normalized = '\n'.join(bkgd_docs_normalized) bkgd_text_no_stop_words = '\n'.join(bkgd_docs_no_stop_words) feature_vectors = list() if 'mem_net' in features: # Get all sentences for the memory network algorithm bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw) doc_input, doc_questions, doc_masks = gen_mem_net_observations( doc_raw, bkgd_text_raw, bkgd_sentences_full, features['mem_net'], vocab, full_vocab, w2v_model, encoder_decoder) # Now add all of the input docs to the primary list inputs.append(doc_input) questions.append(doc_questions) input_masks.append(doc_masks) else: if 'bow' in features: feature_vectors = bow(doc_no_stop_words, bkgd_text_no_stop_words, bkgd_docs_no_stop_words, vocab, features['bow'], feature_vectors) if 'st' in features: sentences = [] for doc in bkgd_docs_raw: for item in get_first_and_last_sentence(doc): sentences.append(item) feature_vectors = st(doc_raw, sentences, encoder_decoder, features['st'], feature_vectors) if 'lda' in features: feature_vectors = lda(doc_no_stop_words, bkgd_text_no_stop_words, vocab, lda_model, features['lda'], feature_vectors) if 'w2v' in features: feature_vectors = w2v(doc_normalized, bkgd_docs_normalized, w2v_model, features['w2v'], feature_vectors) if 'cnn' in features: feature_vectors = run_cnn( normalize.xml_normalize(doc_raw), normalize.xml_normalize(bkgd_text_raw), tf_session) if 'wordonehot' in features: feature_vectors = wordonehot(doc_raw, bkgd_text_raw, full_vocab, features['wordonehot'], feature_vectors) # Save features and label feature_vectors = np.concatenate(feature_vectors, axis=0).astype(dtype) # Fail catastrphically on zero vector (not sure if we need this) #assert not (feature_vectors < 0.0001).all() data.append(feature_vectors) if case["novelty"]: labels.append(1) else: labels.append(0) # save to HDF5 if desired if hdf5_path is not None and len(data) % hdf5_save_frequency == 0: with h5py.File(hdf5_path, 'a') as h5: data_np = np.array(data) labels_np = np.reshape(np.array(labels), (-1, 1)) add_to_hdf5(h5, data_np, data_key) add_to_hdf5(h5, labels_np, labels_key, np.uint8) labels = list() data = list() # Save off any remainder if hdf5_path is not None and len(data) > 0: with h5py.File(hdf5_path, 'a') as h5: data_np = np.array(data) labels_np = np.reshape(np.array(labels), (-1, 1)) add_to_hdf5(h5, data_np, data_key) add_to_hdf5(h5, labels_np, labels_key, np.uint8) mem_net_features['inputs'] = inputs mem_net_features['questions'] = questions mem_net_features['input_masks'] = input_masks mem_net_features['answers'] = labels ids = [ "C" + str(clusterid) + "_P" + str(postid) for clusterid, postid in zip(clusterids, postids) ] if 'mem_net' in features: return mem_net_features, labels, ids if hdf5_path is not None: return hdf5_path, hdf5_path, ids else: return data, labels, ids
def gen_observations(all_clusters, lookup_order, document_data, features, parameters, vocab, full_vocab, encoder_decoder, lda_model, tf_session, w2v_model): ''' Generates observations for each cluster found in JSON file and calculates the specified features. Args: all_clusters (set): cluster IDs lookup_order (dict): document arrival order document_data (array): parsed JSON documents features (dict): the specified features to be calculated parameters (???): data structure with run parameters vocab (dict): the vocabulary of the data set full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation encoder_decoder (???): the encoder/decoder for skipthoughts vectors lda_model (sklearn.???): trained LDA model tf_session: active TensorFlow session w2v_model (gensim.word2vec): trained word2vec model Returns: data(list): contains for each obeservation the features of the document vs corpus which could include: tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding labels(list): the labels for each document where a one is novel and zero is duplicate ''' # Prepare to store results of feature assessments data = list() labels = list() # mem_net_features is used when the mem_net algorithm is ran # It consist of inputs, labels(answers), input_masks and questions for each entry mem_net_features = {} inputs = [] input_masks = [] questions = [] # Sentence punctuation delimiters punkt = ['.','?','!'] corpus_unprocessed = list() # Create random state random_state = np.random.RandomState(parameters['seed']) # Iterate through clusters found in JSON file, generate observations # pairing data and label for cluster in all_clusters: # Determine arrival order in this cluster # sorted_entries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])] observations = [document_data[sorted_entries[0]]] for index in sorted_entries[1:]: next_doc = document_data[index] observations.append(next_doc) labeled_observation = { 'novelty' : next_doc['novelty'], 'data' : copy.copy(observations) } corpus_unprocessed.append(labeled_observation) # Resample if necessary # If oversampling +/- replacement, sample up # to larger class size for both classes, with replacement # If -oversampling, sample down to # smaller class size for both classes with or w/o replacement if 'resampling' in parameters: if 'over' in parameters: desired_size = None parameters['replacement'] = True else: desired_size = -np.Inf if 'replacement' in parameters: replacement = True else: replacement = False corpus = sampling.label_sample(corpus_unprocessed, "novelty", replacement, desired_size, random_state) else: corpus = corpus_unprocessed # Featurize each observation # Some duplication of effort here bc docs will appear multiple times # across observations clusterids = [] postids = [] for case in corpus: # Create raw and normalized document arrays case_docs_raw = [ record['body_text'] for record in case['data'] ] case_docs_normalized = [ normalize.normalize_and_remove_stop_words(body_text) for body_text in case_docs_raw ] #create ids for individual data points postid = [record['post_id'] for record in case['data'] ][-1] postids.append(postid) clusterid = [ record['cluster_id'] for record in case['data'] ][0] clusterids.append(clusterid) # Pull out query documents doc_raw = case_docs_raw[-1] doc_normalized = case_docs_normalized[-1] # Create lists of background documents bkgd_docs_raw = case_docs_raw[:-1] bkgd_docs_normalized = case_docs_normalized[:-1] bkgd_text_raw = '\n'.join(bkgd_docs_raw) bkgd_text_normalized = '\n'.join(bkgd_docs_normalized) feature_vectors = list() if 'mem_net' in features: # Get all sentences for the memory network algorithm bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw) doc_input, doc_questions, doc_masks = gen_mem_net_observations(doc_raw, bkgd_text_raw, bkgd_sentences_full, features['mem_net'], vocab, full_vocab, w2v_model, encoder_decoder) # Now add all of the input docs to the primary list inputs.append(doc_input) questions.append(doc_questions) input_masks.append(doc_masks) else: if 'bow' in features: feature_vectors = bow(doc_normalized, bkgd_text_raw, bkgd_docs_normalized, vocab, features['bow'], feature_vectors) if 'st' in features: sentences = [] for doc in bkgd_docs_raw: for item in get_first_and_last_sentence(doc): sentences.append(item) feature_vectors = st(doc_raw, sentences, encoder_decoder, features['st'], feature_vectors) if 'lda' in features: feature_vectors = lda(doc_normalized, bkgd_text_normalized, vocab, lda_model, features['lda'], feature_vectors) if 'w2v' in features: feature_vectors = w2v(doc_raw, bkgd_text_normalized, w2v_model, features['w2v'], feature_vectors) if 'cnn' in features: feature_vectors = run_cnn(doc_normalized, bkgd_text_normalized, tf_session) if 'wordonehot' in features: feature_vectors = wordonehot(doc_raw, bkgd_text_raw, full_vocab, features['wordonehot'], feature_vectors) # Save features and label feature_vectors = np.concatenate(feature_vectors, axis=0) data.append(feature_vectors) if case["novelty"]: labels.append(1) else: labels.append(0) mem_net_features['inputs'] = inputs mem_net_features['questions'] = questions mem_net_features['input_masks'] = input_masks mem_net_features['answers'] = labels ids = ["C" + str(clusterid) + "_P" + str(postid) for clusterid, postid in zip(clusterids,postids)] if 'mem_net' in features: return mem_net_features, labels, ids else: return data, labels, ids
def gen_observations(all_clusters, lookup_order, document_data, features, parameters, vocab, full_vocab, encoder_decoder, lda_model, tf_session, w2v_model): ''' Generates observations for each cluster found in JSON file and calculates the specified features. Args: all_clusters (set): cluster IDs lookup_order (dict): document arrival order document_data (array): parsed JSON documents features (dict): the specified features to be calculated parameters (???): data structure with run parameters vocab (dict): the vocabulary of the data set full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation encoder_decoder (???): the encoder/decoder for skipthoughts vectors lda_model (sklearn.???): trained LDA model tf_session: active TensorFlow session w2v_model (gensim.word2vec): trained word2vec model Returns: data(list): contains for each obeservation the features of the document vs corpus which could include: tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding labels(list): the labels for each document where a one is novel and zero is duplicate ''' # Prepare to store results of feature assessments data = list() labels = list() # mem_net_features is used when the mem_net algorithm is ran # It consist of inputs, labels(answers), input_masks and questions for each entry mem_net_features = {} inputs = [] input_masks = [] questions = [] # Sentence punctuation delimiters punkt = ['.','?','!'] corpus_unprocessed = list() # Create random state random_state = np.random.RandomState(parameters['seed']) # Iterate through clusters found in JSON file, generate observations # pairing data and label for cluster in all_clusters: # Determine arrival order in this cluster # sorted_entries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])] observations = [document_data[sorted_entries[0]]] for index in sorted_entries[1:]: next_doc = document_data[index] observations.append(next_doc) labeled_observation = { 'novelty' : next_doc['novelty'], 'data' : copy.copy(observations) } corpus_unprocessed.append(labeled_observation) # Resample if necessary # If oversampling +/- replacement, sample up # to larger class size for both classes, with replacement # If -oversampling, sample down to # smaller class size for both classes with or w/o replacement if 'resampling' in parameters: if 'over' in parameters: desired_size = None parameters['replacement'] = True else: desired_size = -np.Inf if 'replacement' in parameters: replacement = True else: replacement = False corpus = sampling.label_sample(corpus_unprocessed, "novelty", replacement, desired_size, random_state) else: corpus = corpus_unprocessed # Featurize each observation # Some duplication of effort here bc docs will appear multiple times # across observations for case in corpus: # Create raw and normalized document arrays case_docs_raw = [ record['body_text'] for record in case['data'] ] case_docs_normalized = [ normalize.normalize_and_remove_stop_words(body_text) for body_text in case_docs_raw ] # Pull out query documents doc_raw = case_docs_raw[-1] doc_normalized = case_docs_raw[-1] # Create lists of background documents bkgd_docs_raw = case_docs_raw[:-1] bkgd_docs_normalized = case_docs_normalized[:-1] bkgd_text_raw = '\n'.join(bkgd_docs_raw) bkgd_text_normalized = '\n'.join(bkgd_docs_normalized) feature_vectors = list() if 'mem_net' in features: # Get all sentences for the memory network algorithm bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw) doc_input, doc_questions, doc_masks = gen_mem_net_observations(doc_raw, bkgd_text_raw, bkgd_sentences_full, features['mem_net'], vocab, full_vocab, w2v_model, encoder_decoder) # Now add all of the input docs to the primary list inputs.append(doc_input) questions.append(doc_questions) input_masks.append(doc_masks) else: if 'bow' in features: feature_vectors = bow(doc_normalized, bkgd_text_raw, bkgd_docs_normalized, vocab, features['bow'], feature_vectors) if 'st' in features: sentences = [ get_first_and_last_sentence(doc) for doc in bkgd_docs_raw ] feature_vectors = st(doc_raw, sentences, encoder_decoder, features['st'], feature_vectors) if 'lda' in features: feature_vectors = lda(doc_normalized, bkgd_text_normalized, vocab, lda_model, features['lda'], feature_vectors) if 'w2v' in features: feature_vectors = w2v(doc_raw, bkgd_text_normalized, w2v_model, features['w2v'], feature_vectors) if 'cnn' in features: feature_vectors = run_cnn(doc_normalized, bkgd_text_normalized, tf_session) if 'wordonehot' in features: feature_vectors = wordonehot(doc_raw, bkgd_text_raw, full_vocab, features['wordonehot'], feature_vectors) # Save features and label feature_vectors = np.concatenate(feature_vectors, axis=0) data.append(feature_vectors) if case["novelty"]: labels.append(1) else: labels.append(0) mem_net_features['inputs'] = inputs mem_net_features['questions'] = questions mem_net_features['input_masks'] = input_masks mem_net_features['answers'] = labels if 'mem_net' in features: return mem_net_features, labels else: return data, labels
def gen_observations(all_clusters, lookup_order, document_data, features, parameters, vocab, full_vocab, encoder_decoder, lda_model, tf_session, w2v_model, hdf5_path=None, dtype=np.float32): ''' Generates observations for each cluster found in JSON file and calculates the specified features. Args: all_clusters (set): cluster IDs lookup_order (dict): document arrival order document_data (array): parsed JSON documents features (dict): the specified features to be calculated parameters (dict): data structure with run parameters vocab (dict): the vocabulary of the data set full_vocab (dict_: to vocabulary of the data set including stop wrods and punctuation encoder_decoder (???): the encoder/decoder for skipthoughts vectors lda_model (sklearn.???): trained LDA model tf_session: active TensorFlow session w2v_model (gensim.word2vec): trained word2vec model Returns: data(list): contains for each obeservation the features of the document vs corpus which could include: tfidf sum, cosine similarity, bag of words vectors, skip thoughts, lda, w2v or, onehot cnn encoding labels(list): the labels for each document where a one is novel and zero is duplicate ''' # Prepare to store results of feature assessments data = list() labels = list() # mem_net_features is used when the mem_net algorithm is ran # It consist of inputs, labels(answers), input_masks and questions for each entry mem_net_features = {} inputs = [] input_masks = [] questions = [] # Sentence punctuation delimiters punkt = ['.','?','!'] corpus_unprocessed = list() # HDF5-related parameters hdf5_save_frequency=parameters['hdf5_save_frequency'] data_key = 'data' labels_key = 'labels' # Truncate any existing files at save location, or return early if # using existing files if hdf5_path is not None: if parameters['hdf5_use_existing'] and os.path.isfile(hdf5_path): return hdf5_path, hdf5_path open(hdf5_path, 'w').close() # Create random state random_state = np.random.RandomState(parameters['seed']) # Iterate through clusters found in JSON file, generate observations # pairing data and label for cluster in all_clusters: # Determine arrival order in this cluster sorted_entries = [x[1] for x in sorted(lookup_order[cluster], key=lambda x: x[0])] observations = [document_data[sorted_entries[0]]] for index in sorted_entries[1:]: next_doc = document_data[index] observations.append(next_doc) labeled_observation = { 'novelty' : next_doc['novelty'], 'data' : copy.copy(observations) } corpus_unprocessed.append(labeled_observation) # Resample if necessary # If oversampling +/- replacement, sample up # to larger class size for both classes, with replacement # If -oversampling, sample down to # smaller class size for both classes with or w/o replacement if 'resampling' in parameters: resampling_parameters = parameters['resampling'] if resampling_parameters.get('over', False): desired_size = None resampling_parameters['replacement'] = True else: desired_size = -np.Inf if resampling_parameters.get('replacement', False): replacement = True else: replacement = False logger.debug("Replacement: {}, Desired size: {}".format(replacement, desired_size)) logger.debug("Size of data: {}, Number of clusters: {}".format(len(corpus_unprocessed), len(all_clusters))) corpus = sampling.label_sample(corpus_unprocessed, "novelty", replacement, desired_size, random_state) else: corpus = corpus_unprocessed # Featurize each observation # Some duplication of effort here bc docs will appear multiple times # across observations clusterids = [] postids = [] for case in corpus: # Create raw and normalized document arrays case_docs_raw = [ record['body_text'] for record in case['data'] ] case_docs_normalized = [ normalize.xml_normalize(body_text) for body_text in case_docs_raw ] case_docs_no_stop_words = [ normalize.normalize_and_remove_stop_words(body_text) for body_text in case_docs_raw ] #create ids for individual data points postid = [record['post_id'] for record in case['data'] ][-1] postids.append(postid) clusterid = [ record['cluster_id'] for record in case['data'] ][0] clusterids.append(clusterid) # Pull out query documents doc_raw = case_docs_raw[-1] doc_normalized = case_docs_normalized[-1] doc_no_stop_words = case_docs_no_stop_words[-1] # Create lists of background documents bkgd_docs_raw = case_docs_raw[:-1] bkgd_docs_normalized = case_docs_normalized[:-1] bkgd_docs_no_stop_words = case_docs_no_stop_words[:-1] bkgd_text_raw = '\n'.join(bkgd_docs_raw) bkgd_text_normalized = '\n'.join(bkgd_docs_normalized) bkgd_text_no_stop_words = '\n'.join(bkgd_docs_no_stop_words) feature_vectors = list() if 'mem_net' in features: # Get all sentences for the memory network algorithm bkgd_sentences_full = tokenize.punkt_sentences(bkgd_text_raw) doc_input, doc_questions, doc_masks = gen_mem_net_observations(doc_raw, bkgd_text_raw, bkgd_sentences_full, features['mem_net'], vocab, full_vocab, w2v_model, encoder_decoder) # Now add all of the input docs to the primary list inputs.append(doc_input) questions.append(doc_questions) input_masks.append(doc_masks) else: if 'bow' in features: feature_vectors = bow(doc_no_stop_words, bkgd_text_no_stop_words, bkgd_docs_no_stop_words, vocab, features['bow'], feature_vectors) if 'st' in features: sentences = [] for doc in bkgd_docs_raw: for item in get_first_and_last_sentence(doc): sentences.append(item) feature_vectors = st(doc_raw, sentences, encoder_decoder, features['st'], feature_vectors) if 'lda' in features: feature_vectors = lda(doc_no_stop_words, bkgd_text_no_stop_words, vocab, lda_model, features['lda'], feature_vectors) if 'w2v' in features: feature_vectors = w2v(doc_normalized, bkgd_docs_normalized, w2v_model, features['w2v'], feature_vectors) if 'cnn' in features: feature_vectors = run_cnn(normalize.xml_normalize(doc_raw), normalize.xml_normalize(bkgd_text_raw), tf_session) if 'wordonehot' in features: feature_vectors = wordonehot(doc_raw, bkgd_text_raw, full_vocab, features['wordonehot'], feature_vectors) # Save features and label feature_vectors = np.concatenate(feature_vectors, axis=0).astype(dtype) # Fail catastrphically on zero vector (not sure if we need this) #assert not (feature_vectors < 0.0001).all() data.append(feature_vectors) if case["novelty"]: labels.append(1) else: labels.append(0) # save to HDF5 if desired if hdf5_path is not None and len(data) % hdf5_save_frequency == 0: with h5py.File(hdf5_path, 'a') as h5: data_np = np.array(data) labels_np = np.reshape(np.array(labels), (-1, 1)) add_to_hdf5(h5, data_np, data_key) add_to_hdf5(h5, labels_np, labels_key, np.uint8) labels = list() data = list() # Save off any remainder if hdf5_path is not None and len(data) > 0: with h5py.File(hdf5_path, 'a') as h5: data_np = np.array(data) labels_np = np.reshape(np.array(labels), (-1, 1)) add_to_hdf5(h5, data_np, data_key) add_to_hdf5(h5, labels_np, labels_key, np.uint8) mem_net_features['inputs'] = inputs mem_net_features['questions'] = questions mem_net_features['input_masks'] = input_masks mem_net_features['answers'] = labels ids = ["C" + str(clusterid) + "_P" + str(postid) for clusterid, postid in zip(clusterids,postids)] if 'mem_net' in features: return mem_net_features, labels, ids if hdf5_path is not None: return hdf5_path, hdf5_path, ids else: return data, labels, ids