def cascade(self, instance): """Cascade the index from this instance to others that depend on it. This causes index_instance() to be called on each instance that depends on the instance supplied. """ for descriptor in self.cascades: cascade_inst = None # find the instance we're being told to cascade the reindex onto try: if callable(descriptor): cascade_inst = descriptor(instance) elif isinstance(descriptor, str): cascade_inst = getattr(instance, descriptor) except: cascade_inst = None # if we found one, check if it's searchable, check if it # wants to accept the cascade, and if so, reindex it if cascade_inst: # If it's not an iterable already, make it into one if not hasattr(cascade_inst, '__iter__'): cascade_insts = [cascade_inst] else: cascade_insts = cascade_inst for cascade_inst in cascade_insts: indexer = get_indexer(cascade_inst) if indexer and indexer.reindex_on_cascade(instance, cascade_inst): indexer.index_instance(cascade_inst, with_cascade=False)
def main(): train_data = get_train_data_from_csv('data/train_15_dns.csv') #shuffle(train_data) train_data = train_data[0:50000] dev_data = get_dev_data_from_csv('data/dev_15_dns.csv') #shuffle(dev_data) dev_data = dev_data[0:10000] print('len of training data:', len(train_data)) print('len of dev data:', len(dev_data)) vocab_size = get_vocab_size(train_data) print('calculated vocab size:', vocab_size) indexer = get_indexer('indexer_15_dups.csv') model = train_rnn_classifier(train_data, vocab_size, indexer) print_evaluation(dev_data, model)
from keras.preprocessing.sequence import pad_sequences from keras.models import Sequential from keras.layers import Dense, Embedding, LSTM, Dropout from sklearn.model_selection import train_test_split from keras.utils.np_utils import to_categorical import re import json import pandas as pd from utils import get_train_data_from_csv, get_dev_data_from_csv, get_test_data_from_csv, Indexer, get_indexer from nltk.tokenize import TweetTokenizer from sklearn.metrics import classification_report include_test = True tknr = TweetTokenizer() indexer = get_indexer('indexer_15_dups.csv') word_indexer = Indexer() word_indexer.add_and_get_index("UNK") train_data = get_train_data_from_csv('data/train_15_ds.csv')[0:1000] dev_data = get_dev_data_from_csv('data/dev_15_ds.csv')[:200] test_data = get_test_data_from_csv('data/test_15_ds.csv')[0:200] X_train = [] Y_train = [] X_dev = [] Y_dev = [] Y_dev_true = [] X_test = [] Y_test = [] Y_test_true = []
def reindex_index(indexname, suffix): """Reindex a named index. """ if not hasattr(settings, 'ENABLE_SEARCHIFY') or not settings.ENABLE_SEARCHIFY: return models = _index_models.get(indexname, None) if models is None: raise KeyError("Index %r is not known" % indexname) try: # Get the index-wide settings. index_settings = {} def merge_dicts(path, a, b): for (k, v) in b.iteritems(): if k not in a: a[k] = v continue if isinstance(v, dict): merge_dicts('%s.%s' % (path, k), a[k], v) continue if a[k] == v: continue raise ValueError("Conflicting values in index_settings (at %s)" % path[1:]) for model in models: indexer = get_indexer(model) merge_dicts('.', index_settings, indexer.index_settings) created = False for model in models: print "Indexing %s to %s, using suffix %s" % (model, indexname, suffix) indexer = get_indexer(model) try: indexer.client.set_suffix(suffix) if not created: #print "Creating index with settings %r" % index_settings indexer.client.create_index(index_settings) created = True indexer.apply_mapping() indexer.index_all(with_cascade=False) finally: indexer.client.set_suffix() indexer.client.flush() # Get the old value of the alias. try: old_index = client.get_alias(indexname)[0] except IndexError: old_index = None if old_index == indexname: # Old index wasn't an alias; we have to delete it and then set the # new alias for it. print "Warning: no alias in use, so must delete in-use index" old_index = None client.delete_index(indexname) print "Setting alias to make new index live" client.set_alias(indexname, indexname + suffix) except: try: client.delete_index(indexname + suffix) except Exception: # Ignore any normal exceptions, so we report the original error. pass raise if old_index: print "Removing old index: %s" % old_index client.delete_index(old_index)
self.indexer = Indexer() def get_indexer(self): return self.indexer def extract_features(self, ex): feature_vector = np.zeros(len(self.indexer)) for word in ex.text: index = self.indexer.index_of(word) feature_vector[index] += 1 return feature_vector filename = sys.argv[1] indexer = get_indexer('data/indexer_' + filename) train_set = get_train_data_from_csv('data/train_' + filename) dev_set = get_dev_data_from_csv('data/dev_' + filename) test_set = get_test_data_from_csv('data/test_' + filename) p = PerceptronClassifier(indexer, FeatureExtractor()) p.train(train_set) y_pred = [] y_true = [] for ex in dev_set: y_true.append(ex.label) y_pred.append(p.predict(ex)) print("Dev Set Results: ") print("Accuracy: ", accuracy_score(y_true, y_pred))