def main(): parser = argparse.ArgumentParser( description='Train CRFSuite on data from the QCRI MySQL database', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-k', '--k-folds', type=int, default=10, help='How many folds of the data to test on') parser.add_argument('--max-data', type=int, default=10000, help='Maximum data points to train and test on') parser.add_argument('--include-none', type=int, default=0, help='Include None in Confusion Matrix.') parser.add_argument('-threshold', type=int, default=10, help='Threshold for number of gold labels classified.') opts = parser.parse_args() # e.g., tokenized_label = # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16 # tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5> # Train and test must be iterables of objects that support CRF-ready # .tokens and .labels attributes. query = DBSession.query(TokenizedLabel).limit(opts.max_data) X_y = ((featurize(item.tokens, crf_feature_functions), item.labels) for item in query) # unzip and flatten into static list X, y = zip(*X_y) # we need to read X multiple times, so make sure it's all static X = map(flatMap, X) categories = dict( (label.id, label.text) for label in DBSession.query(Label)) print 'categories', categories N = len(y) index = 0 for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True): # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices] train_X = [X[i] for i in train_indices] train_y = [y[i] for i in train_indices] test_X = [X[i] for i in test_indices] test_y = [y[i] for i in test_indices] classifier = CRF() # print_gloss=True index = index + 1 evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y, index, opts)
def test_tweedr_models_no_ci(): from tweedr.models import DBSession, TokenizedLabel, Label Tables = [TokenizedLabel, Label] for Table in Tables: row_count = DBSession.query(Table).count() assert row_count > 0, 'There should be more than 0 rows in the table "%s"' % Table.name
def first(limit): print >> sys.stderr, 'First %d tweets.' % limit for tokenized_label in DBSession.query(TokenizedLabel).limit(limit): # print repr(tokenized_label) tokenized_label_text = unicode(tokenized_label).translate(whitespace_unicode_translations).encode('utf8') token_type_object = tokenized_label.token_type_object print token_type_object.id, '\t', token_type_object.text, '\t', tokenized_label_text
def first(limit): print >> sys.stderr, 'First %d tweets.' % limit for tokenized_label in DBSession.query(TokenizedLabel).limit(limit): # print repr(tokenized_label) tokenized_label_text = unicode(tokenized_label).translate( whitespace_unicode_translations).encode('utf8') token_type_object = tokenized_label.token_type_object print token_type_object.id, '\t', token_type_object.text, '\t', tokenized_label_text
def read_tweets(): '''Read labeled tweets from database''' logger.info('Reading labeled tweets from database...') labeled_tweets = \ np.array(DBSession.query(DamageClassification).filter(DamageClassification.mturk_code == 'QCRI').limit(opts.max_data).all()) logger.info('Read %d tweets', len(labeled_tweets)) return labeled_tweets
def main(): parser = argparse.ArgumentParser( description='Train CRFSuite on data from the QCRI MySQL database', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-k', '--k-folds', type=int, default=10, help='How many folds of the data to test on') parser.add_argument('--max-data', type=int, default=10000, help='Maximum data points to train and test on') parser.add_argument( '--adjacent', type=int, default=0, help='Set adjacent to 1 if adjacent functions want to be used') opts = parser.parse_args() # e.g., tokenized_label = # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16 # tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5> # Train and test must be iterables of objects that support CRF-ready # .tokens and .labels attributes. query = DBSession.query(TokenizedLabel).\ filter(TokenizedLabel.tweet is not None).\ filter(TokenizedLabel.tweet != '').\ limit(opts.max_data) if (opts.adjacent == 0): X_y = ((featurize(item.tokens, crf_feature_functions), item.labels) for item in query) else: X_y = ((featurize_adjacent(item.tokens, crf_feature_functions), item.labels) for item in query) # unzip and flatten into static list X, y = zip(*X_y) # we need to read X multiple times, so make sure it's all static X = map(flatMap, X) N = len(y) for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True): # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices] train_X = [X[i] for i in train_indices] train_y = [y[i] for i in train_indices] test_X = [X[i] for i in test_indices] test_y = [y[i] for i in test_indices] classifier = CRF() # print_gloss=True evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y)
def default(cls, feature_functions, retrain=False, limit=10000): # Is it messy to have this method here, since it depends on tweedr.models.*? # and on a specific filepath in the local filesystem? model_filepath = '/tmp/tweedr.ml.crf.classifier-max%d.model' % limit if os.path.exists(model_filepath): return cls.from_file(model_filepath) else: from tweedr.models import DBSession, TokenizedLabel query = DBSession.query(TokenizedLabel).limit(10000) crf = cls.from_data(query, feature_functions) crf.save(model_filepath) return crf
def main(): parser = argparse.ArgumentParser( description='Train CRFSuite on data from the QCRI MySQL database', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-k', '--k-folds', type=int, default=10, help='How many folds of the data to test on') parser.add_argument('--max-data', type=int, default=10000, help='Maximum data points to train and test on') parser.add_argument('--include-none', type=int, default=0, help='Include None in Confusion Matrix.') parser.add_argument('-threshold', type=int, default=10, help='Threshold for number of gold labels classified.') opts = parser.parse_args() # e.g., tokenized_label = # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16 # tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5> # Train and test must be iterables of objects that support CRF-ready # .tokens and .labels attributes. query = DBSession.query(TokenizedLabel).limit(opts.max_data) X_y = ((featurize(item.tokens, crf_feature_functions), item.labels) for item in query) # unzip and flatten into static list X, y = zip(*X_y) # we need to read X multiple times, so make sure it's all static X = map(flatMap, X) categories = dict((label.id, label.text) for label in DBSession.query(Label)) print 'categories', categories N = len(y) index = 0 for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True): # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices] train_X = [X[i] for i in train_indices] train_y = [y[i] for i in train_indices] test_X = [X[i] for i in test_indices] test_y = [y[i] for i in test_indices] classifier = CRF() # print_gloss=True index = index + 1 evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y, index, opts)
def main(): parser = argparse.ArgumentParser( description='Train CRFSuite on data from the QCRI MySQL database', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-k', '--k-folds', type=int, default=10, help='How many folds of the data to test on') parser.add_argument('--max-data', type=int, default=10000, help='Maximum data points to train and test on') opts = parser.parse_args() # e.g., tokenized_label = # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16 # tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5> # Train and test must be iterables of objects that support CRF-ready # .tokens and .labels attributes. query = DBSession.query(TokenizedLabel).limit(opts.max_data) for L in range(0, len(crf_feature_functions) + 1): for subset in itertools.combinations(crf_feature_functions, L): sub = list(subset) print sub X_y = ((featurize(item.tokens, sub), item.labels) for item in query) # unzip and flatten into static list X, y = zip(*X_y) # we need to read X multiple times, so make sure it's all static X = map(flatMap, X) categories = dict((label.id, label.text) for label in DBSession.query(Label)) print 'categories', categories N = len(y) #tests on different data sets -> k folds is set to 10 right now for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True): train_X = [X[i] for i in train_indices] train_y = [y[i] for i in train_indices] test_X = [X[i] for i in test_indices] test_y = [y[i] for i in test_indices] classifier = CRF() evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y)
def main(): parser = argparse.ArgumentParser( description='Train CRFSuite on data from the QCRI MySQL database', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-k', '--k-folds', type=int, default=10, help='How many folds of the data to test on') parser.add_argument('--max-data', type=int, default=10000, help='Maximum data points to train and test on') parser.add_argument('--adjacent', type=int, default=0, help='Set adjacent to 1 if adjacent functions want to be used') opts = parser.parse_args() # e.g., tokenized_label = # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16 # tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5> # Train and test must be iterables of objects that support CRF-ready # .tokens and .labels attributes. query = DBSession.query(TokenizedLabel).\ filter(TokenizedLabel.tweet is not None).\ filter(TokenizedLabel.tweet != '').\ limit(opts.max_data) if (opts.adjacent == 0): X_y = ((featurize(item.tokens, crf_feature_functions), item.labels) for item in query) else: X_y = ((featurize_adjacent(item.tokens, crf_feature_functions), item.labels) for item in query) # unzip and flatten into static list X, y = zip(*X_y) # we need to read X multiple times, so make sure it's all static X = map(flatMap, X) N = len(y) for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True): # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices] train_X = [X[i] for i in train_indices] train_y = [y[i] for i in train_indices] test_X = [X[i] for i in test_indices] test_y = [y[i] for i in test_indices] classifier = CRF() # print_gloss=True evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y)
def tokenized_labels_sample(): total = DBSession.query(TokenizedLabel).count() index = random.randrange(total) logger.debug('/tokenized_labels/sample: choosing #%d out of %d', index, total) tokenized_label = DBSession.query(TokenizedLabel).offset(index).limit(1).first() return tokenized_label.__json__()
def tokenized_labels_sample(): total = DBSession.query(TokenizedLabel).count() index = random.randrange(total) logger.debug("/tokenized_labels/sample: choosing #%d out of %d", index, total) tokenized_label = DBSession.query(TokenizedLabel).offset(index).limit(1).first() return tokenized_label.__json__()
def count(): print >> sys.stderr, 'Tweet count started.' print 'There are %d labels in the database.' % DBSession.query( Label).count() print 'There are %d tokenized labels in the database.' % DBSession.query( TokenizedLabel).count()
def __init__(self): labeled_tweets = \ np.array(DBSession.query(DamageClassification).filter(DamageClassification.mturk_code == 'QCRI').limit(1000).all()) labeled_tweets = map(lambda x: (x.text, int(x.label)), labeled_tweets) self.dataset = labeled_tweets
def count(): print >> sys.stderr, 'Tweet count started.' print 'There are %d labels in the database.' % DBSession.query(Label).count() print 'There are %d tokenized labels in the database.' % DBSession.query(TokenizedLabel).count()