예제 #1
0
def main():
    parser = argparse.ArgumentParser(
        description='Train CRFSuite on data from the QCRI MySQL database',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-k',
                        '--k-folds',
                        type=int,
                        default=10,
                        help='How many folds of the data to test on')
    parser.add_argument('--max-data',
                        type=int,
                        default=10000,
                        help='Maximum data points to train and test on')
    parser.add_argument('--include-none',
                        type=int,
                        default=0,
                        help='Include None in Confusion Matrix.')
    parser.add_argument('-threshold',
                        type=int,
                        default=10,
                        help='Threshold for number of gold labels classified.')
    opts = parser.parse_args()

    # e.g., tokenized_label =
    # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16
    #    tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5>
    # Train and test must be iterables of objects that support CRF-ready
    # .tokens and .labels attributes.
    query = DBSession.query(TokenizedLabel).limit(opts.max_data)
    X_y = ((featurize(item.tokens, crf_feature_functions), item.labels)
           for item in query)
    # unzip and flatten into static list
    X, y = zip(*X_y)
    # we need to read X multiple times, so make sure it's all static
    X = map(flatMap, X)

    categories = dict(
        (label.id, label.text) for label in DBSession.query(Label))
    print 'categories', categories

    N = len(y)
    index = 0
    for train_indices, test_indices in cross_validation.KFold(N,
                                                              opts.k_folds,
                                                              shuffle=True):
        # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices]
        train_X = [X[i] for i in train_indices]
        train_y = [y[i] for i in train_indices]
        test_X = [X[i] for i in test_indices]
        test_y = [y[i] for i in test_indices]
        classifier = CRF()
        # print_gloss=True
        index = index + 1
        evaluateSequenceClassifier(classifier, train_X, train_y, test_X,
                                   test_y, index, opts)
예제 #2
0
def test_tweedr_models_no_ci():
    from tweedr.models import DBSession, TokenizedLabel, Label

    Tables = [TokenizedLabel, Label]
    for Table in Tables:
        row_count = DBSession.query(Table).count()
        assert row_count > 0, 'There should be more than 0 rows in the table "%s"' % Table.name
예제 #3
0
def test_tweedr_models_no_ci():
    from tweedr.models import DBSession, TokenizedLabel, Label

    Tables = [TokenizedLabel, Label]
    for Table in Tables:
        row_count = DBSession.query(Table).count()
        assert row_count > 0, 'There should be more than 0 rows in the table "%s"' % Table.name
예제 #4
0
파일: example.py 프로젝트: Priya22/tweedr
def first(limit):
    print >> sys.stderr, 'First %d tweets.' % limit
    for tokenized_label in DBSession.query(TokenizedLabel).limit(limit):
        # print repr(tokenized_label)
        tokenized_label_text = unicode(tokenized_label).translate(whitespace_unicode_translations).encode('utf8')
        token_type_object = tokenized_label.token_type_object
        print token_type_object.id, '\t', token_type_object.text, '\t', tokenized_label_text
예제 #5
0
def first(limit):
    print >> sys.stderr, 'First %d tweets.' % limit
    for tokenized_label in DBSession.query(TokenizedLabel).limit(limit):
        # print repr(tokenized_label)
        tokenized_label_text = unicode(tokenized_label).translate(
            whitespace_unicode_translations).encode('utf8')
        token_type_object = tokenized_label.token_type_object
        print token_type_object.id, '\t', token_type_object.text, '\t', tokenized_label_text
예제 #6
0
def read_tweets():
    '''Read labeled tweets from database'''

    logger.info('Reading labeled tweets from database...')
    labeled_tweets = \
        np.array(DBSession.query(DamageClassification).filter(DamageClassification.mturk_code
                 == 'QCRI').limit(opts.max_data).all())
    logger.info('Read %d tweets', len(labeled_tweets))
    return labeled_tweets
예제 #7
0
def main():
    parser = argparse.ArgumentParser(
        description='Train CRFSuite on data from the QCRI MySQL database',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-k',
                        '--k-folds',
                        type=int,
                        default=10,
                        help='How many folds of the data to test on')
    parser.add_argument('--max-data',
                        type=int,
                        default=10000,
                        help='Maximum data points to train and test on')
    parser.add_argument(
        '--adjacent',
        type=int,
        default=0,
        help='Set adjacent to 1 if adjacent functions want to be used')
    opts = parser.parse_args()

    # e.g., tokenized_label =
    # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16
    #    tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5>
    # Train and test must be iterables of objects that support CRF-ready
    # .tokens and .labels attributes.
    query = DBSession.query(TokenizedLabel).\
        filter(TokenizedLabel.tweet is not None).\
        filter(TokenizedLabel.tweet != '').\
        limit(opts.max_data)
    if (opts.adjacent == 0):
        X_y = ((featurize(item.tokens, crf_feature_functions), item.labels)
               for item in query)
    else:
        X_y = ((featurize_adjacent(item.tokens,
                                   crf_feature_functions), item.labels)
               for item in query)
    # unzip and flatten into static list
    X, y = zip(*X_y)
    # we need to read X multiple times, so make sure it's all static
    X = map(flatMap, X)

    N = len(y)
    for train_indices, test_indices in cross_validation.KFold(N,
                                                              opts.k_folds,
                                                              shuffle=True):
        # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices]
        train_X = [X[i] for i in train_indices]
        train_y = [y[i] for i in train_indices]
        test_X = [X[i] for i in test_indices]
        test_y = [y[i] for i in test_indices]
        classifier = CRF()
        # print_gloss=True
        evaluateSequenceClassifier(classifier, train_X, train_y, test_X,
                                   test_y)
예제 #8
0
 def default(cls, feature_functions, retrain=False, limit=10000):
     # Is it messy to have this method here, since it depends on tweedr.models.*?
     # and on a specific filepath in the local filesystem?
     model_filepath = '/tmp/tweedr.ml.crf.classifier-max%d.model' % limit
     if os.path.exists(model_filepath):
         return cls.from_file(model_filepath)
     else:
         from tweedr.models import DBSession, TokenizedLabel
         query = DBSession.query(TokenizedLabel).limit(10000)
         crf = cls.from_data(query, feature_functions)
         crf.save(model_filepath)
         return crf
예제 #9
0
 def default(cls, feature_functions, retrain=False, limit=10000):
     # Is it messy to have this method here, since it depends on tweedr.models.*?
     # and on a specific filepath in the local filesystem?
     model_filepath = '/tmp/tweedr.ml.crf.classifier-max%d.model' % limit
     if os.path.exists(model_filepath):
         return cls.from_file(model_filepath)
     else:
         from tweedr.models import DBSession, TokenizedLabel
         query = DBSession.query(TokenizedLabel).limit(10000)
         crf = cls.from_data(query, feature_functions)
         crf.save(model_filepath)
         return crf
예제 #10
0
def main():
    parser = argparse.ArgumentParser(
        description='Train CRFSuite on data from the QCRI MySQL database',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-k', '--k-folds',
        type=int, default=10, help='How many folds of the data to test on')
    parser.add_argument('--max-data',
        type=int, default=10000, help='Maximum data points to train and test on')
    parser.add_argument('--include-none', type=int, default=0, help='Include None in Confusion Matrix.')
    parser.add_argument('-threshold', type=int, default=10, help='Threshold for number of gold labels classified.')
    opts = parser.parse_args()

    # e.g., tokenized_label =
    # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16
    #    tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5>
    # Train and test must be iterables of objects that support CRF-ready
    # .tokens and .labels attributes.
    query = DBSession.query(TokenizedLabel).limit(opts.max_data)
    X_y = ((featurize(item.tokens, crf_feature_functions), item.labels) for item in query)
    # unzip and flatten into static list
    X, y = zip(*X_y)
    # we need to read X multiple times, so make sure it's all static
    X = map(flatMap, X)

    categories = dict((label.id, label.text) for label in DBSession.query(Label))
    print 'categories', categories

    N = len(y)
    index = 0
    for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True):
        # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices]
        train_X = [X[i] for i in train_indices]
        train_y = [y[i] for i in train_indices]
        test_X = [X[i] for i in test_indices]
        test_y = [y[i] for i in test_indices]
        classifier = CRF()
        # print_gloss=True
        index = index + 1
        evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y, index, opts)
예제 #11
0
def main():
    parser = argparse.ArgumentParser(
        description='Train CRFSuite on data from the QCRI MySQL database',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-k', '--k-folds',
        type=int, default=10, help='How many folds of the data to test on')
    parser.add_argument('--max-data',
        type=int, default=10000, help='Maximum data points to train and test on')
    opts = parser.parse_args()

    # e.g., tokenized_label =
    # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16
    #    tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5>
    # Train and test must be iterables of objects that support CRF-ready
    # .tokens and .labels attributes.
    query = DBSession.query(TokenizedLabel).limit(opts.max_data)

    for L in range(0, len(crf_feature_functions) + 1):
        for subset in itertools.combinations(crf_feature_functions, L):
            sub = list(subset)
            print sub
            X_y = ((featurize(item.tokens, sub), item.labels) for item in query)
            # unzip and flatten into static list
            X, y = zip(*X_y)
            # we need to read X multiple times, so make sure it's all static
            X = map(flatMap, X)
            categories = dict((label.id, label.text) for label in DBSession.query(Label))
            print 'categories', categories

            N = len(y)
            #tests on different data sets -> k folds is set to 10 right now
            for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True):
                train_X = [X[i] for i in train_indices]
                train_y = [y[i] for i in train_indices]
                test_X = [X[i] for i in test_indices]
                test_y = [y[i] for i in test_indices]
                classifier = CRF()
                evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y)
예제 #12
0
파일: evaluate.py 프로젝트: Priya22/tweedr
def main():
    parser = argparse.ArgumentParser(
        description='Train CRFSuite on data from the QCRI MySQL database',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-k', '--k-folds',
        type=int, default=10, help='How many folds of the data to test on')
    parser.add_argument('--max-data',
        type=int, default=10000, help='Maximum data points to train and test on')
    parser.add_argument('--adjacent',
        type=int, default=0, help='Set adjacent to 1 if adjacent functions want to be used')
    opts = parser.parse_args()

    # e.g., tokenized_label =
    # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16
    #    tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5>
    # Train and test must be iterables of objects that support CRF-ready
    # .tokens and .labels attributes.
    query = DBSession.query(TokenizedLabel).\
        filter(TokenizedLabel.tweet is not None).\
        filter(TokenizedLabel.tweet != '').\
        limit(opts.max_data)
    if (opts.adjacent == 0):
        X_y = ((featurize(item.tokens, crf_feature_functions), item.labels) for item in query)
    else:
        X_y = ((featurize_adjacent(item.tokens, crf_feature_functions), item.labels) for item in query)
    # unzip and flatten into static list
    X, y = zip(*X_y)
    # we need to read X multiple times, so make sure it's all static
    X = map(flatMap, X)

    N = len(y)
    for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True):
        # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices]
        train_X = [X[i] for i in train_indices]
        train_y = [y[i] for i in train_indices]
        test_X = [X[i] for i in test_indices]
        test_y = [y[i] for i in test_indices]
        classifier = CRF()
        # print_gloss=True
        evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y)
예제 #13
0
def tokenized_labels_sample():
    total = DBSession.query(TokenizedLabel).count()
    index = random.randrange(total)
    logger.debug('/tokenized_labels/sample: choosing #%d out of %d', index, total)
    tokenized_label = DBSession.query(TokenizedLabel).offset(index).limit(1).first()
    return tokenized_label.__json__()
예제 #14
0
파일: crf.py 프로젝트: Priya22/tweedr
def tokenized_labels_sample():
    total = DBSession.query(TokenizedLabel).count()
    index = random.randrange(total)
    logger.debug("/tokenized_labels/sample: choosing #%d out of %d", index, total)
    tokenized_label = DBSession.query(TokenizedLabel).offset(index).limit(1).first()
    return tokenized_label.__json__()
예제 #15
0
def count():
    print >> sys.stderr, 'Tweet count started.'
    print 'There are %d labels in the database.' % DBSession.query(
        Label).count()
    print 'There are %d tokenized labels in the database.' % DBSession.query(
        TokenizedLabel).count()
예제 #16
0
 def __init__(self):
     labeled_tweets = \
         np.array(DBSession.query(DamageClassification).filter(DamageClassification.mturk_code
             == 'QCRI').limit(1000).all())
     labeled_tweets = map(lambda x: (x.text, int(x.label)), labeled_tweets)
     self.dataset = labeled_tweets
예제 #17
0
파일: example.py 프로젝트: Priya22/tweedr
def count():
    print >> sys.stderr, 'Tweet count started.'
    print 'There are %d labels in the database.' % DBSession.query(Label).count()
    print 'There are %d tokenized labels in the database.' % DBSession.query(TokenizedLabel).count()
예제 #18
0
 def __init__(self):
     labeled_tweets = \
         np.array(DBSession.query(DamageClassification).filter(DamageClassification.mturk_code
             == 'QCRI').limit(1000).all())
     labeled_tweets = map(lambda x: (x.text, int(x.label)), labeled_tweets)
     self.dataset = labeled_tweets