Пример #1
0
def test_classifier():
    print "Testing classifier on all labeled tweets"

    labeled_data = get_training_data()

    # based on the desired training size each round, calculate how many rounds
    # of accuracy testing are needed for 100% coverage of the test data
    train_size = 0.9
    num_loops_for_full_coverage = int(1.0 / (1.0 - train_size))

    random.seed()
    accuracy_sum = 0
    for i in xrange(num_loops_for_full_coverage):
        # generate a random state to make the splitting each round random
        rs = random.randint(1, 100)
        XTrain, XTest, yTrain, yTest = train_test_split(labeled_data.tweets,
                                                        labeled_data.labels,
                                                        train_size=train_size,
                                                        random_state=rs)

        pipeline = get_pipeline()

        pipeline.fit(XTrain, yTrain)
        results = pipeline.predict(XTest)

        accuracy = accuracy_score(results, yTest)
        # print "Accuracy:", accuracy
        accuracy_sum += accuracy

    print "Classifier accuracy: %s%%" % (float(accuracy_sum) /
                                         num_loops_for_full_coverage * 100)
Пример #2
0
    def train_model(self):
        """Creates and trains a CatBoost algorithm on the sample query data."""
        labeled_data = get_training_data()

        features = [self.get_features(query) for query in labeled_data.keys()]
        intents = [intent.name for intent in labeled_data.values()]

        self.model.fit(features, intents)
Пример #3
0
def classify(tweets):
    training_data = get_training_data()
    testing_tweets = map(lambda tweet: tweet.text, tweets)

    pipeline = get_pipeline()

    pipeline.fit(training_data.tweets, training_data.labels)
    results = pipeline.predict(testing_tweets)

    relevant_tweets = list()
    for index, result in enumerate(results):
        # print result, testing_tweets[index]
        if result == "+":
            relevant_tweets.append(tweets[index])

    return relevant_tweets
Пример #4
0
    def get_compound_strings(self) -> List[str]:
        use_individual = False  # can be used to see if using the individual examples, instead of compounding them, as the documents is any better

        if use_individual:
            data = get_training_data()
            return self.clean_strings(query for query, intent in data)

        examples = defaultdict(list)

        with open("query.txt") as infile:
            lines = infile.readlines()

        for line in lines:
            components = line.split("|")

            query = components[1]
            intent = components[3]

            examples[intent].append(query)

        strings = [" ".join(queries) for queries in examples.values()]

        return self.clean_strings(strings)
Пример #5
0
def filter(dataset_filename):
    if not os.path.isfile(dataset_filename):
        print "Dataset does not exist:", dataset_filename
        sys.exit(1)

    training_tweets = set(get_training_data().tweets)

    filtered_tweets = []

    # build up disaster regular expression based on the defined keywords
    hurricane_re_str = str()
    for index, keyword in enumerate(disaster_keywords['hurricane']):
        hurricane_re_str += '\\b' + keyword + '\\b'

        # add the "or" regex unless it is the last keyword for the disaster
        if index != len(disaster_keywords['hurricane']) - 1:
            hurricane_re_str += '|'

    hurricane_re = re.compile(r'%s' % (hurricane_re_str), re.IGNORECASE)

    count = 0
    num_tweets_with_errors = 0
    for line in open(dataset_filename, 'r'):
        try:
            # try reading in the file two different ways depending on the json format
            try:
                tweetdict = ast.literal_eval(line)
            except ValueError:
                try:
                    tweetdict = json.loads(line)
                except ValueError as e:
                    raise e

            tweet_text = tweetdict['text'].replace('\n', ' ')

            if hurricane_re.findall(tweet_text):
                # ensure the model isn't biased by including tweets from the training data in the test data set
                if tweet_text not in training_tweets:
                    # replace urls with <url>
                    # tweet_text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '<url>', tweet_text)

                    # ensure the tweet contains only printable characters
                    tweet_text = ''.join(
                        [c for c in tweet_text if c in string.printable])
                    filtered_tweets.append(
                        Tweet(tweet_text, tweetdict['created_at'],
                              tweetdict['geo']))
                    count += 1

        except SyntaxError:
            num_tweets_with_errors += 1

        # if count >= 400:
        # break

    if num_tweets_with_errors > 0:
        print "Number of tweets unable to be parsed:", num_tweets_with_errors

    print 'Number of hurricane tweets after filtering:', len(filtered_tweets)

    return filtered_tweets