def test_classifier(): print "Testing classifier on all labeled tweets" labeled_data = get_training_data() # based on the desired training size each round, calculate how many rounds # of accuracy testing are needed for 100% coverage of the test data train_size = 0.9 num_loops_for_full_coverage = int(1.0 / (1.0 - train_size)) random.seed() accuracy_sum = 0 for i in xrange(num_loops_for_full_coverage): # generate a random state to make the splitting each round random rs = random.randint(1, 100) XTrain, XTest, yTrain, yTest = train_test_split(labeled_data.tweets, labeled_data.labels, train_size=train_size, random_state=rs) pipeline = get_pipeline() pipeline.fit(XTrain, yTrain) results = pipeline.predict(XTest) accuracy = accuracy_score(results, yTest) # print "Accuracy:", accuracy accuracy_sum += accuracy print "Classifier accuracy: %s%%" % (float(accuracy_sum) / num_loops_for_full_coverage * 100)
def train_model(self): """Creates and trains a CatBoost algorithm on the sample query data.""" labeled_data = get_training_data() features = [self.get_features(query) for query in labeled_data.keys()] intents = [intent.name for intent in labeled_data.values()] self.model.fit(features, intents)
def classify(tweets): training_data = get_training_data() testing_tweets = map(lambda tweet: tweet.text, tweets) pipeline = get_pipeline() pipeline.fit(training_data.tweets, training_data.labels) results = pipeline.predict(testing_tweets) relevant_tweets = list() for index, result in enumerate(results): # print result, testing_tweets[index] if result == "+": relevant_tweets.append(tweets[index]) return relevant_tweets
def get_compound_strings(self) -> List[str]: use_individual = False # can be used to see if using the individual examples, instead of compounding them, as the documents is any better if use_individual: data = get_training_data() return self.clean_strings(query for query, intent in data) examples = defaultdict(list) with open("query.txt") as infile: lines = infile.readlines() for line in lines: components = line.split("|") query = components[1] intent = components[3] examples[intent].append(query) strings = [" ".join(queries) for queries in examples.values()] return self.clean_strings(strings)
def filter(dataset_filename): if not os.path.isfile(dataset_filename): print "Dataset does not exist:", dataset_filename sys.exit(1) training_tweets = set(get_training_data().tweets) filtered_tweets = [] # build up disaster regular expression based on the defined keywords hurricane_re_str = str() for index, keyword in enumerate(disaster_keywords['hurricane']): hurricane_re_str += '\\b' + keyword + '\\b' # add the "or" regex unless it is the last keyword for the disaster if index != len(disaster_keywords['hurricane']) - 1: hurricane_re_str += '|' hurricane_re = re.compile(r'%s' % (hurricane_re_str), re.IGNORECASE) count = 0 num_tweets_with_errors = 0 for line in open(dataset_filename, 'r'): try: # try reading in the file two different ways depending on the json format try: tweetdict = ast.literal_eval(line) except ValueError: try: tweetdict = json.loads(line) except ValueError as e: raise e tweet_text = tweetdict['text'].replace('\n', ' ') if hurricane_re.findall(tweet_text): # ensure the model isn't biased by including tweets from the training data in the test data set if tweet_text not in training_tweets: # replace urls with <url> # tweet_text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '<url>', tweet_text) # ensure the tweet contains only printable characters tweet_text = ''.join( [c for c in tweet_text if c in string.printable]) filtered_tweets.append( Tweet(tweet_text, tweetdict['created_at'], tweetdict['geo'])) count += 1 except SyntaxError: num_tweets_with_errors += 1 # if count >= 400: # break if num_tweets_with_errors > 0: print "Number of tweets unable to be parsed:", num_tweets_with_errors print 'Number of hurricane tweets after filtering:', len(filtered_tweets) return filtered_tweets