def searchSGDClassifier_classifier(title, train_departments): """ :param title: :param train_departments: :return: """ timeTraning = time.time() classifier = SklearnClassifier(SGDClassifier(loss='log')) classifier.train(train_departments) timeTraning = time.time() - timeTraning test_sent_features = word_feats(title) timeClassify = time.time() found_department = classifier.classify(test_sent_features) timeClassify = time.time() - timeClassify probability = classifier.prob_classify(test_sent_features) print(probability.prob(found_department)) return [ found_department, probability.prob(found_department), accuracy(classifier, train_departments[1000:]), timeClassify, timeTraning, ]
def searchLinearSVC(title, train_departments): """ Linear SVC :param title: :param train_departments: :return: """ timeTraning = time.time() #classifier = SklearnClassifier(LinearSVC(probability=True)) classifier = SklearnClassifier(SVC(kernel='linear', probability=True)) classifier.train(train_departments) timeTraning = time.time() - timeTraning test_sent_features = word_feats(title) timeClassify = time.time() found_department = classifier.classify(test_sent_features) timeClassify = time.time() - timeClassify probability = classifier.prob_classify(test_sent_features) print(probability.prob(found_department)) return [ found_department, probability.prob(found_department), accuracy(classifier, train_departments[1000:]), timeClassify, timeTraning, ]
def LG_gender(train_set, test_set): print('== SkLearn MaxEnt ==') from nltk.classify import SklearnClassifier from sklearn.linear_model import LogisticRegression sklearn_classifier = SklearnClassifier( LogisticRegression(C=10e5)).train(train_set) print(sklearn_classifier.prob_classify(gender_features('mark'))._prob_dict) print(nltk.classify.accuracy(sklearn_classifier, test_set))
def run_cv(featureset, k=100): random.shuffle(featureset) ntrain = int(len(featureset)) trainset = featureset[:ntrain] subSize = int(len(trainset)/k) perc = [] for i in range(k): print("Testing slice " + str(i) + "...") correct = 0 test = trainset[i*subSize:][:subSize] train = trainset[:i*subSize] + trainset[(i+1)*subSize:] the_classifier = SklearnClassifier(SVC(probability=False), sparse=False).train(train) for item in test: choice = "" probR = the_classifier.prob_classify(item[0]).prob('R') probD = the_classifier.prob_classify(item[0]).prob('D') if probR > probD: choice = 'R' else: choice = 'D' if choice == item[1]: correct+=1 perc.append((correct/len(test))*100) return perc
class RandomForestCascadeClassifier(): def __init__(self, dataset, k, user_followers=True, users_reachable=True, average_time=True, time_to_k=True): self.k = k self._twtokenize = TweetTokenizer(strip_handles=True) self._dataset = dataset self._user_followers = user_followers self._users_reachable = users_reachable self._average_time = average_time self._time_to_k = time_to_k self._stopwords = stopwords.words('english') self._stemmer = PorterStemmer() self._f_count = [] self._r_count = [] self._rt_count = [] self._avg = [] self._time = [] self._train() def _tokenize(self, tweet_text): return [ self._stemmer.stem(token) for token in self._twtokenize.tokenize(tweet_text) if token not in self._stopwords ] def _sorted_cascade_nodes(self, cascade): nodes = cascade['cascade'] cascade_nodes = [(int(key), nodes[key]) for key in nodes.keys()] return sorted(cascade_nodes, key=lambda x: x[0]) def _tweet_length_feature(self, cascade): length = len(cascade['root_tweet']['text']) return length def _user_followers_feature(self, cascade): followers = cascade['root_tweet']['user']['followers_count'] self._f_count.append(followers) return followers def _users_reachable_feature(self, nodes): reachable = 0 for kth, node in zip(range(self.k + 1), nodes): reachable += node[1]['user_followees_count'] self._r_count.append(reachable) return reachable def _average_time_feature(self, nodes): timestamp = [ int(node[1]['created_at']) for kth, node in zip(range(self.k + 1), nodes) ] average = (sum(numpy.diff(timestamp)) / float(len(timestamp))) / 1000 self._avg.append(average) return average def _users_retweet_feature(self, cascade): retweets = cascade['root_tweet']['retweet_count'] self._rt_count.append(retweets) return retweets def _time_to_k_feature(self, nodes): first = int(nodes[0][1]['created_at']) kth = int(list(zip(range(self.k + 1), nodes))[-1][1][1]['created_at']) diff = (kth - first) / 1000 self._time.append(diff) return diff def _extract_features(self, cascade): if cascade['root_tweet']['lang'] == 'en': tweet_tokens = self._tokenize(cascade['root_tweet']['text']) features = { "contains({0})".format(token): True for token in tweet_tokens } else: features = {} features['tweet_length'] = self._tweet_length_feature(cascade) # features['rtweet'] = self._users_retweet_feature(cascade) if self._user_followers: features["user_followers"] = self._user_followers_feature(cascade) cascade_nodes = self._sorted_cascade_nodes(cascade) if self._users_reachable: features['reachable'] = self._users_reachable_feature( cascade_nodes) if self._average_time: features['average'] = self._average_time_feature(cascade_nodes) if self._time_to_k: features['timetok'] = self._time_to_k_feature(cascade_nodes) return features def _train(self): pickle_filename = "{0}.pickle".format(self.__class__.__name__) if os.path.isfile(pickle_filename): with open(pickle_filename, "rb") as classifier_f: self._classifier = pickle.load(classifier_f) classifier_f.close() else: train_set = [(self._extract_features(cascade), cascade['label']) for cascade in self._dataset] pipeline = Pipeline([('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=1000)), ('rf', RandomForestClassifier(n_estimators=1000))]) self._classifier = SklearnClassifier(pipeline, sparse=False).train(train_set) with open(pickle_filename, "wb") as save_classifier: pickle.dump(self._classifier, save_classifier) save_classifier.close() def classify(self, cascade): features = self._extract_features(cascade) return self._classifier.classify(features) def classify_prob(self, cascade): features = self._extract_features(cascade) result = self._classifier.prob_classify(features) return {"positive": result.prob(True), "negative": result.prob(False)} def _metrics(self, results): print( metrics.classification_report(results['actual'], results['prediction'])) def classify_cascades(self, test_dataset): results = {"prediction": [], "actual": []} for cascade in test_dataset: result = self.classify(cascade) actual = cascade['label'] results["prediction"].append(result) results["actual"].append(actual) self._metrics(results) print("Average: {0}, Median: {1}, Std: {2}".format( numpy.average(self._f_count), numpy.median(self._f_count), numpy.std(self._f_count))) print("Average: {0}, Median: {1}, Std: {2}".format( numpy.average(self._r_count), numpy.median(self._r_count), numpy.std(self._r_count))) print("Average: {0}, Median: {1}, Std: {2}".format( numpy.average(self._avg), numpy.median(self._avg), numpy.std(self._avg))) print("Average: {0}, Median: {1}, Std: {2}".format( numpy.average(self._time), numpy.median(self._time), numpy.std(self._time))) def classify_cascades_prob_export(self, test_dataset): export = "dataset/" + self.__class__.__name__ + "_results.json" results = {} for cascade in test_dataset: results[cascade['url']] = self.classify_prob(cascade) export_file = open(export, 'w') export_file.write(json.dumps(results))