def test_1NN(digitsdat, selected, all_test_m): for testm in all_test_m: classifier = Classifier() classifier.build_model(digitsdat.X[selected[0:testm], :], digitsdat.y[ selected[0:testm]]) print("m=%d error=%f" % ( testm, classifier.classify(digitsdat.testX, digitsdat.testy)))
class TagCountBolt(Bolt): outputs = ['cls', 'tag', 'date', 'hour'] def initialize(self, conf, ctx): self.counter = 0 self.pid = os.getpid() self.total = 0 self.classifier = Classifier() self.directory = str(os.getcwd()) + "/Tweet_Images" if not os.path.exists(self.directory): os.makedirs(self.directory) #self.logger.info("------CREATED FOLDER--------") def _increment(self, word, inc_by): self.counter[word] += inc_by self.total += inc_by def process(self, tup): data = json.loads(tup.values[0].encode('utf-8')) self.logger.info(data) if 'img_url' in data: path = "{}/{}.jpg".format(self.directory, self.counter) try: urllib.urlretrieve(data['img_url'], path) self.counter = self.counter + 1 self.classifier.load_image(path) predicted_class = self.classifier.classify() #self.logger.info("\n [INFO_BOLT_PREDICTION] : "+ " ".join(predicted_class)) if len(data['hash']) > 0: tags = [ str(li['text']) for li in data['hash'] if li['text'][0:1] != "\\" ] #self.logger.info("\n [INFO_BOLT_TAGS] : "+ " ".join(tags)) now = datetime.datetime.now() now_date = "{:04}-{:02}-{:02}".format( now.year, now.month, now.day) for cls in predicted_class: if len(tags) > 0: for tag in tags: self.emit([cls, tag, now_date, str(now.hour)]) #self.logger.info("{0}/{1}".format(cls,tag)) os.remove(path) except (KeyboardInterrupt, Exception): self.logger.info(Exception) else: self.logger.info("NO IMG URL!!!") #self.logger.info(json.dumps(data)) if self.counter % 10 == 0: self.logger.info("Processed [{:,}] tweets".format(self.counter))
def test_1NN(digitsdat, selected, all_test_m): for testm in all_test_m: classifier = Classifier() # model = build(digitsdat.X[selected[0:testm], :], digitsdat.y[selected[0:testm]]) classifier.build_model(digitsdat.X[selected[0:testm], :], digitsdat.y[selected[0:testm]]) error = classifier.classify(digitsdat.testX, digitsdat.testy) # accuracy = res(model) # print("m=%d error=%f" % (testm, 100-accuracy)) print("m=%d error=%f" % (testm, error)) # global M, e M.append(testm) e.append(error)
from classify import Classifier cl = Classifier() print(cl.classify('./test/'))
class Validator: def __init__(self, restrictions): self.attributes = [] self.true_pos = 0 self.true_neg = 0 self.false_pos = 0 self.false_neg = 0 if len(restrictions) > 0: self.restr = restrictions.restr else: self.restr = restrictions def train(self, domain, class_data): document = xml.dom.minidom.Document() node = document.createElement('Tree') document.appendChild(node) d = Trainer(domain, class_data, document) partial_atts = d.attributes partial_atts.remove("Id") partial_atts.remove("Vote") print partial_atts if len(self.restr) > 0: d.rem_restrictions(self.restr) d.c45(d.data, d.attributes, node, 0) self.classifier = Classifier() if len(class_data.category) > 0: self.classifier.has_category = True for row in d.data: self.classifier.classify(document.documentElement, row, class_data.attributes) self.classifier.print_stats() #def print_stats(self): def recall(self): TP = self.classifier.true_pos FN = self.classifier.false_neg return float(TP) / (TP + FN) def precision(self): TP = self.classifier.true_pos FP = self.classifier.false_pos return float(TP) / (TP + FP) def pf(self): TN = self.classifier.true_neg FP = self.classifier.false_pos return float(FP) / (FP + TN) def fmeasure(self): beta = 2 return float(beta * self.precision() * self.recall()) / (self.precision() + self.recall()) def confusion_matrix(self): print "###### CONFUSION MATRIX #######" print " | Classified Positive | Classified Negative |" print "Actual Positive | " + self.classifier.true_pos + " | " + self.classifier.false_neg + " |" print "Actual Negative | " + self.classifier.false_pos + " | " + self.classifier.true_neg + " |"
# -*- coding: utf-8 -*- from classify import Classifier import urllib clf = Classifier() clf.load_image("/home/hari/Documents/Big_data/bits-please/car.jpg") predicted_class = clf.classify() print (" ".join(predicted_class))
import logging # :: Logging level :: loggingLevel = logging.INFO logger = logging.getLogger() logger.setLevel(loggingLevel) ch = logging.StreamHandler(sys.stdout) ch.setLevel(loggingLevel) formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) logger.addHandler(ch) parser = argparse.ArgumentParser() parser.add_argument('--dataset', help='Location of word dataset files', type=str, required='true') parser.add_argument('--BIO', help='State if its bio_tag', type=bool, default=False) args = parser.parse_args() # Build the graph pmi = PMI(args.dataset) connected = pmi.build_graph() # Performs classification classifier = Classifier(args.dataset, args.BIO) classifier.classify()
class Validator: def __init__(self, restrictions): self.attributes = []; self.true_pos = 0 self.true_neg = 0 self.false_pos = 0 self.false_neg = 0 if len(restrictions) > 0: self.restr = restrictions.restr else: self.restr = restrictions def train(self, domain, class_data): document = xml.dom.minidom.Document() node = document.createElement('Tree') document.appendChild(node) d = Trainer(domain, class_data, document) partial_atts = d.attributes partial_atts.remove("Id") partial_atts.remove("Vote") print partial_atts if len(self.restr) > 0: d.rem_restrictions(self.restr) d.c45(d.data, d.attributes, node, 0) self.classifier = Classifier() if len(class_data.category) > 0: self.classifier.has_category = True for row in d.data: self.classifier.classify(document.documentElement, row, class_data.attributes) self.classifier.print_stats() #def print_stats(self): def recall(self): TP = self.classifier.true_pos FN = self.classifier.false_neg return float(TP) / (TP + FN) def precision(self): TP = self.classifier.true_pos FP = self.classifier.false_pos return float(TP) / (TP + FP) def pf(self): TN = self.classifier.true_neg FP = self.classifier.false_pos return float(FP) / (FP + TN) def fmeasure(self): beta = 2 return float(beta * self.precision() * self.recall()) / (self.precision() + self.recall()) def confusion_matrix(self): print "###### CONFUSION MATRIX #######" print " | Classified Positive | Classified Negative |" print "Actual Positive | " + self.classifier.true_pos + " | " + self.classifier.false_neg + " |" print "Actual Negative | " + self.classifier.false_pos + " | " + self.classifier.true_neg + " |"
class Controller(threading.Thread): def __init__(self, thread_name, event): super(Controller, self).__init__() self.name = thread_name self.threadEvent = event self.logger_info = utils.initlog('Console', 'Console.log') self.schedule = sched.scheduler(time.time, time.sleep) self.profiles_name, profiles = utils.load_profiles('profiles') self.related_tweets = [[] for _ in range(len(profiles))] self.pushed_tweets = [[] for _ in range(len(profiles))] self.pushed_tweets_ids = set([]) self.related_tweets_hash = set([]) self.classifier = Classifier() self.ranker = self.load_ranker() self.pusher = Pusher() def load_ranker(self): self.logger_info.info('loading ranker...') gateway = JavaGateway() ranker = gateway.entry_point self.logger_info.info('ranker loaded!') return ranker def run(self): self.logger_info.info('%s is starting...' % self.name) self.threadEvent.wait() self.logger_info.info('%s is running...' % self.name) self.schedule.enter(0, 0, self.dump_schedule, ()) self.schedule.run() self.process() def process(self): start_day = time.gmtime(time.time()).tm_mday for line in sys.stdin: tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text( line) simhash_value = Simhash(tweet_text).value if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids: continue topic_id, similarity = self.classifier.classify(tweet_text) if topic_id == '': continue tweet_json['similarity'] = similarity evaluate_score = self.ranker.predict(json.dumps(tweet_json)) total_score = similarity * evaluate_score if total_score < 0.15: continue is_pushed = self.pusher.push(evaluate_score, topic_id) if is_pushed: delivery_time = time.time() self.pushed_tweets[topic_id].append([ tid_origin, str(delivery_time)[:10], similarity, total_score, tweet_text ]) self.pushed_tweets_ids.add(tid_retweet) struct_time = time.gmtime(float(timestamp[:-3])) utc_time = time.strftime('%Y%m%d', struct_time) self.related_tweets[topic_id].append( [utc_time, tid_origin, total_score, tweet_text]) self.related_tweets_hash.add(simhash_value) if struct_time.tm_mday != start_day: self.dump_result(start_day) start_day = struct_time.tm_mday def dump_result(self, file_name): self.logger_info.info('saving result...') with open('submit/task-b/' + file_name, 'w') as fw: for index, records in enumerate(self.related_tweets): pid = str(index + 226) sorted_records = sorted(records, key=lambda item: -item[2]) for rank, record in enumerate(sorted_records): if rank >= 100: break fw.write('%s\tMB%s\tQ0\t%s\t%d\t%f\t%s\n' % (record[0], pid, record[1], rank + 1, record[2], 'CSSNA')) with open('submit/task-a/' + file_name, 'w') as fw: with open('submit/task-a_extr/' + file_name, 'w') as fw_extr: for index, records in enumerate(self.pushed_tweets): pid = str(index + 226) for record in records: fw.write('MB%s\t%s\t%s\tCSSNA\n' % (pid, record[0], record[1])) fw_extr.write('MB%s\t%s\t%s\tCSSNA\t%s\t%s\t%s\n' % (pid, record[0], record[1], record[2], record[3], record[4])) self.related_tweets = [[] for _ in range(225)] self.pushed_tweets = [[] for _ in range(225)]
class Controller(threading.Thread): def __init__(self, thread_name, event): super(Controller, self).__init__() self.name = thread_name self.threadEvent = event self.logger_info = utils.initlog('Console', 'Console.log') self.schedule = sched.scheduler(time.time, time.sleep) self.profiles_name, profiles = utils.load_profiles('profiles') self.related_tweets = [[] for _ in range(len(profiles))] # 当天相关推文记录,存储共离线分析 self.pushed_tweets = [[] for _ in range(len(profiles))] self.pushed_tweets_ids = set([]) self.related_tweets_hash = set([]) self.classifier = Classifier() self.ranker = self.load_ranker() self.pusher = Pusher() def load_ranker(self): self.logger_info.info('loading ranker...') gateway = JavaGateway() ranker = gateway.entry_point self.logger_info.info('ranker loaded!') return ranker def run(self): self.logger_info.info('%s is starting...' % self.name) self.threadEvent.wait() self.logger_info.info('%s is running...' % self.name) # self.schedule.enter(0, 0, self.dump_schedule, ()) # self.schedule.run() self.process() def process(self): data_file_path = sys.argv[1] files = os.listdir(data_file_path) files.sort() for f in files: filename = os.path.join(data_file_path, f) logging.info(filename) count = 0 for line in open(filename, 'rb'): start = time.clock() tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text(line) simhash_value = Simhash(tweet_text).value if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids: continue topic_id, similarity = self.classifier.classify(tweet_text) if topic_id == '': continue count += 1 if count % 10000 == 0: logging.info('%d' % count) tweet_json['similarity'] = similarity evaluate_score = self.ranker.predict(json.dumps(tweet_json)) total_score = (evaluate_score ** 0.5) * similarity # if total_score < 0.15: # continue timestruct = time.gmtime(int(timestamp[:-3])) is_pushed = self.pusher.push(total_score, topic_id, timestruct) if is_pushed: delivery_time = float(timestamp) / 1000.0 + (time.clock() - start) self.pushed_tweets[topic_id].append([tid_origin, str(delivery_time)[:10], similarity, total_score, tweet_text]) utc_time = time.strftime('%Y%m%d', timestruct) self.related_tweets[topic_id].append([utc_time, tid_origin, total_score, tid_retweet, timestamp[:-3], tweet_text]) self.related_tweets_hash.add(simhash_value) self.pushed_tweets_ids.add(tid_retweet) self.dump_result(f) self.pusher = Pusher() self.logger_info.info('\n=======finished!=======\n') def dump_result(self, file_name): self.logger_info.info('saving result...') with open('submit/task-b/b_submit', 'a') as fw: with open('submit/task-b/b_review/B_candidateday_' + file_name[-2:], 'w') as fw_review: for index, records in enumerate(self.related_tweets): pid = str(index+226) sorted_records = sorted(records, key=lambda item: -item[2]) for rank, record in enumerate(sorted_records): if rank >= 100: break fw.write('%s\tMB%s\tQ0\t%s\t%d\t%f\t%s\n' % (record[0], pid, record[1], rank+1, record[2], 'CSSNA')) fw_review.write('%s\tMB%s\tQ0\t%s\t%f\tSNACS\t%s\t%s\t%s\n' % (record[0], pid, record[1], record[2], record[3], record[4], record[5])) with open('submit/task-a/a_submit', 'a') as fw: with open('submit/task-a/a_review', 'a') as fw_review: for index, records in enumerate(self.pushed_tweets): pid = str(index+226) for record in records: fw.write('MB%s\t%s\t%s\tCSSNA\n' % (pid, record[0], record[1])) fw_review.write('MB%s\t%s\t%s\tCSSNA\t%s\t%s\t%s\n' % (pid, record[0], record[1], record[2], record[3], record[4])) self.related_tweets = [[] for _ in range(225)] # 清空前天相关推文记录 self.pushed_tweets = [[] for _ in range(225)] def dump_schedule(self): self.logger_info.info('saving result...') utc_time = time.strftime('%Y%m%d', time.gmtime()) for index, records in self.related_tweets: pid = str(index+226) with open('profile_MB' + pid, 'w') as fw: for record in records: fw.write(utc_time + '\t' + pid + '\tQ0\t' + record + '\n') self.related_tweets = [[] for _ in range(226)] # 清空前天相关推文记录 self.schedule.enter(24*60*60, 0, self.dump_schedule, ()) def detect_tweet_stream(self, year, month, d, h, m, s, ms): start = datetime.datetime(year, month, d, h, m, s, ms) delta = (start - datetime.datetime.now()).seconds self.logger_info.info('waiting secondes: ' + str(delta)) time.sleep(delta) self.logger_info.info('tweet stream is ready') is_ready = True return is_ready
class Controller(threading.Thread): def __init__(self, thread_name, event): super(Controller, self).__init__() self.name = thread_name self.threadEvent = event self.logger_info = utils.initlog('Console', 'Console.log') self.schedule = sched.scheduler(time.time, time.sleep) self.profiles_name, profiles = utils.load_profiles('profiles') self.related_tweets = [[] for _ in range(len(profiles))] self.pushed_tweets = [[] for _ in range(len(profiles))] self.pushed_tweets_ids = set([]) self.related_tweets_hash = set([]) self.classifier = Classifier() self.ranker = self.load_ranker() self.pusher = Pusher() def load_ranker(self): self.logger_info.info('loading ranker...') gateway = JavaGateway() ranker = gateway.entry_point self.logger_info.info('ranker loaded!') return ranker def run(self): self.logger_info.info('%s is starting...' % self.name) self.threadEvent.wait() self.logger_info.info('%s is running...' % self.name) self.schedule.enter(0, 0, self.dump_schedule, ()) self.schedule.run() self.process() def process(self): start_day = time.gmtime(time.time()).tm_mday for line in sys.stdin: tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text(line) simhash_value = Simhash(tweet_text).value if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids: continue topic_id, similarity = self.classifier.classify(tweet_text) if topic_id == '': continue tweet_json['similarity'] = similarity evaluate_score = self.ranker.predict(json.dumps(tweet_json)) total_score = similarity * evaluate_score if total_score < 0.15: continue is_pushed = self.pusher.push(evaluate_score, topic_id) if is_pushed: delivery_time = time.time() self.pushed_tweets[topic_id].append([tid_origin, str(delivery_time)[:10], similarity, total_score, tweet_text]) self.pushed_tweets_ids.add(tid_retweet) struct_time = time.gmtime(float(timestamp[:-3])) utc_time = time.strftime('%Y%m%d', struct_time) self.related_tweets[topic_id].append([utc_time, tid_origin, total_score, tweet_text]) self.related_tweets_hash.add(simhash_value) if struct_time.tm_mday != start_day: self.dump_result(start_day) start_day = struct_time.tm_mday def dump_result(self, file_name): self.logger_info.info('saving result...') with open('submit/task-b/'+file_name, 'w') as fw: for index, records in enumerate(self.related_tweets): pid = str(index+226) sorted_records = sorted(records, key=lambda item: -item[2]) for rank, record in enumerate(sorted_records): if rank >= 100: break fw.write('%s\tMB%s\tQ0\t%s\t%d\t%f\t%s\n' % (record[0], pid, record[1], rank+1, record[2], 'CSSNA')) with open('submit/task-a/'+file_name, 'w') as fw: with open('submit/task-a_extr/'+file_name, 'w') as fw_extr: for index, records in enumerate(self.pushed_tweets): pid = str(index+226) for record in records: fw.write('MB%s\t%s\t%s\tCSSNA\n' % (pid, record[0], record[1])) fw_extr.write('MB%s\t%s\t%s\tCSSNA\t%s\t%s\t%s\n' % (pid, record[0], record[1], record[2], record[3], record[4])) self.related_tweets = [[] for _ in range(225)] self.pushed_tweets = [[] for _ in range(225)]