Пример #1
0
def test_1NN(digitsdat, selected, all_test_m):

    for testm in all_test_m:

        classifier = Classifier()
        classifier.build_model(digitsdat.X[selected[0:testm], :], digitsdat.y[ selected[0:testm]])
        print("m=%d error=%f" % ( testm, classifier.classify(digitsdat.testX, digitsdat.testy)))
Пример #2
0
class TagCountBolt(Bolt):
    outputs = ['cls', 'tag', 'date', 'hour']

    def initialize(self, conf, ctx):
        self.counter = 0
        self.pid = os.getpid()
        self.total = 0
        self.classifier = Classifier()
        self.directory = str(os.getcwd()) + "/Tweet_Images"
        if not os.path.exists(self.directory):
            os.makedirs(self.directory)
            #self.logger.info("------CREATED FOLDER--------")

    def _increment(self, word, inc_by):
        self.counter[word] += inc_by
        self.total += inc_by

    def process(self, tup):
        data = json.loads(tup.values[0].encode('utf-8'))
        self.logger.info(data)
        if 'img_url' in data:
            path = "{}/{}.jpg".format(self.directory, self.counter)
            try:
                urllib.urlretrieve(data['img_url'], path)
                self.counter = self.counter + 1
                self.classifier.load_image(path)
                predicted_class = self.classifier.classify()
                #self.logger.info("\n [INFO_BOLT_PREDICTION] : "+ " ".join(predicted_class))
                if len(data['hash']) > 0:
                    tags = [
                        str(li['text']) for li in data['hash']
                        if li['text'][0:1] != "\\"
                    ]
                    #self.logger.info("\n [INFO_BOLT_TAGS] : "+ " ".join(tags))

                    now = datetime.datetime.now()
                    now_date = "{:04}-{:02}-{:02}".format(
                        now.year, now.month, now.day)
                    for cls in predicted_class:
                        if len(tags) > 0:
                            for tag in tags:
                                self.emit([cls, tag, now_date, str(now.hour)])
                                #self.logger.info("{0}/{1}".format(cls,tag))

                os.remove(path)

            except (KeyboardInterrupt, Exception):
                self.logger.info(Exception)

        else:
            self.logger.info("NO IMG URL!!!")
            #self.logger.info(json.dumps(data))

        if self.counter % 10 == 0:
            self.logger.info("Processed [{:,}] tweets".format(self.counter))
Пример #3
0
def test_1NN(digitsdat, selected, all_test_m):
    for testm in all_test_m:
        classifier = Classifier()
        # model = build(digitsdat.X[selected[0:testm], :], digitsdat.y[selected[0:testm]])
        classifier.build_model(digitsdat.X[selected[0:testm], :],
                               digitsdat.y[selected[0:testm]])
        error = classifier.classify(digitsdat.testX, digitsdat.testy)
        # accuracy = res(model)
        # print("m=%d error=%f" % (testm, 100-accuracy))
        print("m=%d error=%f" % (testm, error))
        # global M, e
        M.append(testm)
        e.append(error)
Пример #4
0
from classify import Classifier

cl = Classifier()
print(cl.classify('./test/'))
Пример #5
0
class Validator:
    def __init__(self, restrictions):
        self.attributes = []
        self.true_pos = 0
        self.true_neg = 0
        self.false_pos = 0
        self.false_neg = 0

        if len(restrictions) > 0:
            self.restr = restrictions.restr
        else:
            self.restr = restrictions

    def train(self, domain, class_data):
        document = xml.dom.minidom.Document()
        node = document.createElement('Tree')
        document.appendChild(node)
        d = Trainer(domain, class_data, document)
        partial_atts = d.attributes
        partial_atts.remove("Id")
        partial_atts.remove("Vote")
        print partial_atts

        if len(self.restr) > 0:
            d.rem_restrictions(self.restr)

        d.c45(d.data, d.attributes, node, 0)

        self.classifier = Classifier()

        if len(class_data.category) > 0:
            self.classifier.has_category = True

        for row in d.data:
            self.classifier.classify(document.documentElement, row,
                                     class_data.attributes)

        self.classifier.print_stats()

    #def print_stats(self):

    def recall(self):
        TP = self.classifier.true_pos
        FN = self.classifier.false_neg

        return float(TP) / (TP + FN)

    def precision(self):
        TP = self.classifier.true_pos
        FP = self.classifier.false_pos

        return float(TP) / (TP + FP)

    def pf(self):
        TN = self.classifier.true_neg
        FP = self.classifier.false_pos

        return float(FP) / (FP + TN)

    def fmeasure(self):
        beta = 2
        return float(beta * self.precision() *
                     self.recall()) / (self.precision() + self.recall())

    def confusion_matrix(self):
        print "###### CONFUSION MATRIX #######"
        print "                | Classified Positive | Classified Negative |"
        print "Actual Positive |          " + self.classifier.true_pos + "           |          " + self.classifier.false_neg + "           |"
        print "Actual Negative |          " + self.classifier.false_pos + "           |           " + self.classifier.true_neg + "          |"
Пример #6
0
# -*- coding: utf-8 -*-

from classify import Classifier
import urllib

clf = Classifier()
clf.load_image("/home/hari/Documents/Big_data/bits-please/car.jpg")
predicted_class = clf.classify()
print (" ".join(predicted_class))
Пример #7
0
import logging

# :: Logging level ::
loggingLevel = logging.INFO
logger = logging.getLogger()
logger.setLevel(loggingLevel)

ch = logging.StreamHandler(sys.stdout)
ch.setLevel(loggingLevel)
formatter = logging.Formatter('%(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
parser = argparse.ArgumentParser()

parser.add_argument('--dataset',
                    help='Location of word dataset files',
                    type=str,
                    required='true')
parser.add_argument('--BIO',
                    help='State if its bio_tag',
                    type=bool,
                    default=False)

args = parser.parse_args()
# Build the graph
pmi = PMI(args.dataset)
connected = pmi.build_graph()
# Performs classification
classifier = Classifier(args.dataset, args.BIO)
classifier.classify()
Пример #8
0
class Validator:
    def __init__(self, restrictions):
        self.attributes = [];
        self.true_pos = 0
        self.true_neg = 0
        self.false_pos = 0
        self.false_neg = 0
        
        if len(restrictions) > 0:
            self.restr = restrictions.restr
        else: 
            self.restr = restrictions

    def train(self, domain, class_data):
        document = xml.dom.minidom.Document()
        node = document.createElement('Tree')
        document.appendChild(node)
        d = Trainer(domain, class_data, document)
        partial_atts = d.attributes
        partial_atts.remove("Id")
        partial_atts.remove("Vote")
        print partial_atts
      
        if len(self.restr) > 0:
            d.rem_restrictions(self.restr)

        d.c45(d.data, d.attributes, node, 0)

        self.classifier = Classifier()

        if len(class_data.category) > 0:
            self.classifier.has_category = True

        for row in d.data:
            self.classifier.classify(document.documentElement, row, class_data.attributes)
            
        self.classifier.print_stats()

    #def print_stats(self):


    def recall(self):
        TP = self.classifier.true_pos
        FN = self.classifier.false_neg

        return float(TP) / (TP + FN)

    def precision(self):
        TP = self.classifier.true_pos
        FP = self.classifier.false_pos

        return float(TP) / (TP + FP)

    def pf(self):
        TN = self.classifier.true_neg
        FP = self.classifier.false_pos

        return float(FP) / (FP + TN)

    def fmeasure(self):
        beta = 2 
        return float(beta * self.precision() * self.recall()) / (self.precision() + self.recall())

    def confusion_matrix(self):
        print "###### CONFUSION MATRIX #######"
        print "                | Classified Positive | Classified Negative |"
        print "Actual Positive |          " + self.classifier.true_pos + "           |          " + self.classifier.false_neg + "           |"
        print "Actual Negative |          " + self.classifier.false_pos + "           |           " + self.classifier.true_neg + "          |"
Пример #9
0
class Controller(threading.Thread):
    def __init__(self, thread_name, event):
        super(Controller, self).__init__()
        self.name = thread_name
        self.threadEvent = event
        self.logger_info = utils.initlog('Console', 'Console.log')
        self.schedule = sched.scheduler(time.time, time.sleep)

        self.profiles_name, profiles = utils.load_profiles('profiles')
        self.related_tweets = [[] for _ in range(len(profiles))]
        self.pushed_tweets = [[] for _ in range(len(profiles))]
        self.pushed_tweets_ids = set([])
        self.related_tweets_hash = set([])

        self.classifier = Classifier()
        self.ranker = self.load_ranker()
        self.pusher = Pusher()

    def load_ranker(self):
        self.logger_info.info('loading ranker...')
        gateway = JavaGateway()
        ranker = gateway.entry_point
        self.logger_info.info('ranker loaded!')
        return ranker

    def run(self):
        self.logger_info.info('%s is starting...' % self.name)
        self.threadEvent.wait()
        self.logger_info.info('%s is running...' % self.name)
        self.schedule.enter(0, 0, self.dump_schedule, ())
        self.schedule.run()
        self.process()

    def process(self):
        start_day = time.gmtime(time.time()).tm_mday
        for line in sys.stdin:
            tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text(
                line)
            simhash_value = Simhash(tweet_text).value
            if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids:
                continue

            topic_id, similarity = self.classifier.classify(tweet_text)
            if topic_id == '':
                continue

            tweet_json['similarity'] = similarity
            evaluate_score = self.ranker.predict(json.dumps(tweet_json))
            total_score = similarity * evaluate_score
            if total_score < 0.15:
                continue

            is_pushed = self.pusher.push(evaluate_score, topic_id)
            if is_pushed:
                delivery_time = time.time()
                self.pushed_tweets[topic_id].append([
                    tid_origin,
                    str(delivery_time)[:10], similarity, total_score,
                    tweet_text
                ])
                self.pushed_tweets_ids.add(tid_retweet)

            struct_time = time.gmtime(float(timestamp[:-3]))
            utc_time = time.strftime('%Y%m%d', struct_time)
            self.related_tweets[topic_id].append(
                [utc_time, tid_origin, total_score, tweet_text])
            self.related_tweets_hash.add(simhash_value)

            if struct_time.tm_mday != start_day:
                self.dump_result(start_day)
                start_day = struct_time.tm_mday

    def dump_result(self, file_name):
        self.logger_info.info('saving result...')
        with open('submit/task-b/' + file_name, 'w') as fw:
            for index, records in enumerate(self.related_tweets):
                pid = str(index + 226)
                sorted_records = sorted(records, key=lambda item: -item[2])
                for rank, record in enumerate(sorted_records):
                    if rank >= 100:
                        break
                    fw.write('%s\tMB%s\tQ0\t%s\t%d\t%f\t%s\n' %
                             (record[0], pid, record[1], rank + 1, record[2],
                              'CSSNA'))
        with open('submit/task-a/' + file_name, 'w') as fw:
            with open('submit/task-a_extr/' + file_name, 'w') as fw_extr:
                for index, records in enumerate(self.pushed_tweets):
                    pid = str(index + 226)
                    for record in records:
                        fw.write('MB%s\t%s\t%s\tCSSNA\n' %
                                 (pid, record[0], record[1]))
                        fw_extr.write('MB%s\t%s\t%s\tCSSNA\t%s\t%s\t%s\n' %
                                      (pid, record[0], record[1], record[2],
                                       record[3], record[4]))

        self.related_tweets = [[] for _ in range(225)]
        self.pushed_tweets = [[] for _ in range(225)]
Пример #10
0
class Controller(threading.Thread):
    def __init__(self, thread_name, event):
        super(Controller, self).__init__()
        self.name = thread_name
        self.threadEvent = event
        self.logger_info = utils.initlog('Console', 'Console.log')
        self.schedule = sched.scheduler(time.time, time.sleep)

        self.profiles_name, profiles = utils.load_profiles('profiles')
        self.related_tweets = [[] for _ in range(len(profiles))]    # 当天相关推文记录,存储共离线分析
        self.pushed_tweets = [[] for _ in range(len(profiles))]
        self.pushed_tweets_ids = set([])
        self.related_tweets_hash = set([])

        self.classifier = Classifier()
        self.ranker = self.load_ranker()
        self.pusher = Pusher()

    def load_ranker(self):
        self.logger_info.info('loading ranker...')
        gateway = JavaGateway()
        ranker = gateway.entry_point
        self.logger_info.info('ranker loaded!')
        return ranker

    def run(self):
        self.logger_info.info('%s is starting...' % self.name)
        self.threadEvent.wait()
        self.logger_info.info('%s is running...' % self.name)
        # self.schedule.enter(0, 0, self.dump_schedule, ())
        # self.schedule.run()
        self.process()

    def process(self):
        data_file_path = sys.argv[1]
        files = os.listdir(data_file_path)
        files.sort()
        for f in files:
            filename = os.path.join(data_file_path, f)
            logging.info(filename)
            count = 0
            for line in open(filename, 'rb'):
                start = time.clock()
                tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text(line)
                simhash_value = Simhash(tweet_text).value
                if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids:
                    continue

                topic_id, similarity = self.classifier.classify(tweet_text)
                if topic_id == '':
                    continue

                count += 1
                if count % 10000 == 0:  logging.info('%d' % count)

                tweet_json['similarity'] = similarity
                evaluate_score = self.ranker.predict(json.dumps(tweet_json))
                total_score = (evaluate_score ** 0.5) * similarity
                # if total_score < 0.15:
                #     continue

                timestruct = time.gmtime(int(timestamp[:-3]))
                is_pushed = self.pusher.push(total_score, topic_id, timestruct)
                if is_pushed:
                    delivery_time = float(timestamp) / 1000.0 + (time.clock() - start)
                    self.pushed_tweets[topic_id].append([tid_origin, str(delivery_time)[:10], similarity, total_score, tweet_text])

                utc_time = time.strftime('%Y%m%d', timestruct)
                self.related_tweets[topic_id].append([utc_time, tid_origin, total_score, tid_retweet, timestamp[:-3], tweet_text])

                self.related_tweets_hash.add(simhash_value)
                self.pushed_tweets_ids.add(tid_retweet)
            self.dump_result(f)
            self.pusher = Pusher()
        self.logger_info.info('\n=======finished!=======\n')

    def dump_result(self, file_name):
        self.logger_info.info('saving result...')
        with open('submit/task-b/b_submit', 'a') as fw:
            with open('submit/task-b/b_review/B_candidateday_' + file_name[-2:], 'w') as fw_review:
                for index, records in enumerate(self.related_tweets):
                    pid = str(index+226)
                    sorted_records = sorted(records, key=lambda item: -item[2])
                    for rank, record in enumerate(sorted_records):
                        if rank >= 100:
                            break
                        fw.write('%s\tMB%s\tQ0\t%s\t%d\t%f\t%s\n' % (record[0], pid, record[1], rank+1, record[2], 'CSSNA'))
                        fw_review.write('%s\tMB%s\tQ0\t%s\t%f\tSNACS\t%s\t%s\t%s\n' % (record[0], pid, record[1], record[2], record[3], record[4], record[5]))

        with open('submit/task-a/a_submit', 'a') as fw:
            with open('submit/task-a/a_review', 'a') as fw_review:
                for index, records in enumerate(self.pushed_tweets):
                    pid = str(index+226)
                    for record in records:
                        fw.write('MB%s\t%s\t%s\tCSSNA\n' % (pid, record[0], record[1]))
                        fw_review.write('MB%s\t%s\t%s\tCSSNA\t%s\t%s\t%s\n' % (pid, record[0], record[1], record[2], record[3], record[4]))

        self.related_tweets = [[] for _ in range(225)]    # 清空前天相关推文记录
        self.pushed_tweets = [[] for _ in range(225)]


    def dump_schedule(self):
        self.logger_info.info('saving result...')
        utc_time = time.strftime('%Y%m%d', time.gmtime())
        for index, records in self.related_tweets:
            pid = str(index+226)
            with open('profile_MB' + pid, 'w') as fw:
                for record in records:
                    fw.write(utc_time + '\t' + pid + '\tQ0\t' + record + '\n')
        self.related_tweets = [[] for _ in range(226)]    # 清空前天相关推文记录
        self.schedule.enter(24*60*60, 0, self.dump_schedule, ())

    def detect_tweet_stream(self, year, month, d, h, m, s, ms):
        start = datetime.datetime(year, month, d, h, m, s, ms)
        delta = (start - datetime.datetime.now()).seconds
        self.logger_info.info('waiting secondes: ' + str(delta))
        time.sleep(delta)
        self.logger_info.info('tweet stream is ready')
        is_ready = True
        return is_ready
Пример #11
0
class Controller(threading.Thread):
    def __init__(self, thread_name, event):
        super(Controller, self).__init__()
        self.name = thread_name
        self.threadEvent = event
        self.logger_info = utils.initlog('Console', 'Console.log')
        self.schedule = sched.scheduler(time.time, time.sleep)

        self.profiles_name, profiles = utils.load_profiles('profiles')
        self.related_tweets = [[] for _ in range(len(profiles))]
        self.pushed_tweets = [[] for _ in range(len(profiles))]
        self.pushed_tweets_ids = set([])
        self.related_tweets_hash = set([])

        self.classifier = Classifier()
        self.ranker = self.load_ranker()
        self.pusher = Pusher()

    def load_ranker(self):
        self.logger_info.info('loading ranker...')
        gateway = JavaGateway()
        ranker = gateway.entry_point
        self.logger_info.info('ranker loaded!')
        return ranker

    def run(self):
        self.logger_info.info('%s is starting...' % self.name)
        self.threadEvent.wait()
        self.logger_info.info('%s is running...' % self.name)
        self.schedule.enter(0, 0, self.dump_schedule, ())
        self.schedule.run()
        self.process()

    def process(self):
        start_day = time.gmtime(time.time()).tm_mday
        for line in sys.stdin:
            tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text(line)
            simhash_value = Simhash(tweet_text).value
            if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids:
                continue

            topic_id, similarity = self.classifier.classify(tweet_text)
            if topic_id == '':
                continue

            tweet_json['similarity'] = similarity
            evaluate_score = self.ranker.predict(json.dumps(tweet_json))
            total_score = similarity * evaluate_score
            if total_score < 0.15:
                continue

            is_pushed = self.pusher.push(evaluate_score, topic_id)
            if is_pushed:
                delivery_time = time.time()
                self.pushed_tweets[topic_id].append([tid_origin, str(delivery_time)[:10], similarity, total_score, tweet_text])
                self.pushed_tweets_ids.add(tid_retweet)

            struct_time = time.gmtime(float(timestamp[:-3]))
            utc_time = time.strftime('%Y%m%d', struct_time)
            self.related_tweets[topic_id].append([utc_time, tid_origin, total_score, tweet_text])
            self.related_tweets_hash.add(simhash_value)

            if struct_time.tm_mday != start_day:
                self.dump_result(start_day)
                start_day = struct_time.tm_mday

    def dump_result(self, file_name):
        self.logger_info.info('saving result...')
        with open('submit/task-b/'+file_name, 'w') as fw:
            for index, records in enumerate(self.related_tweets):
                pid = str(index+226)
                sorted_records = sorted(records, key=lambda item: -item[2])
                for rank, record in enumerate(sorted_records):
                    if rank >= 100:
                        break
                    fw.write('%s\tMB%s\tQ0\t%s\t%d\t%f\t%s\n' % (record[0], pid, record[1], rank+1, record[2], 'CSSNA'))
        with open('submit/task-a/'+file_name, 'w') as fw:
            with open('submit/task-a_extr/'+file_name, 'w') as fw_extr:
                for index, records in enumerate(self.pushed_tweets):
                    pid = str(index+226)
                    for record in records:
                        fw.write('MB%s\t%s\t%s\tCSSNA\n' % (pid, record[0], record[1]))
                        fw_extr.write('MB%s\t%s\t%s\tCSSNA\t%s\t%s\t%s\n' % (pid, record[0], record[1], record[2], record[3], record[4]))

        self.related_tweets = [[] for _ in range(225)]
        self.pushed_tweets = [[] for _ in range(225)]