Пример #1
0
def classify_pages(in_path, out_path):
    classifier = Classifier()

    with open(out_path, 'wb') as f:
        for site, html in utils.read_file_multiple(in_path):
            if classifier.classify(html):
                pickle.dump((site, html), f)
class Annotator:

    def __init__(self, db):
        self.db = db

        self.city = self.db["area"].find_one({
            "name": configuration.AREA
        })

        self.classifier = Classifier(self.db)


    def tokenize(self,tweet):
        stop_words_list = get_stop_words("en")

        tweet_text = tweet["text"]

        if tweet["truncated"]:
            tweet_text = tweet["extended_tweet"]["full_text"]

        tweet_text = re.sub(r"(?:\@|https?\://)\S+", "", tweet_text)

        tokens = [token for token in utils.simple_preprocess(
            tweet_text, deacc=False, min_len=3) if token not in stop_words_list]

        tweet["tokens"] = tokens

        return tweet

    def add_date(self,tweet):
        tweet["date"] = datetime.datetime.fromtimestamp(int(tweet["timestamp_ms"]) // 1000)
        return tweet

    def annotate_tweet_location(self, tweet):

        if tweet["geo"] is None and tweet["place"] is None:
            return tweet

        point = None
        if tweet["geo"] is not None:
            point = Point(tweet["geo"]["coordinates"][1], tweet["geo"]["coordinates"][0])

        for a in self.city["geojson"]["features"]:
            area = shape(a["geometry"])
            if (point is not None and area.contains(point)) or a["properties"]["name"] == tweet["place"]["name"]:
                tweet["area_name"] = a["properties"]["name"]
                #tweet["area_id"] = a["id"]
                print("Found a tweet in",tweet["area_name"])
                break

        return tweet

    def classify_tweet(self, tweet):
        return self.classifier.classify(tweet)


    def classify_offline(self):
        tweets = list(self.db["tweet"].find())

        print("Classifying tweets")
        for t in tweets:
            print(t["id"])
            c_tweet = self.classifier.classify(t)
            self.db["tweet"].update({"id": t["id"]}, {"$set": {"categories": c_tweet["categories"]}})

        print("Done")

    def tokenize_offline(self):
        tweets = list(self.db["tweet"].find())

        print("Updating tweets")
        for t in tweets:
            stop_words_list = get_stop_words("en")

            tweet_text = t["text"]

            if t["truncated"]:
                tweet_text = t["extended_tweet"]["full_text"]

            tweet_text = re.sub(r"(?:\@|https?\://)\S+", "", tweet_text)

            tokens = [token for token in utils.simple_preprocess(
                tweet_text, deacc=False, min_len=3) if token not in stop_words_list]

            query = {
                "_id": t["_id"]
            }

            update = {
                "$set": {
                    "tokens": tokens
                }
            }
            self.db["tweet"].update(query, update)

        print("Done")
Пример #3
0
 def classify_pattern(cls, pattern):
     prediction = Classifier.classify(pattern)
     return cls.PATTERN_MAPPING[int(np.argmax(prediction))]
__author__ = 'dungdt'

import time
from classifier.classifier import Classifier
from classifier.data.dictionary import Dictionary
from classifier.data_reader import DataReader

if __name__ == '__main__':
    dictionary = Dictionary()
    dataReader = DataReader(dictionary)
    classifier = Classifier(dataReader, trainingDataPath='data/training',
                            testDataPath='data/test')

    print 'Training...'
    t = time.time()
    classifier.train()
    print 'Training time: %d' %(time.time() - t)

    t = time.time()
    print 'Testing...'
    print 'Accuracy: %s%%' % ('{:4.2f}'.format(classifier.test() * 100))
    print 'Testing time: %d' %(time.time() - t)

    testData = classifier.dataReader.readTestData(classifier.testDataPath)
    print classifier.classify(testData[0][0])