Пример #1
0
def pipeline_dw():
    classificator = Classificator(taxonomy)
    ner = initializeNer()
    elastic = ElasticWrapper()
    day = START_DAY
    today = datetime.now()
    days_log = open(DAYS_LOG_PATH, 'a')
    t0 = datetime.now()
    logging.info("Quering for day: %s" % day.strftime("%Y-%m-%dT%H:%M:%SZ"))
    days_log.write("Quering for day: %s\n" %
                   day.strftime("%Y-%m-%dT%H:%M:%SZ"))
    articles_count = elastic.articles_count_from(day)
    count = 0
    while (count < articles_count):
        articles = elastic.get_articles_from(day, count)
        count += 1000
        logging.info("Received %s articles" % len(articles))
        logging.info("Got %s/%s articles" % (count, articles_count))
        days_log.write("Got %s/%s articles\n" %
                       (count + len(articles), articles_count))
        result_articles = []
        for article in articles:
            logging.debug("Analyzing article: %s" % article)
            result_article = analyze_doc(classificator, ner, article)
            logging.debug("Result article: %s" % result_article)
            result_article["source"] = "dw"
            result_articles.append(result_article)
        elastic.insert(result_articles, "analyzed", "article")
    t1 = datetime.now()
    logging.info("This day took %s" % (t1 - t0))
    days_log.write("This day took %s\n" % (t1 - t0))
    logging.info("Finished!!!")
    days_log.write("This day took %s\n" % (t1 - t0))
    days_log.close()
 def get(self):
     logging.debug("Received GET request")
     service_id = self.get_argument('service_id', default=None)
     text = self.get_argument('text')
     lang = self.get_argument('lang', default=None)
     taxonomy = self.get_argument('taxonomy', default=None)
     if taxonomy:
         taxonomy = json.loads(taxonomy)
         classificator = Classificator(taxonomy)
     else:
         classificator = self.default_classificator
     classification = classificator.classify(text)
     logging.debug(classification)
     result = self.__get_concepts_from_classification(classification)
     if result:
         self.write(json.dumps(result))
     else:
         self.write(json.dumps(['N/A']))
Пример #3
0
 def get(self):
     logging.debug("Received GET request")
     service_id = self.get_argument('service_id', default=None)
     text = self.get_argument('text')
     lang = self.get_argument('lang', default=None)
     taxonomy = self.get_argument('taxonomy', default=None)
     if taxonomy:
         taxonomy = json.loads(taxonomy)
         classificator = Classificator(taxonomy)
     else:
         classificator = self.default_classificator
     classification = classificator.classify(text)
     logging.debug(classification)
     result = self.__get_concepts_from_classification(classification)
     if result:
         self.write(json.dumps(result))
     else:
         self.write(json.dumps(['N/A']))
Пример #4
0
class ClassificationService(RequestHandler):
    def initialize(self):
        #self.cache = TaxonomyCache()
        self.default_classificator = Classificator(example_taxonomy)

    def get(self):
        logging.debug("Received GET request")
        service_id = self.get_argument('service_id', default=None)
        text = self.get_argument('text')
        lang = self.get_argument('lang', default=None)
        taxonomy = self.get_argument('taxonomy', default=None)
        if taxonomy:
            taxonomy = json.loads(taxonomy)
            classificator = Classificator(taxonomy)
        else:
            classificator = self.default_classificator
        classification = classificator.classify(text)
        logging.debug(classification)
        result = self.__get_concepts_from_classification(classification)
        if result:
            self.write(json.dumps(result))
        else:
            self.write(json.dumps(['N/A']))

    def post(self):
        results = list()
        logging.debug("Received POST request")
        for line in str(self.request.body, 'utf8').split('\n'):
            fields = line.split('\t')
            text = unquote_plus(unquote_plus(fields[0]))
            logging.debug("Classificating %s" % text)
            classification = self.default_classificator.classify(text)
            result = {
                "text": text,
                "topics":
                self.__get_concepts_from_classification(classification)
            }
            results.append(result)
        self.write({"response": results})

    def __get_concepts_from_classification(self, classification):
        unique_concepts = set()
        for word, concepts in classification.items():
            unique_concepts |= set(concepts)
        return list(unique_concepts)

    def __format_post_classification(self, classification):
        concepts = self.__get_concepts_from_classification(classification)
        if concepts:
            return ';;'.join(concepts)
        else:
            return 'N/A'
Пример #5
0
class ClassificationService(RequestHandler):

    def initialize(self):
        #self.cache = TaxonomyCache()
        self.default_classificator = Classificator(example_taxonomy)

    def get(self):
        logging.debug("Received GET request")
        service_id = self.get_argument('service_id', default=None)
        text = self.get_argument('text')
        lang = self.get_argument('lang', default=None)
        taxonomy = self.get_argument('taxonomy', default=None)
        if taxonomy:
            taxonomy = json.loads(taxonomy)
            classificator = Classificator(taxonomy)
        else:
            classificator = self.default_classificator
        classification = classificator.classify(text)
        logging.debug(classification)
        result = self.__get_concepts_from_classification(classification)
        if result:
            self.write(json.dumps(result))
        else:
            self.write(json.dumps(['N/A']))

    def post(self):
        results = list()
        logging.debug("Received POST request")
        for line in str(self.request.body, 'utf8').split('\n'):
            fields = line.split('\t')
            text = unquote_plus(unquote_plus(fields[0]))
            logging.debug("Classificating %s" % text)
            classification = self.default_classificator.classify(text)
            result = {"text":text, "topics":self.__get_concepts_from_classification(classification)}
            results.append(result)
        self.write({"response":results})

    def __get_concepts_from_classification(self, classification):
        unique_concepts = set()
        for word, concepts in classification.items():
            unique_concepts|=set(concepts)
        return list(unique_concepts)

    def __format_post_classification(self, classification):
        concepts = self.__get_concepts_from_classification(classification)
        if concepts:
            return ';;'.join(concepts)
        else:
            return 'N/A'
Пример #6
0
def pipeline():
    classificator = Classificator(taxonomy)
    ner = initializeNer()
    elastic = ElasticWrapper()
    day = START_DAY
    today = datetime.now()
    days_log = open(DAYS_LOG_PATH, 'a')
    while (day < today):
        t0 = datetime.now()
        logging.info("Quering for day: %s" %
                     day.strftime("%Y-%m-%dT%H:%M:%SZ"))
        days_log.write("Quering for day: %s\n" %
                       day.strftime("%Y-%m-%dT%H:%M:%SZ"))
        tweets_count = elastic.tweets_count_for_day(day)
        count = 0
        while (count < tweets_count):
            tweets = elastic.get_day_tweets(day, count)
            count += 1000
            logging.info("Received %s tweets" % len(tweets))
            logging.info("Got %s/%s tweets" % (count, tweets_count))
            days_log.write("Got %s/%s tweets\n" %
                           (count + len(tweets), tweets_count))
            result_tweets = []
            for tweet in tweets:
                logging.debug("Analyzing tweet: %s" % tweet)
                result_tweet = analyze_doc(classificator, ner, tweet)
                logging.debug("Result tweet: %s" % result_tweet)
                result_tweet["source"] = "twitter"
                result_tweets.append(result_tweet)
            elastic.insert(result_tweets, "analyzed", "tweet")
        t1 = datetime.now()
        logging.info("This day took %s" % (t1 - t0))
        days_log.write("This day took %s\n" % (t1 - t0))
        day = day + timedelta(days=1)
    logging.info("Finished!!!")
    days_log.write("This day took %s\n" % (t1 - t0))
    days_log.close()
Пример #7
0
 def initialize(self):
     #self.cache = TaxonomyCache()
     self.default_classificator = Classificator(example_taxonomy)
 def initialize(self):
     self.default_classificator = Classificator(taxonomy)
Пример #9
0
 def initialize(self):
     #self.cache = TaxonomyCache()
     self.default_classificator = Classificator(example_taxonomy)