def pipeline_dw(): classificator = Classificator(taxonomy) ner = initializeNer() elastic = ElasticWrapper() day = START_DAY today = datetime.now() days_log = open(DAYS_LOG_PATH, 'a') t0 = datetime.now() logging.info("Quering for day: %s" % day.strftime("%Y-%m-%dT%H:%M:%SZ")) days_log.write("Quering for day: %s\n" % day.strftime("%Y-%m-%dT%H:%M:%SZ")) articles_count = elastic.articles_count_from(day) count = 0 while (count < articles_count): articles = elastic.get_articles_from(day, count) count += 1000 logging.info("Received %s articles" % len(articles)) logging.info("Got %s/%s articles" % (count, articles_count)) days_log.write("Got %s/%s articles\n" % (count + len(articles), articles_count)) result_articles = [] for article in articles: logging.debug("Analyzing article: %s" % article) result_article = analyze_doc(classificator, ner, article) logging.debug("Result article: %s" % result_article) result_article["source"] = "dw" result_articles.append(result_article) elastic.insert(result_articles, "analyzed", "article") t1 = datetime.now() logging.info("This day took %s" % (t1 - t0)) days_log.write("This day took %s\n" % (t1 - t0)) logging.info("Finished!!!") days_log.write("This day took %s\n" % (t1 - t0)) days_log.close()
def get(self): logging.debug("Received GET request") service_id = self.get_argument('service_id', default=None) text = self.get_argument('text') lang = self.get_argument('lang', default=None) taxonomy = self.get_argument('taxonomy', default=None) if taxonomy: taxonomy = json.loads(taxonomy) classificator = Classificator(taxonomy) else: classificator = self.default_classificator classification = classificator.classify(text) logging.debug(classification) result = self.__get_concepts_from_classification(classification) if result: self.write(json.dumps(result)) else: self.write(json.dumps(['N/A']))
class ClassificationService(RequestHandler): def initialize(self): #self.cache = TaxonomyCache() self.default_classificator = Classificator(example_taxonomy) def get(self): logging.debug("Received GET request") service_id = self.get_argument('service_id', default=None) text = self.get_argument('text') lang = self.get_argument('lang', default=None) taxonomy = self.get_argument('taxonomy', default=None) if taxonomy: taxonomy = json.loads(taxonomy) classificator = Classificator(taxonomy) else: classificator = self.default_classificator classification = classificator.classify(text) logging.debug(classification) result = self.__get_concepts_from_classification(classification) if result: self.write(json.dumps(result)) else: self.write(json.dumps(['N/A'])) def post(self): results = list() logging.debug("Received POST request") for line in str(self.request.body, 'utf8').split('\n'): fields = line.split('\t') text = unquote_plus(unquote_plus(fields[0])) logging.debug("Classificating %s" % text) classification = self.default_classificator.classify(text) result = { "text": text, "topics": self.__get_concepts_from_classification(classification) } results.append(result) self.write({"response": results}) def __get_concepts_from_classification(self, classification): unique_concepts = set() for word, concepts in classification.items(): unique_concepts |= set(concepts) return list(unique_concepts) def __format_post_classification(self, classification): concepts = self.__get_concepts_from_classification(classification) if concepts: return ';;'.join(concepts) else: return 'N/A'
class ClassificationService(RequestHandler): def initialize(self): #self.cache = TaxonomyCache() self.default_classificator = Classificator(example_taxonomy) def get(self): logging.debug("Received GET request") service_id = self.get_argument('service_id', default=None) text = self.get_argument('text') lang = self.get_argument('lang', default=None) taxonomy = self.get_argument('taxonomy', default=None) if taxonomy: taxonomy = json.loads(taxonomy) classificator = Classificator(taxonomy) else: classificator = self.default_classificator classification = classificator.classify(text) logging.debug(classification) result = self.__get_concepts_from_classification(classification) if result: self.write(json.dumps(result)) else: self.write(json.dumps(['N/A'])) def post(self): results = list() logging.debug("Received POST request") for line in str(self.request.body, 'utf8').split('\n'): fields = line.split('\t') text = unquote_plus(unquote_plus(fields[0])) logging.debug("Classificating %s" % text) classification = self.default_classificator.classify(text) result = {"text":text, "topics":self.__get_concepts_from_classification(classification)} results.append(result) self.write({"response":results}) def __get_concepts_from_classification(self, classification): unique_concepts = set() for word, concepts in classification.items(): unique_concepts|=set(concepts) return list(unique_concepts) def __format_post_classification(self, classification): concepts = self.__get_concepts_from_classification(classification) if concepts: return ';;'.join(concepts) else: return 'N/A'
def pipeline(): classificator = Classificator(taxonomy) ner = initializeNer() elastic = ElasticWrapper() day = START_DAY today = datetime.now() days_log = open(DAYS_LOG_PATH, 'a') while (day < today): t0 = datetime.now() logging.info("Quering for day: %s" % day.strftime("%Y-%m-%dT%H:%M:%SZ")) days_log.write("Quering for day: %s\n" % day.strftime("%Y-%m-%dT%H:%M:%SZ")) tweets_count = elastic.tweets_count_for_day(day) count = 0 while (count < tweets_count): tweets = elastic.get_day_tweets(day, count) count += 1000 logging.info("Received %s tweets" % len(tweets)) logging.info("Got %s/%s tweets" % (count, tweets_count)) days_log.write("Got %s/%s tweets\n" % (count + len(tweets), tweets_count)) result_tweets = [] for tweet in tweets: logging.debug("Analyzing tweet: %s" % tweet) result_tweet = analyze_doc(classificator, ner, tweet) logging.debug("Result tweet: %s" % result_tweet) result_tweet["source"] = "twitter" result_tweets.append(result_tweet) elastic.insert(result_tweets, "analyzed", "tweet") t1 = datetime.now() logging.info("This day took %s" % (t1 - t0)) days_log.write("This day took %s\n" % (t1 - t0)) day = day + timedelta(days=1) logging.info("Finished!!!") days_log.write("This day took %s\n" % (t1 - t0)) days_log.close()
def initialize(self): #self.cache = TaxonomyCache() self.default_classificator = Classificator(example_taxonomy)
def initialize(self): self.default_classificator = Classificator(taxonomy)