예제 #1
0
    def handle(self, *args, **options):
        # make sure we have our recommendations index
        create_index("recommendation")
        days = options['days']
        domains = options["domain"]

        if not domains:
            raise CommandError('Domain name is required')

        start, end = self.get_date_range(days)

        for domain in Domain.objects.filter(domain_id__in=domains):
            results = scan(es, index='article', query={"query": {"bool": {"must": [{"term": {"domain": domain.domain_id}}, {"range": {"published_on": {"gte": start,"lt": end}}}]}},"sort": [{"published_on": {"order": "desc"}}]}, preserve_order=True)

            for current in results:
                article_id, title, domain = current['_source']['id'], current['_source']['title'], current['_source']['domain']
                document = {}
                document['id'] = article_id
                document['recommendation'] = self.get_recommendations(title, domain)
                ingest_to_elastic([document], "recommendation", "recommendation", "id")

                if self.DEBUG:
                    print(f"Generated Recommendation for: {title}")

                    for item in document['recommendation']:
                        print("\t", item['title'])
예제 #2
0
    def handle(self, *args, **options):
        if options['source'] == None:
            raise CommandError("Option `--source=...` must be specified.")

        # start prometheus http server for metrics
        start_http_server(8686)

        source = options['source']
        index = options['index']
        create_index(index)
        domain = Domain.objects.get(domain_id="newscout")
        try:
            while True:
                file_path = self.get_data_from_redis(source)
                if file_path:
                    date = datetime.now(
                        pytz.timezone("Asia/Kolkata")).strftime("%Y-%m-%d")
                    self.task_state.state("running")
                    self.sleep_time = 0
                    if os.path.isfile(file_path):
                        doc = cPickle.loads(
                            zlib.decompress(open(file_path, "rb").read()))
                        try:
                            self.create_model_obj(doc, domain, index)
                            if date == self.now:
                                self.source_ingest.labels(
                                    source=doc.get("source", "source"),
                                    category=doc.get("category",
                                                     "category")).inc()
                            else:
                                self.now = datetime.now(
                                    pytz.timezone("Asia/Kolkata")).strftime(
                                        "%Y-%m-%d")
                                # self.reset_stats()
                                self.source_ingest.labels(
                                    source=doc.get("source", "source"),
                                    category=doc.get("category",
                                                     "category")).inc()
                        except Exception as e:
                            print("error in doc read")
                            print(e)
                    else:
                        msg = "Data file not found: {0}".format(file_path)
                        print(msg)
                else:
                    self.task_state.state("waiting")
                    print("Sleeping...!!!")
                    time.sleep(10)
                    self.sleep_time += 10
                    if self.sleep_time >= 60:
                        if self.batch:
                            ingest_to_elastic(self.batch, index, index, 'id')
                            print("Ingesting Final Batch...!!!")
                            self.batch = []
                            self.sleep_time = 0
        except KeyboardInterrupt:
            sys.exit(0)
 def ingest(self, *args, **options):
     print("Ingesting Data from Database\n")
     index = 'article'
     create_index(index)
     for article in Article.objects.all().iterator():
         serializer = ArticleSerializer(article)
         json_data = serializer.data
         if json_data["hash_tags"]:
             tag_list = self.get_tags(json_data["hash_tags"])
             json_data["hash_tags"] = tag_list
         self.batch.append(json_data)
         if len(self.batch) == 999:
             ingest_to_elastic(self.batch, index, index, 'id')
             self.batch = []
             print("Ingesting Batch...!!!")
     ingest_to_elastic(self.batch, index, index, 'id')
     print("Ingesting Final Batch...!!!")
 def ingest(self, *args, **options):
     print("Ingesting Data from Database\n")
     index = 'auto_suggestions'
     create_index(index, auto_suggestion_mapping)
     for domain in Domain.objects.filter(
             domain_name__isnull=False).iterator():
         if domain.domain_name:
             as_dict = {}
             as_dict["desc"] = domain.domain_name
             as_dict["name_suggest"] = domain.domain_name
             as_dict["id"] = md5(str(
                 domain.domain_name).encode("utf-8")).hexdigest()
             self.batch.append(as_dict)
             if len(self.batch) == 999:
                 ingest_to_elastic(self.batch, index, index, 'id')
                 self.batch = []
                 print("Ingesting Batch...!!!")
     for source in Source.objects.filter(name__isnull=False).iterator():
         if source.name:
             as_dict = {}
             as_dict["desc"] = source.name
             as_dict["name_suggest"] = source.name
             as_dict["id"] = md5(str(
                 source.name).encode("utf-8")).hexdigest()
             self.batch.append(as_dict)
             if len(self.batch) == 999:
                 ingest_to_elastic(self.batch, index, index, 'id')
                 self.batch = []
                 print("Ingesting Batch...!!!")
     for cat in Category.objects.filter(name__isnull=False).iterator():
         if cat.name:
             as_dict = {}
             as_dict["desc"] = cat.name
             as_dict["name_suggest"] = cat.name
             as_dict["id"] = md5(str(cat.name).encode("utf-8")).hexdigest()
             self.batch.append(as_dict)
             if len(self.batch) == 999:
                 ingest_to_elastic(self.batch, index, index, 'id')
                 self.batch = []
                 print("Ingesting Batch...!!!")
     ingest_to_elastic(self.batch, index, index, 'id')
     print("Ingesting Final Batch...!!!")
예제 #5
0
    def handle(self, *args, **options):
        if options['source'] == None:
            raise CommandError("Option `--source=...` must be specified.")

        json_files = options['json']
        index = options['index']
        domain_name = options["domain_name"]
        domain_id = options["domain_id"]
        if not domain_name:
            raise CommandError("Option `--domain_name=...` must be specified.")

        if not domain_id:
            raise CommandError("Option `--domain_id=...` must be specified.")

        create_index(index)
        domain, _ = Domain.objects.get_or_create(domain_name=domain_name,
                                                 domain_id=domain_id)
        try:
            for root, _, files in os.walk(json_files):
                if files:
                    for f in files:
                        if f.endswith(".dat"):
                            file_path = "{0}/{1}".format(root, f)
                            if os.path.isfile(file_path):
                                doc = cPickle.loads(
                                    zlib.decompress(open(file_path).read()))
                                try:
                                    self.create_model_obj(doc, index, domain)
                                except Exception as e:
                                    print(e)
                            else:
                                msg = "Data file not found: {0}".format(
                                    file_path)
                                print(msg)

            if self.batch:
                ingest_to_elastic(self.batch, index, index, 'id')
                print("Ingesting Final Batch...!!!")
                self.batch = []
        except KeyboardInterrupt:
            sys.exit(0)
 def handle(self, *args, **options):
     print("Ingesting Data from Database\n")
     index = options['index']
     create_index(index)
     categories = Category.objects.all()
     domain = Domain.objects.get(domain_id="newscout")
     for cat in categories:
         if Article.objects.filter(category=cat, domain=domain).exists():
             article_objs = Article.objects.filter(category=cat,
                                                   domain=domain)[:200]
             for article in article_objs:
                 serializer = ArticleSerializer(article)
                 json_data = serializer.data
                 if json_data["hash_tags"]:
                     tag_list = self.get_tags(json_data["hash_tags"])
                     json_data["hash_tags"] = tag_list
                 self.batch.append(json_data)
                 if len(self.batch) == 200:
                     ingest_to_elastic(self.batch, index, index, 'id')
                     self.batch = []
                     print("Ingesting Batch...!!!")
             ingest_to_elastic(self.batch, index, index, 'id')
             print("Ingesting Final Batch...!!!")