def spider_ended(spider, reason): print('Spider ended:', spider.name, reason) for url in spider.start_urls: content = { 'domain': str(text_parser.extract_domain_from_url(url)), 'spider_id': ID } kafka.send_message(producer = producer, topic = TOPIC_OUTPUT, value = content)
def main(): print('Loading the model...') model = classifier.load_classifier(model = MODEL, parquet = TRAINING_PARQUET, training_set = TRAINING_SET) print('Running Consumer...') try: consumer = kafka.connectConsumer(topic = TOPIC_INPUT, server = KAFKA_BROKER_URL) print("Consumer connected") except Exception as ex: print("Error connecting kafka broker as Consumer") print(ex) try: producer = kafka.connectProducer(server = KAFKA_BROKER_URL) print("Producer connected") except Exception as ex: print("Error connecting kafka broker as Producer") print(ex) working = True while working: message_dict = kafka.consume(consumer = consumer) if (message_dict != {}): for topic, messages in message_dict.items(): for message in messages: print('Received message: '+str(message.value['domain'])) domain = message.value['domain'] domain_clusters = cluster_utils.parse_cluster(domain, message.value['TaggedClusters']) filtered_list = [] for page_dict in domain_clusters: label = page_dict['cluster_label'] if label == 'product': page_text = page_dict['text'] prediction = classifier.predict(model=model, input=page_text) if prediction == [1]: filtered_list.append(page_dict) else: filtered_list.append(page_dict) content = { 'domain': domain, 'filtered_pages': filtered_list } content_json = json.dumps(content) mongo.put(domain, content_json) print('Data saved on db: collection: ' + str(domain)) kafka.send_message(producer = producer, topic = TOPIC_OUTPUT, message = content)
def main(): print('Loading the model...') model = classifier.load_classifier(model=MODEL, parquet=TRAINING_PARQUET, training_set=TRAINING_SET) print('Running Consumer...') try: partitioner = kafka.get_RoundRobin_partitioner_for_topic( TOPIC_OUTPUT, KAFKA_BROKER_URL) except Exception as ex: print('Error with topic partitions') print(ex) try: consumer = kafka.connectConsumer(topic=TOPIC_INPUT, server=KAFKA_BROKER_URL) print("Consumer connected") except Exception as ex: print("Error connecting kafka broker as Consumer") print(ex) try: producer = kafka.connectProducer(server=KAFKA_BROKER_URL, partitioner=partitioner) print("Producer connected") except Exception as ex: print("Error connecting kafka broker as Producer") print(ex) i = 0 working = True while working: message_dict = kafka.consume(consumer=consumer) if (message_dict != {}): for topic, messages in message_dict.items(): for message in messages: if classifier.predict( model=model, input=message.value['url_page']) == 1: collection = 'Classifier' mongo.put(collection, json.dumps(message.value)) print('Data saved on db: collection: ' + str(collection) + ' url: ' + message.value['url_page']) kafka.send_message(producer=producer, key=i, topic=TOPIC_OUTPUT, message=message.value) i = i + 1
DATABASE_READ, collection_name) except: print('#########################################') print('#######ERROR tryng to read from db#######') print('#########################################') try: clusters = clustering.structural_clustering( collection, threshold) except: print('#########################################') print('########ERROR tryng to cluster###########') print('#########################################') if clusters: content = { 'domain': collection_name, 'clusters': clusters } content_json = json.dumps(content) mongo.put(collection_name, content_json) print('#########################################') print('############Data saved on DB#############') print('#########################################') kafka.send_message(producer=producer, topic=TOPIC_OUTPUT, value=content) print('Sent message for domain: ' + str(content['domain'])) except: print('#############ERROR clusterizer ##############')