Пример #1
0
    def spider_ended(spider, reason):
        print('Spider ended:', spider.name, reason)
        for url in spider.start_urls:

         content = {
             'domain': str(text_parser.extract_domain_from_url(url)),
             'spider_id': ID
         }
         kafka.send_message(producer = producer, topic = TOPIC_OUTPUT, value = content)
Пример #2
0
def main():

    print('Loading the model...')
    model = classifier.load_classifier(model = MODEL, parquet = TRAINING_PARQUET, training_set = TRAINING_SET)
    print('Running Consumer...')


    try:
        consumer = kafka.connectConsumer(topic = TOPIC_INPUT, server = KAFKA_BROKER_URL)
        print("Consumer connected")
    except Exception as ex:
        print("Error connecting kafka broker as Consumer")
        print(ex)
    try:
        producer = kafka.connectProducer(server = KAFKA_BROKER_URL)
        print("Producer connected")
    except Exception as ex:
        print("Error connecting kafka broker as Producer")
        print(ex)

    working = True
    while working:
        message_dict = kafka.consume(consumer = consumer)
        if (message_dict != {}):
            for topic, messages in message_dict.items():
                for message in messages:
                    print('Received message: '+str(message.value['domain']))
                    domain = message.value['domain']
                    domain_clusters = cluster_utils.parse_cluster(domain, message.value['TaggedClusters'])
                    filtered_list = []
                    for page_dict in domain_clusters:
                        label = page_dict['cluster_label']
                        if label == 'product':
                            page_text = page_dict['text']
                            prediction = classifier.predict(model=model, input=page_text)
                            if prediction == [1]:
                                filtered_list.append(page_dict)
                        else:
                            filtered_list.append(page_dict)
                    content = {
                        'domain': domain,
                        'filtered_pages': filtered_list
                    }
                    content_json = json.dumps(content)
                    mongo.put(domain, content_json)
                    print('Data saved on db: collection: ' + str(domain))
                    kafka.send_message(producer = producer, topic = TOPIC_OUTPUT, message = content)
Пример #3
0
def main():
    print('Loading the model...')
    model = classifier.load_classifier(model=MODEL,
                                       parquet=TRAINING_PARQUET,
                                       training_set=TRAINING_SET)
    print('Running Consumer...')
    try:
        partitioner = kafka.get_RoundRobin_partitioner_for_topic(
            TOPIC_OUTPUT, KAFKA_BROKER_URL)
    except Exception as ex:
        print('Error with topic partitions')
        print(ex)
    try:
        consumer = kafka.connectConsumer(topic=TOPIC_INPUT,
                                         server=KAFKA_BROKER_URL)
        print("Consumer connected")
    except Exception as ex:
        print("Error connecting kafka broker as Consumer")
        print(ex)
    try:
        producer = kafka.connectProducer(server=KAFKA_BROKER_URL,
                                         partitioner=partitioner)
        print("Producer connected")
    except Exception as ex:
        print("Error connecting kafka broker as Producer")
        print(ex)
    i = 0
    working = True
    while working:
        message_dict = kafka.consume(consumer=consumer)
        if (message_dict != {}):
            for topic, messages in message_dict.items():
                for message in messages:
                    if classifier.predict(
                            model=model, input=message.value['url_page']) == 1:
                        collection = 'Classifier'
                        mongo.put(collection, json.dumps(message.value))
                        print('Data saved on db: collection: ' +
                              str(collection) + ' url: ' +
                              message.value['url_page'])

                        kafka.send_message(producer=producer,
                                           key=i,
                                           topic=TOPIC_OUTPUT,
                                           message=message.value)
                    i = i + 1
Пример #4
0
                                DATABASE_READ, collection_name)
                        except:
                            print('#########################################')
                            print('#######ERROR tryng to read from db#######')
                            print('#########################################')

                        try:
                            clusters = clustering.structural_clustering(
                                collection, threshold)
                        except:
                            print('#########################################')
                            print('########ERROR tryng to cluster###########')
                            print('#########################################')
                        if clusters:
                            content = {
                                'domain': collection_name,
                                'clusters': clusters
                            }
                            content_json = json.dumps(content)
                            mongo.put(collection_name, content_json)
                            print('#########################################')
                            print('############Data saved on DB#############')
                            print('#########################################')
                            kafka.send_message(producer=producer,
                                               topic=TOPIC_OUTPUT,
                                               value=content)
                            print('Sent message for domain: ' +
                                  str(content['domain']))
                    except:
                        print('#############ERROR clusterizer ##############')