Пример #1
0
def __queue_smart_crawl_start(workspace_id, job_id, page_limit, broadness,
                              urls, page_model):
    '''
    {
        "workspace_id": "workspace id",
        "id": "crawl id",
        "page_limit": 100
        "broadness": "BROAD" # Valid codes are["N10", "N100", "N1000", "N10000", "BROAD"],
        "urls": ["http://example.com", "http://example.com/2"],
        "page_model": "b64-encoded page classifier",
    }
    '''

    message = {
        'workspace_id':
        workspace_id,  #Singleton.getInstance().mongo_instance.get_current_workspace_name(),
        'id': job_id,
        'page_limit': page_limit,
        'broadness': broadness,
        'urls': urls,
        'page_model': page_model,
    }
    logging.info(message)
    Singleton.getInstance().broker_service.add_message_to_dd_crawler_input(
        message)
def queue_broad_crawl(workspace_id, job_id, num_to_fetch,
                      broad_crawler_provider, broad_crawler_sources):
    # keywords = dao_get_keywords()
    keywords = dao_get_keywords_by_relevance(workspace_id)
    categorized_urls = get_seeds_urls_categorized(workspace_id)
    existent_url = get_relevant_or_neutral_seeds_urls_url(workspace_id)

    # jobId = str(uuid.uuid1())
    logging.info("sending broad crawl message for %s urls with keywords %s" %
                 (str(num_to_fetch), str(keywords)))
    message = {
        'included': keywords['included'],
        'excluded': keywords['excluded'],
        'relevantUrl': categorized_urls['relevant'],
        'irrelevantUrl': categorized_urls['irrelevant'],
        'nResults': int(num_to_fetch),
        'existentUrl': existent_url,  #get_seeds_urls_url(),
        # 'appInstance': Singleton.getInstance().app_instance,
        'workspace':
        workspace_id,  #Singleton.getInstance().mongo_instance.get_current_workspace_name(),
        'jobId': job_id,
        'crawlProvider': broad_crawler_provider,
        'crawlSources': broad_crawler_sources
    }

    logging.info(message)
    Singleton.getInstance().broker_service.add_message_to_broadcrawler(
        message, broad_crawler_provider)
Пример #3
0
def get_screenshot(crawl_type, id):
    if crawl_type == "broadcrawl":
        collection = Singleton.getInstance(
        ).mongo_instance.get_broad_crawler_collection()
    elif crawl_type == "deepcrawl":
        collection = Singleton.getInstance(
        ).mongo_instance.get_deep_crawler_collection()
    elif crawl_type == "deepcrawl-domains":
        collection = Singleton.getInstance(
        ).mongo_instance.get_deep_crawler_domains_collection()
    elif crawl_type == "keywords":
        collection = Singleton.getInstance(
        ).mongo_instance.get_seed_urls_collection()
    else:
        logging.info("invalid crawl type: " + crawl_type)

    res = collection.find({"_id": ObjectId(id)})
    docs = list(res)

    screenshot = ""

    if len(docs) > 0:
        url = docs[0]["url"]
        bytes = Singleton.getInstance().es_client.get_screenshoot(url)
        if bytes:
            screenshot = base64.b64decode(bytes)

    return screenshot
def queue_deep_crawl_stop(workspace_id, job_id):
    logging.info("preparing stop deep-crawler")
    message = {
        'id': job_id,
        # 'workspace_id': workspace_id,
        'stop': True
    }

    logging.info(message)
    Singleton.getInstance().broker_service.add_message_to_deepcrawler(message)
Пример #5
0
def queue_login(workspace_id, job_id, credentials):
    message = {
        'workspace_id': workspace_id,
        'job_id': job_id,
        'id': credentials["_id"],
        'domain': credentials["domain"],
        'url': credentials["url"],
        'key_values': credentials["keyValues"]
    }

    Singleton.getInstance().broker_service.add_message_to_login(message)
def queue_deep_crawl_start(workspace_id, job_id, num_to_fetch, urls,
                           login_credentials):
    logging.info("preparing deep crawl message")
    message = {
        'id': job_id,
        'workspace_id': workspace_id,
        'page_limit': int(num_to_fetch),
        'urls': urls,
        'login_credentials': login_credentials
    }

    logging.info(message)
    Singleton.getInstance().broker_service.add_message_to_deepcrawler(message)
def register_scraping_subscriber():
    # Callback
    def action1(args):
        logging.info("read_topic_from_scraping run with: " + str(args))

    # def save_urls(obj):
    # url = obj['url']
    # 	print("save_url with: " + url)
    # 	print("save_url object with: " + str(obj))
    # 	dao_update_url(url=url, obj=obj)

    # Singleton.getInstance().broker_service.read_topic_from_broadcrawler(callback=save_urls)
    Singleton.getInstance().broker_service.read_topic_from_scraping(callback=action1)
Пример #8
0
def get_screenshot(crawl_type, id):
    if crawl_type == "broadcrawl":
        collection = Singleton.getInstance(
        ).mongo_instance.get_broad_crawler_collection()
    elif crawl_type == "keywords":
        collection = Singleton.getInstance(
        ).mongo_instance.get_seed_urls_collection()
    else:
        logging.info("invalid crawl type: " + crawl_type)

    res = collection.find({"_id": ObjectId(id)})
    url = res[0]["url"]
    base64 = Singleton.getInstance().es_client.get_screenshoot(url)
    return base64
Пример #9
0
def get_deep_crawl_domains_by_domain_name(workspace_id, job_id, domain_name,
                                          limit, last_id):

    collection = Singleton.getInstance(
    ).mongo_instance.get_deep_crawler_collection()

    and_source_conditions = []

    workspace_search_object = {'workspaceId': workspace_id}
    and_source_conditions.append(workspace_search_object)

    job_search_object = {'jobId': job_id}
    and_source_conditions.append(job_search_object)

    domain_name_search_object = {'domain': domain_name}
    and_source_conditions.append(domain_name_search_object)

    page_search_object = {}
    if last_id is not None:
        page_search_object = {"_id": {"$gt": ObjectId(last_id)}}
    and_source_conditions.append(page_search_object)

    query = {'$and': and_source_conditions}
    cursor = collection.find(query) \
            .sort('_id', pymongo.ASCENDING) \
            .limit(limit)

    docs = list(cursor)
    return docs
Пример #10
0
def get_logins(workspace_id, domains):

    collection = Singleton.getInstance().mongo_instance.get_login_collection()

    and_source_conditions = []

    workspace_search_object = {'workspaceId': workspace_id}
    and_source_conditions.append(workspace_search_object)

    domains_search_object = {'domain': {'$in': domains}}
    and_source_conditions.append(domains_search_object)

    key_values_search_object = {'keyValues': {"$exists": True}}
    and_source_conditions.append(key_values_search_object)

    query = {'$and': and_source_conditions}
    fields = {
        'url': 1,
        'domain': 1,
        'keysOrder': 1,
        'keyValues': 1,
        'result': 1,
        '_id': 1
    }
    cursor = collection.find(query, fields)
    docs = list(cursor)
    for doc in docs:
        doc["_id"] = str(doc["_id"])

    return docs
Пример #11
0
def save_login(credentials):
    collection = Singleton.getInstance().mongo_instance.get_login_collection()

    login_search_object = {'_id': ObjectId(credentials["_id"])}

    operation = {'$set': {"keyValues": credentials["keyValues"]}}
    collection.update(login_search_object, operation)
def get_max_id(workspace_id, job_id):

    ws_object = {}
    ws_object["workspaceId"] = workspace_id

    search_object = {}
    if job_id:
        search_object = {"jobId": job_id}

    # res = Singleton.getInstance().mongo_instance.get_broad_crawler_output_collection() \
    # collection = Singleton.getInstance().mongo_instance.get_broad_crawler_output_collection_by_workspace_id(workspace_id)

    collection = Singleton.getInstance(
    ).mongo_instance.get_broad_crawler_collection()
    res = collection \
        .find({'$and': [ws_object, search_object]}, ['_id']) \
        .sort('_id', pymongo.DESCENDING) \
        .limit(1)

    res_list = list(res)
    if len(res_list) > 0:
        max_id = str(res_list[0]["_id"])
    else:
        max_id = None

    return max_id
Пример #13
0
def create_account(username):
    password = request.json['password']
    encrypted_password = utils.encrypt_password(password)

    try:
        Singleton.getInstance().user_datastore.create_user(
            email=username,
            password=encrypted_password,
            roles=[],
            active=True,
            login_count=0)
    except NotUniqueError as ex:
        raise InvalidUsage('An username with that email already exists',
                           status_code=409)

    return Response("{}", mimetype="application/json")
    def __init__(self):
        self.counter = 0
        self.sample_size = 50

        def action1(args):
            print("callback run with: " + str(args))
            print("current counter:" + str(self.counter))
            self.counter = self.counter + 1

        kafka_host_port = "9092"
        kafka_host_name = "localhost"
        # topic_group = "test-group"
        # topic = 'splash'
        # topic_input = topic + '-input'
        # topic_output = topic + '-input'#+ '-output'

        app_instance = str(uuid.uuid1())
        app_instance = 'app-instance'  #str(uuid.uuid1())
        instance = Singleton.getInstance()
        instance.app_instance = app_instance
        instance.mongo_instance = MongoInstance()

        kafka_connector = KafkaConnector(kafka_host_name, kafka_host_port)
        # kafka_connector.create_topic(topic_input)

        broker_service = BrokerService(app_instance, kafka_host_name,
                                       kafka_host_port)

        #subscribe
        # broker_service.read_from_queue(action1, topic_group, topic_output)
        broker_service.read_topic_from_splash(action1)
        time.sleep(1)

        #produce
        message = {}
        i = 0
        while i < self.sample_size:
            message['index'] = i
            message['text'] = 'test'
            # message['date'] =  strftime("%Y-%m-%d %H:%M:%S", gmtime())
            message[
                'splash_url_path'] = "/render.json?png=1&html=1&url=http://www.hyperiongray.com"
            #"splash_url_path" : "/render.json?png=1&html=1&url=http://www.hyperiongray.com",

            # broker_service.post_to_queue(message, topic_input, topic_output)
            broker_service.add_message_to_splash(message)
            print 'sending: ' + str(i)
            i = i + 1

        time.sleep(5)
        print "Finish all"
        print('Samples produced: ' + str(self.sample_size))
        print('Samples consumed: ' + str(self.counter))
        if self.sample_size == self.counter:
            print 'Test passed'
        else:
            print 'test failed!!!'

        time.sleep(1000000)
Пример #15
0
 def get_metadata(self, workspace_id):
     metadata = {}
     metadata['workspace'] = workspace_id
     metadata['source'] = Singleton.getInstance().app_instance
     metadata['callbackQueue'] = "callback_queue_not_used"
     metadata['timestamp'] = time.time()
     metadata['strTimestamp'] = strftime("%Y-%m-%d %H:%M:%S", gmtime())
     return metadata
def get_existing_categories_service(workspace_id):

    collection = Singleton.getInstance(
    ).mongo_instance.get_broad_crawler_collection()
    search_object = {}
    search_object["workspaceId"] = workspace_id
    categories = list(collection.find(search_object).distinct("categories"))
    languages = list(collection.find(search_object).distinct("language"))
    return categories, languages
def scraping_publication(url):

    def register_scraping_subscriber():

        logging.info('registering register_scraping_subscriber')

    # Callback
    def action1(args):
        logging.info("action1 callback run with: " + str(args))

        logging.info('publishing for scraping: ' + url)
    message = {
        'url': url,
        'nResults': 2,
        'workspace': Singleton.getInstance().mongo_instance.get_current_workspace_name()

    }
    Singleton.getInstance().broker_service.add_message_to_scraping(message)
    return 282828
Пример #18
0
    def post_to_queue(self, message, input_queue, callback_queue):

        # add more headers to the message
        message['source'] = Singleton.getInstance().app_instance#self.app_instance
        message['callbackQueue'] = callback_queue
        message['timestamp'] = time.time()
        message['strTimestamp'] = strftime("%Y-%m-%d %H:%M:%S", gmtime())

        json_message = json.dumps(message)
        self.kafka_connector.send_message(input_queue, json_message)
Пример #19
0
def pin_service(workspace_id, id, is_pinned):
    logging.info("PINNING: %s" % id + " as " + str(is_pinned))
    collection = Singleton.getInstance(
    ).mongo_instance.get_broad_crawler_collection()
    update_object = {"pinned": is_pinned}
    collection.update({"_id": ObjectId(id)}, {'$set': update_object}, True)

    publish_to_events_queue(workspace_id, "bookmark", "changed", {
        "id": id,
        "pinned": is_pinned
    })
Пример #20
0
def get_job_by_id(job_id):
    collection = Singleton.getInstance(
    ).mongo_instance.get_crawl_job_collection()

    and_source_conditions = []

    job_search_object = {'_id': ObjectId(job_id)}
    and_source_conditions.append(job_search_object)

    query = {'$and': and_source_conditions}
    cursor = collection.find(query)
    docs = list(cursor)
    return docs[0]
Пример #21
0
def get_domains_by_job_id(workspace_id, job_id):

    collection = Singleton.getInstance(
    ).mongo_instance.get_deep_crawler_domains_collection()

    and_source_conditions = []

    workspace_search_object = {'workspaceId': workspace_id}
    and_source_conditions.append(workspace_search_object)

    job_search_object = {'jobId': job_id}
    and_source_conditions.append(job_search_object)

    query = {'$and': and_source_conditions}
    cursor = collection.find(query)
    docs = list(cursor)
    return docs
def get_relevant_or_neutral_seeds_urls_url(workspace_id):
    # res = Singleton.getInstance().mongo_instance.get_broad_crawler_output_collection()\
    # collection = Singleton.getInstance().mongo_instance.get_broad_crawler_output_collection_by_workspace_id(workspace_id)
    collection = Singleton.getInstance(
    ).mongo_instance.get_seed_urls_collection()
    res = collection.find(
        {
            'relevant': {
                '$ne': False
            },
            "workspaceId": workspace_id
        }, {'url': 1})
    docs = list(res)
    urls = []
    for doc in docs:
        # urls.append({"id": str(doc["_id"]), "url": doc["url"]})
        urls.append(doc["url"])

    return urls
Пример #23
0
def __get_urls(workspace_id):
    collection = Singleton.getInstance(
    ).mongo_instance.get_seed_urls_collection()
    relevant_urls_result = collection.find(
        {
            'workspaceId': workspace_id,
            'relevant': True,
            'deleted': {
                '$exists': False
            }
        }, {
            '_id': 0,
            'url': 1
        })
    relevant_urls = []
    for url_doc in relevant_urls_result:
        if 'url' in url_doc:
            relevant_urls.append(url_doc['url'])

    return relevant_urls
Пример #24
0
def get_smart_crawler_results(workspace_id, page_size, input_search_query):

    and_conditions = []
    and_conditions.append({'workspaceId': workspace_id})
    and_conditions.append({'deleted': {'$exists': False}})
    and_conditions.append({'crawlEntityType': 'DD'})
    #    and_conditions.append({'crawlType': 'SMARTCRAWL'})

    if 'job_id' in input_search_query and input_search_query[
            'job_id'] is not None:
        job_search_object = {'jobId': input_search_query["job_id"]}
        and_conditions.append(job_search_object)
    ''' max_id restricts the results to the current set of returned results and not screw the pagination by score '''
    if 'last_id' in input_search_query and input_search_query[
            'last_id'] is not None:
        last_id_search_object = {
            "_id": {
                "$gt": ObjectId(input_search_query['last_id'])
            }
        }
        and_conditions.append(last_id_search_object)

    if "search_text" in input_search_query:
        search_text = input_search_query['search_text']
        url_search_condition = {'url': {'$regex': search_text}}
        host_search_condition = {'host': {'$regex': search_text}}
        and_conditions.append(
            {'$or': [url_search_condition, host_search_condition]})

    # and_source_conditions = {'$and': and_conditions}

    collection = Singleton.getInstance(
    ).mongo_instance.get_broad_crawler_collection()
    query = {'$and': and_conditions}
    cursor = collection.find(query) \
            .sort('_id', pymongo.ASCENDING) \
            .limit(page_size)

    docs = list(cursor)
    return docs
def dao_aggregate_broadcrawl_results(workspace_id):

    source_search_conditions = []

    workspace_search_object = {'workspaceId': workspace_id}
    source_search_conditions.append(workspace_search_object)

    delete_search_object = {'deleted': {'$exists': False}}
    source_search_conditions.append(delete_search_object)

    source_search_object = {'$and': source_search_conditions}

    collection = Singleton.getInstance(
    ).mongo_instance.get_broad_crawler_collection()

    try:
        res = collection.aggregate([

            # '$group': {'_id': '$crawlEntityType', "count": {"$sum": 1}}
            {
                '$match': source_search_object
            },
            {
                '$group': {
                    '_id': {
                        'crawlEntityType': '$crawlEntityType'
                    },
                    "count": {
                        "$sum": 1
                    }
                }
            }
        ])
    except Exception as e:
        print e

    return res["result"]
    def __init__(self):

        app_instance = str(uuid.uuid1())
        app_instance = 'app-instance'  #str(uuid.uuid1())
        instance = Singleton.getInstance()
        instance.app_instance = app_instance
        instance.mongo_instance = MongoInstance()

        search_query = {}
        search_text = 'abellan'
        print ''
        print 'searching: ' + search_text
        search_query['search_text'] = search_text
        search_results = get_search_results(search_query)
        print(search_results)
        print 'searching complete for: ' + search_text

        # in host
        print ''
        search_text = 'Guingueta'
        print 'searching: ' + search_text
        search_query['search_text'] = search_text
        search_results = get_search_results(search_query)
        print(search_results)
        print 'searching complete for: ' + search_text

        # in words
        print ''
        search_text = 'restaurantes'
        print 'searching: ' + search_text
        search_query['search_text'] = search_text
        search_results = get_search_results(search_query)
        print(search_results)
        print 'searching complete for: ' + search_text

        print 'done!'
Пример #27
0
def __get_page_model(workspace_id):
    base64 = Singleton.getInstance().es_client.get_modeler_model(workspace_id)
    return base64
Пример #28
0
def __queue_smart_crawler_stop(workspace_id, job_id):

    message = {'id': job_id, 'stop': True}
    logging.info(message)
    Singleton.getInstance().broker_service.add_message_to_dd_crawler_input(
        message)
def count_service(workspace_id):
    collection = Singleton.getInstance(
    ).mongo_instance.get_broad_crawler_collection()
    return collection.find({"workspaceId": workspace_id}).count()
def pin_service(workspace_id, id, is_pinned):
    logging.info("PINNING: %s" % id + " as " + str(is_pinned))
    collection = Singleton.getInstance(
    ).mongo_instance.get_broad_crawler_collection()
    update_object = {"pinned": is_pinned}
    collection.update({"_id": ObjectId(id)}, {'$set': update_object}, True)