def dao_aggregate_urls(workspace_id): collection = Singleton.getInstance( ).mongo_instance.get_seed_urls_collection() source_search_conditions = [] workspace_search_object = {'workspaceId': workspace_id} delete_search_object = {'deleted': {'$exists': False}} # 'deleted': {'$exists': False}} source_search_conditions.append(workspace_search_object) source_search_conditions.append(delete_search_object) source_search_object = {'$and': source_search_conditions} try: res = collection.aggregate([ # '$group': {'_id': '$crawlEntityType', "count": {"$sum": 1}} { '$match': source_search_object }, { '$group': { '_id': { 'crawlEntityType': '$crawlEntityType', 'relevant': '$relevant' }, "count": { "$sum": 1 } } } ]) except Exception as e: print e return res["result"]
def unlabel(url_id, user_defined_category): collection = Singleton.getInstance( ).mongo_instance.get_seed_urls_collection() operation = {'$pull': {"userDefinedCategories": user_defined_category}} collection.update({"_id": ObjectId(url_id)}, operation)
def dao_get_workspace_by_id(id): return Singleton.getInstance( ).mongo_instance.workspace_collection.find_one({'_id': ObjectId(id)})
def get_seeds_urls_to_label_dao(workspace_id, page_size, sources, relevances, categories, keyword_source_type, last_id, last_source): and_condition_list = [] #sources if len(sources) > 0: source_search_conditions = [] for source in sources: if source == "searchengine": source_search_conditions.append({'crawlEntityType': "BING"}) source_search_conditions.append({'crawlEntityType': "GOOGLE"}) elif source == "tor": source_search_conditions.append({'crawlEntityType': "TOR"}) elif source == "imported": source_search_conditions.append({'crawlEntityType': "MANUAL"}) elif source == "deepdeep": source_search_conditions.append({'crawlEntityType': "DD"}) else: print("no valid source was provided:" + source) source_search_object = {'$or': source_search_conditions} and_condition_list.append(source_search_object) #relevances if len(relevances) > 0: relevances_search_conditions = [] for relevance in relevances: if relevance == "unset": relevances_search_conditions.append( {'relevant': { "$exists": False }}) else: relevances_search_conditions.append({ "$and": [{ 'relevant': relevance }, { 'relevant': { "$exists": True } }] }) relevances_search_object = {'$or': relevances_search_conditions} and_condition_list.append(relevances_search_object) #page_types if len(categories) > 0: categories_search_conditions = [] for category in categories: categories_search_conditions.append({'categories': category}) categories_search_object = {'$or': categories_search_conditions} and_condition_list.append(categories_search_object) page_search_object = {} if last_id is not None and last_source is not None: #even bigger from same source, or any from other source page_search_object = { '$or': [{ "$and": [{ "_id": { "$gt": ObjectId(last_id) } }, { "crawlEntityType": last_source }] }, { "crawlEntityType": { "$ne": last_source } }] } and_condition_list.append(page_search_object) deleted_search_object = {'deleted': {"$exists": False}} and_condition_list.append(deleted_search_object) workspace_search_object = {'workspaceId': workspace_id} and_condition_list.append(workspace_search_object) sort_dict = OrderedDict() sort_dict['order'] = 1 sort_dict['_id'] = 1 collection = Singleton.getInstance( ).mongo_instance.get_seed_urls_collection() res = collection.aggregate([{ "$project": { '_id': 1, 'host': 1, 'crawlEntityType': 1, 'url': 1, 'title': 1, 'relevant': 1, 'workspaceId': 1, 'deleted': 1, "order": { "$cond": { "if": { "$eq": ["$crawlEntityType", "DD"] }, "then": 1, "else": { "$cond": { "if": { "$eq": ["$crawlEntityType", "TOR"] }, "then": 2, "else": { "$cond": { "if": { "$eq": ["$crawlEntityType", "GOOGLE"] }, "then": 3, "else": { "$cond": { "if": { "$eq": ["$crawlEntityType", "BING"] }, "then": 4, "else": 5 } } } } } } } } } }, { "$match": { '$and': and_condition_list } }, { "$sort": sort_dict }, { "$limit": page_size }, { "$project": { '_id': 1, 'host': 1, 'crawlEntityType': 1, 'url': 1, 'title': 1, 'relevant': 1, 'order': 1 } }]) docs = list(res["result"]) return docs
def dao_delete(id): Singleton.getInstance().mongo_instance.get_user_collection().remove( {"_id": ObjectId(id)})
def get_tasks_by_job(job_id): return Singleton.getInstance().mongo_instance.get_crawl_task_collection( ).find_one({'jobId': job_id})
def get_last_job_by_workspace_dao(workspace_id, crawl_type): docs = Singleton.getInstance().mongo_instance.get_crawl_job_collection()\ .find({'workspaceId': workspace_id, 'crawlType': crawl_type})\ .limit(1)\ .sort('_id', pymongo.DESCENDING) return list(docs)
def cancel_job(job_id): collection = Singleton.getInstance( ).mongo_instance.get_crawl_job_collection() operation = {'$set': {"status": "CANCELLED"}} collection.update({"_id": ObjectId(job_id)}, operation)
def dao_list_workspace(): # docs = Singleton.getInstance().mongo_instance.workspace_collection.find({}, {'name': 1, 'created': 1, 'words': 1}).sort('created', pymongo.ASCENDING) docs = Singleton.getInstance().mongo_instance.workspace_collection.find( {}).sort('created', pymongo.ASCENDING) return list(docs)
def __get_seeds_url_by_selection(workspace_id, selection): collection = Singleton.getInstance( ).mongo_instance.get_seed_urls_collection() or_sources_conditions = [] for key, value in selection.iteritems(): and_source_conditions = [] workspace_search_object = {'workspaceId': workspace_id} and_source_conditions.append(workspace_search_object) # deleted_search_object = {'deleted': None} # and_source_conditions.append(deleted_search_object) source_search_conditions = [] keywordSourceType = value["source"] source_search_conditions.append( {'keywordSourceType': keywordSourceType}) # if keywordSourceType == "FETCHED": # source_search_conditions.append({'crawlEntityType': "BING"}) # elif keywordSourceType == "MANUAL": # # if source == "searchengine": # source_search_conditions.append({'crawlEntityType': "BING"}) # source_search_conditions.append({'crawlEntityType': "GOOGLE"}) # elif source == "deepdeep": # source_search_conditions.append({'crawlEntityType': "DD"}) # # elif source == "tor": # source_search_conditions.append({'crawlEntityType': "TOR"}) # elif source == "imported": # source_search_conditions.append({'crawlEntityType': "MANUAL"}) # else: # print("no valid source was provided:" + source) source_search_object = {'$or': source_search_conditions} and_source_conditions.append(source_search_object) if value["allSelected"]: object_ids = [] for id in value["unselected"]: object_ids.append(ObjectId(id)) ids_search_object = {'_id': {'$nin': object_ids}} and_source_conditions.append(ids_search_object) else: object_ids = [] for id in value["selected"]: object_ids.append(ObjectId(id)) ids_search_object = {'_id': {'$in': object_ids}} and_source_conditions.append(ids_search_object) or_sources_conditions.append({'$and': and_source_conditions}) cursor = collection.find({'$or': or_sources_conditions}, { '_id': 0, 'url': 1 }) # docs = list(cursor) urls = [] for item in cursor: urls.append(item["url"]) return urls
def get_seeds_urls_url(workspace_id): collection = Singleton.getInstance( ).mongo_instance.get_seed_urls_collection() res = collection.find({'workspaceId': workspace_id}, {'_id': 0, 'url': 1}) docs = list(res) return docs
def get_seeds_urls_by_workspace_dao(workspace_id): collection = Singleton.getInstance( ).mongo_instance.get_seed_urls_collection() return list(collection.find({'workspaceId': workspace_id}))
def dao_delete_seed_url(workspace_id, id): collection = Singleton.getInstance( ).mongo_instance.get_seed_urls_collection() update_object = {} update_object['deleted'] = True collection.update({"_id": ObjectId(id)}, {'$set': update_object}, True)
def dao_update_relevanceByid(workspace_id, id, relevance): collection = Singleton.getInstance( ).mongo_instance.get_seed_urls_collection() update_object = {} update_object['relevant'] = relevance collection.update({"_id": ObjectId(id)}, {'$set': update_object}, True)
def upsert(workspace_id, user_defined_category): operation = {'$addToSet': {"userDefinedCategories": user_defined_category}} ws = Singleton.getInstance().mongo_instance.get_workspace_by_id(workspace_id) Singleton.getInstance().mongo_instance.workspace_collection.update({"_id": ObjectId(ws["_id"])}, operation)
def dao_count_workspace(): count = Singleton.getInstance().mongo_instance.workspace_collection\ .count() return count
def delete(workspace_id, user_defined_category): ws = Singleton.getInstance().mongo_instance.get_workspace_by_id(workspace_id) operation = {'$pull': {"userDefinedCategories": user_defined_category}} Singleton.getInstance().mongo_instance.workspace_collection.update({"_id": ObjectId(ws["_id"])}, operation)
if __name__ == "__main__": # usage: $ python runserver.py --logging-level=debug --logging-file=debug.log parser = optparse.OptionParser() parser.add_option('-l', '--logging-level', help='Logging level') parser.add_option('-f', '--logging-file', help='Logging file name') (options, args) = parser.parse_args() logging_level = LOGGING_LEVELS.get(options.logging_level, logging.INFO) logging.basicConfig(level=logging_level, filename=options.logging_file, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') app_instance = str(uuid.uuid1()) instance = Singleton.getInstance() instance.app_instance = app_instance instance.mongo_instance = MongoInstance(app.config['MONGO_HOST_NAME'], app.config['MONGO_HOST_PORT']) instance.broker_service = BrokerService(app_instance, app.config['KAFKA_HOST_NAME'], app.config['KAFKA_HOST_PORT']) instance.es_client = ElasticsearchClient(app) instance.broker_service.init_subscribers() # Create database connection object app.config['MONGODB_HOST'] = app.config['MONGO_HOST_NAME'] app.config['MONGODB_PORT'] = app.config['MONGO_HOST_PORT'] app.config['MONGODB_DB'] = 'MemexHack' db = MongoEngine(app)
def get_jobs_by_workspace_dao(workspace_id): docs = Singleton.getInstance().mongo_instance.get_crawl_job_collection()\ .find({'workspaceId': workspace_id})\ .sort('_id', pymongo.DESCENDING) return list(docs)
def get_job_dao(job_id): return Singleton.getInstance().mongo_instance.get_crawl_job_collection().find_one({"_id": ObjectId(job_id)})
def get_job_dao(job_id): return Singleton.getInstance().mongo_instance.get_crawl_job_collection( ).find_one({'jobId': job_id})
def dao_count_jobs(input_search_query): collection = Singleton.getInstance().mongo_instance.get_crawl_job_collection() query = {"workspaceId": input_search_query["workspace_id"]} count = collection.find(query).count() return count
def dao_get_all(): docs = Singleton.getInstance().mongo_instance.get_user_collection().find( ).sort('email', pymongo.ASCENDING) return list(docs)
def dao_get_blur_level(self): ws = Singleton.getInstance().mongo_instance.get_current_workspace() if ws is None or "blur_level" not in ws or ws["blur_level"] is None: return 0 else: return ws["blur_level"]
def dao_get_roles_all(): docs = Singleton.getInstance().mongo_instance.get_role_collection().find( ).sort('name', pymongo.ASCENDING) return list(docs)
def get_seeds_urls_all_labeled_dao(workspace_id, page_size, sources, relevances, last_id): and_condition_list = [] if len(sources) > 0: source_search_conditions = [] for source in sources: if source == "searchengine": source_search_conditions.append({'crawlEntityType': "BING"}) source_search_conditions.append({'crawlEntityType': "GOOGLE"}) elif source == "tor": source_search_conditions.append({'crawlEntityType': "TOR"}) elif source == "imported": source_search_conditions.append({'crawlEntityType': "MANUAL"}) elif source == "deepdeep": source_search_conditions.append({'crawlEntityType': "DD"}) else: print("no valid source was provided:" + source) source_search_object = {'$or': source_search_conditions} and_condition_list.append(source_search_object) if len(relevances) > 0: relevances_search_conditions = [] for relevance in relevances: if relevance == "unset": relevances_search_conditions.append( {'relevant': { "$exists": False }}) else: relevances_search_conditions.append({ "$and": [{ 'relevant': relevance }, { 'relevant': { "$exists": True } }] }) relevances_search_object = {'$or': relevances_search_conditions} and_condition_list.append(relevances_search_object) if last_id: last_id_search_object = {"_id": {"$gt": ObjectId(last_id)}} and_condition_list.append(last_id_search_object) deleted_search_object = {'deleted': {"$exists": False}} and_condition_list.append(deleted_search_object) workspace_search_object = {'workspaceId': workspace_id} and_condition_list.append(workspace_search_object) collection = Singleton.getInstance( ).mongo_instance.get_seed_urls_collection() res = collection.find({'$and': and_condition_list})\ .sort('_id', pymongo.ASCENDING)\ .limit(page_size) docs = list(res) return docs