Python DBObject.DBObject примеры использования

Язык программирования: Python

Пространство имен/Пакет: database

Класс/Тип: DBObject

Метод/Функция: DBObject

Примеров на hotexamples.com: 5

Python DBObject.DBObject - 5 примеров найдено. Это лучшие примеры Python кода для database.DBObject.DBObject, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

DBObject(5)

__init__(4)

insert_html_data(3)

__getattr__(2)

create_wokers_log(2)

query_wokers_info(2)

query_wokers_logs(2)

update_wokers_info(2)

update_wokers_log(2)

cancel_task(1)

pause_task(1)

query_html_db(1)

workAs(1)

Пример #1

Показать файл

def searchPostHtml(request: dict):
    # print(request)
    site_type_re = {
        "nha.chotot.com": {
            "land": r"^.*/mua-ban-dat/.*$",
            "house": r"^.*/mua-ban-nha-dat/.*$",
            "apartment": r"^.*/mua-ban-can-ho-chung-cu/.*$"
        },
        "nhadat247.com.vn": {
            "land": r"^.*nhadat247.com.vn/ban-dat.*$",
            "apartment": r"^.*nhadat247.com.vn/ban-can-ho-chung-cu.*$",
            "house": r"^.*nhadat247.com.vn/ban-nha.*$"
        },
        "batdongsan.com.vn": {
            "land": r"^.*batdongsan.com.vn/ban-dat.*$",
            "apartment": r"^.*batdongsan.com.vn/ban-can-ho-chung-cu.*$",
            "house": r"^.*batdongsan.com.vn/ban-nha.*$"
        }
    }

    try:
        db = DBObject()

        _site = request["site"] if "site" in request else None
        _crawl_date = request["crawl_date"] if "crawl_date" in request else None
        _post_date = request["post_date"] if "post_date" in request else None
        _type = request["type"] if "type" in request else "all"
        _limit = int(request["limit"]) if (
            "limit" in request) and len(request["limit"]) > 0 else 0

        list_filter = []

        if _site in site_type_re:
            list_filter.append(
                {"url": {
                    "$regex": "^https://%s/.*$" % (_site)
                }})

        if _type in site_type_re[_site]:
            list_filter.append({"url": {"$regex": site_type_re[_site][_type]}})
        else:
            list_filter.append({
                "$or": [{
                    "url": {
                        "$regex": site_type_re[_site][_t]
                    }
                } for _t in site_type_re[_site]]
            })

        _d_range = d_range(_crawl_date)
        if len(_d_range) > 0:
            list_filter.append({
                "$or": [{
                    "date": {
                        "$regex": "^[0-9]{2}/%s/%s$" % (m, y)
                    }
                } for m, y in _d_range]
            })

        _d_range = d_range(_post_date)
        if len(_d_range) > 0:
            list_filter.append({
                "$or": [{
                    "post_date": {
                        "$regex": "^[0-9]{2}/%s/%s$" % (m, y)
                    }
                } for m, y in _d_range]
            })

        query_return = []
        for post in db.query_html_db(query_dict={"$and": list_filter},
                                     limit=_limit):
            post.pop("html")
            post.pop("_id")

            post["html"] = "content is eliminated"
            query_return.append(post)
        # print(query_return[0])
        return {"code": 200, "message": "successfull", "content": query_return}
    except:
        # traceback.print_exc()
        return {"code": 404, "message": "failed", "content": []}

Пример #2

Показать файл

import re
import pandas as pd
from itertools import chain
from database import DBObject
import traceback

import traceback
from database import DBObject

from time import time
import pandas as pd
from datetime import date

db = DBObject()


def strip_text(text):
    return text.replace("\t", "").replace("\n", "").strip()


def stringify_children(node):
    # print(str(node.tag))

    parts = ([node.text] + list(
        chain(*((stringify_children(c) + ("\n" if str(c.tag) == "div" else ""))
                for c in node.getchildren()))) + [node.tail])

    return ''.join(filter(None, parts))


def clean_trash(html):

Пример #3

Показать файл

Файл: ParserEngines.py Проект: ngthquocminh/bds-scraper

import pandas as pd
from datetime import datetime, date
import time

import hashlib

from ParserObject import ParserObject
from ParserModelSelector import ParserModelSelector
from LibFunc import clean_trash
from database import DBObject
from Settings import Settings

#=============================================================================================
#=============================================================================================

database = DBObject()


def parse(posts_data,
          site=None,
          type=None,
          num=None,
          many: bool = False,
          model_name=None,
          resume=False):

    print("Go to Parsing Data")
    the_status = "parsing"
    __failed_urls = []
    __saved_post = []
    task_id = (int)(time.time())

Пример #4

Показать файл

    def __init__(self,
                 date_from=None,
                 date_to=None,
                 post_type=None,
                 all_date: bool = False,
                 resume=False,
                 limit=-1):

        self.limit = int(limit)
        self.db_object = DBObject()
        the_status = "crawling"
        worker_info = self.db_object.query_wokers_info(Settings.worker_id)
        self.resume = resume
        if self.resume:
            try:
                info_ = worker_info
                status_ = info_["status"]
                task_id = info_["task_id"]
                info_str_ = info_["str_info"]
                if not ("(pause)" in status_ and "crawling" in status_):
                    print(">>", status_)
                    return
                info_dict_ = {
                    _i_.split(": ")[0]: _i_.split(": ")[1]
                    for _i_ in info_str_.lower().split(", ")
                }
                if info_dict_["site"] != "nhadat247.com.vn":
                    return
                date_from = info_dict_["date"].split("-")[0]
                date_to = info_dict_["date"].split("-")[1]

                try:
                    self.limit = int(info_dict_["limit"])
                except:
                    self.limit = -1

                post_type = info_dict_["type"]
                the_status = status_.replace("(pause)", "")
                print("Internal loading data to resume")
            except:
                traceback.print_exc()
                return

        self.__str_info = "Site: nhadat247.com.vn, Type: %s, Date: %s-%s, Limit: %s, " % (
            post_type, date_from, date_to, str(self.limit)
            if isinstance(self.limit, int) and self.limit > 0 else "No")
        self.__str_info += "Numpost: %d, Error: %d"

        self.post_type = post_type
        self.buffer = []
        self.seed_url = NhaDat247.get_seed_url(post_type)

        self.__current_url = ""
        self.__failed_urls = []
        self.__saved_post = []

        self.file_log_visited_url = "visited_post_log_nhadat247_%s.txt" % (
            self.post_type)
        self.file_log_new_url = "local_urls_log_nhadat247_%s.txt" % (
            self.post_type)

        self.regex_sub_url = re.compile(
            "([a-z][-a-z]*)?ban-[-a-z]+((.html)|(/[0-9]+))?")
        self.regex_post = re.compile(
            "([a-z][-a-z]*)?ban-[-a-z0-9]+/[-a-z0-9]+pr[0-9]+.html")

        self.key_type = NhaDat247.get_key_from_type(self.post_type)

        try:
            last_day_to = calendar.monthrange(int(date_to.split("/")[1]),
                                              int(date_to.split("/")[0]))[1]
            self.post_date_range = {
                "from":
                datetime.strptime("1/" + date_from, '%d/%m/%Y').date(),
                "to":
                datetime.strptime(
                    str(last_day_to) + "/" + date_to, '%d/%m/%Y').date()
            }
            print("-" * 200, "\n", self.post_date_range)
        except:
            traceback.print_exc()
            self.post_date_range = None

        self.browser = Browser(headless=False)

        if not self.resume:
            task_id = (int)(time.time())

        self.__crawling_info = {
            "task_id": task_id,
            "status": the_status,
            "str_info": ""
        }
        self.__crawling_log = {
            "worker_id": Settings.worker_id,
            "task_id": task_id,
            "task_info": self.__str_info % (0, 0),
            "saved_posts": [],
            "error_posts": []
        }

        if not self.resume:
            print("Create log")
            self.db_object.create_wokers_log(self.__crawling_log)
            self.update_crawling_status_info(0, 0)
        else:
            log = self.db_object.query_wokers_logs(Settings.worker_id, task_id)
            print("Get log: ", log if log else "null")
            if log is not None:
                self.__saved_post = log["saved_posts"]
                self.__failed_urls = log["error_posts"]

        print("Init crawler")

Пример #5

Показать файл

Файл: receiver.py Проект: ngthquocminh/bds-scraper

    def callback(ch, method, properties, body):
        command = "nothing"
        try:
            body = body.decode('ascii')
            message = message_loads(body)
            command = message["command"]

            if command == "crawl":

                pid = int(open("data.lock", "r").read())
                if not psutil.pid_exists(pid):
                    Popen(['python', 'worker.py', body])
                else:
                    command = "is runing"

            elif command == "parse":

                pid = int(open("data.lock", "r").read())
                if not psutil.pid_exists(pid):
                    file = open("parse_posts.data", "w")
                    file.write(message["posts"])
                    file.close()
                    model = message["model"] if "model" in message else "auto"
                    type = message["type"] if "type" in message else "all"
                    site = message["site"] if "site" in message else "all"

                    Popen([
                        'python', 'worker.py',
                        "command:parse site:%s type:%s model:%s" %
                        (site, type, model)
                    ])
                else:
                    command = "is runing"

            elif command == "stop":

                db = DBObject()
                db.cancel_task(Settings.worker_id)
                try:
                    pid = int(open("data.lock", "r").read())
                    os.kill(pid, signal.SIGTERM)
                except:
                    ""
                subprocess.call("TASKKILL /f  /IM  CHROMEDRIVER.EXE")
                subprocess.call("TASKKILL /f  /IM  CHROME.EXE")

            elif command == "pause":

                db = DBObject()
                pid = int(open("data.lock", "r").read())
                _working, _as = db.workAs(Settings.worker_id)
                if _working:
                    db.pause_task(Settings.worker_id)
                    try:
                        os.kill(pid, signal.SIGTERM)
                    except:
                        ""
                    subprocess.call("TASKKILL /f  /IM  CHROME.EXE")
                    subprocess.call("TASKKILL /f  /IM  CHROMEDRIVER.EXE")
                else:
                    if not psutil.pid_exists(pid):
                        Popen([
                            'python', 'worker.py',
                            "command:%s resume:1" % (_as)
                        ])
                    else:
                        command = "is runing"

            elif command == "shield":
                shield_on = True if (
                    ("shield" in message and int(message["shield"]) == 1) or
                    (not Settings.isShieldEnable())) else False
                Settings.enableShield(shield_on)
            else:
                command = "nothing"
                ""
        except:
            traceback.print_exc()

        print(" [x] Received \n    -> Do %s" % (command))