Пример #1
0
def execute_one() -> None:
    try:
        os.makedirs("./tmp/", exist_ok=True)

        db = DBHelper()
        job_id = db.get_import_job()
        if job_id is None:
            print("no job, wait")
            time.sleep(const.NO_JOB_WAIT)
            return

        try:
            job_doc = db.lock_import_job(job_id)
        except Exception as e:
            log("unable to lock, skip: ", e)

            # random backoff time
            time.sleep(0.01 * random.randint(1, 40))
            return

        try:
            download(job_doc)
            add_to_db(job_doc, db)
        except Exception as e:
            log("execution error: ", e)

            # release lock
            job_doc["lock_timestamp"] = 0
            job_doc.save()

    except Exception as e:
        log("unknown error: ", e)
    finally:
        log("clean up tmp folder")
Пример #2
0
def get_xxx_detail(text, table_name):
    dbHelp = DBHelper(user_config.db_host, user_config.db_port,
                      user_config.db_user, user_config.db_password,
                      user_config.db_database)
    soup = BeautifulSoup(text, "html.parser")
    # 如果不指定name,指定attrs,那么会找所有好友该属性的标签。
    menu_lst_tag = soup.find(name="div", attrs={"class": "rstdtl-menu-lst"})

    #找一层就可以了。
    menu_head_content_tags = menu_lst_tag.findChildren(recursive=False)
    menu_type = ""
    for menu_head_content_tag in menu_head_content_tags:
        #print(menu_head_content_tag)
        if menu_head_content_tag.get("class")[0] == "rstdtl-menu-lst__heading":
            menu_type = my_util.getTagText(menu_head_content_tag)
            continue
        info = {}
        info["type"] = menu_type
        info["name"] = my_util.getTagText(
            menu_head_content_tag.find(
                name="p", attrs={"class": "rstdtl-menu-lst__menu-title"}))
        img_tag = menu_head_content_tag.find(name="img")
        img_href = img_tag.get("src") if img_tag else ""
        if img_href != "":
            info["img"] = down_file(img_href,
                                    img_href[img_href.rfind("/") + 1:])
        info["price"] = my_util.getTagText(
            menu_head_content_tag.find(
                name="p", attrs={"class": "rstdtl-menu-lst__price"}))
        info["description"] = my_util.getTagText(
            menu_head_content_tag.find(name="p",
                                       attrs={"class": "rstdtl-menu-lst__ex"}))
        #print(info)
        save_data(table_name, info, dbHelp)
    dbHelp.closeDB()
Пример #3
0
        def get_host_ip(user_id, project_name, service_name):
            prj_id = Db.exec_one("select id from projects "
                                 "where name='%s' and userID='%s')",
                                 (project_name, user_id))

            return Db.exec_one("select IP from services "
                               "where name='%s' and projectID='%s'",
                               (service_name, prj_id))
Пример #4
0
        def delete_all_services(user_name, project_name):
            prj_id = Db.exec_one("select id from projects "
                                 "where name='%s' and userID = (select id from user where name = '%s')",
                                 (project_name, user_name))
            if prj_id is None:
                raise Exception("Project does not exist for %s, %s" % (user_name, project_name))

            Db.exec_cmd("delete from services where projectID = '%s'", prj_id)
Пример #5
0
        def get_list(user_id, project_name):
            prj_id = Db.exec_one("select id from projects "
                                 "where name = '%s' and userID=(select id from user where name='%s')",
                                 (project_name, user_id))

            if prj_id is None:
                raise Exception("Project does not exist for %s, %s" % (user_id, project_name))

            return Db.exec_list("select name from services where projectID='%s'", prj_id)
Пример #6
0
 def get_list(user_id, start_index, count):
     """
     返回用于分页的某个用户的所有项目;
     如果count <= 0,则忽略 start_index,返回全部列表
     """
     if count <= 0:
         return Db.exec_list("select name, url from projects "
                             "where userID = '%s'",
                             user_id)
     else:
         return Db.exec_list("select name, url from projects "
                             "where userID = '%s' limit %s,%s",
                             (user_id, start_index, count))
Пример #7
0
def build_event_sm(events):
    d = DBHelper()
    d.recreate_tables()
    ts = 0
    event_i = 0
    sm = EventSM()
    for event in events:
        event_i += 1
        try:
            sm.digest(event)
        except Exception as e:
            print "Exception in event #%d: %s" % (event_i, str(e))
            raise e
    return sm  
Пример #8
0
def get_party_detail(links):
    dbHelp = DBHelper(user_config.db_host, user_config.db_port,
                      user_config.db_user, user_config.db_password,
                      user_config.db_database)
    for link in links:
        info = {}
        info["link"] = link
        text = get_html(link)
        if text == "EOF" or text == "ERR":
            print("获取失败:" + link)
            continue

        soup = BeautifulSoup(text, "html.parser")
        #如果不指定name,指定attrs,那么会找所有好友该属性的标签。
        title_tag = soup.find(name="h3",
                              attrs={"class": "course-dtl__course-title"})
        info["name"] = my_util.getTagText(title_tag)
        img_div_tag = soup.find(name="div", attrs={"class": "course-dtl__img"})
        img_tag = img_div_tag.find(name="img") if img_div_tag else None
        img_href = img_tag.get("src") if img_tag else ""
        if img_href != "":
            info["img"] = down_file(img_href,
                                    img_href[img_href.rfind("/") + 1:])
        desc_tag = soup.find(name="div", attrs={"class": "course-dtl__desc"})
        info["description"] = my_util.getTagText(desc_tag)

        table_tag = soup.find(
            name="table",
            attrs={"class": "c-table c-table--form course-dtl__data-table"})

        info_map = {
            "コース料金": "price",
            "品数": "num",
            "滞在可能時間": "free_time",
            "コース内容": "content"
        }
        if table_tag:
            trs = table_tag.select("tbody tr")
            for tr in trs:
                th = tr.find(name="th")
                th_text = my_util.getTagText(th)
                if th_text in info_map:
                    info[info_map.get(th_text)] = my_util.getTagText(
                        tr.find(name="td"))

        # print(info)
        save_data("STORE_PARTY", info, dbHelp)

    dbHelp.closeDB()
Пример #9
0
 def __init__(self):
     self.db_helper = DBHelper()
     self._init_counters()
     self._init_callbacks()
     self._alone_time = 0
     self._where_away = None
     self._away_ts = 0
Пример #10
0
def get_project_by_id(id):
    projects = DBHelper().fetch(u"SELECT * FROM projects WHERE id=%s;" % (id))

    if (len(projects)):
        return projects[0]

    return None
Пример #11
0
def get_requirements_by_date(project_id, base_date):
    return DBHelper().fetch(u" SELECT 	    r.*"
                            u" FROM 		requirements r"
                            u" INNER JOIN   projects p ON p.id = r.project_id"
                            u" WHERE	    p.id = %s"
                            u" AND 		    r.added >= '%s'" %
                            (project_id, base_date))
Пример #12
0
def insert_recommendation(project_id, risk_id, base_date, distance, sample,
                          steps, type):
    DBHelper().execute(
        u" INSERT INTO recommendations"
        u"             (project_id, risk_id, base_date, distance, sample, steps, type)"
        u" VALUES      (%s, %s, '%s', %s, %s, %s, '%s');" %
        (project_id, risk_id, base_date, distance, sample, steps, type))
Пример #13
0
def get_risk_by_id(id):
    risks = DBHelper().fetch(u"SELECT * FROM risks WHERE id=%s;" % (id))

    if (len(risks)):
        return risks[0]

    return None
Пример #14
0
def add_import_job(start_date_s: str, end_date_s: str) -> None:
    start_date = date.fromisoformat(start_date_s)
    end_date = date.fromisoformat(end_date_s)
    db = DBHelper()
    while start_date <= end_date:

        date_str = "{},{},{}".format(start_date.year, start_date.month,
                                     start_date.day)
        cmd = config.curl_command_template.format(date_str, date_str).strip()

        data = {
            '_id': start_date.isoformat(),
            'curl_cmd': cmd,
            'finished': False,
            'lock_timestamp': 0,
            'work_node': None,
            'total_num': None,
            'import_num': None
        }

        if start_date.isoformat() not in db.client["import_job"]:
            print(start_date)
            db.client["import_job"].create_document(data)

        start_date += timedelta(days=1)
Пример #15
0
def get_risks_by_date(project_id, base_date):
    return DBHelper().fetch(u" SELECT 	    r.*"
                            u" FROM 		risks r"
                            u" INNER JOIN   projects p ON p.code = r.code"
                            u" WHERE	    p.id = %s"
                            u" AND 		    r.added >= '%s'" %
                            (project_id, base_date))
def main(argv):
	params = argv
	if len(params) == 1:
		params = IrishNameParser().names
	else:
		params = argv[1:]

	counter = 0
	profile_downloader = PublicProfileDownloader()
	for param in params:
		profile_downloader.download(param)
		counter += NUM
		if counter > 500:
			time.sleep(120)
			counter -= 500

	DBHelper.commitAndClose()
Пример #17
0
def get_requirement_by_id(id):
    requirements = DBHelper().fetch(
        u"SELECT * FROM requirements WHERE id=%s;" % (id))

    if (len(requirements)):
        return requirements[0]

    return None
Пример #18
0
def delete_recommendations(distance, sample, steps, type):
    DBHelper().execute(
        u" DELETE "
        u" FROM   recommendations "
        u" WHERE  CAST(distance AS DECIMAL(5,3))=CAST(%s AS DECIMAL(5,3))"
        u" AND    CAST(sample AS DECIMAL(5,3))=CAST(%s AS DECIMAL(5,3))"
        u" AND    CAST(steps AS DECIMAL(5,3))=CAST(%s AS DECIMAL(5,3))"
        u" AND    type='%s';" % (distance, sample, steps, type))
Пример #19
0
def get_project_by_rand():
    projects = DBHelper().fetch(
        u"SELECT * FROM projects ORDER BY RAND() LIMIT 1;")

    if (len(projects)):
        return projects[0]

    return None
Пример #20
0
def harvest_single_user(maintask: MainTask, api: tweepy.API,
                        doc: cloudant.document, db: DBHelper) -> bool:
    max_id = 0
    min_id_last_round = None
    counter = 0
    while True:
        kwargs = {
            "user_id": doc["_id"],
            "since_id": int(doc["last_harvest_tweet_id"]) + 1,
            "include_rts": "false"
        }
        if min_id_last_round is not None:
            kwargs["max_id"] = str(min_id_last_round - 1)

        try:
            status_list = api.user_timeline(**kwargs)
        except Exception as e:
            maintask.log("user tweets: twitter api error, backoff", e)
            return False

        ids = []
        if len(status_list) == 0:
            break

        for status in status_list:
            counter += 1
            t_json = status._json
            t_id = int(t_json["id_str"])
            max_id = max(max_id, t_id)
            ids.append(t_id)
            db.add_tweet(t_json)

        min_id_last_round = min(ids)

        maintask.log("user tweets: ids from ", min(ids), "to", max(ids))

    doc["last_harvest_tweet_id"] = str(
        max(max_id, int(doc["last_harvest_tweet_id"])))
    doc["last_harvest"] = int(time.time())
    doc.save()

    maintask.log("user tweets: got tweets", counter)

    return True
Пример #21
0
 def login(email, password):
     password = get_md5_value(password)
     user = DBHelper.get_user(email)
     if user:
         if user.password == password:
             session['email'] = email
             return True
         else:
             return False
     return False
Пример #22
0
def get_requirements_distance(req_a_id, req_b_id):
    distance = DBHelper().fetch(u" SELECT * "
                                u" FROM   requirements_distance "
                                u" WHERE  req_a_id=%s "
                                u" AND    req_b_id=%s;" % (req_a_id, req_b_id))

    if (len(distance)):
        return distance[0]

    return None
Пример #23
0
 def __init__(self):
     self.db = DBHelper()
     self.bot = Bot()
     self.controls = {}
     self.bot.message_loop(
         {
             'chat': self.handle,
             'callback_query': self.on_callback
         }
     )
Пример #24
0
def dump(filename: str, keep_auth: bool = False) -> None:
    data = {}
    dbh = DBHelper()
    dbs = list(filter(lambda x: x[0] != '_', dbh.client.all_dbs()))
    for db in dbs:
        dds = dbh.client[db].design_documents()
        if not keep_auth:
            dds = list(filter(lambda x: x['id'] != '_design/auth', dds))
        data[db] = dds
    with open(filename, 'w') as outfile:
        json.dump(data, outfile)
Пример #25
0
def get_project_by_rand():
    project = DBHelper().fetch(u" SELECT    *"
                               u" FROM 	    projects"
                               u" WHERE     id < 18"
                               u" ORDER BY	rand()"
                               u" LIMIT     1")

    if (len(project)):
        return project[0]

    return None
Пример #26
0
        def delete(user_id, password, project_name):
            # todo: 删除容器考虑在上层实现?
            srv_list = DBModel.Service.get_list(user_id, password, project_name)
            if len(srv_list) != 0:
                for service_name in srv_list:
                    ip = DBModel.Service.get_host_ip(user_id, project_name, service_name)
                    if ip == '-':
                        continue
                    else:
                        # rm this container
                        # cli = Client(base_url=url, version=config.c_version)
                        # full_name = username + config.split_mark + project_name + config.split_mark + service_name
                        # if container_exists(cli, full_name):
                        #     logs = logs + full_name + '\n' + cli.logs(container=full_name) + '\n'
                        #     cli.stop(container=full_name)
                        #     cli.remove_container(container=full_name)
                        pass

            Db.exec_cmd("delete from service where project='%s'", project_name)
            Db.exec_cmd("delete from project where name='%s'", project_name)
Пример #27
0
def get_risks_distance(risk_a_id, risk_b_id):
    distance = DBHelper().fetch(u" SELECT * "
                                u" FROM   risks_distance "
                                u" WHERE  risk_a_id=%s "
                                u" AND    risk_b_id=%s;" %
                                (risk_a_id, risk_b_id))

    if (len(distance)):
        return distance[0]

    return None
Пример #28
0
def sync_tasks(data):
    '''
    insert or update database
    '''
    try:
        list_id = data[0]['list_id']
        d = DBHelper.fetchall(
            "select task_id from {0} where list_id = {1}".format(
                tb_tasks, list_id))
        d2 = [o['id'] for o in data]
        _delete_task = [
            ddd['task_id'] for ddd in d if ddd['task_id'] not in d2
        ]
        for dx in _delete_task:
            print 'now delete..'
            sql = "delete from %s where task_id = '%s'" % (tb_tasks, dx)
            print sql
            DBHelper.delete(sql)
    except IndexError, e:
        print 'index error ,msg:{}'.format(e)
        print 'data:{}'.format(data)
Пример #29
0
 def __init__(self, reactor, pool, init_url, conf, use_pool=False):
     self.logger = logging.getLogger("")
     self.reactor = reactor
     self.pool = pool
     self._parse_conf(conf)
     self.db_helper = DBHelper(conf)
     self.url_dedup = URLDedup(conf)
     self.cleaner = lxml.html.clean.Cleaner(style=True, scripts=True, page_structure=False, safe_attrs_only=False)
     self.html_parser = lxml.html.HTMLParser(encoding='utf-8')
     self.init_url = init_url
     self.use_pool = use_pool
     self.flow_control = TimeFlowControl(1, 60)
Пример #30
0
def process_a_image(url: str, db: DBHelper, tweet_id: str) -> Dict[str, Any]:
    with tempfile.NamedTemporaryFile(dir=TMP_FOLDER,
                                     delete=True,
                                     suffix=".jpg") as tmpf:
        res = requests.get(url + ":small")
        if res.status_code != 200:
            raise Exception("unable to donwload image")

        content = res.content
        tmpf.write(content)
        tmpf.flush()
        # run yolo
        lock.acquire()
        try:
            results = detector.detect(tmpf.name)
        finally:
            lock.release()

    db.add_tweet_image_with_yolo(url, results, content, tweet_id)

    return results
Пример #31
0
def getPublicProfiles(limit=1000):
	path = 'user_raw'
	profile_paths = []

	for f in DBHelper.getNotRDFedFileName(limit):
		file_path = join(path, f)
		profile_paths.append(file_path)
	# 	parser = ProfileParser(file_path)
	# 	profile = parser.parseHtml()
	# 	profiles.append(profile)
	# return profiles
	return profile_paths
Пример #32
0
def get_projects_non_processed(distance, sample, steps, type):
    return DBHelper().fetch(
        u" SELECT   p.*"
        u" FROM     projects p"
        u" WHERE    p.id NOT IN("
        u"              SELECT project_id"
        u"              FROM   recommendations"
        u"              WHERE  CAST(distance AS DECIMAL(5,1)) = %s"
        u"              AND    CAST(sample AS DECIMAL(5,1)) = %s"
        u"              AND	   CAST(steps AS DECIMAL(5,1)) = %s"
        u"              AND    type = '%s'"
        u"          )"
        u" ORDER BY p.id ASC" % (distance, sample, steps, type))
Пример #33
0
        def delete_user_and_projects(user_name):
            """
            删除用户及该用户的所有项目和所属服务
            """
            # todo: 删除所有项目和所属服务是否有上层负责,或以事务方式执行?
            # todo: 合并下面的多条sql cmds?
            user_id = Db.exec_one("select id from user where name='%s'", user_name)

            Db.exec_cmd("delete from services where projectID in "
                        "(select id from projects where userID='%s')", user_id)
            Db.exec_cmd("delete from projects where userID='%s'", user_id)
            Db.exec_cmd("delete from user where name='%s'", user_id)
Пример #34
0
def harvest_twitter_tweet_process_meta_update() -> None:
    db = DBHelper()
    count = 0
    for doc in db.client["harvest_twitter_tweet"]:
        if "locked" in doc["process_meta"]:
            # old version
            doc["process_meta"] = {'lock_timestamp': 0, 'processed': False}
            doc.save()

            count += 1

            if count % 100 == 0:
                print(count)

    print("finished.", count)
Пример #35
0
def add_to_db(job_doc: cloudant.document, db: DBHelper) -> None:
    total_num = 0
    import_num = 0

    with open('./tmp/twitter.json', 'r') as f:
        # skip things like "{"total_rows":3877777,"offset":805584,"rows":["
        f.readline()
        for l in f:
            # skip last line
            if l.strip() == ']}':
                continue

            try:
                # load one doc
                data = json.loads(l.rstrip(",\r\n "))
                data = data["doc"]
                data.pop("_id", None)
                data.pop("_rev", None)
            except Exception as e:
                log("unknow parse error, skip: ", e)
                continue

            total_num += 1
            if db.add_tweet_import(data):
                import_num += 1

            if total_num % const.JOB_UPDATE_PER_TWEET == 0:
                print("t: ", total_num, "i", import_num)
                # update doc, mainly for checking if someone else taken over
                try:
                    job_doc["total_num"] = total_num
                    job_doc["import_num"] = import_num
                    job_doc.save()
                except Exception as e:
                    log("lock conflict: ", e)
                    return

    # complete
    try:
        job_doc["total_num"] = total_num
        job_doc["import_num"] = import_num
        job_doc["finished"] = True
        job_doc.save()
        log("finished")
    except Exception as e:
        log("unable to finish: ", e)
        return
Пример #36
0
class OrderDAO():
    def __init__(self):
        self.db_helper = DBHelper()
        self.db_helper.open_conn()

    def __del__(self):
        self.db_helper.close_conn()

    def query_all_order(self):
        '''
            查询所有订单
        :return: 所有订单实体组成的列表(list) or None
        '''
        order_list = []
        sql = 'SELECT * FROM orders LIMIT 0,10'
        result = self.db_helper.do_query(sql)
        if not result:
            print('查询结果为空')
            return None

        for row in result:
            order_id = row[0]
            cust_id = row[1]
            if row[4]:
                products_num = int(row[4])
            else:
                products_num = 0
            if row[5]:
                amt = float(row[5])
            else:
                amt = 0
            order_list.append(Order(order_id, cust_id, products_num, amt))

        return order_list

    def query_by_id(self, id):

        sql = 'select * from orders WHERE order_id = %s' % (id)
        result = self.db_helper.do_query(sql)[0]
        if not result:
            print('查询结果为空')
            return None
        order_id = result[0]
        cust_id = result[1]
        if result[4]:
            products_num = int(result[4])
        else:
            products_num = 0
        if result[5]:
            amt = float(result[5])
        else:
            amt = 0
        order = Order(order_id, cust_id, products_num, amt)
        return order
Пример #37
0
def dump_all(dump_dir: str) -> None:
    data = {}
    dbh = DBHelper()
    dbs = list(filter(lambda x: x[0] != '_', dbh.client.all_dbs()))

    url_obj = furl(config.couchdb_host)
    url_obj.username = config.couchdb_user
    url_obj.password = config.couchdb_auth_token
    url = url_obj.url

    for db in dbs:
        _url = url + db + "/_all_docs?include_docs=true&attachments=true"
        print(_url)

        cmd = r"""curl "{}" -G  -o "{}" """
        cmd = cmd.format(_url, os.path.join(dump_dir, db + ".json"))
        exit_code = subprocess.call(cmd, shell=True)
        if exit_code != 0:
            raise Exception("unable to download")
Пример #38
0
def sync_lists():
    '''
    insert or update database
    :return data: list data, json, ref https://developer.wunderlist.com/documentation/endpoints/list
    '''
    data = get_lists()
    print data
    d = DBHelper.fetchall("select list_id from %s" % tb_lists)
    d2 = [o['id'] for o in data]
    _delete_list = [ddd['list_id'] for ddd in d if ddd['list_id'] not in d2]
    for dx in _delete_list:
        print 'now delete..'
        sql = "delete from %s where list_id = '%s'" % (tb_lists, dx)
        print sql
        DBHelper.delete(sql)
    for o in data:
        q = DBHelper.fetchall("select * from %s"
                              " where list_id = '%s'" % (tb_lists, o['id']))
        #updated_at = dateutil.parser.parse(o['updated_at']).date() # deprecated
        updated_at = datetime.now().date()
        created_at = dateutil.parser.parse(o['created_at']).date()
        if q:
            q = q[0]
            if str(q['updated_at']) == str(updated_at):
                print 'no update!'
            else:
                print '%s update now...' % o['title']
                update_sql = "update %s set title='%s', created_at = '%s', updated_at='%s', content='%s' where list_id='%s'" % (
                    tb_lists, o['title'], created_at, updated_at,
                    json.dumps(o), o['id'])
                DBHelper.update(update_sql)
        else:
            print 'insert now...'
            sql = '''insert into %s (title,created_at,updated_at,content,list_id) values('%s','%s','%s', '%s','%s')''' % (
                tb_lists, o['title'], created_at, updated_at, json.dumps(o),
                o['id'])
            print sql
            DBHelper.update(sql)
    print 'success sync lists'
    return data
Пример #39
0
def tweet_data_melb_time_update() -> None:
    db = DBHelper()
    count = 0

    for doc in tqdm.tqdm(db.client["tweet_data"],
                         total=db.client["tweet_data"].doc_count()):
        if "created_at_melb_time" not in doc["data"]:
            # old version

            time = doc["data"]["created_at"]
            melb_time = datetime.strptime(time, '%a %b %d %H:%M:%S %z %Y')\
                .replace(tzinfo=timezone.utc).astimezone(pytz.timezone('Australia/Melbourne'))

            doc["data"]["created_at_melb_time"] = \
                [melb_time.year, melb_time.month, melb_time.day, melb_time.hour, melb_time.minute, melb_time.second]
            doc.save()

            count += 1

            if count % 100 == 0:
                print(count)

    print("finished.", count)
Пример #40
0
def handle_tweet_media(tweet_json: Dict[str, Any], worker: Worker,
                       db: DBHelper) -> Optional[List[Dict[str, Any]]]:
    try:
        res = []
        if "extended_entities" not in tweet_json or "media" not in tweet_json[
                "extended_entities"]:
            return []
        media = tweet_json["extended_entities"]["media"]
        for img in media:
            try:
                img_url: str = img["media_url_https"]
                if img_url.startswith(
                        "https://pbs.twimg.com/") and img_url.endswith(".jpg"):
                    worker.log("handle_tweet_media: image", img_url)

                    tmp = db.get_tweet_image_with_yolo(img_url)
                    if tmp is not None:
                        res.append(tmp)
                        continue

                    # handle image
                    try:
                        yolo_res = process_a_image(img_url, db,
                                                   tweet_json["id_str"])
                        res.append({"url": img_url, "yolo": yolo_res})
                    except Exception as e:
                        worker.log("handle_tweet_media: process a image error",
                                   e)
                        return None

            except Exception as e:
                worker.log("handle_tweet_media: single media error", e)
    except Exception as e:
        worker.log("handle_tweet_media: unknown error", e)
        return []

    return res
Пример #41
0
def run(num, file_name):
	rg = RG(file_name)
	profile_paths = getPublicProfiles(limit=num)
	try:
		for path in profile_paths:
			parser = ProfileParser(path)
			profile = parser.parseHtml()
			# print profile.extra_profile_list
			# Utils.putExtraProfilesIntoDB(profile.extra_profile_list)
			rg.add(profile)
			DBHelper.dataSetRDF(profile.file_name, rdf=1)
	except Exception:
		traceback.print_exc()
		rg.save(format='xml', file_name=file_name)
		rg.close()
		DBHelper.commitAndClose()
	else:
		rg.save(format='xml', file_name=file_name)
		rg.close()
		DBHelper.commitAndClose()
	def get_company_profile(self, url, company_name):
		file_path = CPD.downloadByUrl(url, company_name)
		parser = CPP(file_path)
		company_profile = parser.parseHtml().content
		DBHelper.dataAddEntry(company_profile['file_name'], url, exist=1, type='COMPANY')
		return company_profile
	def add_experience_triple(self, profile, person):
		for experience in profile.experience_list:
			if profile.city is None:
				if 'city' in experience:
					self.set_profile_city(person, profile, experience['city'])

			if 'job_title' in experience:
				job_title = experience['job_title']
				job_title = self.position_helper(job_title)
				term = BNode()

				self.graph_add(term, RDF.type, self.schema.Position)
				self.graph_add(term, self.schema.occupation, Literal(job_title))
				try:
					if experience['from'] and self.check_datetime_format(experience['from']):
						self.graph_add(term, self.schema.from_value, Literal(experience['from'], datatype=XSD.date))
				except KeyError:
					pass
				try:
					if experience['to']:
						if self.check_datetime_format(experience['to']):
							self.graph_add(term, self.schema.to_time, Literal(experience['to'], datatype=XSD.date))
						elif experience['to'].lower() == 'current' or experience['to'].upper() == 'now':
							self.graph_add(term, self.schema.to_time, Literal('now', datatype=XSD.string))
				except KeyError:
					pass

			if 'company_name' in experience:
				company_name = experience['company_name']
				company_name = self.company_name_helper(company_name)
				company = self.schema.get_term(company_name)

				self.graph_add(company, RDFS.label, Literal(company_name, datatype=XSD.string))

				# we need to define this company
				if company_name not in self.companies:
					self.graph_add(company, RDF.type, self.schema.Organization)
					self.companies.add(company_name)

					# add city info
					cities = self.get_cities_by_company_name(company_name)
					for city in cities:
						self.graph_add(company, self.schema.city, self.schema.get_term(city))

					if profile.city is None:
						if cities:
							self.set_profile_city(person, profile, cities[0])

					# extra process required for
					if 'company_url' in experience:
						company_profile = self.get_company_profile(experience['company_url'], company_name)

						if 'Founded' in company_profile:
							self.graph_add(company, self.schema.formation_year, Literal(company_profile['Founded'], datatype=XSD.gYear))

						if 'Company Size' in company_profile:
							mini, maxi = self.get_company_size(company_profile['Company Size'])
							self.graph_add(company, self.schema.from_value, Literal(mini, datatype=XSD.integer))
							self.graph_add(company, self.schema.to_time, Literal(maxi, datatype=XSD.integer))

						if 'Type' in company_profile:
							self.graph_add(company, self.schema.organization_type, Literal(company_profile['Type'], datatype=XSD.string))

						if 'Industry' in company_profile:
							self.graph_add(company, self.schema.industry, self.schema.get_term(company_profile['Industry']))

						DBHelper.dataSetRDF(company_profile['file_name'], rdf=1)

				self.graph_add(company, self.schema.has_position, term)
				self.graph_add(person, self.schema.works_as, term)
Пример #44
0
 def get(user_id):
     data = Db.exec_cmd("select net from info where name='%s'",
                        user_id)  # todo: check tuple usage
     return data[0]
Пример #45
0
        def exists(user_name, project_name):
            prj = Db.exec_one("select 1 from projects "
                              "where name='%s' and userID = (select id from user where name = '%s')",
                              (project_name, user_name))

            return prj is not None
Пример #46
0
 def __init__(self, conf):
     self.db_helper = DBHelper(conf)
     self.parser = HTMLParser(encoding='utf-8', remove_comments=True, remove_blank_text=True)
     self.cleaner = lxml.html.clean.Cleaner(style=True, scripts=True,
         page_structure=False, safe_attrs_only=False, comments=True, javascript=True)
Пример #47
0
config = Configurations()
# Apply configurations
env_lang = config.get_language()
set_proto_lang(env_lang)
set_msg_lang(env_lang)
# Get network IP addr
if cctv_url == '':
    cctv_url = 'http://'
    tempstr = subprocess.check_output('dig +short myip.opendns.com @resolver1.opendns.com', shell=True)
    tempstr.replace('\r', '')
    tempstr.replace('\n', '')
    cctv_url += tempstr.strip()
    cctv_url += ':' + cctv_port
print 'Found external IP = ' + cctv_url
# Initialize DB
dbhelper = DBHelper(MYSQL_USER, MYSQL_PASS, MYSQL_DB)
dbhelper.connect()
dbhelper.checktables()
# Connect serial
t_ser = SerialThread(recv_queue, send_queue, incoming_cmd_callback)
t_ser.connect()
# Make worker thread
t_dev = DeviceManagerThread(dbhelper, recv_queue, send_queue, device_thread_callback)

# Start main loop
if __name__ == '__main__':
    try:
        # Start worker thread
        t_dev.load_devices()
        t_dev.load_noti()
        t_dev.load_macro()
Пример #48
0
 def get_info(user_id, project_name):
     # todo: name已经知道了,url是git repo的url吗?
     return Db.exec_one("select name, url from projects where userID = '%s' and name = '%s'",
                        (user_id, project_name))
Пример #49
0
class EventSM:
    AWAKE = "awake"
    PUTTING_TO_SLEEP = "put to sleep"
    SLEEPING = "sleeping"

    def __init__(self):
        self.db_helper = DBHelper()
        self._init_counters()
        self._init_callbacks()
        self._alone_time = 0
        self._where_away = None
        self._away_ts = 0

    def _init_counters(self):
        self.state = EventSM.AWAKE
        self.last_rec_ts = 0
        self.accum_sleeping_time = 0
        self.accum_put_to_sleep_time = 0
        self.days = set()

    def _init_callbacks(self):
        self.cb = {
            'putting to sleep': self._put_to_sleep,
            'failed putting to sleep': self._failed_to_put_to_sleep,
            'slept': self._fell_asleep,
            'woke up': self._woke_up,
            'megi left': self._alone,
            'megi\'s back': self._not_alone,
            'away': self._away,
            'back home': self._back_home
        }

    def _add_sleep_state(self, old_state, end_ts, new_state):
        success = not (old_state == EventSM.PUTTING_TO_SLEEP and new_state == EventSM.AWAKE)
        length_secs = end_ts - self.last_rec_ts
        self.db_helper.add_sleep_state(old_state, self.last_rec_ts, length_secs, success)
        self.last_rec_ts = end_ts
        self.state = new_state
        return length_secs

    def _put_to_sleep(self, ts, event):
        if self.state != EventSM.AWAKE:
            raise Exception('Cannot put to sleep an awake baby...')
        self._add_sleep_state(EventSM.AWAKE, ts, EventSM.PUTTING_TO_SLEEP)

    def _failed_to_put_to_sleep(self, ts, event):
        if self.state != EventSM.PUTTING_TO_SLEEP:
            raise Exception('Cannot fail to put baby to sleep without trying first...')
        s = self._add_sleep_state(EventSM.PUTTING_TO_SLEEP, ts, EventSM.AWAKE)
        self.accum_put_to_sleep_time += s

    def _fell_asleep(self, ts, event):
        if self.state != EventSM.PUTTING_TO_SLEEP:
            raise Exception('Baby cannot sleep without being put to sleep...')
        s = self._add_sleep_state(EventSM.PUTTING_TO_SLEEP, ts, EventSM.SLEEPING)
        self.accum_put_to_sleep_time += s

    def _woke_up(self, ts, event):
        if self.state != EventSM.SLEEPING:
            raise Exception('Cannot wake up without sleeping first...')
        s = self._add_sleep_state(EventSM.SLEEPING, ts, EventSM.AWAKE)
        self.accum_sleeping_time += s

    def _alone(self, ts, event):
        if self._alone_time != 0:
            raise Exception('Already alone, cannot be left alone again...')
        self._alone_time = ts

    def _not_alone(self, ts, event):
        if self._alone_time == 0:
            raise Exception('Cannpt be not alone with being left alone first...')
        t = ts - self._alone_time
        self._alone_time = 0
        self.db_helper.add_time_alone(ts, t)

    def _away(self, ts, event):
        if self._where_away != None:
            if self._away_ts == 0:
                raise Exception('Cannot go away again without leaving first...')
            self.db_helper.add_time_away(self._away_ts, ts - self._away_ts, self._where_away)
        self._where_away = event.get_subtype()
        self._away_ts = ts

    def _back_home(self, ts, event):
        if self._away_ts == 0 or self._where_away == None:
            raise Exception('Cannot get back home without leaving...')
        self.db_helper.add_time_away(self._away_ts, ts - self._away_ts, self._where_away)
        self._where_away = None
        self._away_ts = 0

    def digest(self, event):
        self.db_helper.add_event(event)

        event_date = event.get_date()
        event_ts = event.get_ts()
        event_type = event.get_type()
        event_subtype = event.get_subtype()

        if self.last_rec_ts == 0:
            self.last_rec_ts = event_ts

        self.days.add(event_date)

        if not self.cb.has_key(event_type):
            return
        cb = self.cb[event_type]
        if type(cb) == dict:
            cb = cb[event_subtype]
        cb(event_ts, event)

    def get_state(self):
        return self.state

    def get_different_days(self):
        return len(self.days)

    def get_sleep_time(self):
        return self.accum_sleeping_time

    def get_put_to_slee_time(self):
        return self.accum_put_to_sleep_time
Пример #50
0
 def create(user_id, volume_path):
     Db.exec_cmd("insert into info(volume) values('%s') where name='%s'",
                 (user_id, volume_path))
Пример #51
0
 def create(user_name, service_name, machine_ip, project_name):
     prj_id = Db.exec_one("select id from projects "
                          "where name='%s' and userID in (select id from user where name = '%s')",
                          (project_name, user_name))
     Db.exec_cmd("insert into services(name, projectID, IP) values('%s', %s, '%s')",
                 (service_name, prj_id, machine_ip))
Пример #52
0
 def add_user(user_name, email):
     Db.exec_cmd("insert into user(name, email) values('%s', '%s')",
                 (user_name, email))
Пример #53
0
class SeedMiner(object):

    def __init__(self, conf):
        self.db_helper = DBHelper(conf)
        self.parser = HTMLParser(encoding='utf-8', remove_comments=True, remove_blank_text=True)
        self.cleaner = lxml.html.clean.Cleaner(style=True, scripts=True,
            page_structure=False, safe_attrs_only=False, comments=True, javascript=True)

    def read_html(self, object_id, url):
        # object_id = 'index.html'
        try:
            fin = open('../data/mining_page/'+object_id)
        except IOError:
            return None
        body = fin.read()
        fin.close()
        #body = body.decode('gbk').encode('utf-8')
        tree = document_fromstring(body, parser=self.parser)
        tree = remove_display_none(tree)
        tree = self.cleaner.clean_html(tree)
        tree.make_links_absolute(url)
        return tree

    def maybe_hub(self, url, tree):
        if self.match_filter_url(url):
            return False, []

        block, matched_a, paths = self.get_hub_block(url, tree)
        tree = self.remove_p_aside_a(block, tree, matched_a)
        content_tree = self.get_readability_content(url, tree)
        content = unicode(content_tree.text_content().strip())
        content = re.sub(ur'\s', u'', content)
        chinese_content = filter_string(content, False, True, True)
        a_content = sum([len(a.text.strip()) for a in matched_a])
        ratio = len(chinese_content)*1.0/(a_content or 0.001)
        print 'url:%s matched_a:%d match content/link:%f' % (url, len(matched_a), ratio)
        print len(chinese_content), content.encode('utf-8')

        #import pdb;pdb.set_trace()
        if len(matched_a) > 20 and len(chinese_content) < 200 and ratio < 0.2:
            return True, paths
        else:
            return False, paths

    def get_readability_content(self, url, tree):
        body = lxml.html.tostring(tree)
        doc = Document(body)
        content = doc.summary(True)
        content_tree = fromstring(content, parser=self.parser)
        return content_tree

    def get_hub_block(self, url, tree):
        a_elements = valid_a_elements(tree.xpath('//a'), url)
        visited_a = set()
        all_a = set(a_elements)
        long_a = set([a for a in a_elements if a.text and len(a.text.strip()) >= 10])
        block = []
        max_div = 2
        max_depth = 8
        min_link_number = 4
        for start_a in long_a:
            if start_a in visited_a:
                continue
            path = '/a'
            iter_node = start_a
            div_count = 0
            loop_flag = True
            for _ in xrange(max_depth):
                if not loop_flag:
                    break
                if div_count > max_div or iter_node.tag == 'body':
                    break
                iter_node = iter_node.getparent()
                if iter_node is None:
                    break
                if iter_node.tag in BLOCK_TAG and len(iter_node.getchildren()) > 1:
                    div_count += 1
                    sibling = iter_node.xpath('.'+path)
                    if len(sibling) >= min_link_number and \
                        all([x in all_a for x in sibling]):
                        long_a_sibling = [x for x in sibling if x in long_a]
                        block.append((iter_node, path, long_a_sibling))
                        [visited_a.add(x) for x in sibling]
                        loop_flag = False

                path = '/' + iter_node.tag + path

        matched_a = [a for a in long_a if a in visited_a]

        paths = []
        for node, path, long_a in block:
            paths.append(get_html_path(node) + path)
        print len(block)
        #import pdb;pdb.set_trace()
        return block, matched_a, paths

    def remove_p_aside_a(self, block, tree, matched_a):
        matched_a = set(matched_a)
        for node, path, long_a in block:
            for e in node.iter():
                text = e.text or ''
                text = text.strip()
                if len(text) < 100 and e not in matched_a:
                    e.text = ''

        return tree

    URL_FILTER_RX = re.compile('''news.xinhuanet.com/video/.+/c_\d+.htm''')

    def match_filter_url(self, url):
        if self.URL_FILTER_RX.search(url):
            return True
        else:
            return False

    def test(self):
        fout =  open('../data/mining_result.txt', 'w')
        for obj in self.db_helper.get_some_mining_task(0, 180000):
            url = obj.get('url')
            _id = str(obj.get('_id'))
            tree = self.read_html(_id, url)
            if tree is not None:
                try:
                    flag, paths = self.maybe_hub(url, tree)
                    aline = [str(flag), url, str(paths)]
                    fout.write('\t'.join(aline) + '\n')
                except KeyboardInterrupt:
                    sys.exit(1)
                except:
                    print "ERROR!"
                    traceback.print_exc()
            else:
                fout.write('\n')
        fout.close()
Пример #54
0
 def get_machine_list():
     return Db.exec_list("select ip from machine")
Пример #55
0
 def get(user_id):
     return Db.exec_list("select volume from info where name='%s'", user_id)
Пример #56
0
 def add_machine_list(ip_list):
     for ip in ip_list:
         Db.exec_cmd("insert into machine(ip) values('%s')", ip)
Пример #57
0
def downloadMoreProfiles(limit=3000):
	downloader = PublicProfileDownloader()
	urls = DBHelper.getNotExistFileNames(limit=limit)
	for url in urls:
		downloader.downloadAndAnalyze(url, analysis=False)
Пример #58
0
 def create(user_id, net_id):
     Db.exec_cmd("insert into info(net) values('%s') where name='%s'",
                 (net_id, user_id))  # todo: check tuple usage
Пример #59
0
from conf import TOKEN, DB_NAME
from db_helper import DBHelper

BTN_TODAY, BTN_TOMORROW, BTN_MONTH, BTN_REGION, BTN_DUA = ('⌛️ Bgun', '⏳ Erta',
                                                           "📅 To'liq taqvim",
                                                           '🇺🇿 Mintaqalar',
                                                           '🤲 Duo')
main_buttons = ReplyKeyboardMarkup(
    [[BTN_TODAY], [BTN_TOMORROW, BTN_MONTH], [BTN_REGION], [BTN_DUA]],
    resize_keyboard=True)

STATE_REGION = 1
STATE_CALENDAR = 2

user_region = dict()
db = DBHelper(DB_NAME)


def region_buttons():
    regions = db.get_regions()
    buttons = []
    tmp_b = []
    for region in regions:
        tmp_b.append(
            InlineKeyboardButton(region['name'], callback_data=region['id']))
        if len(tmp_b) == 2:
            buttons.append(tmp_b)
            tmp_b = []
    return buttons

Пример #60
0
 def get_machine(index):
     # todo: what is this?
     # todo: 用于随机调度?
     return Db.exec_one("select ip from machine limit %s,1" % index)