def make_final_diffs(): log.info('Make diff_time') #db.execute("INSERT INTO `diff_time` (`id_1`, `id_2`, `diff`) VALUES(1, NULL, 2);") #db.execute("INSERT INTO `diff_time` (`id_1`, `id_2`, `diff`) SELECT `a`.`id`, `b`.`id`, " + # "TIME_TO_SEC(TIMEDIFF(`a`.`datetime`, `b`.`datetime`)) FROM `measurement_points` AS `a` " + # "JOIN `measurement_points` AS `b` ON `a`.`id`-1=`b`.`id` WHERE `a`.`id` > 1;") #db.execute("INSERT INTO `diff_time` (`id_1`, `id_2`, `diff`) VALUES(NULL, " + # "(SELECT MAX(id) FROM `measurement_points`), 2);") #db.execute("TRUNCATE `diff_buffer`;") for table in cache.get('value_types', lambda: []): log.info('Make diff_%s' % table['name']) db.execute("INSERT INTO `diff_buffer`(`original_id`, `value`) SELECT `id`, `value` FROM `measurements` " + "WHERE device='%s' AND type='%s' AND level=2 ORDER BY `measurement_point_id`;" % (table['original'][0], table['original'][1])) break #db.execute("INSERT INTO `diff_%s` (`id_1`, `id_2`, `diff`) VALUES(1, NULL, 2);" % table['name']) #db.execute("INSERT INTO `diff_%s` (`id_1`, `id_2`, `diff`) SELECT `a`.`id`, `b`.`id`, " % table['name'] + # "`a`.`value`-`b`.`value` FROM `diff_buffer` AS `a` JOIN `diff_buffer` AS `b` " + # "ON `a`.`id` - 1=`b`.`id` WHERE `a`.`id` > 1;") #db.execute("INSERT INTO `diff_%s` (`id_1`, `id_2`, `diff`) VALUES(NULL, " % table['name'] + # "(SELECT MAX(id) FROM `diff_buffer`), 2);") # #db.execute("TRUNCATE `diff_buffer`;")
def fix_stats_monster(): infura_client = InfuraClient(INFURA_API_URL) data_contract = infura_client.getDataContract() monster_records = EtheremonDB.EmaMonsterDataTab.objects.filter( Q(b0=0) | Q(b1=0) | Q(b2=0) | Q(b3=0) | Q(b4=0) | Q(b5=0)).all() for monster in monster_records: if monster.monster_id < 32599: continue base_stats = [] for index in xrange(0, 6): stat_value = data_contract.call().getElementInArrayType( DataArrayType.STAT_BASE, monster.monster_id, index) base_stats.append(stat_value) if 0 in base_stats: log.error("fix_monster_invalid_stat|monster_id=%s,base_stats=%s", monster.monster_id, base_stats) continue monster.b0 = base_stats[0] monster.b1 = base_stats[1] monster.b2 = base_stats[2] monster.b3 = base_stats[3] monster.b4 = base_stats[4] monster.b5 = base_stats[5] monster.exp = 0 monster.save() _sync_monster_id(data_contract, monster.monster_id) time.sleep(0.05) log.info("fix_monster_stats|monster_id=%s", monster.monster_id)
def crawl_contacts(self): """爬小组用户的关注列表页并写入数据库中对应用户名的表中""" try: self.login() # 关注用户表,表名对应当前用户id user_id = config.get('user', 'id') contacts = ContactsTable(table_name=user_id) c = ContactsList(self.s) total_members, total_pages = c.total_members, c.total_pages log.info('共关注了{}位用户,列表页共有{}页'.format(c.total_members, c.total_pages)) # for page_num in range(1, total_pages + 1): log.info('当前进度:[{}/{}]'.format(page_num, total_pages)) try: page_members = c.get_contacts_from_page(page_num) except Exception as e: raise Exception('因为{}, 无法爬取页:{} 至 {}'.format( e, page_num, total_pages)) else: contacts.insert(page_members) # 防止IP或者账号被ban的睡眠 time.sleep(random.randint(3, 20)) except Exception as e: raise Exception('crawl_contacts: {}'.format(e))
def crawl_group(self): """爬小组成员页""" group_name = config.get('group', 'id') log.info(group_name) group = GroupList() if group.total_members is 0 or group.total_pages is 0: # 小组成员页获取错误 raise Exception('所爬小组:{}\t 总人数:{}\t 总页数{}\n大概率账号被ban'.format( group_name, group.total_members, group.total_pages)) else: log.info('所爬小组:{}\t 总人数:{}\t 总页数{}'.format(group_name, group.total_members, group.total_pages)) # 设置开始爬的位置,倒着爬 start_page = (lambda page_num: group.total_pages if page_num is -1 else page_num)(int( config.get('group', 'start_page'))) end_page = (lambda page_num: 0 if page_num is -1 else page_num)(int( config.get('group', 'end_page'))) # 开始爬,每倒爬step页换一个headers和proxies step = int(config.get('group', 'skip_page')) for page_range in range(start_page, end_page, step): # 换一个headers和proxies group = GroupList() try: self.crawl_group_members(page_range, page_range + step, group) except Exception as e: raise Exception('crawl_group: {}'.format(e))
def execute(query): try: log.info("Execute query: %s" % (str(query))) return DBConnection.connection().execute(query) except Exception, exc: log.error("db.execute: %(error)s" % {'error': exc.message}) return None
def _func(keys, **kwargs): if not keys: return {} keys = list(keys) force_query = kwargs.get("force_query", False) result_data = {} if not force_query: cache_key_map = {cache_prefix % key: key for key in keys} cached_data_dict = cache.get_cache(cache_name).get_many( cache_key_map.keys()) for cached_key, cached_data in cached_data_dict.iteritems(): key = cache_key_map[cached_key] result_data[key] = cached_data keys.remove(key) log.info("key_cache_hit|cached_key=%s", ','.join(cached_data_dict.keys())) if keys: response_data = func(keys) if response_data: data_to_cache = { cache_prefix % key: data for key, data in response_data.iteritems() } cache.get_cache(cache_name).set_many( data_to_cache, expiry_time) return dict(result_data.items() + response_data.items()) else: return result_data
def wait_element_clickable(self, loc, img_info, timeout=15, poll_frequency=0.5): """ 等待元素可点击 :param loc:定位元素表达式 :param img_info:发生错误时截图文件名 :param timeout:等待超时时间 :param poll_frequency:查询频率 :return:超时错误或者元素 """ # 获取当前时间 start_time = time() try: ele = WebDriverWait(self.driver, timeout, poll_frequency).until(EC.element_to_be_clickable(loc)) except Exception as e: # 输出错误日志 log.error("元素{}等待可点击超时".format(loc)) log.exception(e) # 对当前错误页面截图 self.screen_shot(img_info) raise e else: # 打印等待时间,返回元素 end_time = time() log.info('元素{}可点击,等待时间{}秒'.format(loc, start_time - end_time)) return ele
def download_media_thumbnail(media_id, url): ''' Downloads an image from a URL and save it as a local thumbnail attached to a Media instance. ''' try: media = Media.objects.get(pk=media_id) except Media.DoesNotExist: # Task triggered but the media no longer exists, do nothing return width = getattr(settings, 'MEDIA_THUMBNAIL_WIDTH', 430) height = getattr(settings, 'MEDIA_THUMBNAIL_HEIGHT', 240) i = get_remote_image(url) log.info(f'Resizing {i.width}x{i.height} thumbnail to ' f'{width}x{height}: {url}') i = resize_image_to_height(i, width, height) image_file = BytesIO() i.save(image_file, 'JPEG', quality=85, optimize=True, progressive=True) image_file.seek(0) media.thumb.save('thumb', SimpleUploadedFile( 'thumb', image_file.read(), 'image/jpeg', ), save=True) log.info(f'Saved thumbnail for: {media} from: {url}') return True
def get_login_mode(self, option): """ query login mode :param option: query option (0: mock, 1: query market price, 2: employee/customer) :return: login mode """ log.info('[api] call - GetLoginMode({})'.format(str(option))) return self.dynamicCall('GetLoginMode(nOption)', option)
def verify(self): assert_data = self.assert_data.split(" ") expect = "" if assert_data[2].isdigit(): expect = int(assert_data[2]) else: expect = assert_data[2] try: if assert_data[1] == "==": assert eval(assert_data[0]) == expect elif assert_data[1] == "<": assert eval(assert_data[0]) < expect elif assert_data[1] == ">": assert eval(assert_data[0]) > expect elif assert_data[1] == "in": assert expect in eval(assert_data[0]) elif assert_data[1] == "!=": assert eval(assert_data[0]) != expect log.info("检查点校验成功") except Exception as e: log.error("检查点检验失败!预期结果是:{},实际结果是:{}".format( self.assert_data, assert_data[0] + " " + assert_data[1] + " " + eval(assert_data[0]))) raise e
def get_login_state(self) -> bool: """ query login state :return: login state """ log.info('[api] call - GetLoginState()') result = self.dynamicCall('GetLoginState()') return True if result else False
def test_new_supercargo(self, init): """新增一个押运员""" self.driver = init try: log.info("-----> 开始新增押运员") self.driver.find_element(By.ID, allData.get_element_info(0)).click() self.driver.find_element(By.ID, allData.get_element_info(1)).click() self.driver.find_elements(By.ID, allData.get_element_info(2))[1].click() self.driver.find_element(By.ID, allData.get_element_info(3)).click() self.driver.find_element(By.ID, allData.get_element_info(4)).send_keys(new_supercargo) self.driver.find_element(By.ID, allData.get_element_info(5)).send_keys(new_supercargo_num) self.driver.find_element(By.ID, allData.get_element_info(6)).click() self.driver.find_element(By.XPATH, reuseData.get_element_info(0)).click() self.driver.find_element(By.ID, reuseData.get_element_info(1)).click() self.driver.find_elements(By.ID, reuseData.get_element_info(2))[0].click() time.sleep(1) pageView.adb_tap((110, 260)) time.sleep(1) pageView.adb_tap((668, 46)) self.driver.find_element(By.ID, reuseData.get_element_info(3)).click() self.driver.find_element(By.ID, reuseData.get_element_info(4)).click() self.driver.find_element(By.ID, reuseData.get_element_info(5)).click() self.driver.find_element(By.ID, reuseData.get_element_info(7)).click() time.sleep(3) except Exception as e: log.error("异常情况,返回错误信息是->: {0}".format(e)) screen_shot(self.driver, allData.get_id() + '.png')
def release_request_id(self, rid): """ release request id :param rid: request id :return: void """ log.info('[api] call - ReleaseRqId({})'.format(str(rid))) self.dynamicCall('ReleaseRqId(nRqId)', rid)
def get_fid_output_count(self, rid): """ FID count of data inquiry response data :param rid: request id :return: count of data """ log.info('[api] call - GetFidOutputRowCnt({})'.format(str(rid))) return self.dynamicCall('GetFidOutputRowCnt(nRequestId)', rid)
def source_pre_delete(sender, instance, **kwargs): # Triggered before a source is deleted, delete all media objects to trigger # the Media models post_delete signal for media in Media.objects.filter(source=instance): log.info( f'Deleting media for source: {instance.name} item: {media.name}') media.delete()
def comm_init(self) -> bool: """ initialize communication module :return: initialize successful """ log.info('[api] call - CommInit()') result = self.dynamicCall('CommInit()') return True if result == 0 else False
def get_comm_state(self) -> bool: """ communication module status inquiry :return: normal operation status of communication module """ log.info('[api] call - CommGetConnectState()') state = self.dynamicCall('CommGetConnectState()') return True if state == 1 else False
def __init__(self, url, browse): self.driver_browse(browse) log.info("打开浏览器") self.driver.get(url) log.info("打开url") self.driver.implicitly_wait(20) self.driver.maximize_window()
def post(self): response = "" try: # 参数是字典,不上传文件 if self.data_type == "data" and self.upload_file == "": response = self.session.post(url=self.url, data=self.parameter) # 参数是字典,上传文件 elif self.data_type == "data" and self.upload_file != "": response = self.session.post(url=self.url, data=self.parameter, files=self.upload_file) # 参数是json,上传文件 elif self.data_type == "json" and self.upload_file != "": response = self.session.post(url=self.url, json=self.parameter, files=self.upload_file) # 参数是json,不上传文件 elif self.data_type == "json" and self.upload_file == "": response = self.session.post(url=self.url, json=self.parameter) log.debug("运行post请求成功,请求的参数是:{}".format(self.parameter)) log.info("运行post请求成功") except Exception as e: log.error("post请求失败!,错误信息是:{}".format(e)) log.error("post请求失败!,请求的参数是:{}".format(self.parameter)) raise e return response.json()
def set_proxies(self): p = ProxyTable() member = p.fetch_proxy() if member is None: raise Exception('set_proxies: {}'.format('member is None')) else: self.proxies = {'http': 'http://' + member.ip_port} log.info('{}'.format(self.proxies))
def create_request_id(self): """ create request id :return: request id """ request_id = self._api.create_request_id() log.info('[create_request_id] Request id created: {}'.format( str(request_id))) return request_id
def media_pre_delete(sender, instance, **kwargs): # Triggered before media is deleted, delete any scheduled tasks log.info(f'Deleting tasks for media: {instance.name}') delete_task_by_media('sync.tasks.download_media', (str(instance.pk), )) thumbnail_url = instance.thumbnail if thumbnail_url: delete_task_by_media('sync.tasks.download_media_thumbnail', (str(instance.pk), thumbnail_url))
def print_contacts_table(self): """打印拉取到的关注列表页数据""" members = ContactsTable() items = members.fetch_all() num_existed = len(items) log.info('当前共有: {}'.format(num_existed)) log.info('随机打印一个用户的信息') members.fetch_one_basic_infos().print_basic_infos()
def print_group_table(self): """打印拉取到的小组成员页数据""" members = MembersTable(table_name=config.get('group', 'id')) items = members.fetch_all() num_existed = len(items) log.info('当前共有: {}'.format(num_existed)) log.info('随机打印一个用户的信息') members.fetch_one_basic_infos().print_basic_infos()
def read_xml(file_path): element_tree = ElementTree() if not os.path.exists(file_path): os.makedirs(os.path.dirname(file_path), exist_ok=True) init(file_path) log.info("init file:{}".format(file_path)) element_tree.parse(file_path) return element_tree
def logout(self) -> bool: """ logout :return: logout successful """ log.info('[api] call - CommLogout(*)') result = self.dynamicCall('CommLogout(sUserId)', self._CREDENTIALS['id']) return True if result == 0 else False
def send(transaction_object) -> bool: """ forward to the message pipe cache :param transaction_object: transaction object :return: send message successful """ log.info('[send] transaction: {}'.format(str(transaction_object))) # send to MQ or storage return True
def get_yt_opts(): opts = copy(_defaults) cookie_file = settings.COOKIES_FILE if cookie_file.is_file(): cookie_file_path = str(cookie_file.resolve()) log.info(f'[youtube-dl] using cookies.txt from: {cookie_file_path}') opts.update({'cookiefile': cookie_file_path}) return opts
def on_agent_event_handler(self, event_type, param, value): """ agent event loop handler :param event_type: event type :param param: param :param value: value :return: void """ log.info('[api] on event({}) {} - {}'.format(str(event_type), str(param), str(value)))
def media_post_delete(sender, instance, **kwargs): # Schedule a task to update media servers for mediaserver in MediaServer.objects.all(): log.info(f'Scheduling media server updates') verbose_name = _('Request media server rescan for "{}"') rescan_media_server(str(mediaserver.pk), priority=0, verbose_name=verbose_name.format(mediaserver), remove_existing_tasks=True)
def calibrateCameraAndLidar(): res = {"code": "99", "msg": "", "result": {}} data2d = [] data3d = [] xyz = (0, 0, 0) try: data = json.loads(request.get_data(as_text=True)) token = data['token'] prjId = data['prjId'] raw3d = data['coordinateData0'] raw2d = data['coordinateData1'] xyz = (float(data['BLH']["x"]), float(data['BLH']["y"]), float(data['BLH']["z"])) mtx = np.array(data["para"]["mtx"]) dist = np.array(data["para"]["dist"]) assert (len(raw2d) == len(raw3d)), "接收坐标对数据长度不一致" assert (len(raw2d) >= 6), "接收坐标对数据长度小于6" for i in range(0, len(raw2d)): tmpraw2dU = float(raw2d[i]["axisX"]) tmpraw2dV = float(raw2d[i]["axisY"]) tmpraw3dX = float(raw3d[i]["axisX"]) tmpraw3dY = float(raw3d[i]["axisY"]) tmpraw3dZ = float(raw3d[i]["axisZ"]) data2d.append((tmpraw2dU, tmpraw2dV)) data3d.append((tmpraw3dX, tmpraw3dY, tmpraw3dZ)) log.info("Successful transfer of points") except Exception as e: log.error(e) res['msg'] = 'The server receives a bad json' return get_result_response(EasyDict(res)) log.info("ip:{}".format(request.remote_addr)) rotM, tvec, rvec, Cx, Cy, Cz, thetaX, thetaY, thetaZ = calibrate_camera_and_lidar( xyz, data2d, data3d, mtx, dist) result = { "rotM": list(map(np.ndarray.tolist, rotM)), "tvec": list(map(np.ndarray.tolist, tvec)), "rvec": list(map(np.ndarray.tolist, rvec)), "Cx": Cx.tolist()[0], "Cy": Cy.tolist()[0], "Cz": Cz.tolist()[0], "thetaX": thetaX, "thetaY": thetaY, "thetaZ": thetaZ } res["result"] = result res["code"] = "00" res["msg"] = "Success" return get_result_response(EasyDict(res))
def register_real(self) -> bool: """ subscribe to real register api :return: subscribe successful """ log.info('[api] call - RegisterReal({}, {})'.format( str(self._REAL_NAME), str(self._SYMBOL))) result = self.dynamicCall('RegisterReal(strRealName, strRealKey)', self._REAL_NAME, self._SYMBOL) return True if result == 0 else False
def filter_before(): """ Filter data before reordering to be able merge data Prefiltering suppose to expel data_files where we can find same ut time """ log.info("Data prefiltering") truncate = "TRUNCATE TABLE %(table_name)s" db.execute(truncate % {'table_name': ShortDiffNACS.__tablename__}) db.execute(truncate % {'table_name': ShortDiffWATS.__tablename__}) db.execute(truncate % {'table_name': BasicReorderNACS.__tablename__}) db.execute(truncate % {'table_name': BasicReorderWATS.__tablename__})
def make_order(): """ Making order after prefiltering of data """ ordering_query = 'INSERT INTO `%(destination_table)s` (%(fields_insert)s) SELECT %(fields_select)s FROM ' + \ '`%(source_table)s` as `st` JOIN `source_files` as `sf` ON `st`.`source_id`=`sf`.`id` ' + \ 'WHERE `sf`.`ignored`=0 ORDER BY DATE_ADD(CONCAT(FROM_DAYS(TO_DAYS(CONCAT(`year`, ' + \ '"-01-01")) + `day_of_year` - 1), " 00:00:00"), INTERVAL ut/1000 SECOND_MICROSECOND) ASC;' nacs_select = ["st.id", "source_id", SQLCommand("DATE_ADD(CONCAT(FROM_DAYS(TO_DAYS(CONCAT(`year`, '-01-01')) + " "`day_of_year` - 1), ' 00:00:00'), INTERVAL ut/1000 SECOND_MICROSECOND)"), SQLCommand("`ut` %% 1000"), "year", "day_of_year", "ut", "orbit", "o_density", "o_density_err", "n2_density", "n2_density_err", "he_density", "he_density_err", "n_density", "n_density_err", "ar_density", "ar_density_err", "alt", "lat", "long", "lst", "lmt", "l_sh", "inv_lat", "sza"] nacs_insert = ["original_id", "source_id", "date_general", "date_ms", "date_original_year", "date_original_day_of_year", "date_original_ut", "orbit", "o_density", "o_density_err", "n2_density", "n2_density_err", "he_density", "he_density_err", "n_density", "n_density_err", "ar_density", "ar_density_err", "alt", "lat", "long", "lst", "lmt", "l_sh", "inv_lat", "sza"] wats_select = ["st.id", "source_id", SQLCommand("DATE_ADD(CONCAT(FROM_DAYS(TO_DAYS(CONCAT(`year`, '-01-01')) + " "`day_of_year` - 1), ' 00:00:00'), INTERVAL ut/1000 SECOND_MICROSECOND)"), SQLCommand("`ut` %% 1000"), "year", "day_of_year", "ut", "mode", "mode_horizontal", "slot", "outin", "mass", "density", "tn", "tn_correction", "v_s", "c1", "c2", "t1", "t2", "v_geo", "v_geo_correction", "orbit", "altitude", "latitude", "longitude", "lst", "lmt", "l", "inv_lat", "sza"] wats_insert = ["original_id", "source_id", "date_general", "date_ms", "date_original_year", "date_original_day_of_year", "date_original_ut", "mode", "mode_horizontal", "slot", "outin", "mass", "density", "tn", "tn_correction", "v_s", "c1", "c2", "t1", "t2", "v_geo", "v_geo_correction", "orbit", "altitude", "latitude", "longitude", "lst", "lmt", "l", "inv_lat", "sza"] db.execute(ordering_query % { 'destination_table': BasicReorderNACS.__tablename__, 'source_table': NeutralGasNACSnT1s.__tablename__, 'fields_insert': prepare_fields(nacs_insert), 'fields_select': prepare_fields(nacs_select) }) db.execute(ordering_query % { 'destination_table': BasicReorderWATS.__tablename__, 'source_table': NeutralGasWATSnTv2s.__tablename__, 'fields_insert': prepare_fields(wats_insert), 'fields_select': prepare_fields(wats_select) }) log.info("Making order in satellite data")
def make_conversion(data_type, chunk_size, do_search=True): count = s.query(data_type).count() log.info("%i elements to be converted" % count) iterations = count / chunk_size if count % chunk_size: iterations += 1 for i in range(0, iterations): data = convert(s.query(data_type).slice(i * chunk_size, (i + 1) * chunk_size - 1).all(), do_search) for i in data: s.add(i) s.commit()
def main(): infoList=[] oldtime=time.time() infoList+=getExtraPageInfo(40) for info in infoList: try: table.InsertItemDict(ctable, info) # print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(info['loadtime'])),info['title'] except: logging.error('encoding not supported') msg='sina has crawled %s records,time cost: %s (seconds)' % (len(infoList), time.time()-oldtime) print msg log.info(msg)
def parse_all(): log.info('Parsing `plasma lang`') walk('%s/plasma_lang/Ne_Te_500ms_ascii/' % data_path, 'asc', True, NeTe500Ms) log.info('Parsing `neutral gas nacs`') walk('%s/neutral_gas_nacs/n_T_1s_ascii/data/' % data_path, 'asc', True, NT1s) log.info('Parsing `neutral gas wats`') walk('%s/neutral_gas_wats/n_T_v_2s_ascii/' % data_path, 'asc', True, NTV2s) log.info('DONE')
def main_single(): infoList=[] oldtime=time.time() # page can be started from 0 to 5 which represents different category for page in range(0,6): infoList+=getPageInfo(page) for info in infoList: try: table.InsertItemDict(ctable, info) # print info['loadtime'],info['title'] except: logging.error('encoding not supported') msg='qq has crawled %s records,time cost: %s (seconds)' % (len(infoList), time.time()-oldtime) print msg log.info(msg)
def filter_after(): """ Filter data after reordering to be able merge data Have no idea what should be done here, but leave it in case """ ## Compute difference between timestamps diff_insert = ["first_original_id", "first_source_id", "second_original_id", "second_source_id", "time_diff"] diff_select = ["e1.original_id", "e1.source_id", "e2.original_id", "e2.source_id", SQLCommand("TIME_TO_SEC(TIMEDIFF(%s, %s)) * 1000 + %s - %s" % (q("e2.date_general"), q("e1.date_general"), q("e2.date_ms"), q("e1.date_ms")))] log.info("Data postfiltering") short_diff_maker = "INSERT INTO %(diff_destination)s (%(diff_insert)s) SELECT %(diff_select)s " + \ "FROM %(diff_source)s as `e2` JOIN %(diff_source)s as `e1` ON `e1`.`id` = `e2`.`id` - 1 " + \ " WHERE `e2`.`id` > 1;" db.execute(short_diff_maker % { 'diff_destination': q(ShortDiffNACS.__tablename__), 'diff_insert': prepare_fields(diff_insert), 'diff_select': prepare_fields(diff_select), 'diff_source': q(BasicReorderNACS.__tablename__) }) db.execute(short_diff_maker % { 'diff_destination': q(ShortDiffWATS.__tablename__), 'diff_insert': prepare_fields(diff_insert), 'diff_select': prepare_fields(diff_select), 'diff_source': q(BasicReorderWATS.__tablename__) }) ## Select zero-diff elements fetcher = "SELECT `first_source_id`, `second_source_id` FROM %(table_name)s WHERE `time_diff` = 0;" source = [] for i in db.execute(fetcher % {'table_name': ShortDiffNACS.__tablename__}).fetchall(): source.extend([str(i[0]), str(i[1])]) for i in db.execute(fetcher % {'table_name': ShortDiffWATS.__tablename__}).fetchall(): source.extend([str(i[0]), str(i[1])]) source_ids = [] for i in source: if i not in source_ids: source_ids.append(i) ## Mark files if len(source_ids) > 0: db.execute("UPDATE `source_files` SET `ignored`=1 WHERE id IN (%s)" % (', '.join(source_ids)))
def main(): infoList=[] oldtime=time.time() pool=process_dummy.Pool() # default is core_num for page in range(1,4): infoList+=getPageInfo(page,pool) pool.close() pool.join() for info in infoList: try: table.InsertItemDict(ctable, info) # print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(vInfo['loadtime'])),info['title'] except: logging.error('encoding not supported') msg='ifeng has crawled %s records,time cost: %s (seconds)' % (len(infoList), time.time()-oldtime) print msg log.info(msg)
def main(): infoList=[] oldtime=time.time() pool=process_dummy.Pool() # default is cpu_count() results=pool.map(getMainPageInfo, categories.iterkeys()) pool.close() pool.join() for result in results: infoList+=result for info in infoList: try: table.InsertItemDict(ctable, info) # print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(info['loadtime'])),info['title'] except: logging.error('encoding not supported') msg='sohu has crawled %s records,time cost: %s (seconds)' % (len(infoList), time.time()-oldtime) print msg log.info(msg)
def main_map(): # A multiprocessing implementation of main(), about 3 pages updated every hour # Default use cup_count() processing ,is 4 times faster than single processing infoList=[] oldtime=time.time() #pool=multiprocessing.Pool(multiprocessing.cpu_count()) pool=multiprocessing.Pool() # will use default :cpu_count() processings results=pool.map(getPageInfo, range(0,6)) pool.close() pool.join() for result in results: infoList+=result for info in infoList: try: # table.InsertItemDict(ctable, info) print info['loadtime'],info['title'] except: logging.error('encoding not supported') msg='qq has crawled %s records,time cost: %s (seconds)' % (len(infoList), time.time()-oldtime) print msg log.info(msg)
def merge(): """ Merge data NACS and WATS together Question is about fitting NACS(with 1s resolution) to WATS(with 2s resolution) data """ s = db.session() def make_conversion(data_type, chunk_size, do_search=True): count = s.query(data_type).count() log.info("%i elements to be converted" % count) iterations = count / chunk_size if count % chunk_size: iterations += 1 for i in range(0, iterations): data = convert(s.query(data_type).slice(i * chunk_size, (i + 1) * chunk_size - 1).all(), do_search) for i in data: s.add(i) s.commit() chunk_size = 1000 make_conversion(BasicReorderNACS, chunk_size, False) make_conversion(BasicReorderWATS, chunk_size) s.close() log.info("Merging data")
def main(): # A multiprocessing.dummy implementation of main(), about 3 pages updated every hour # Default use cup_count() threads infoList=[] oldtime=time.time() #pool=multiprocessing.Pool(multiprocessing.cpu_count()) # try: pool=process_dummy.Pool() # will use default :cpu_count() processings results=pool.map(getPageInfo, range(0,6)) pool.close() pool.join() for result in results: infoList+=result for info in infoList: try: table.InsertItemDict(ctable, info) # print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(info['loadtime'])),info['title'] except: logging.error('encoding not supported') # except: # print 'error on distributing tasks' msg='qq has crawled %s records,time cost: %s (seconds)' % (len(infoList), time.time()-oldtime) print msg log.info(msg)
def resample(): """NACS data resampling from 1s to 2s due to wats data. Might be it's more logical would be to find out wats. """ def merge_pair(mp1, mp2=None): """Merge pair of MeasurementPoints. Check if point have neighbour with time difference 2 seconds. If don't we will take values of original point, otherwise we will take middle value of this point and neighbour Keyword arguments: mp1 -- 'nacs' measurement point conjuncted with 'wats' mp2 -- neighbour measurement point. Default 'None' - means neighbour doesn't exists Return: mp1 """ def find_value_model(mp, type, level=1, device='nacs'): """Find measurement from mp2 conjuncted with measurement from mp1 Keyword arguments: mp -- measurement point (mp2) type -- type of measurement from mp1 level -- level of measurement from mp1 device -- device of measurement from mp1 Return: Measurement object conjuncted with selected value """ for value_model in mp.data: if value_model.type == type and value_model.level == level and value_model.device == device: return value_model return None update = [] if not mp2 or (mp1.datetime.python_type() - mp2.datetime.python_type()).seconds > 3: if not mp2: log.debug("[wats:%i:%s] edge point does not exists" % (mp1.id, str(mp1.datetime))) else: log.debug("[wats:%i:%s]&[wats:%i:%s] is to fas in time dimension" % (mp1.id, str(mp1.datetime), mp2.id, str(mp2.datetime))) for measurement in mp1.data: if measurement.device == 'nacs': nm = Measurement(measurement) nm.level = 2 update.append(nm) else: log.debug("[wats:%i:%s]&[wats:%i:%s] is goes to be resampled" % (mp1.id, str(mp1.datetime), mp2.id, str(mp2.datetime))) for measurement in mp1.data: ms = find_value_model(mp2, mp1.type) nm = Measurement(measurement) nm.level = 2 nm.value = (nm.value + ms.value) / 2 nm.error = (nm.error + ms.error) / 2 nm.correction = (nm.correction + ms.correction) / 2 update.append(nm) session_instance.add(mp1.data.extend(update)) session_instance.commit() session_instance = db.session() ids_ = session_instance.query(Measurement.measurement_point_id).filter(Measurement.device=='wats').all() ids = [] for i in ids_: if i[0] not in ids: ids.append(i[0]) chunk_size = 100 iterations = [ids[i*chunk_size:(i+1)*chunk_size] for i in range(0, len(ids)/chunk_size)] log.info("WATS data in %i elements going to be processed in %i iterations" % (len(ids), len(iterations))) for points in iterations: log.info("Processing ids in range [%s..%s](%i)" % (str(points[0]), str(points[-1], len(points)))) extended_points = points extended_points.extend([j-1 for j in points]) data = session_instance.query(MeasurementPoint).join(Measurement).\ filter(Measurement.type == 'nacs').filter(MeasurementPoint.id.in_(extended_points)).\ order_by(Measurement.measurement_point_id).order_by(Measurement.type).all() data = {row.id: row for row in data} for key, row in data: if key in points: merge_pair(row, data.get(key-1, None)) #session_instance.commit() # Generating 2 level for 'wats' measurements db.execute("INSERT INTO `measurements` (`measurement_point_id`, `device`,`type`, `level`, `value`, `error`, " + "`correction`) SELECT `measurement_point_id`, `device`,`type`, 2, `value`, `error`, `correction` " + "FROM `measurements` WHERE `device`='wats';")