def render(categoryId): try: managment = Management() message = managment.existCategory(categoryId) if message == 1: util = Utils('') level = int(config.get('keys', 'level-limit')) template = config.get('template', 'folder') query = util.getQuery(level) all_rows = managment.getCategory(categoryId, query) util.setHtml(categoryId, all_rows, level, template) else: print(message) except Exception as e: raise e
def __rebuild(): parentItem = None queue = [] categoryList = [] categoryPartList = [] level_limit = config.get('keys', 'level-limit') parent = config.get('keys', 'parent-id') pool = int(config.get('threads', 'pool')) if level_limit > 1: thread = Request(parent, 1, 0) thread.start() thread.join() if thread.response is not None: categoryPartList = getCategoryPartLists(parent, thread.response) thread.response = None if len(categoryPartList) > 0: categoryList.extend(getFullList(categoryPartList, parent, level_limit)) categories = categoryList managment = Management() managment.createTable() util = Utils('') categories = util.listToInsert(categories) managment.isertList(categories)
diff = 0 for i in xrange(k): diff += distance.euclidean(newCentroids[i], oldCentroids[i]) logger.debug( 'means total diff %f:' % diff) if diff < delta: break else: oldCentroids[:] = newCentroids np.savetxt(centroidsinputfile, newCentroids) logger.debug( 'total time: %f' %total) #Calculate SSE SSE = Utils.calcSSE(points, newCentroids) logger.info('%.f' % SSE) logger.debug( 'Sum of Squared Error: %s' % SSE) #per data batch #finds the distance to nearest cluster distmatrix = distance.cdist(points, newCentroids, metric='euclidean') labels = distmatrix.argmin(axis=1) #Local #points = np.loadtxt(inputfile1) # Plot.plotPoints(points, labels, title='final kmeans2') # Plot.plotMeans(newCentroids) # strnow = datetime.now().strftime("%Y-%m-%d_%H%M%S") # plt.savefig('%s_%s.png' % (strnow,fileoutpostfix))
def svm_estimation(df_gender): X_train, X_test, y_train, y_test = Utils.split_data(df_gender) clf = svm.SVC(gamma='scale') clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print("SVM acc: ", accuracy_score(y_test, y_pred))
def onEdit(self, event): self.control.write(str(util.obtainEventInfo(event)))
else: params.epochs = 1 params.patience = 1 params.dropout = 0.5 params.use_pickle = True params.save_loc = "." params.outfile = 'gtsrb_kaggle.csv' #params.train_pickle = params.save_loc + '/train_balanced_preprocessed.p' params.train_pickle = params.save_loc + '/train.p' params.extra_debug = False from util import Utils utils = Utils() from model import IDSIANetwork # In[16]: class Trainer: def __init__(self, params, train_data=None, val_data=None): self.params = params self.train_data = train_data self.val_data = val_data
from util import Utils as ut from shann import SHANN max_len_doc_sents = 20 # avg 29 max_len_summ_sents = 4 max_len_doc_sent_words = 25 # avg 28 max_len_summ_sent_words = 15 padding_val = 0. pos_pairs = 32 neg_pairs = 32 path_models = "./chkp_models/" name_models = "model_" w2v = ut.load_word2vec("../../Embeddings/cnndaily_w2v.model") d = w2v.vector_size similar_val = 0.9999 non_similar_val = 0.0001 steps_per_epoch = 500 # 196961 muestras epochs = 200 validation_steps = 150 shann_obj = SHANN(max_len_doc_sents, max_len_doc_sent_words, max_len_summ_sents, max_len_summ_sent_words, d, path_models, name_models) shann_obj._set_model() train_file = "../../Corpora/CNNDM/dev.csv" dev_file = "../../Corpora/CNNDM/dev.csv" x_tr, y_tr = ut.load_csv_samples(train_file) x_dv, y_dv = ut.load_csv_samples(dev_file)
X = Personality.normalize(df) y = df[df.columns[-1:]] reg.fit(X, y) pickle.dump(reg, open("resources/LinearRegression_ext_v2.sav", 'wb')) @staticmethod def normalize(df): X = df.iloc[:, 0:-1] # independent columns X = np.log(X + 1) X = (X - X.min()) / (X.max() - X.min()) X.fillna(0, inplace=True) return X if __name__ == '__main__': util = Utils() PERSONALITY = Personality() df = PERSONALITY.get_data(labels=['userid', 'ext']) df = df.filter( ['positive', 'negative', 'anger_x', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they', 'ipron', 'future', 'affect', 'posemo', 'negemo', 'anx', 'incl', 'work', 'death', 'assent', 'nonfl', 'Quote', 'Apostro', 'ext'], axis=1) reg = linear_model.LinearRegression() X = df.iloc[:, 0:-1] # independent columns X = np.log(X + 1) X = (X - X.min()) / (X.max() - X.min()) X.fillna(0, inplace=True) y = df[df.columns[-1:]]
class ajkLoadDataAndInsert(): city_list = [] user_agents = [] headers = {} utils = Utils() list_data = [] ips = [] ipIndex = 0 ip = {} PROXYNAME = 'ipProxy' COLUMENAME = 'active_ajk_sec' def __init__(self): self.Logger = Logger('getAjkData') self.user_agents = Headers().user_agents self.headers = Headers().headers self.cfg = self.utils.pathToConfig() self.mysql = Mysql(self.cfg.get('DB', 'DBHOST'), int(self.cfg.get('DB', 'DBPORT')), self.cfg.get('DB', 'DBUSER'), self.cfg.get('DB', 'DBPWD'), 3, 5) def load_detail_info_sec(self): self.Logger.Info(u'>>>>> 开始抓取详细数据 <<<<<') self.ip = self.ips[0] for city in self.city_list: for page in range(0, int(city['ajk_sec_pages'])): city_list_url = city['ajk_sec_url'].replace( '?from=navigation', 'p' + str(int(page) + 1) + '/#filtersort') self.Logger.Info(u'>>>>> 开始抓取:' + city['city_name'] + '|url:' + str(city_list_url) + '|ip:' + self.ip['ip'] + '<<<<<') oneCityGetDown = True while oneCityGetDown: try: self.Logger.Info(u'>>>>> 使用ip:' + str(self.ip['ip']) + '<<<<<') proxies = { 'http': self.ip['ip'], 'https': self.ip['ip'] } head = self.headers head['user-agent'] = random.choice(self.user_agents) r = requests.get(city_list_url, timeout=10, proxies=proxies, headers=head) time.sleep(random.random() * 10) soup = BeautifulSoup(r.text, "html.parser") title = soup.find('title').get_text() if '二手房' in title: self.Logger.Info(u'>>>>> ip:' + str(self.ip['ip']) + u'可用|' + title + '<<<<<') list = soup.find(attrs={ 'id': 'houselist-mod-new' }).find_all('li') for l in list[0:]: oneDetailGetDown = True while oneDetailGetDown: house_title = l.find(attrs={ 'class': 'house-title' }).find('a').attrs['title'].strip() price = l.find(attrs={ 'class': 'price-det' }).get_text().strip() try: detail_url = l.find( attrs={ 'class': 'house-title' }).find('a').attrs['href'] self.Logger.Info( u'>>>>> 开始抓取:' + house_title + '|' + detail_url.split('view/') [1].split('?')[0] + '|ip:' + self.ip['ip'] + u'|数据<<<<<') proxies = { 'http': self.ip['ip'], 'https': self.ip['ip'] } head['user-agent'] = random.choice( self.user_agents) r_detail = requests.get( detail_url.split('now_time')[0], timeout=10, proxies=proxies, headers=head) time.sleep(random.random() * 20) soup_detail = BeautifulSoup( r_detail.text, "html.parser") title_detail = soup_detail.find( 'title').get_text() if '58安居客' in title_detail and '访问验证' not in title_detail: try: self.Logger.Info( u'>>>>> 开始从列表页获取详情中需要的数据|' + title_detail + '<<<<<') detail_dict = self.get_data( soup_detail) detail_dict['city_id'] = city[ 'city_id'] detail_dict[ 'city_name'] = city[ 'city_name'] detail_dict['source'] = 'ajk' detail_dict[ 'house_id'] = detail_url.split( 'view/')[1].split( '?')[0] detail_dict[ 'link_url'] = detail_url.split( '?')[0] detail_dict[ 'title'] = house_title detail_dict[ 'price'] = self.utils.str_to_num( price) oneDetailGetDown = False self.insert_update_data( detail_dict) except BaseException, e: self.Logger.Info( u'>>>>> 从列表页获取详情中需要的数据出错' + str(e) + '<<<<<') elif '可能被删除' in title_detail: self.Logger.Info(u'>>>>> 该链接失效|' + title_detail + '<<<<<') oneDetailGetDown = False else: self.Logger.Info( u'>>>>> ip for detail:' + str(self.ip['ip']) + u'不可用|' + str(title_detail) + '<<<<<') result_ip = self.utils.get_active_ip( self.ips, self.ip, self.Logger, self.PROXYNAME, self.mysql) self.ip = result_ip['active_ip'] self.ips = result_ip['ips'] except BaseException, e: self.Logger.Info( u'>>>>> ip for detail:' + str(self.ip['ip']) + u'不可用,超时|' + str(e) + '<<<<<') result_ip = self.utils.get_active_ip( self.ips, self.ip, self.Logger, self.PROXYNAME, self.mysql) self.ip = result_ip['active_ip'] self.ips = result_ip['ips'] oneCityGetDown = False self.Logger.Info(u'>>>>> ========== city:' + city['city_name'] + u'第' + str(int(page) + 1) + u'页' + u'抓取完成 ========== <<<<<') else:
def __init__(self, driver, username): self.driver = driver self.utils = Utils(driver) self.username = username
def go(self): user_type = self.account.split('@') self.browser.maximize_window() if len(user_type) > 1: self.browser.get("https://mp.dayu.com/") time.sleep(3) iframe = self.browser.find_element_by_tag_name("iframe") self.browser.switch_to.frame(iframe) self.browser.find_element_by_xpath("//input[@id='login_name']").send_keys(self.account) self.browser.find_element_by_xpath("//input[@id='password']").send_keys(self.password) slide = self.browser.find_element_by_xpath("//span[@id='nc_1_n1z']") action_chain = self.action_list(300) ActionChains(self.browser).click_and_hold(slide).perform() for action in action_chain: time.sleep(0.1) ActionChains(self.browser).move_by_offset(xoffset=action, yoffset=0).perform() ActionChains(self.browser).release().perform() WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//b[contains(string(), '验证通过')]"))) self.browser.find_element_by_xpath("//input[@id='submit_btn']").click() else: self.browser.get('https://account.youku.com/partnerLogin.htm?pid=20170512PLF000867&callback=https%3A%2F%2Fmp.dayu.com%2Fyt-login-callback%3Fredirect_url%3D') time.sleep(3) self.browser.find_element_by_xpath('//*[@id="YT-ytaccount"]').send_keys(self.account) self.browser.find_element_by_xpath('//*[@id="YT-ytpassword"]').send_keys(self.password) self.browser.find_element_by_xpath('//*[@id="YT-nloginSubmit"]').click() time.sleep(3) try: self.browser.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div/button[1]').click() time.sleep(2) except Exception: pass self.browser.maximize_window() self.cookies = self.get_cookies() # 切换视频上传页面 time.sleep(1) self.browser.implicitly_wait(30) WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//a[@id='w-menu-']"))).click() time.sleep(2) WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//a[@data-path='/dashboard/video/write']"))).click() time.sleep(1) # ----- end ----- try: # 切换视频上传页面 time.sleep(1) self.browser.implicitly_wait(30) WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//a[@id='w-menu-']"))).click() time.sleep(2) WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//a[@data-path='/dashboard/video/write']"))).click() time.sleep(1) # ----- end ----- except Exception as e: pass # 视频上传开始 WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//div[@class='article-write_video-container-upload-local']"))).click() time.sleep(1) Utils.upload(self.video_path) time.sleep(2) try: Utils.upload(self.video_path) except Exception as e: print(e) print("第二次打视频地址") if int(self.browser.find_element_by_xpath( '/html/body/div[1]/div[4]/div/div[2]/div/div/div[2]/div/div[2]/span').text) == 0: self.msg = "次数不足" return # ---- end ----- WebDriverWait(self.browser, 180).until( EC.presence_of_element_located((By.XPATH, "//p[contains(string(), '视频上传成功,处理中')]"))) time.sleep(2) WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//div[@class='w-form-field-content']/input"))).clear() self.browser.find_element_by_xpath("//div[@class='w-form-field-content']/input").clear() time.sleep(0.1) self.browser.find_element_by_xpath("//div[@class='w-form-field-content']/input").send_keys([Keys.BACKSPACE for i in range(1, 100)] + [i for i in self.title]) time.sleep(0.1) try: js = "var q=document.documentElement.scrollTop=100000" self.browser.execute_script(js) except Exception as e: print(e) print("滑动至底部失败") self.browser.find_element_by_xpath("//div[@class='w-form-field-content']/textarea").clear() time.sleep(0.1) self.browser.find_element_by_xpath("//div[@class='w-form-field-content']/textarea").send_keys([Keys.BACKSPACE for i in range(1, 100)] + [i for i in self.title]) time.sleep(0.1) self.build_tags(self.tags) try: js = "var q=document.documentElement.scrollTop=100000" self.browser.execute_script(js) except Exception as e: print(e) print("滑动至底部失败") #选择分类 try: self.browser.find_element_by_xpath("//div[@class='widgets-selects_container']").click() except Exception as e: print("选择分类失败") # try: # self.browser.find_element_by_xpath('/html/body/div[1]/div[4]/div/div[2]/div/div/div[1]/div[7]/div/div/div[1]/i').click() # except Exception as e: # print(e) # print("第二次选择分分类失败") # self.browser.find_element_by_xpath("//div[@class='widgets-selects_container']").click() WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//div[@class='widgets-selects_select_container']/a[contains(string(), '{}')]".format(self.video_type)))).click() image_div = self.browser.find_element_by_xpath("//div[@class='article-write_box-form-coverImg']") self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight)") ActionChains(self.browser).move_to_element(image_div).perform() try: ActionChains(self.browser).move_to_element(image_div).perform() ActionChains(self.browser).move_to_element(image_div).perform() ActionChains(self.browser).move_to_element(image_div).perform() except Exception as e: print("这是去点击上传封面图片的框,") time.sleep(0.1) image_div.find_element_by_xpath("//button[contains(string(), '从本地选择')]").click() time.sleep(0.5) print(self.image_path) Utils.upload(self.image_path) print("打图片地址上去") try: Utils.upload(self.image_path) except Exception as e: print("丢图片地址异常") time.sleep(2) WebDriverWait(self.browser, 30).until( EC.presence_of_element_located((By.XPATH, "//div[@class='article-material-image-dialog_btn']//button[contains(string(), '保存') and not(@disabled)]"))).click() time.sleep(2) # try: # WebDriverWait(self.browser, 2).until( # EC.presence_of_element_located((By.XPATH, "//div[@class='w-radio w-radio_checked iconfont wm-icon-yes']"))).click() # except TimeoutException: # pass time.sleep(5) WebDriverWait(self.browser, 30).until( EC.presence_of_element_located((By.XPATH, "//button[contains(string(), '发表')]"))).click() time.sleep(1) #判断是否标题党 try: WebDriverWait(self.browser, 3).until( EC.text_to_be_present_in_element((By.XPATH, "/html/body/div[4]/div/div[2]/div/div[1]"), '平台') ) self.msg = "标题党嫌疑,请再改改" return except Exception as e: print("暂无标题党嫌疑") try: WebDriverWait(self.browser, 4).until( EC.element_to_be_clickable((By.XPATH, '/html/body/div[4]/div/div[2]/div/div[2]/div/button[1]')) ) self.msg = "标题党嫌疑,请你改下,并重新提交" return except Exception as e: logger.info("无标题党嫌疑") # ---- end ---- # 确认发布 time.sleep(2) WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//button[contains(string(), '确认发表')]"))).click() # ---- end ---- # 获取视频信息 time.sleep(2) WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//ul[@class='w-list']/li[1]//div[@class='w-list-item-content-detail']/h3/a"))).click() self.status = True time.sleep(2) ahref = self.browser.find_element_by_xpath("//div[@class='contents-publish-article-preview']/iframe").get_attribute("src") info_frame = self.browser.find_element_by_xpath("//div[@class='contents-publish-article-preview']/iframe") self.browser.switch_to.frame(info_frame) vhref = self.browser.find_element_by_xpath("//div[@class='article-content simple-ui']//iframe").get_attribute("src") self.aid = self.get_aid(ahref) self.vid = self.get_vid(vhref)
class Home: def __init__(self, driver, username): self.driver = driver self.utils = Utils(driver) self.username = username def __find_post(self): tries = 0 while True: tries += 1 try: post = WebDriverWait(self.driver, 5).until( EC.presence_of_element_located(( By.XPATH, f'//div[ .//div[@data-testid="like"] and contains(@style, "position: absolute") and .//div[ contains( @class, "css-bfa6kz r-1re7ezh")]//span[text() != "{self.username}" ] ]' ))) except Exception as error: if (tries > 2): playsound('./alert_sound.mp3') print(error) if input("Do you want to quitr? y/n").lower() == 'n': tries = 0 continue self.utils.quit() self.utils.scroll_to_end() continue else: return post def __find_required_elements(self, post): try: username = post.find_element_by_xpath( './/div[ contains( @class, "css-bfa6kz r-1re7ezh")]//span' ).text like_btn = post.find_element_by_xpath( './/div[@data-testid="like"]') reply_btn = post.find_element_by_xpath( './/div[@data-testid="reply"]') except: self.utils.handle_error( "Home: username, likebtn or reply btn xpath outdated") return (username, like_btn, reply_btn) def __reply(self, reply_btn, comment): self.utils.click_js(reply_btn) try: reply_input = self.driver.find_element_by_xpath( '(//div[@data-testid="tweetTextarea_0"])[1]') except: self.utils.handle_error("Home: Reply input field xpath outdated") else: reply_input.send_keys(comment, Keys.CONTROL + Keys.ENTER) def __get_comment(self): comments = [ f"Nice post", f"Awesome work", f"Impressive work", f"Coool" ] return comments[random.randint(0, len(comments) - 1)] def __handle_like_error(self, like_btn): print(like_btn.get_attribute('data-testid')) if like_btn.get_attribute('data-testid') != 'unlike': try: error = WebDriverWait(self.driver, 3).until( EC.presence_of_element_located( (By.XPATH, '//div[@data-testid="toast" and @role="alert"]//span' ))).text except NoSuchElementException: error = """Home: Like error occured...\nPossible Reasons:\n1: Slow internet\n2: Xpath for error text of like is outdated OR error text is not displayed by twitter""" except: error = 'Home: Unexpected error' self.utils.handle_error(error) def like_and_comment(self, like_only=True, limit=1000): self.utils.navigate(TwitterUrls.twitter_home_url) current_iteration = 1 while current_iteration <= limit: post = self.__find_post() username, like_btn, reply_btn = self.__find_required_elements(post) print(":" * 20 + f"POST OF user {username} found" + ":" * 20) self.utils.click_js(like_btn) print("-->post liked") time.sleep(random.uniform(.5, .8)) self.__handle_like_error(like_btn) if not like_only: self.__reply(reply_btn, self.__get_comment()) print("-->replied to the post") delay = random.uniform(2, 4) print(f"Waiting {delay} seconds.") time.sleep(delay)
def getSomeArticlesPageSoup(self, index=0, skipfail=False): ''' 1. 获取当前节目总共页数 2. 存储节目信息 3. 获取并解析节目第index页html内容 :return: BeautifulSoup类实例,已访问过返回None ''' # 如果当前没有正在访问的节目 if self.currentSoup is None: try: # 获取到包含多个节目页面某个节目的解析 while True: # 没有内容则抛出异常 self.currentSoup = self.currentSoupList.pop() itemUrl = Utils.listenHost + self.currentSoup["href"] if itemUrl not in self.itemUrls: self.itemUrls.add(itemUrl) break except Exception: # 没有节目了,看看fromUrl里有没有可以获取节目的页面 self.currentSoup = None if self.fromUrlsIndex >= len(self.fromUrls): return None currentFromUrl = self.fromUrls[self.fromUrlsIndex] self.fromUrlsIndex += 1 try: self.currentSoupList = self.getItemsFromUrl(currentFromUrl) return self.getSomeArticlesPageSoup(index=index, skipfail=skipfail) except Exception: self.logger.error('节目包含页面访问失败,地址: ' + currentFromUrl) # 此页面访问失败后,若再次调用此函数,依旧访问此页面 if skipfail == False: self.fromUrlsIndex -= 1 raise Exception # 既然换了节目,就要初始化一些属性 self.currentItemInit() else: itemUrl = Utils.listenHost + self.currentSoup["href"] # 已经获取到了节目首页,现在要根据传入的index获取页面 # index为0则表示自增 index = max(0, int(index)) index = self.currentPageIndex if index == 0 else index itemFullUrl = itemUrl + 'page' + str(index) + '/' try: articlesContent = requests.get(itemFullUrl, headers=Utils.headers) resSoup = BeautifulSoup(articlesContent.text, "lxml") except Exception as e: self.logger.error('获取某节目某页失败: ' + itemFullUrl) raise Exception # 如果是第一次访问这个节目(无论哪一页),那么要做一些被延迟处理的事 # 1. 持久化节目信息 if self.hasBeenSaved is False: # 即使index非法,也能获取到节目信息 try: self.currentItemInfo = self.getListenItemInfo( resSoup, itemFullUrl) except Exception: self.logger.error('节目信息存储失败: ' + itemFullUrl) raise Exception self.hasBeenSaved = True # 2. 获取总页数 if self.currentTotalPageCounts == 0: # 获取页数,若指定的index值过大,是无法获取到总页数的,即认为index非法 self.currentTotalPageCounts = Utils.getPageCount(resSoup) # index非法,此次获取失败,恢复currentTotalPageCounts的值 if index > self.currentTotalPageCounts: self.currentTotalPageCounts = 0 return None # 如果指定了index,则下次访问index下一页 # 默认访问下一页 self.currentPageIndex = index + 1 if self.currentPageIndex > self.currentTotalPageCounts: self.currentSoup = None # 如果当前节目访问完毕,且数量达到限制,那么设置超出位 if self.currentSoup is None and self.getItemsSize() == self.limit: self.isOverLimited = True # 节目完整url(包含页码), 该页的soup return (itemFullUrl, resSoup)
def go(self): # 登录 self.browser.get("https://sso.toutiao.com/login/?service=https://mp.toutiao.com/sso_confirm/?redirect_url=JTJG") self.browser.implicitly_wait(30) self.browser.find_element_by_xpath("//div[@id='login-type-account']").click() time.sleep(2) self.browser.find_element_by_xpath("//input[@id='user-name']").send_keys(self.account) self.browser.find_element_by_xpath("//input[@id='password']").send_keys(self.password) time.sleep(0.1) self.browser.find_element_by_xpath("//button[@id='bytedance-login-submit']").click() self.handler_slider_verify() # 处理滑块 time.sleep(2) self.browser.maximize_window() # --- 登录结束 ---- self.cookies = self.get_cookies() WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//span[contains(string(), '西瓜视频')]"))).click() self.browser.implicitly_wait(20) # 跳转视屏上传页面 self.browser.get("https://mp.toutiao.com/profile_v3/xigua/upload-video") # 视屏上传开始 WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//div[@class='undefined upload-handler ']"))).click() time.sleep(1) # 等待上传框体加载 Utils.upload(self.video_path) WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//span[contains(string(), '上传成功')]"))) # 等待视频上传完成 WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//div[@class='article-title-wrap-new']/input"))).clear() self.browser.find_element_by_xpath("//div[@class='article-title-wrap-new']/input").send_keys([Keys.BACKSPACE for i in range(1, 100)] + [i for i in self.title]) WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//span[@class='tui-input-wrapper']/textarea"))).clear() self.browser.find_element_by_xpath("//span[@class='tui-input-wrapper']/textarea").send_keys([Keys.BACKSPACE for i in range(1, 100)] + [i for i in self.content]) if self.is_origin: # 是否原创 # WebDriverWait(Browser, 10).until( # EC.presence_of_element_located((By.XPATH, "//div[@class='edit-cell-new add-origin']//div[@class='tui2-radio']/input"))).click() WebDriverWait(self.browser, 10).until( EC.presence_of_element_located( (By.XPATH, "//span[@class='tui2-radio-text' and contains(string(), '声明原创')]/..//input"))).click() WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//button[contains(string(.), '确 定')]"))).click() WebDriverWait(self.browser, 10).until( EC.presence_of_element_located( (By.XPATH, "//span[@class='tui2-radio-text' and contains(string(), '非独家')]/..//input"))).click() if self.is_first: # 是否首发 WebDriverWait(self.browser, 10).until( EC.presence_of_element_located( (By.XPATH, "//span[@class='tui2-radio-text' and contains(string(), '非首发')]/..//input"))).click() WebDriverWait(self.browser, 10).until( EC.presence_of_element_located( (By.XPATH, "//div[@class='m-video-first']/div[@class='edit-cell-new'][1]//input"))).send_keys(self.first_url) WebDriverWait(self.browser, 10).until( EC.presence_of_element_located( (By.XPATH, "//div[@class='m-video-first']/div[@class='edit-cell-new'][2]//input"))).send_keys(self.first_platform) WebDriverWait(self.browser, 10).until( EC.presence_of_element_located( (By.XPATH, "//div[@class='m-video-first']/div[@class='edit-cell-new'][3]//input"))).send_keys(self.first_uname) else: WebDriverWait(self.browser, 10).until( EC.presence_of_element_located( (By.XPATH, "//span[@class='tui2-radio-text' and contains(string(), '首发')]/..//input"))).click() else: WebDriverWait(self.browser, 10).until( EC.presence_of_element_located( (By.XPATH, "//span[@class='tui2-radio-text' and contains(string(), '非原创')]/..//input"))).click() # 原创结束 self.build_tags(self.tags) # self.browser.find_element_by_xpath("//div[@class='edit-cell-new video-tag show-short']//input").send_keys("gsdfg") # self.browser.find_element_by_xpath("//div[@class='edit-cell-new video-tag show-short']//input").send_keys(Keys.ENTER) WebDriverWait(self.browser, 10).until( EC.presence_of_element_located( (By.XPATH, "//div[@class='Select tui-select Select--single']/div[@class='Select-control']"))).click() time.sleep(0.1) WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//div[@aria-label='{}']".format(self.video_type)))).click() WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//div[@class='submit btn ']"))).click() WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//div[@class='m-pgc-video-manage']"))) self.status = True # self.browser.close() # 获取信息开始 WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//div[@class='m-articles no-count']"))) # 确认视频列表加载 href = WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//div[@class='m-articles no-count']/div[1]//a[@class='title-wrap']"))).get_attribute("href") self.aid = self.get_aid(href) self.vid = self.get_vid(href)
def __init__(self, driver): self.driver = driver self.utils = Utils(driver)
files.sort() for i in xrange(len(files)): filename = files[i] data = np.loadtxt(filename) pwo = removeoutliers(data) postfix = filename[-4:] fileout = filename[:-4] + "_wo_outliers" + postfix np.savetxt(os.path.basename(fileout), pwo) print "wo outliers saved: %s" % os.path.basename(fileout) # calculate avg and std. for the last file. # use that those informatoin to convert the z-score back to original score if i == len(files) - 1: writeoutStdAndAvg(pwo) zscoredata = Utils.zscore(pwo) fileout = filename[:-4] + "_zscore_wo_outliers" + postfix np.savetxt(os.path.basename(fileout), zscoredata) print "wo outliers saved: %s" % os.path.basename(fileout) if i == 0: # store the points from first file, for centroid generation print zscoredata firstpoints = zscoredata k = 3 # 3 features initialc = np.array(random.sample(zscoredata, k)) fileoutcentroids = filename[:-4] + "_zscore_wo_outliers.centroids" + postfix np.savetxt(os.path.basename(fileoutcentroids), initialc) print "wo outliers saved: %s" % os.path.basename(fileout) #
params.locnet = '10,10,10' params.locnet2 = None params.locnet3 = None params.st = True params.resume = False params.dropout = 0.5 params.use_pickle = True params.save_loc = "." params.outfile = 'gtsrb_kaggle.csv' #params.train_pickle = params.save_loc + '/train_balanced_preprocessed.p' params.train_pickle = params.save_loc + '/train.p' params.extra_debug = False from util import Utils utils = Utils() fixedindex = 3 #3 epsilon = 0.05 import argparse parser = argparse.ArgumentParser() parser.add_argument("--fixedindex", type=int, default=1, help="choose the data to display") parser.add_argument("--epsilon", type=float, default=0.0, help="perturb level") parser.add_argument("--method", type=str, default='ERM', help="perturb level") args = parser.parse_args()
def removeoutliers(points): print "total number of points: %s" % len(points) tmpzscoredata = Utils.zscore(points) # To find outliers po = points[~(np.abs(tmpzscoredata) > 3).any(1)] # filter out outlier rows print "removed : %s" % (len(points) - len(po)) return po
def go(self): self.browser.get("https://mp.qutoutiao.net/login") # 登录开始 WebDriverWait(self.browser, 10).until( EC.presence_of_element_located( (By.XPATH, "//i[@class='login-icon']/following-sibling::input[1]"))).send_keys(self.account) WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//i[@class='pwd-icon']/following-sibling::input[1]"))).send_keys( self.password) WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//button[@id='submit-login']"))).click() # 登录结束 time.sleep(2) self.browser.maximize_window() # 跳转发布开始 WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//span[contains(string(), '发布内容')]"))).click() WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//p[contains(string(), '发布视频')]"))).click() #手机验证码 try: WebDriverWait(self.browser, 10).until( EC.element_to_be_clickable((By.XPATH, '//*[@id="app"]/div[2]/div[2]/div/div[2]/div/div[2]/div/div[3]/div/div[3]/span/button[1]/span')) ) self.msg = '需要手机验证码,请延时提交' return except Exception: pass # *** 处理发文规范弹窗 *** WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//i[@class='el-message-box__close el-icon-close']"))).click() time.sleep(1) # 跳转发布结束 # 开始发视频 WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//input[@id='inp-video-file']"))).click() time.sleep(1) # 等待上传框体加载 Utils.upload(self.video_path) WebDriverWait(self.browser, 100).until( EC.presence_of_element_located((By.XPATH, "//span[contains(string(), '上传成功')]"))) WebDriverWait(self.browser, 10).until( EC.presence_of_element_located( (By.XPATH, "//form[@class='el-form el-form--label-left']/div[1]//input"))).send_keys( [Keys.BACKSPACE for i in range(1, 100)] + [i for i in self.title]) # 发送标题 WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//textarea[@class='el-textarea__inner']"))).send_keys( self.content) # 发送描述 WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//input[@placeholder='请选择分类']"))).click() time.sleep(1) # 等待分类加载 WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//dd[contains(string(), '{}')]".format(self.video_type)))).click() time.sleep(1) self.build_tag(self.tags) time.sleep(0.5) # -- 选择封面开始 WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//div[@class='el-upload']"))).click() time.sleep(1) # 等待封面框体加载 WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//p[contains(string(), '自定义封面')]"))).click() WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//span[contains(string(), '选择图片')]"))).click() time.sleep(1) # 等待上传框体加载 Utils.upload(self.image_path) time.sleep(3) # 等待图片上传完成 # TODO 待处理 WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.XPATH, "//div[@class='el-dialog__wrapper dialog-img-cropper']//span[contains(string(), '确 定')]"))) time.sleep(1) self.browser.find_element_by_xpath("//div[@class='el-dialog__wrapper dialog-img-cropper']//span[contains(string(), '确 定')]").click() # 封面选择结束 # 视频信息构造结束 time.sleep(3) WebDriverWait(self.browser, 10).until( EC.presence_of_element_located( (By.XPATH, "//button[@class='el-button el-button--primary']//span[contains(string(), '发布')]"))).click() try: WebDriverWait(self.browser, 3).until( EC.presence_of_element_located((By.XPATH, "//div[@class='el-message-box__message']"))) time.sleep(2) self.msg = self.browser.find_element_by_xpath("//div[@class='el-message-box__message']").text if self.msg: return except TimeoutException: pass WebDriverWait(self.browser, 10).until(EC.presence_of_element_located((By.XPATH, "//button[@class='el-button el-button--primary el-button--medium']//span[contains(string(), '确认发布')]"))) # TODO time.sleep(1) self.browser.find_element_by_xpath("//button[@class='el-button el-button--primary el-button--medium']//span[contains(string(), '确认发布')]").click() time.sleep(2) self.status = True
def connect_to_wifi(device_serial: str, wifi_name, wifi_password): print("the device " + device_serial + " try to connect the wifi " + wifi_name) # android.settings.WIFI_SETTINGS Utils.switch_on_wifi(device_serial) uiAuto = uiautomator2.connect_usb(serial=device_serial) """ 即使通过 start -a android.settings.WIFI_SETTINGS 进入到界面 也有可能是在其他的界面(比如当前已经连接的 WiFi的信息界面. 我们必须杀死进程才能再次进去 进去的时候 等待 20s, 增加等待时长, 有时候需要自动连接, 需要等待更长的时间 """ uiAuto.app_stop("com.android.settings") time.sleep(2) wifi_intent = "android.settings.WIFI_SETTINGS" start_wifi_activity = "adb -s " + device_serial + " shell am start -a " + wifi_intent subprocess.getstatusoutput(start_wifi_activity) time.sleep(20) if not uiAuto(text=wifi_name).exists: uiAuto.swipe(300, 900, 300, 200) if not uiAuto(text=wifi_name).exists: uiAuto.swipe(300, 900, 300, 200) if not uiAuto(text=wifi_name).exists: uiAuto.swipe(300, 900, 300, 200) if not uiAuto(text=wifi_name).exists: print("finally we have not found the wifi: " + wifi_name + ", for device: " + device_serial) return uiAuto(text=wifi_name).click() """ 在某些情况下, 这个设备已经连接过这个 WiFi 了, 执行 sendkeys 会报错, """ if uiAuto(text="Advanced options").exists() or uiAuto( text="高级选项").exists(): try: uiAuto.send_keys(wifi_password, clear=True) if uiAuto(text="连接").exists(): uiAuto(text="连接").click() else: if uiAuto(text="Connect").exists(): uiAuto(text="Connect").click() else: if uiAuto(text="CONNECT").exists(): uiAuto(text="CONNECT").click() except UiObjectNotFoundError as error: print(error) if uiAuto(text="Frequency").exists(): print("the device had been connected to the wifi " + wifi_name) finally: pass """ 等待 几秒钟, 这样才会dump出wifi的信息 """ time.sleep(5) device_wifi_name = Utils.get_wifi_ssid(device_serial) if device_wifi_name.__eq__(wifi_name): print("hi, the device switch to " + wifi_name + ", successfully! ") time.sleep(5)
def __init__(self): utils = Utils() self.cfg = utils.pathToConfig() self.mysql = Mysql(self.cfg.get('DB', 'DBHOST'), int(self.cfg.get('DB', 'DBPORT')), self.cfg.get('DB', 'DBUSER'), self.cfg.get('DB', 'DBPWD'), 3, 5) print('start get ip')
def save_to_disc(): Utils.saveToDisk("known_messages", known_messages)
print( " Arg2: ProjectKey en SonarQube correspondiente al elemento a analizar" ) sys.exit(0) #myPropertiesFile = "D://Alfonso//AAProyectos//qamera_python//application.properties" myPropertiesFile = sys.argv[1] myProjectKey = sys.argv[2] print("Parametros de ejecucion:") print("Fichero de propiedades:", myPropertiesFile) print("Parametros de ejecucion:", myProjectKey) print("...") ### Read Program Properties from configuration file myprops = {} myprops = Utils.loadPropertiesFile(myPropertiesFile) ### Read Quality Model from configuration file myQualityModel = QualityModel(myPropertiesFile) print("Quality Model:", myQualityModel.max_violations_rate) ### Connecto to SonarQube mySonarqubeConector = SonarqubeConector(myprops["sonarURL"], myprops["sonarUser"], myprops["sonarCredentials"]) ### Get ProjectId From SonarQube myProjectId = mySonarqubeConector.loadProjectIdFromSonar(myProjectKey) ### Get Metrics from SonarQube metricList = myprops["metrics"] qualityCodeResults = mySonarqubeConector.loadProjectMetricsFromSonar( myProjectKey, metricList)
./chkp_models//model_-00082-0.00764.hdf5 """ max_len_doc_sents = 20 # avg 27 # MEJOR 10 max_len_summ_sents = 4 max_len_doc_sent_words = 25 # avg 29 # MEJOR 25 max_len_summ_sent_words = 15 padding_val = 0. pos_pairs = 32 neg_pairs = 32 path_models = "./best_model/" name_models = "model_" output_file_sents = "./cnndaily_shann_3_sents.out" output_file_words = "./cnndaily_shann_3_words.out" path_weights = "./best_model//model_-00082-0.00764.hdf5" w2v = ut.load_word2vec("../../Embeddings/cnndaily_w2v.model") d = w2v.vector_size topk_sentences = 3 shann_obj = SHANN(max_len_doc_sents, max_len_doc_sent_words, max_len_summ_sents, max_len_summ_sent_words, d, path_models, name_models) shann_obj._set_model() shann_obj.load_weights(path_weights) decoder = Decoder(max_len_doc_sents, max_len_doc_sent_words, w2v, d, shann_obj.get_all_att_model(), topk_sentences=topk_sentences) test_file = "../../Corpora/CNNDM/test.csv"
class ipProxy(): user_agents = [] headers = {} utils = Utils() def __init__(self): self.Loggers = Logger('ipProxy') self.user_agents = Headers().user_agents self.headers = Headers().headers self.cfg = self.utils.pathToConfig() self.mysql = Mysql(self.cfg.get('DB', 'DBHOST'), int(self.cfg.get('DB', 'DBPORT')), self.cfg.get('DB', 'DBUSER'), self.cfg.get('DB', 'DBPWD'), 3, 5) def get_ip_from_xici(self): Loggers = Logger(special_log_file='getProxyXiCi') while 1 == 1: try: avalibleIpsOneWeb = [] startGetIpTime = time.time() startGetIpTimeFormat = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(time.time())) title = u'西祠代理' Loggers.Info('>>>>> ' + startGetIpTimeFormat + '|' + title + u'|开始抓取ip <<<<<') url = 'http://www.xicidaili.com/nn/' head = self.headers head['user-agent'] = random.choice(self.user_agents) try: Loggers.Info('>>>>> ' + title + u'|开始请求url ' + url + ' <<<<<') r = requests.get(url, timeout=10, headers=head) soup = BeautifulSoup(r.text, "html.parser") list = soup.find('table', attrs={ 'id': 'ip_list' }).find_all('td') strText = '' ips = [] for l in list: content = l.get_text().strip() if re.match(r'^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$', content): strText = content if re.match( r'^([0-9]|[1-9]\d{1,3}|[1-5]\d{4}|6[0-4]\d{4}|65[0-4]\d{2}|655[0-2]\d|6553[0-5])$', content): strText = strText + ':' + content ips.append(strText) endGetIpTime = time.time() endGetIpTimeFormat = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(time.time())) Loggers.Info('>>>>> ' + endGetIpTimeFormat + '|' + title + u'|结束抓取ip,共抓取' + str(len(ips)) + '条 <<<<<') Loggers.Info('>>>>> ' + endGetIpTimeFormat + '|' + title + u'|开始检查ip是否可用,抓取共耗时' + str(endGetIpTime - startGetIpTime) + ' <<<<<') for ip in ips: Loggers.Info(u'>>>>> 开始检查ip:' + str(ip) + ' <<<<<') start = time.time() if self.utils.checkIpForAJK(ip): end = time.time() avalibleIpsOneWeb.append({ 'source': 'xici', 'ip': ip, 'time': str(end - start) }) Loggers.Info('>>>>> ip:' + str(ip) + u' 可用<<<<<') else: Loggers.Info('>>>>> ip:' + str(ip) + u' 不可用<<<<<') endCheckIpTime = time.time() endCheckIpTimeFormat = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(time.time())) Loggers.Info('>>>>> ' + endCheckIpTimeFormat + '|' + title + u'|结束检查ip是否可用,检查共耗时' + str(endCheckIpTime - endGetIpTime) + ' <<<<<') Loggers.Info('>>>>> ' + title + u'|成功率:' + str(len(avalibleIpsOneWeb)) + '-' + str(len(ips)) + ' <<<<<') Loggers.Info('>>>>> ' + endCheckIpTimeFormat + '|' + title + u'|结束,抓取到' + str(len(avalibleIpsOneWeb)) + u'条可用ip,共耗时' + str(endCheckIpTime - startGetIpTime) + ' <<<<<') # self.avalibleIps.append(avalibleIpsOneWeb) self.insert_data(Loggers, avalibleIpsOneWeb) except BaseException, e: Loggers.Error(u'>>>>> 请求url出错 ' + str(e) + '<<<<<') except BaseException, e: Loggers.Error(u'>>>>> 抓取ip循环出错 ' + str(e) + '<<<<<') time.sleep(10)
def getCategoryPartLists(parent, response): categoryList = [] util = Utils(response) root = util.getStringXml() categoryList = util.getCategoryList(root, parent) return categoryList
def load_from_disc(): global known_messages known_messages = Utils.fetchFromDisk("known_messages")
def to_json(config, array): id_field = config.property('result.id_field', 'id') value_field = config.property('result.value_field', 'value') value_unit = config.property('result.value_unit', 's') time_functions = { 'ns': lambda x: x * 1000000000, 'nanosecond': lambda x: x * 1000000000, 'mcs': lambda x: x * 1000000, 'microsecond': lambda x: x * 1000000, 'ms': lambda x: x * 1000, 'millisecond': lambda x: x * 1000, 'm': lambda x: x / 60, 'minute': lambda x: x / 60, 'h': lambda x: x / 3600, 'hour': lambda x: x / 3600, 'd': lambda x: x / 86400, 'day': lambda x: x / 86400 } time_function = time_functions[value_unit] \ if time_functions.__contains__(value_unit) \ else lambda x: x expired_time = config.property('expire.delay') event_time_expire = config.property('expire.event_time', True) expired_behavior = config.property('expire.behavior', 'fail') # Flag to fail package failed = False package, values = (array[0], array[1].values()) if values[-1][1] == 'start': if expired_behavior == 'keep': from util import Utils values += [(Utils.datetime_add_seconds(values[-1][0], expired_time), 'stop')] elif expired_behavior == 'fail': failed = True start_values = [value[0] for value in values if value[1] == 'start'] stop_values = [value[0] for value in values if value[1] == 'stop'] durations = map(lambda (start, stop): (stop - start).total_seconds(), zip(start_values, stop_values)) if len(durations) == 0: return None json = { '@timestamp': stop_values[-1].isoformat() if len(stop_values) > 0 else start_values[-1].isoformat(), value_field: time_function(sum(durations) / len(durations)), id_field: package } if expired_behavior == 'fail': json['failed'] = 'true' if failed else 'false' if event_time_expire and expired_behavior != 'keep': expired_durations = [duration for duration in durations if duration > expired_time] unexpired_durations = [duration for duration in durations if duration <= expired_time] if len(unexpired_durations) == 0: return None json[value_field] = time_function(sum(unexpired_durations) / len(unexpired_durations)) json['failed'] = 'true' if len(expired_durations) > 0 \ and expired_behavior == 'fail' else 'false' static_fields = config.property('result.static_fields') if static_fields is not None: json.update(static_fields) return json
def getUserInfo(self, uid, frequentAdd=1, frequentReduce=2): full_url = Utils.userHost + '/u/' + uid + '/' user = ListenUser(full_url) try: content = self.session.get(full_url, headers=Utils.headers, allow_redirects=False) except Exception: self.logger.error('获取用户页面失败: ' + full_url) return None # 有的人将部落设置为隐私,外部不能访问,页面会302转向error if content.status_code != 200: self.logger.warning(full_url + ': redirect ' + str(content.status_code)) responseText = urlparse(unquote( content.headers['Location'])).query.split('=', maxsplit=1)[1] self.logger.warning('提示: ' + responseText) if responseText[0:2] == '用户': # 若为私有,需要换种访问信息的方式 # 如下jsonp可用于获取部分信息 # encodeUri后:http://bulo.hujiang.com/service/GetUserFace.ashx?ver=2016/11/23%20%E4%B8%8B%E5%8D%888:29:28&userId=5326257&callback=jQuery17202552129787287378_1479904148908&_=1479904168992 # decodeUri后:http://bulo.hujiang.com/service/GetUserFace.ashx?ver=2016/11/23 下午8:29:28&userId=5326257&callback=jQuery17202552129787287378_1479904148908&_=1479904168992 queryParams = {} nt = datetime.datetime.now() currHour = int(datetime.datetime.now().hour) verTime = nt.strftime('%Y/%m/%d {half}%I:%M:%S').format( half=('上午' if currHour < 12 else '下午')) queryParams['ver'] = verTime queryParams['userId'] = uid timeStamp = ''.join(str(time.time()).split('.'))[:13] queryParams['callback'] = 'jQuery17202552129787287378_' + str( timeStamp) queryParams['_'] = str(timeStamp) info_url = Utils.urlCreate( Utils.userHost + '/service/GetUserFace.ashx?', queryParams) try: infoRespond = requests.get(info_url, headers=Utils.headers) except Exception: self.logger.error('隐私用户信息获取失败: ' + uid) return None infoRespondStr = infoRespond.text.split('(', maxsplit=1)[1][:-1] infoRespondJson = json.loads(infoRespondStr) user.name = infoRespondJson[ 'UserName'] if 'UserName' in infoRespondJson else '' user.nickName = infoRespondJson['NickName'][ 1:-1] if 'NickName' in infoRespondJson else '' user.signature = infoRespondJson[ 'UserSign'] if 'UserSign' in infoRespondJson else '' user.city = infoRespondJson[ 'city'] if 'city' in infoRespondJson else '' user.signinLast = infoRespondJson[ 'PunchCount'] if 'PunchCount' in infoRespondJson else '' gender = infoRespondJson[ 'Gender'] if 'PunchCount' in infoRespondJson else '' if gender == '1' or gender == '0': user.gender = '男' if gender == '1' else '女' try: user.save(self.mysql_session) except Exception: self.logger.error('存储隐私用户信息失败') raise Exception # 若为私有,则存储进userAll self.privateUids += 1 self.userUids.add(uid) if self.tooFrequent > 0: self.tooFrequent -= frequentReduce self.tooFrequent = 0 if self.tooFrequent < 0 else self.tooFrequent self.logger.debug(user) return user else: # 某用户页面访问次数限制 if self.lastUserVisitInfo[0] == uid: self.lastUserVisitInfo[1] += 1 else: self.lastUserVisitInfo[0] = uid self.lastUserVisitInfo[1] = 0 # 满足次数要求的话,就放进优先队列等待下一次重试访问 if self.lastUserVisitInfo[1] <= self.failedToVisitCountLimit: # 不能存进userAll, 而是放进放进uidsPriority数组等待重新访问 self.appendUidPriority(uid) else: # 记录失败的访问用户 self.failedToVisit.append(uid) # 避免再次访问 self.userUids.add(uid) # 如果第一次遇到这种情况,最好睡眠时间快速增长,之后缓慢增长 if self.tooFrequent == 0: self.tooFrequent = 4 else: self.tooFrequent += frequentAdd return None # 如果此时的返回不是过于频繁,那么等待时间即可缩小一倍 if self.tooFrequent > 0: self.tooFrequent -= frequentReduce self.tooFrequent = 0 if self.tooFrequent < 0 else self.tooFrequent try: soup = BeautifulSoup(content.text, "lxml") except Exception: self.logger.error('解析用户页面失败: ' + full_url) return # 统计 countList = soup.find(attrs={'id': 'LeftCnt_divUserCount'}) # 处理一些数据 if countList: # 访客数 viewCount = countList.find(attrs={'id': 'li_viewCount'}) if viewCount and len(viewCount) != 0: user.viewCount = viewCount.string # 留言数 msgCount = countList.find(attrs={'id': 'li_msgCount'}) if msgCount and len(msgCount) != 0: user.msgCount = msgCount.find('a').string # 碎碎数 ingCount = countList.find(attrs={'id': 'li_ingCount'}) if ingCount and len(ingCount) != 0: user.ingCount = ingCount.find('a').string # 日志数 blogCount = countList.find(attrs={'id': 'li_blogCount'}) if blogCount and len(blogCount) != 0: user.blogCount = blogCount.find('a').string # 听写数 listenCount = countList.find(attrs={'id': 'li_listenCount'}) if listenCount and len(listenCount) != 0: user.listenCount = listenCount.find('a').string # 口语数 talkCount = countList.find(attrs={'id': 'li_talkCount'}) if talkCount and len(talkCount) != 0: user.talkCount = talkCount.find('a').string # 礼物数 giftCount = countList.find(attrs={'id': 'li_giftCount'}) if giftCount and len(giftCount) != 0: user.giftCount = giftCount.find('a').string # 个人信息 profileList = soup.find(id='u_profile').find('ul') # 继续处理数据 if profileList: for child in profileList.children: if child.name != 'li': continue text = child.get_text(strip=True) if re.compile(r'性别').search(text): user.gender = child.find_all('span')[1].string if re.compile(r'城市').search(text): user.city = child.find_all('span')[1].string if re.compile(r'昵称').search(text): child.span.replace_with('') user.nickName = child.get_text(strip=True) if re.compile(r'签名').search(text): child.span.replace_with('') user.signature = child.get_text(strip=True) if re.compile(r'沪龄').search(text): # user.yearLast = child.find_all('span')[1].string user.registDate = child.find_all('span')[1]['title'][5:] if re.compile(r'打卡').search(text): child.span.replace_with('') user.signinLast = int(child.get_text(strip=True)[0:-1]) if re.compile(r'登录').search(text): user.lastSignin = child.find_all('span')[1].string # 自我介绍 selfIntroPre = soup.find(id='user_Profile_span_reportIt') selfIntro = None if selfIntroPre: selfIntro = selfIntroPre.find_previous_sibling() if selfIntro and selfIntro.name == 'div': user.selfIntroduction = selfIntro.get_text(strip=True) # 城市,因为该部分是注释,所以用bs4找不出来就用re了 cityMatch = re.compile( r'<li id="user_Profile_span_city.*?<span>(.*?)</span></li>', re.S).search(content.text) if cityMatch: user.city = cityMatch.group(1) # 获取名称 userNameHtml = soup.find(id='cont_h1') userNameHtml.a.replace_with('') userNameHtml.span.replace_with('') user.name = userNameHtml.get_text(strip=True)[0:-5].strip() try: user.save(self.mysql_session) except Exception: self.logger.error('存储用户信息失败') raise Exception self.userUids.add(uid) self.logger.debug(user) return user
def __init__(self, driver): self.driver = driver self.twitter_common = TwitterCommon(driver) self.utils = Utils(driver)
def sgd_classify(df_gender): X_train, X_test, y_train, y_test = Utils.split_data(df_gender) clf = SGDClassifier(loss="hinge", penalty="l2") clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print("sgd acc: ", accuracy_score(y_test, y_pred))
print(station + '|' + str(len(data['results']))) Utils.write_csv('statistics.csv', [station, str(len(data['results']))], 'a+') if not os.path.exists(path): os.mkdir(path) Utils.write_file( path + '/' + station + '.txt', json.dumps(data['results'], ensure_ascii=False, indent=4), 'a+') else: Utils.write_csv('no_place_' + path + '.csv', [station], 'a+') def get_bus_place(self, station): self.__get_place(self.__REGION, station, '长途汽车站,公交车站', 'bus') def get_railway_place(self, station): self.__get_place(self.__REGION, station, '火车站', 'railway') if __name__ == '__main__': api = LbsApi() # bus_config = 'config/busStationNoGroup.txt' # stations = Utils.get_txt_config(bus_config) # Utils.async_task(api.get_bus_place, stations) railway_config = '../config/trainStationNoGroup.txt' stations = Utils.get_txt_config(railway_config) Utils.async_task(api.get_railway_place, stations) # 合并文件
class Followers: def __init__(self, driver): self.driver = driver self.utils = Utils(driver) def __find_followable_people_element(self): tries = 0 while True: tries += 1 try: follow_element = WebDriverWait(self.driver, 2).until(EC.presence_of_element_located((By.XPATH, '//div[contains(@style,"position: absolute;") and .//div[contains(@data-testid, "-follow")]]'))) except Exception as error: if tries > 2: self.utils.handle_error(error) self.utils.scroll_to_end() continue else: return follow_element def __find_username_and_follow_btn(self, follow_element): try: username = follow_element.find_element_by_xpath('.//div[contains(@class, "r-1re7ezh r-18u37iz")]/span').text follow_btn = follow_element.find_element_by_xpath('.//div[contains(@data-testid, "-follow")]') except: self.utils.handle_error("Followers: username or follow_btn xpath outdated") return (username, follow_btn) def __handle_if_error_occured(self, username, last_followed_user): if username == last_followed_user: try: error = WebDriverWait(self.driver, 2).until(EC.presence_of_element_located((By.XPATH, '//div[@data-testid="toast"]//div[contains(@class, "r-16dba41")]//span'))).text except NoSuchElementException: error = "Followers: Xpath for error text of followers is outdated" except: error = "Followers: Unexpected error" self.utils.handle_error(error) def __wait_before_next_follow(self): delay = round(random.uniform(2, 4), 3) print(f"Waiting {delay} seconds before next follow") time.sleep(delay) def follow(self, username_or_query, is_username=True, limit=400): if is_username: self.utils.navigate(TwitterUrls.get_link_of_user_followers(username_or_query)) else: self.utils.navigate(TwitterUrls.get_twitter_users_link(username_or_query)) current_iteration = 1 last_followed_user = None while current_iteration <= limit: current_iteration += 1 follow_element = self.__find_followable_people_element() username, follow_btn = self.__find_username_and_follow_btn(follow_element) self.__handle_if_error_occured(username, last_followed_user) self.utils.click_js(follow_btn) print(f"Just followed {username}") last_followed_user = username self.__wait_before_next_follow()
def load(): global LOADED, INFO INFO = Utils.fetchFromDisk("config/global") LOADED = True
from models import User, Article, Item from datetime import datetime from util import Utils utils = Utils() class UserInfo(object): def __init__(self, uid): # 用户ID self.__uid = uid # 访客数量 #self.viewCount = 0 # 留言数量 #self.msgCount = 0 # 留言列表 # 留言的评论时间有问题,一年以内(365天以内)没有年份 #self.msgList = [] # 碎碎数量(http://t.hujiang.com/u/3566854/) # 不要被显示的页数欺骗,其实有很多页,可以通过ul标签是否有li元素判断这一页是否有碎碎,没有则表示结束 #self.ingCount = 0 # 碎碎列表 #self.ingList = [] # 日志数量 #self.blogCount = 0 # 听写数量 #self.listenCount = 0 # 听写列表 #self.listenList = [] # 口语数量 #self.talkCount = 0 # 礼物数量
for k,v in d.iteritems(): # filebyvalues.write('%s %s\n' % (str(k), ' '.join(map(str, v)))) filebyvalues.write('%s\n' % ' '.join(map(str, v))) # i += 1 # if i == 10: # break print 'done.' print 'Alle done!' #Normalize files and pick random distinct initial centroids files = glob.glob(filespath + '.dat') for filename in files: data = np.loadtxt(filename) zscoredata = Utils.zscore(data) postfix = filename[-4:] fileout = filename[:-4] + '_zscore' + postfix np.savetxt(fileout, zscoredata) k = 3 #3 features centroids = Utils.getInitialMeans(zscoredata, k) fileoutcentroids = filename[:-4] + '_zscore.centroids' + postfix np.savetxt(fileoutcentroids, centroids) print 'zscore saved: %s' % fileout