Пример #1
0
def render(categoryId):
	try:
		managment = Management()
		message = managment.existCategory(categoryId)
		if message == 1:
			util = Utils('')
			level = int(config.get('keys', 'level-limit'))
			template = config.get('template', 'folder')
			query = util.getQuery(level)
			all_rows = managment.getCategory(categoryId, query)
			util.setHtml(categoryId, all_rows, level, template)
		else:
			print(message)	
	except Exception as e:
  		raise e
Пример #2
0
def __rebuild():
	parentItem = None
	queue = []
	categoryList = []
	categoryPartList = []
	level_limit = config.get('keys', 'level-limit')
	parent = config.get('keys', 'parent-id')
	pool = int(config.get('threads', 'pool'))
	if level_limit > 1:
		thread = Request(parent, 1, 0)
		thread.start()
		thread.join()
		if thread.response is not None:
			categoryPartList = getCategoryPartLists(parent, thread.response)
			thread.response = None
			if len(categoryPartList) > 0:
				categoryList.extend(getFullList(categoryPartList, parent, level_limit))
	categories = categoryList		
	managment = Management()
	managment.createTable()
	util = Utils('')
	categories = util.listToInsert(categories)
	managment.isertList(categories)
Пример #3
0
                diff = 0
                for i in xrange(k):
                    diff += distance.euclidean(newCentroids[i], oldCentroids[i])
                    
                logger.debug( 'means total diff %f:' % diff)
                if diff < delta:
                    break
                else:
                    oldCentroids[:] = newCentroids
                    np.savetxt(centroidsinputfile, newCentroids)
                    
            
            logger.debug( 'total time: %f' %total)
    
            #Calculate SSE
            SSE = Utils.calcSSE(points, newCentroids)
            logger.info('%.f' % SSE)
            logger.debug( 'Sum of Squared Error: %s' % SSE)
            
        
            #per data batch
            #finds the distance to nearest cluster        
            distmatrix = distance.cdist(points, newCentroids, metric='euclidean')
            labels = distmatrix.argmin(axis=1)

        #Local
        #points = np.loadtxt(inputfile1)
#        Plot.plotPoints(points, labels, title='final kmeans2')
#        Plot.plotMeans(newCentroids)
#        strnow = datetime.now().strftime("%Y-%m-%d_%H%M%S")
#        plt.savefig('%s_%s.png' % (strnow,fileoutpostfix))
 def svm_estimation(df_gender):
     X_train, X_test, y_train, y_test = Utils.split_data(df_gender)
     clf = svm.SVC(gamma='scale')
     clf.fit(X_train, y_train)
     y_pred = clf.predict(X_test)
     print("SVM acc: ", accuracy_score(y_test, y_pred))
Пример #5
0
 def onEdit(self, event):
     self.control.write(str(util.obtainEventInfo(event)))
Пример #6
0
else:
    params.epochs = 1
    params.patience = 1

params.dropout = 0.5
params.use_pickle = True
params.save_loc = "."
params.outfile = 'gtsrb_kaggle.csv'
#params.train_pickle = params.save_loc + '/train_balanced_preprocessed.p'
params.train_pickle = params.save_loc + '/train.p'
params.extra_debug = False


from util import Utils
utils = Utils()



from model import IDSIANetwork
# In[16]:





class Trainer:
    def __init__(self, params, train_data=None, val_data=None):
        self.params = params
        self.train_data = train_data
        self.val_data = val_data
Пример #7
0
from util import Utils as ut
from shann import SHANN

max_len_doc_sents = 20  # avg 29
max_len_summ_sents = 4
max_len_doc_sent_words = 25  # avg 28
max_len_summ_sent_words = 15
padding_val = 0.
pos_pairs = 32
neg_pairs = 32
path_models = "./chkp_models/"
name_models = "model_"
w2v = ut.load_word2vec("../../Embeddings/cnndaily_w2v.model")
d = w2v.vector_size
similar_val = 0.9999
non_similar_val = 0.0001
steps_per_epoch = 500  # 196961 muestras
epochs = 200
validation_steps = 150

shann_obj = SHANN(max_len_doc_sents, max_len_doc_sent_words,
                  max_len_summ_sents, max_len_summ_sent_words, d, path_models,
                  name_models)

shann_obj._set_model()
train_file = "../../Corpora/CNNDM/dev.csv"
dev_file = "../../Corpora/CNNDM/dev.csv"

x_tr, y_tr = ut.load_csv_samples(train_file)
x_dv, y_dv = ut.load_csv_samples(dev_file)
Пример #8
0
        X = Personality.normalize(df)
        y = df[df.columns[-1:]]
        reg.fit(X, y)
        pickle.dump(reg, open("resources/LinearRegression_ext_v2.sav", 'wb'))

    @staticmethod
    def normalize(df):
        X = df.iloc[:, 0:-1]  # independent columns
        X = np.log(X + 1)
        X = (X - X.min()) / (X.max() - X.min())
        X.fillna(0, inplace=True)
        return X


if __name__ == '__main__':
    util = Utils()
    PERSONALITY = Personality()
    df = PERSONALITY.get_data(labels=['userid', 'ext'])
    df = df.filter(
        ['positive', 'negative', 'anger_x', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust',
         'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they', 'ipron', 'future', 'affect', 'posemo', 'negemo', 'anx',
         'incl', 'work', 'death', 'assent', 'nonfl', 'Quote', 'Apostro', 'ext'], axis=1)
    reg = linear_model.LinearRegression()

    X = df.iloc[:, 0:-1]  # independent columns
    X = np.log(X + 1)
    X = (X - X.min()) / (X.max() - X.min())
    X.fillna(0, inplace=True)

    y = df[df.columns[-1:]]
Пример #9
0
class ajkLoadDataAndInsert():
    city_list = []
    user_agents = []
    headers = {}
    utils = Utils()
    list_data = []
    ips = []
    ipIndex = 0
    ip = {}
    PROXYNAME = 'ipProxy'
    COLUMENAME = 'active_ajk_sec'

    def __init__(self):
        self.Logger = Logger('getAjkData')
        self.user_agents = Headers().user_agents
        self.headers = Headers().headers
        self.cfg = self.utils.pathToConfig()
        self.mysql = Mysql(self.cfg.get('DB', 'DBHOST'),
                           int(self.cfg.get('DB', 'DBPORT')),
                           self.cfg.get('DB', 'DBUSER'),
                           self.cfg.get('DB', 'DBPWD'), 3, 5)

    def load_detail_info_sec(self):
        self.Logger.Info(u'>>>>> 开始抓取详细数据 <<<<<')
        self.ip = self.ips[0]
        for city in self.city_list:
            for page in range(0, int(city['ajk_sec_pages'])):
                city_list_url = city['ajk_sec_url'].replace(
                    '?from=navigation',
                    'p' + str(int(page) + 1) + '/#filtersort')
                self.Logger.Info(u'>>>>> 开始抓取:' + city['city_name'] + '|url:' +
                                 str(city_list_url) + '|ip:' + self.ip['ip'] +
                                 '<<<<<')
                oneCityGetDown = True
                while oneCityGetDown:
                    try:
                        self.Logger.Info(u'>>>>> 使用ip:' + str(self.ip['ip']) +
                                         '<<<<<')
                        proxies = {
                            'http': self.ip['ip'],
                            'https': self.ip['ip']
                        }
                        head = self.headers
                        head['user-agent'] = random.choice(self.user_agents)
                        r = requests.get(city_list_url,
                                         timeout=10,
                                         proxies=proxies,
                                         headers=head)
                        time.sleep(random.random() * 10)
                        soup = BeautifulSoup(r.text, "html.parser")
                        title = soup.find('title').get_text()
                        if '二手房' in title:
                            self.Logger.Info(u'>>>>> ip:' +
                                             str(self.ip['ip']) + u'可用|' +
                                             title + '<<<<<')
                            list = soup.find(attrs={
                                'id': 'houselist-mod-new'
                            }).find_all('li')
                            for l in list[0:]:
                                oneDetailGetDown = True
                                while oneDetailGetDown:
                                    house_title = l.find(attrs={
                                        'class': 'house-title'
                                    }).find('a').attrs['title'].strip()
                                    price = l.find(attrs={
                                        'class': 'price-det'
                                    }).get_text().strip()
                                    try:
                                        detail_url = l.find(
                                            attrs={
                                                'class': 'house-title'
                                            }).find('a').attrs['href']
                                        self.Logger.Info(
                                            u'>>>>> 开始抓取:' + house_title +
                                            '|' + detail_url.split('view/')
                                            [1].split('?')[0] + '|ip:' +
                                            self.ip['ip'] + u'|数据<<<<<')
                                        proxies = {
                                            'http': self.ip['ip'],
                                            'https': self.ip['ip']
                                        }
                                        head['user-agent'] = random.choice(
                                            self.user_agents)
                                        r_detail = requests.get(
                                            detail_url.split('now_time')[0],
                                            timeout=10,
                                            proxies=proxies,
                                            headers=head)
                                        time.sleep(random.random() * 20)
                                        soup_detail = BeautifulSoup(
                                            r_detail.text, "html.parser")
                                        title_detail = soup_detail.find(
                                            'title').get_text()
                                        if '58安居客' in title_detail and '访问验证' not in title_detail:
                                            try:
                                                self.Logger.Info(
                                                    u'>>>>> 开始从列表页获取详情中需要的数据|'
                                                    + title_detail + '<<<<<')
                                                detail_dict = self.get_data(
                                                    soup_detail)
                                                detail_dict['city_id'] = city[
                                                    'city_id']
                                                detail_dict[
                                                    'city_name'] = city[
                                                        'city_name']
                                                detail_dict['source'] = 'ajk'
                                                detail_dict[
                                                    'house_id'] = detail_url.split(
                                                        'view/')[1].split(
                                                            '?')[0]
                                                detail_dict[
                                                    'link_url'] = detail_url.split(
                                                        '?')[0]
                                                detail_dict[
                                                    'title'] = house_title
                                                detail_dict[
                                                    'price'] = self.utils.str_to_num(
                                                        price)
                                                oneDetailGetDown = False
                                                self.insert_update_data(
                                                    detail_dict)
                                            except BaseException, e:
                                                self.Logger.Info(
                                                    u'>>>>> 从列表页获取详情中需要的数据出错' +
                                                    str(e) + '<<<<<')
                                        elif '可能被删除' in title_detail:
                                            self.Logger.Info(u'>>>>> 该链接失效|' +
                                                             title_detail +
                                                             '<<<<<')
                                            oneDetailGetDown = False
                                        else:
                                            self.Logger.Info(
                                                u'>>>>> ip for detail:' +
                                                str(self.ip['ip']) + u'不可用|' +
                                                str(title_detail) + '<<<<<')
                                            result_ip = self.utils.get_active_ip(
                                                self.ips, self.ip, self.Logger,
                                                self.PROXYNAME, self.mysql)
                                            self.ip = result_ip['active_ip']
                                            self.ips = result_ip['ips']
                                    except BaseException, e:
                                        self.Logger.Info(
                                            u'>>>>> ip for detail:' +
                                            str(self.ip['ip']) + u'不可用,超时|' +
                                            str(e) + '<<<<<')
                                        result_ip = self.utils.get_active_ip(
                                            self.ips, self.ip, self.Logger,
                                            self.PROXYNAME, self.mysql)
                                        self.ip = result_ip['active_ip']
                                        self.ips = result_ip['ips']
                            oneCityGetDown = False
                            self.Logger.Info(u'>>>>> ========== city:' +
                                             city['city_name'] + u'第' +
                                             str(int(page) + 1) + u'页' +
                                             u'抓取完成 ========== <<<<<')
                        else:
Пример #10
0
 def onEdit(self, event):
     self.control.write(str(util.obtainEventInfo(event)))
Пример #11
0
 def __init__(self, driver, username):
     self.driver = driver
     self.utils = Utils(driver)
     self.username = username
Пример #12
0
    def go(self):
        user_type = self.account.split('@')
        self.browser.maximize_window()
        if len(user_type) > 1:
            self.browser.get("https://mp.dayu.com/")
            time.sleep(3)
            iframe = self.browser.find_element_by_tag_name("iframe")
            self.browser.switch_to.frame(iframe)
            self.browser.find_element_by_xpath("//input[@id='login_name']").send_keys(self.account)
            self.browser.find_element_by_xpath("//input[@id='password']").send_keys(self.password)
            slide = self.browser.find_element_by_xpath("//span[@id='nc_1_n1z']")

            action_chain = self.action_list(300)
            ActionChains(self.browser).click_and_hold(slide).perform()
            for action in action_chain:
                time.sleep(0.1)
                ActionChains(self.browser).move_by_offset(xoffset=action, yoffset=0).perform()
            ActionChains(self.browser).release().perform()
            WebDriverWait(self.browser, 10).until(
                EC.presence_of_element_located((By.XPATH, "//b[contains(string(), '验证通过')]")))
            self.browser.find_element_by_xpath("//input[@id='submit_btn']").click()
        else:
            self.browser.get('https://account.youku.com/partnerLogin.htm?pid=20170512PLF000867&callback=https%3A%2F%2Fmp.dayu.com%2Fyt-login-callback%3Fredirect_url%3D')
            time.sleep(3)
            self.browser.find_element_by_xpath('//*[@id="YT-ytaccount"]').send_keys(self.account)
            self.browser.find_element_by_xpath('//*[@id="YT-ytpassword"]').send_keys(self.password)
            self.browser.find_element_by_xpath('//*[@id="YT-nloginSubmit"]').click()
            time.sleep(3)
        try:
            self.browser.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div/button[1]').click()
            time.sleep(2)
        except Exception:
            pass

        self.browser.maximize_window()
        self.cookies = self.get_cookies()

        # 切换视频上传页面
        time.sleep(1)
        self.browser.implicitly_wait(30)
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//a[@id='w-menu-']"))).click()
        time.sleep(2)
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//a[@data-path='/dashboard/video/write']"))).click()
        time.sleep(1)
        # ----- end -----

        try:
            # 切换视频上传页面
            time.sleep(1)
            self.browser.implicitly_wait(30)
            WebDriverWait(self.browser, 10).until(
                EC.presence_of_element_located((By.XPATH, "//a[@id='w-menu-']"))).click()
            time.sleep(2)
            WebDriverWait(self.browser, 10).until(
                EC.presence_of_element_located((By.XPATH, "//a[@data-path='/dashboard/video/write']"))).click()
            time.sleep(1)
            # ----- end -----
        except Exception as e:
            pass

        # 视频上传开始
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='article-write_video-container-upload-local']"))).click()
        time.sleep(1)
        Utils.upload(self.video_path)
        time.sleep(2)
        try:
            Utils.upload(self.video_path)
        except Exception as e:
            print(e)
            print("第二次打视频地址")
        if int(self.browser.find_element_by_xpath(
                '/html/body/div[1]/div[4]/div/div[2]/div/div/div[2]/div/div[2]/span').text) == 0:
            self.msg = "次数不足"
            return
        # ---- end -----
        WebDriverWait(self.browser, 180).until(
            EC.presence_of_element_located((By.XPATH, "//p[contains(string(), '视频上传成功,处理中')]")))
        time.sleep(2)

        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='w-form-field-content']/input"))).clear()
        self.browser.find_element_by_xpath("//div[@class='w-form-field-content']/input").clear()
        time.sleep(0.1)
        self.browser.find_element_by_xpath("//div[@class='w-form-field-content']/input").send_keys([Keys.BACKSPACE for i in range(1, 100)] + [i for i in self.title])
        time.sleep(0.1)
        try:
            js = "var q=document.documentElement.scrollTop=100000"
            self.browser.execute_script(js)
        except Exception as e:
            print(e)
            print("滑动至底部失败")
        self.browser.find_element_by_xpath("//div[@class='w-form-field-content']/textarea").clear()
        time.sleep(0.1)
        self.browser.find_element_by_xpath("//div[@class='w-form-field-content']/textarea").send_keys([Keys.BACKSPACE for i in range(1, 100)] + [i for i in self.title])
        time.sleep(0.1)
        self.build_tags(self.tags)

        try:
            js = "var q=document.documentElement.scrollTop=100000"
            self.browser.execute_script(js)
        except Exception as e:
            print(e)
            print("滑动至底部失败")

            #选择分类
        try:
            self.browser.find_element_by_xpath("//div[@class='widgets-selects_container']").click()
        except Exception as e:
            print("选择分类失败")
        # try:
        #     self.browser.find_element_by_xpath('/html/body/div[1]/div[4]/div/div[2]/div/div/div[1]/div[7]/div/div/div[1]/i').click()
        # except Exception as e:
        #     print(e)
        #     print("第二次选择分分类失败")
        # self.browser.find_element_by_xpath("//div[@class='widgets-selects_container']").click()
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='widgets-selects_select_container']/a[contains(string(), '{}')]".format(self.video_type)))).click()
        image_div = self.browser.find_element_by_xpath("//div[@class='article-write_box-form-coverImg']")
        self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        ActionChains(self.browser).move_to_element(image_div).perform()
        try:
            ActionChains(self.browser).move_to_element(image_div).perform()
            ActionChains(self.browser).move_to_element(image_div).perform()
            ActionChains(self.browser).move_to_element(image_div).perform()
        except Exception as e:
            print("这是去点击上传封面图片的框,")


        time.sleep(0.1)
        image_div.find_element_by_xpath("//button[contains(string(), '从本地选择')]").click()
        time.sleep(0.5)
        print(self.image_path)

        Utils.upload(self.image_path)
        print("打图片地址上去")
        try:
            Utils.upload(self.image_path)
        except Exception as e:
            print("丢图片地址异常")
        time.sleep(2)

        WebDriverWait(self.browser, 30).until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='article-material-image-dialog_btn']//button[contains(string(), '保存') and not(@disabled)]"))).click()
        time.sleep(2)
        # try:
        #     WebDriverWait(self.browser, 2).until(
        #         EC.presence_of_element_located((By.XPATH, "//div[@class='w-radio w-radio_checked iconfont wm-icon-yes']"))).click()
        # except TimeoutException:
        #     pass
        time.sleep(5)
        WebDriverWait(self.browser, 30).until(
            EC.presence_of_element_located((By.XPATH, "//button[contains(string(), '发表')]"))).click()
        time.sleep(1)

        #判断是否标题党
        try:
            WebDriverWait(self.browser, 3).until(
                EC.text_to_be_present_in_element((By.XPATH, "/html/body/div[4]/div/div[2]/div/div[1]"), '平台')
            )
            self.msg = "标题党嫌疑,请再改改"
            return
        except Exception as e:
            print("暂无标题党嫌疑")
        try:
            WebDriverWait(self.browser, 4).until(
                EC.element_to_be_clickable((By.XPATH, '/html/body/div[4]/div/div[2]/div/div[2]/div/button[1]'))
            )
            self.msg = "标题党嫌疑,请你改下,并重新提交"
            return
        except Exception as e:
            logger.info("无标题党嫌疑")
        # ---- end ----
        # 确认发布
        time.sleep(2)
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//button[contains(string(), '确认发表')]"))).click()
        # ---- end ----
        # 获取视频信息
        time.sleep(2)
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//ul[@class='w-list']/li[1]//div[@class='w-list-item-content-detail']/h3/a"))).click()
        self.status = True
        time.sleep(2)
        ahref = self.browser.find_element_by_xpath("//div[@class='contents-publish-article-preview']/iframe").get_attribute("src")
        info_frame = self.browser.find_element_by_xpath("//div[@class='contents-publish-article-preview']/iframe")
        self.browser.switch_to.frame(info_frame)
        vhref = self.browser.find_element_by_xpath("//div[@class='article-content simple-ui']//iframe").get_attribute("src")
        self.aid = self.get_aid(ahref)
        self.vid = self.get_vid(vhref)
Пример #13
0
class Home:
    def __init__(self, driver, username):
        self.driver = driver
        self.utils = Utils(driver)
        self.username = username

    def __find_post(self):
        tries = 0

        while True:
            tries += 1

            try:
                post = WebDriverWait(self.driver, 5).until(
                    EC.presence_of_element_located((
                        By.XPATH,
                        f'//div[ .//div[@data-testid="like"] and contains(@style, "position: absolute") and .//div[ contains( @class, "css-bfa6kz r-1re7ezh")]//span[text() != "{self.username}" ] ]'
                    )))
            except Exception as error:
                if (tries > 2):
                    playsound('./alert_sound.mp3')
                    print(error)

                    if input("Do you want to quitr? y/n").lower() == 'n':
                        tries = 0
                        continue

                    self.utils.quit()

                self.utils.scroll_to_end()
                continue
            else:
                return post

    def __find_required_elements(self, post):
        try:
            username = post.find_element_by_xpath(
                './/div[ contains( @class, "css-bfa6kz r-1re7ezh")]//span'
            ).text
            like_btn = post.find_element_by_xpath(
                './/div[@data-testid="like"]')
            reply_btn = post.find_element_by_xpath(
                './/div[@data-testid="reply"]')
        except:
            self.utils.handle_error(
                "Home: username, likebtn or reply btn xpath outdated")

        return (username, like_btn, reply_btn)

    def __reply(self, reply_btn, comment):
        self.utils.click_js(reply_btn)
        try:
            reply_input = self.driver.find_element_by_xpath(
                '(//div[@data-testid="tweetTextarea_0"])[1]')
        except:
            self.utils.handle_error("Home: Reply input field xpath outdated")
        else:
            reply_input.send_keys(comment, Keys.CONTROL + Keys.ENTER)

    def __get_comment(self):
        comments = [
            f"Nice post", f"Awesome work", f"Impressive work", f"Coool"
        ]
        return comments[random.randint(0, len(comments) - 1)]

    def __handle_like_error(self, like_btn):
        print(like_btn.get_attribute('data-testid'))

        if like_btn.get_attribute('data-testid') != 'unlike':
            try:
                error = WebDriverWait(self.driver, 3).until(
                    EC.presence_of_element_located(
                        (By.XPATH,
                         '//div[@data-testid="toast" and @role="alert"]//span'
                         ))).text
            except NoSuchElementException:
                error = """Home: Like error occured...\nPossible Reasons:\n1: Slow internet\n2: Xpath for error text of like is outdated OR error text is not displayed by twitter"""
            except:
                error = 'Home: Unexpected error'

            self.utils.handle_error(error)

    def like_and_comment(self, like_only=True, limit=1000):
        self.utils.navigate(TwitterUrls.twitter_home_url)

        current_iteration = 1
        while current_iteration <= limit:
            post = self.__find_post()
            username, like_btn, reply_btn = self.__find_required_elements(post)

            print(":" * 20 + f"POST OF user {username} found" + ":" * 20)

            self.utils.click_js(like_btn)
            print("-->post liked")
            time.sleep(random.uniform(.5, .8))
            self.__handle_like_error(like_btn)

            if not like_only:
                self.__reply(reply_btn, self.__get_comment())
                print("-->replied to the post")

                delay = random.uniform(2, 4)
                print(f"Waiting {delay} seconds.")
                time.sleep(delay)
Пример #14
0
    def getSomeArticlesPageSoup(self, index=0, skipfail=False):
        '''
        1. 获取当前节目总共页数
        2. 存储节目信息
        3. 获取并解析节目第index页html内容
        :return: BeautifulSoup类实例,已访问过返回None
        '''
        # 如果当前没有正在访问的节目
        if self.currentSoup is None:
            try:
                # 获取到包含多个节目页面某个节目的解析
                while True:
                    # 没有内容则抛出异常
                    self.currentSoup = self.currentSoupList.pop()
                    itemUrl = Utils.listenHost + self.currentSoup["href"]
                    if itemUrl not in self.itemUrls:
                        self.itemUrls.add(itemUrl)
                        break
            except Exception:
                # 没有节目了,看看fromUrl里有没有可以获取节目的页面
                self.currentSoup = None
                if self.fromUrlsIndex >= len(self.fromUrls): return None
                currentFromUrl = self.fromUrls[self.fromUrlsIndex]
                self.fromUrlsIndex += 1
                try:
                    self.currentSoupList = self.getItemsFromUrl(currentFromUrl)
                    return self.getSomeArticlesPageSoup(index=index,
                                                        skipfail=skipfail)
                except Exception:
                    self.logger.error('节目包含页面访问失败,地址: ' + currentFromUrl)
                    # 此页面访问失败后,若再次调用此函数,依旧访问此页面
                    if skipfail == False:
                        self.fromUrlsIndex -= 1
                    raise Exception

            # 既然换了节目,就要初始化一些属性
            self.currentItemInit()
        else:
            itemUrl = Utils.listenHost + self.currentSoup["href"]

        # 已经获取到了节目首页,现在要根据传入的index获取页面
        # index为0则表示自增
        index = max(0, int(index))
        index = self.currentPageIndex if index == 0 else index

        itemFullUrl = itemUrl + 'page' + str(index) + '/'

        try:
            articlesContent = requests.get(itemFullUrl, headers=Utils.headers)
            resSoup = BeautifulSoup(articlesContent.text, "lxml")
        except Exception as e:
            self.logger.error('获取某节目某页失败: ' + itemFullUrl)
            raise Exception

        # 如果是第一次访问这个节目(无论哪一页),那么要做一些被延迟处理的事
        # 1. 持久化节目信息
        if self.hasBeenSaved is False:
            # 即使index非法,也能获取到节目信息
            try:
                self.currentItemInfo = self.getListenItemInfo(
                    resSoup, itemFullUrl)
            except Exception:
                self.logger.error('节目信息存储失败: ' + itemFullUrl)
                raise Exception

            self.hasBeenSaved = True

        # 2. 获取总页数
        if self.currentTotalPageCounts == 0:
            # 获取页数,若指定的index值过大,是无法获取到总页数的,即认为index非法
            self.currentTotalPageCounts = Utils.getPageCount(resSoup)
            # index非法,此次获取失败,恢复currentTotalPageCounts的值
            if index > self.currentTotalPageCounts:
                self.currentTotalPageCounts = 0
                return None

        # 如果指定了index,则下次访问index下一页
        # 默认访问下一页
        self.currentPageIndex = index + 1
        if self.currentPageIndex > self.currentTotalPageCounts:
            self.currentSoup = None

        # 如果当前节目访问完毕,且数量达到限制,那么设置超出位
        if self.currentSoup is None and self.getItemsSize() == self.limit:
            self.isOverLimited = True

        # 节目完整url(包含页码), 该页的soup
        return (itemFullUrl, resSoup)
Пример #15
0
    def go(self):
        # 登录
        self.browser.get("https://sso.toutiao.com/login/?service=https://mp.toutiao.com/sso_confirm/?redirect_url=JTJG")
        self.browser.implicitly_wait(30)
        self.browser.find_element_by_xpath("//div[@id='login-type-account']").click()
        time.sleep(2)
        self.browser.find_element_by_xpath("//input[@id='user-name']").send_keys(self.account)
        self.browser.find_element_by_xpath("//input[@id='password']").send_keys(self.password)
        time.sleep(0.1)
        self.browser.find_element_by_xpath("//button[@id='bytedance-login-submit']").click()
        self.handler_slider_verify()  # 处理滑块
        time.sleep(2)
        self.browser.maximize_window()
        # --- 登录结束 ----
        self.cookies = self.get_cookies()

        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//span[contains(string(), '西瓜视频')]"))).click()
        self.browser.implicitly_wait(20)
        # 跳转视屏上传页面
        self.browser.get("https://mp.toutiao.com/profile_v3/xigua/upload-video")
        # 视屏上传开始
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='undefined upload-handler ']"))).click()
        time.sleep(1)  # 等待上传框体加载
        Utils.upload(self.video_path)
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//span[contains(string(), '上传成功')]")))  # 等待视频上传完成
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='article-title-wrap-new']/input"))).clear()
        self.browser.find_element_by_xpath("//div[@class='article-title-wrap-new']/input").send_keys([Keys.BACKSPACE for i in range(1, 100)] + [i for i in self.title])
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//span[@class='tui-input-wrapper']/textarea"))).clear()
        self.browser.find_element_by_xpath("//span[@class='tui-input-wrapper']/textarea").send_keys([Keys.BACKSPACE for i in range(1, 100)] + [i for i in self.content])

        if self.is_origin:  # 是否原创
            # WebDriverWait(Browser, 10).until(
            #     EC.presence_of_element_located((By.XPATH, "//div[@class='edit-cell-new add-origin']//div[@class='tui2-radio']/input"))).click()
            WebDriverWait(self.browser, 10).until(
                EC.presence_of_element_located(
                    (By.XPATH, "//span[@class='tui2-radio-text' and contains(string(), '声明原创')]/..//input"))).click()
            WebDriverWait(self.browser, 10).until(
                EC.presence_of_element_located((By.XPATH, "//button[contains(string(.), '确 定')]"))).click()
            WebDriverWait(self.browser, 10).until(
                EC.presence_of_element_located(
                    (By.XPATH, "//span[@class='tui2-radio-text' and contains(string(), '非独家')]/..//input"))).click()
            if self.is_first:  # 是否首发
                WebDriverWait(self.browser, 10).until(
                    EC.presence_of_element_located(
                        (By.XPATH, "//span[@class='tui2-radio-text' and contains(string(), '非首发')]/..//input"))).click()
                WebDriverWait(self.browser, 10).until(
                    EC.presence_of_element_located(
                        (By.XPATH, "//div[@class='m-video-first']/div[@class='edit-cell-new'][1]//input"))).send_keys(self.first_url)
                WebDriverWait(self.browser, 10).until(
                    EC.presence_of_element_located(
                        (By.XPATH, "//div[@class='m-video-first']/div[@class='edit-cell-new'][2]//input"))).send_keys(self.first_platform)
                WebDriverWait(self.browser, 10).until(
                    EC.presence_of_element_located(
                        (By.XPATH, "//div[@class='m-video-first']/div[@class='edit-cell-new'][3]//input"))).send_keys(self.first_uname)
            else:
                WebDriverWait(self.browser, 10).until(
                    EC.presence_of_element_located(
                        (By.XPATH, "//span[@class='tui2-radio-text' and contains(string(), '首发')]/..//input"))).click()
        else:
            WebDriverWait(self.browser, 10).until(
                EC.presence_of_element_located(
                    (By.XPATH, "//span[@class='tui2-radio-text' and contains(string(), '非原创')]/..//input"))).click()
        # 原创结束

        self.build_tags(self.tags)
        # self.browser.find_element_by_xpath("//div[@class='edit-cell-new video-tag show-short']//input").send_keys("gsdfg")
        # self.browser.find_element_by_xpath("//div[@class='edit-cell-new video-tag show-short']//input").send_keys(Keys.ENTER)

        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located(
                (By.XPATH, "//div[@class='Select tui-select Select--single']/div[@class='Select-control']"))).click()
        time.sleep(0.1)
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@aria-label='{}']".format(self.video_type)))).click()
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='submit btn ']"))).click()
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='m-pgc-video-manage']")))
        self.status = True
        # self.browser.close()
        # 获取信息开始
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='m-articles no-count']")))  # 确认视频列表加载
        href = WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='m-articles no-count']/div[1]//a[@class='title-wrap']"))).get_attribute("href")
        self.aid = self.get_aid(href)
        self.vid = self.get_vid(href)
Пример #16
0
 def __init__(self, driver):
     self.driver = driver
     self.utils = Utils(driver)
Пример #17
0
    files.sort()
    for i in xrange(len(files)):
        filename = files[i]
        data = np.loadtxt(filename)
        pwo = removeoutliers(data)
        postfix = filename[-4:]
        fileout = filename[:-4] + "_wo_outliers" + postfix
        np.savetxt(os.path.basename(fileout), pwo)
        print "wo outliers saved: %s" % os.path.basename(fileout)

        # calculate avg and std. for the last file.
        # use that those informatoin to convert the z-score back to original score
        if i == len(files) - 1:
            writeoutStdAndAvg(pwo)

        zscoredata = Utils.zscore(pwo)

        fileout = filename[:-4] + "_zscore_wo_outliers" + postfix
        np.savetxt(os.path.basename(fileout), zscoredata)
        print "wo outliers saved: %s" % os.path.basename(fileout)

        if i == 0:  # store the points from first file, for centroid generation
            print zscoredata
            firstpoints = zscoredata

        k = 3  # 3 features
        initialc = np.array(random.sample(zscoredata, k))
        fileoutcentroids = filename[:-4] + "_zscore_wo_outliers.centroids" + postfix
        np.savetxt(os.path.basename(fileoutcentroids), initialc)
        print "wo outliers saved: %s" % os.path.basename(fileout)
    #
Пример #18
0
params.locnet = '10,10,10'
params.locnet2 = None
params.locnet3 = None
params.st = True
params.resume = False
params.dropout = 0.5
params.use_pickle = True
params.save_loc = "."
params.outfile = 'gtsrb_kaggle.csv'
#params.train_pickle = params.save_loc + '/train_balanced_preprocessed.p'
params.train_pickle = params.save_loc + '/train.p'
params.extra_debug = False

from util import Utils

utils = Utils()

fixedindex = 3  #3
epsilon = 0.05

import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--fixedindex",
                    type=int,
                    default=1,
                    help="choose the data to display")
parser.add_argument("--epsilon", type=float, default=0.0, help="perturb level")
parser.add_argument("--method", type=str, default='ERM', help="perturb level")
args = parser.parse_args()
Пример #19
0
def removeoutliers(points):
    print "total number of points: %s" % len(points)
    tmpzscoredata = Utils.zscore(points)  # To find outliers
    po = points[~(np.abs(tmpzscoredata) > 3).any(1)]  # filter out outlier rows
    print "removed : %s" % (len(points) - len(po))
    return po
Пример #20
0
    def go(self):
        self.browser.get("https://mp.qutoutiao.net/login")
        # 登录开始
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located(
                (By.XPATH, "//i[@class='login-icon']/following-sibling::input[1]"))).send_keys(self.account)
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//i[@class='pwd-icon']/following-sibling::input[1]"))).send_keys(
            self.password)
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//button[@id='submit-login']"))).click()
        # 登录结束
        time.sleep(2)
        self.browser.maximize_window()

        # 跳转发布开始
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//span[contains(string(), '发布内容')]"))).click()
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//p[contains(string(), '发布视频')]"))).click()
        #手机验证码
        try:
            WebDriverWait(self.browser, 10).until(
                EC.element_to_be_clickable((By.XPATH, '//*[@id="app"]/div[2]/div[2]/div/div[2]/div/div[2]/div/div[3]/div/div[3]/span/button[1]/span'))
            )
            self.msg = '需要手机验证码,请延时提交'
            return
        except Exception:
            pass
        # *** 处理发文规范弹窗 ***
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//i[@class='el-message-box__close el-icon-close']"))).click()
        time.sleep(1)
        # 跳转发布结束

        # 开始发视频
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//input[@id='inp-video-file']"))).click()
        time.sleep(1)  # 等待上传框体加载
        Utils.upload(self.video_path)
        WebDriverWait(self.browser, 100).until(
            EC.presence_of_element_located((By.XPATH, "//span[contains(string(), '上传成功')]")))
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located(
                (By.XPATH, "//form[@class='el-form el-form--label-left']/div[1]//input"))).send_keys(
            [Keys.BACKSPACE for i in range(1, 100)] + [i for i in self.title])  # 发送标题
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//textarea[@class='el-textarea__inner']"))).send_keys(
            self.content)  # 发送描述
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//input[@placeholder='请选择分类']"))).click()
        time.sleep(1)  # 等待分类加载
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//dd[contains(string(), '{}')]".format(self.video_type)))).click()
        time.sleep(1)
        self.build_tag(self.tags)
        time.sleep(0.5)
        # --  选择封面开始
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='el-upload']"))).click()
        time.sleep(1)  # 等待封面框体加载
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//p[contains(string(), '自定义封面')]"))).click()
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH, "//span[contains(string(), '选择图片')]"))).click()
        time.sleep(1)  # 等待上传框体加载
        Utils.upload(self.image_path)
        time.sleep(3)  # 等待图片上传完成  # TODO 待处理
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located((By.XPATH,
                                            "//div[@class='el-dialog__wrapper dialog-img-cropper']//span[contains(string(), '确 定')]")))
        time.sleep(1)
        self.browser.find_element_by_xpath("//div[@class='el-dialog__wrapper dialog-img-cropper']//span[contains(string(), '确 定')]").click()
        # 封面选择结束
        # 视频信息构造结束
        time.sleep(3)
        WebDriverWait(self.browser, 10).until(
            EC.presence_of_element_located(
                (By.XPATH, "//button[@class='el-button el-button--primary']//span[contains(string(), '发布')]"))).click()
        try:
            WebDriverWait(self.browser, 3).until(
                EC.presence_of_element_located((By.XPATH, "//div[@class='el-message-box__message']")))
            time.sleep(2)
            self.msg = self.browser.find_element_by_xpath("//div[@class='el-message-box__message']").text
            if self.msg:
                return
        except TimeoutException:
            pass
        WebDriverWait(self.browser, 10).until(EC.presence_of_element_located((By.XPATH, "//button[@class='el-button el-button--primary el-button--medium']//span[contains(string(), '确认发布')]")))   # TODO
        time.sleep(1)
        self.browser.find_element_by_xpath("//button[@class='el-button el-button--primary el-button--medium']//span[contains(string(), '确认发布')]").click()
        time.sleep(2)
        self.status = True
Пример #21
0
def connect_to_wifi(device_serial: str, wifi_name, wifi_password):
    print("the device " + device_serial + " try to connect the wifi " +
          wifi_name)
    # android.settings.WIFI_SETTINGS
    Utils.switch_on_wifi(device_serial)
    uiAuto = uiautomator2.connect_usb(serial=device_serial)
    """
      即使通过 start -a android.settings.WIFI_SETTINGS 进入到界面
      也有可能是在其他的界面(比如当前已经连接的 WiFi的信息界面. 我们必须杀死进程才能再次进去
      进去的时候 等待 20s, 增加等待时长, 有时候需要自动连接, 需要等待更长的时间
    """
    uiAuto.app_stop("com.android.settings")
    time.sleep(2)
    wifi_intent = "android.settings.WIFI_SETTINGS"
    start_wifi_activity = "adb -s " + device_serial + " shell am start -a " + wifi_intent
    subprocess.getstatusoutput(start_wifi_activity)
    time.sleep(20)

    if not uiAuto(text=wifi_name).exists:
        uiAuto.swipe(300, 900, 300, 200)
        if not uiAuto(text=wifi_name).exists:
            uiAuto.swipe(300, 900, 300, 200)
            if not uiAuto(text=wifi_name).exists:
                uiAuto.swipe(300, 900, 300, 200)

    if not uiAuto(text=wifi_name).exists:
        print("finally we have not found the wifi: " + wifi_name +
              ", for device: " + device_serial)
        return

    uiAuto(text=wifi_name).click()
    """
        在某些情况下, 这个设备已经连接过这个 WiFi 了, 执行 sendkeys 会报错, 
    """

    if uiAuto(text="Advanced options").exists() or uiAuto(
            text="高级选项").exists():
        try:
            uiAuto.send_keys(wifi_password, clear=True)
            if uiAuto(text="连接").exists():
                uiAuto(text="连接").click()
            else:
                if uiAuto(text="Connect").exists():
                    uiAuto(text="Connect").click()
                else:
                    if uiAuto(text="CONNECT").exists():
                        uiAuto(text="CONNECT").click()
        except UiObjectNotFoundError as error:
            print(error)
            if uiAuto(text="Frequency").exists():
                print("the device had been connected to the wifi " + wifi_name)
        finally:
            pass
    """
     等待 几秒钟, 这样才会dump出wifi的信息
    """
    time.sleep(5)

    device_wifi_name = Utils.get_wifi_ssid(device_serial)
    if device_wifi_name.__eq__(wifi_name):
        print("hi, the device switch to " + wifi_name + ", successfully! ")
    time.sleep(5)
Пример #22
0
	def __init__(self):
		utils = Utils()
		self.cfg = utils.pathToConfig()
		self.mysql = Mysql(self.cfg.get('DB', 'DBHOST'), int(self.cfg.get('DB', 'DBPORT')), self.cfg.get('DB', 'DBUSER'), self.cfg.get('DB', 'DBPWD'), 3, 5)
		print('start get ip')
Пример #23
0
def save_to_disc():
    Utils.saveToDisk("known_messages", known_messages)
Пример #24
0
        print(
            "    Arg2: ProjectKey en SonarQube correspondiente al elemento a analizar"
        )
        sys.exit(0)

    #myPropertiesFile = "D://Alfonso//AAProyectos//qamera_python//application.properties"
    myPropertiesFile = sys.argv[1]
    myProjectKey = sys.argv[2]
    print("Parametros de ejecucion:")
    print("Fichero de propiedades:", myPropertiesFile)
    print("Parametros de ejecucion:", myProjectKey)
    print("...")

    ### Read Program Properties from configuration file
    myprops = {}
    myprops = Utils.loadPropertiesFile(myPropertiesFile)

    ### Read Quality Model from configuration file
    myQualityModel = QualityModel(myPropertiesFile)
    print("Quality Model:", myQualityModel.max_violations_rate)

    ### Connecto to SonarQube
    mySonarqubeConector = SonarqubeConector(myprops["sonarURL"],
                                            myprops["sonarUser"],
                                            myprops["sonarCredentials"])
    ### Get ProjectId From SonarQube
    myProjectId = mySonarqubeConector.loadProjectIdFromSonar(myProjectKey)
    ### Get Metrics from SonarQube
    metricList = myprops["metrics"]
    qualityCodeResults = mySonarqubeConector.loadProjectMetricsFromSonar(
        myProjectKey, metricList)
Пример #25
0
./chkp_models//model_-00082-0.00764.hdf5
"""

max_len_doc_sents = 20 # avg 27 # MEJOR 10
max_len_summ_sents = 4
max_len_doc_sent_words = 25 # avg 29 # MEJOR 25
max_len_summ_sent_words = 15
padding_val = 0.
pos_pairs = 32
neg_pairs = 32
path_models = "./best_model/"
name_models = "model_"
output_file_sents = "./cnndaily_shann_3_sents.out"
output_file_words = "./cnndaily_shann_3_words.out"
path_weights = "./best_model//model_-00082-0.00764.hdf5"
w2v = ut.load_word2vec("../../Embeddings/cnndaily_w2v.model")
d = w2v.vector_size
topk_sentences = 3

shann_obj = SHANN(max_len_doc_sents, max_len_doc_sent_words,
		  max_len_summ_sents, max_len_summ_sent_words,
		  d, path_models, name_models)

shann_obj._set_model()
shann_obj.load_weights(path_weights)

decoder = Decoder(max_len_doc_sents, max_len_doc_sent_words,
                  w2v, d, shann_obj.get_all_att_model(),
                  topk_sentences=topk_sentences)

test_file = "../../Corpora/CNNDM/test.csv"
Пример #26
0
class ipProxy():

    user_agents = []
    headers = {}
    utils = Utils()

    def __init__(self):
        self.Loggers = Logger('ipProxy')
        self.user_agents = Headers().user_agents
        self.headers = Headers().headers
        self.cfg = self.utils.pathToConfig()
        self.mysql = Mysql(self.cfg.get('DB', 'DBHOST'),
                           int(self.cfg.get('DB', 'DBPORT')),
                           self.cfg.get('DB', 'DBUSER'),
                           self.cfg.get('DB', 'DBPWD'), 3, 5)

    def get_ip_from_xici(self):
        Loggers = Logger(special_log_file='getProxyXiCi')
        while 1 == 1:
            try:
                avalibleIpsOneWeb = []
                startGetIpTime = time.time()
                startGetIpTimeFormat = time.strftime(
                    "%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
                title = u'西祠代理'
                Loggers.Info('>>>>> ' + startGetIpTimeFormat + '|' + title +
                             u'|开始抓取ip <<<<<')
                url = 'http://www.xicidaili.com/nn/'
                head = self.headers
                head['user-agent'] = random.choice(self.user_agents)
                try:
                    Loggers.Info('>>>>> ' + title + u'|开始请求url ' + url +
                                 ' <<<<<')
                    r = requests.get(url, timeout=10, headers=head)
                    soup = BeautifulSoup(r.text, "html.parser")
                    list = soup.find('table', attrs={
                        'id': 'ip_list'
                    }).find_all('td')
                    strText = ''
                    ips = []
                    for l in list:
                        content = l.get_text().strip()
                        if re.match(r'^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$',
                                    content):
                            strText = content
                        if re.match(
                                r'^([0-9]|[1-9]\d{1,3}|[1-5]\d{4}|6[0-4]\d{4}|65[0-4]\d{2}|655[0-2]\d|6553[0-5])$',
                                content):
                            strText = strText + ':' + content
                            ips.append(strText)
                    endGetIpTime = time.time()
                    endGetIpTimeFormat = time.strftime(
                        "%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
                    Loggers.Info('>>>>> ' + endGetIpTimeFormat + '|' + title +
                                 u'|结束抓取ip,共抓取' + str(len(ips)) + '条 <<<<<')
                    Loggers.Info('>>>>> ' + endGetIpTimeFormat + '|' + title +
                                 u'|开始检查ip是否可用,抓取共耗时' +
                                 str(endGetIpTime - startGetIpTime) + ' <<<<<')

                    for ip in ips:
                        Loggers.Info(u'>>>>> 开始检查ip:' + str(ip) + ' <<<<<')
                        start = time.time()
                        if self.utils.checkIpForAJK(ip):
                            end = time.time()
                            avalibleIpsOneWeb.append({
                                'source': 'xici',
                                'ip': ip,
                                'time': str(end - start)
                            })
                            Loggers.Info('>>>>> ip:' + str(ip) + u' 可用<<<<<')
                        else:
                            Loggers.Info('>>>>> ip:' + str(ip) + u' 不可用<<<<<')
                    endCheckIpTime = time.time()
                    endCheckIpTimeFormat = time.strftime(
                        "%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
                    Loggers.Info('>>>>> ' + endCheckIpTimeFormat + '|' +
                                 title + u'|结束检查ip是否可用,检查共耗时' +
                                 str(endCheckIpTime - endGetIpTime) + ' <<<<<')
                    Loggers.Info('>>>>> ' + title + u'|成功率:' +
                                 str(len(avalibleIpsOneWeb)) + '-' +
                                 str(len(ips)) + ' <<<<<')
                    Loggers.Info('>>>>> ' + endCheckIpTimeFormat + '|' +
                                 title + u'|结束,抓取到' +
                                 str(len(avalibleIpsOneWeb)) + u'条可用ip,共耗时' +
                                 str(endCheckIpTime - startGetIpTime) +
                                 ' <<<<<')
                    # self.avalibleIps.append(avalibleIpsOneWeb)
                    self.insert_data(Loggers, avalibleIpsOneWeb)
                except BaseException, e:
                    Loggers.Error(u'>>>>> 请求url出错 ' + str(e) + '<<<<<')
            except BaseException, e:
                Loggers.Error(u'>>>>> 抓取ip循环出错 ' + str(e) + '<<<<<')
            time.sleep(10)
Пример #27
0
def getCategoryPartLists(parent, response):
	categoryList = []
	util = Utils(response)
	root = util.getStringXml()
	categoryList = util.getCategoryList(root, parent)
	return categoryList
Пример #28
0
def load_from_disc():
    global known_messages
    known_messages = Utils.fetchFromDisk("known_messages")
Пример #29
0
    def to_json(config, array):
        id_field = config.property('result.id_field', 'id')
        value_field = config.property('result.value_field', 'value')

        value_unit = config.property('result.value_unit', 's')
        time_functions = {
            'ns': lambda x: x * 1000000000,
            'nanosecond': lambda x: x * 1000000000,
            'mcs': lambda x: x * 1000000,
            'microsecond': lambda x: x * 1000000,
            'ms': lambda x: x * 1000,
            'millisecond': lambda x: x * 1000,
            'm': lambda x: x / 60,
            'minute': lambda x: x / 60,
            'h': lambda x: x / 3600,
            'hour': lambda x: x / 3600,
            'd': lambda x: x / 86400,
            'day': lambda x: x / 86400
        }
        time_function = time_functions[value_unit] \
            if time_functions.__contains__(value_unit) \
            else lambda x: x

        expired_time = config.property('expire.delay')
        event_time_expire = config.property('expire.event_time', True)
        expired_behavior = config.property('expire.behavior', 'fail')

        # Flag to fail package
        failed = False

        package, values = (array[0], array[1].values())
        if values[-1][1] == 'start':
            if expired_behavior == 'keep':
                from util import Utils
                values += [(Utils.datetime_add_seconds(values[-1][0], expired_time), 'stop')]
            elif expired_behavior == 'fail':
                failed = True

        start_values = [value[0] for value in values if value[1] == 'start']
        stop_values = [value[0] for value in values if value[1] == 'stop']
        durations = map(lambda (start, stop): (stop - start).total_seconds(),
                        zip(start_values, stop_values))

        if len(durations) == 0:
            return None

        json = {
            '@timestamp':
                stop_values[-1].isoformat() if len(stop_values) > 0
                else start_values[-1].isoformat(),
            value_field: time_function(sum(durations) / len(durations)),
            id_field: package
        }

        if expired_behavior == 'fail':
            json['failed'] = 'true' if failed else 'false'

        if event_time_expire and expired_behavior != 'keep':
            expired_durations = [duration for duration in durations if duration > expired_time]
            unexpired_durations = [duration for duration in durations if duration <= expired_time]

            if len(unexpired_durations) == 0:
                return None

            json[value_field] = time_function(sum(unexpired_durations) / len(unexpired_durations))
            json['failed'] = 'true' if len(expired_durations) > 0 \
                                       and expired_behavior == 'fail' else 'false'

        static_fields = config.property('result.static_fields')
        if static_fields is not None:
            json.update(static_fields)

        return json
Пример #30
0
    def getUserInfo(self, uid, frequentAdd=1, frequentReduce=2):
        full_url = Utils.userHost + '/u/' + uid + '/'
        user = ListenUser(full_url)

        try:
            content = self.session.get(full_url,
                                       headers=Utils.headers,
                                       allow_redirects=False)
        except Exception:
            self.logger.error('获取用户页面失败: ' + full_url)
            return None

        # 有的人将部落设置为隐私,外部不能访问,页面会302转向error
        if content.status_code != 200:
            self.logger.warning(full_url + ': redirect ' +
                                str(content.status_code))
            responseText = urlparse(unquote(
                content.headers['Location'])).query.split('=', maxsplit=1)[1]
            self.logger.warning('提示: ' + responseText)

            if responseText[0:2] == '用户':
                # 若为私有,需要换种访问信息的方式
                # 如下jsonp可用于获取部分信息
                # encodeUri后:http://bulo.hujiang.com/service/GetUserFace.ashx?ver=2016/11/23%20%E4%B8%8B%E5%8D%888:29:28&userId=5326257&callback=jQuery17202552129787287378_1479904148908&_=1479904168992
                # decodeUri后:http://bulo.hujiang.com/service/GetUserFace.ashx?ver=2016/11/23 下午8:29:28&userId=5326257&callback=jQuery17202552129787287378_1479904148908&_=1479904168992
                queryParams = {}
                nt = datetime.datetime.now()
                currHour = int(datetime.datetime.now().hour)
                verTime = nt.strftime('%Y/%m/%d {half}%I:%M:%S').format(
                    half=('上午' if currHour < 12 else '下午'))
                queryParams['ver'] = verTime
                queryParams['userId'] = uid
                timeStamp = ''.join(str(time.time()).split('.'))[:13]
                queryParams['callback'] = 'jQuery17202552129787287378_' + str(
                    timeStamp)
                queryParams['_'] = str(timeStamp)
                info_url = Utils.urlCreate(
                    Utils.userHost + '/service/GetUserFace.ashx?', queryParams)

                try:
                    infoRespond = requests.get(info_url, headers=Utils.headers)
                except Exception:
                    self.logger.error('隐私用户信息获取失败: ' + uid)
                    return None

                infoRespondStr = infoRespond.text.split('(',
                                                        maxsplit=1)[1][:-1]
                infoRespondJson = json.loads(infoRespondStr)

                user.name = infoRespondJson[
                    'UserName'] if 'UserName' in infoRespondJson else ''
                user.nickName = infoRespondJson['NickName'][
                    1:-1] if 'NickName' in infoRespondJson else ''
                user.signature = infoRespondJson[
                    'UserSign'] if 'UserSign' in infoRespondJson else ''
                user.city = infoRespondJson[
                    'city'] if 'city' in infoRespondJson else ''
                user.signinLast = infoRespondJson[
                    'PunchCount'] if 'PunchCount' in infoRespondJson else ''
                gender = infoRespondJson[
                    'Gender'] if 'PunchCount' in infoRespondJson else ''
                if gender == '1' or gender == '0':
                    user.gender = '男' if gender == '1' else '女'

                try:
                    user.save(self.mysql_session)
                except Exception:
                    self.logger.error('存储隐私用户信息失败')
                    raise Exception

                # 若为私有,则存储进userAll
                self.privateUids += 1
                self.userUids.add(uid)
                if self.tooFrequent > 0:
                    self.tooFrequent -= frequentReduce
                self.tooFrequent = 0 if self.tooFrequent < 0 else self.tooFrequent

                self.logger.debug(user)
                return user
            else:
                # 某用户页面访问次数限制
                if self.lastUserVisitInfo[0] == uid:
                    self.lastUserVisitInfo[1] += 1
                else:
                    self.lastUserVisitInfo[0] = uid
                    self.lastUserVisitInfo[1] = 0

                # 满足次数要求的话,就放进优先队列等待下一次重试访问
                if self.lastUserVisitInfo[1] <= self.failedToVisitCountLimit:
                    # 不能存进userAll, 而是放进放进uidsPriority数组等待重新访问
                    self.appendUidPriority(uid)
                else:
                    # 记录失败的访问用户
                    self.failedToVisit.append(uid)
                    # 避免再次访问
                    self.userUids.add(uid)

                # 如果第一次遇到这种情况,最好睡眠时间快速增长,之后缓慢增长
                if self.tooFrequent == 0:
                    self.tooFrequent = 4
                else:
                    self.tooFrequent += frequentAdd

                return None

        # 如果此时的返回不是过于频繁,那么等待时间即可缩小一倍
        if self.tooFrequent > 0:
            self.tooFrequent -= frequentReduce
        self.tooFrequent = 0 if self.tooFrequent < 0 else self.tooFrequent

        try:
            soup = BeautifulSoup(content.text, "lxml")
        except Exception:
            self.logger.error('解析用户页面失败: ' + full_url)
            return

        # 统计
        countList = soup.find(attrs={'id': 'LeftCnt_divUserCount'})

        # 处理一些数据
        if countList:
            # 访客数
            viewCount = countList.find(attrs={'id': 'li_viewCount'})
            if viewCount and len(viewCount) != 0:
                user.viewCount = viewCount.string

            # 留言数
            msgCount = countList.find(attrs={'id': 'li_msgCount'})
            if msgCount and len(msgCount) != 0:
                user.msgCount = msgCount.find('a').string

            # 碎碎数
            ingCount = countList.find(attrs={'id': 'li_ingCount'})
            if ingCount and len(ingCount) != 0:
                user.ingCount = ingCount.find('a').string

            # 日志数
            blogCount = countList.find(attrs={'id': 'li_blogCount'})
            if blogCount and len(blogCount) != 0:
                user.blogCount = blogCount.find('a').string

            # 听写数
            listenCount = countList.find(attrs={'id': 'li_listenCount'})
            if listenCount and len(listenCount) != 0:
                user.listenCount = listenCount.find('a').string

            # 口语数
            talkCount = countList.find(attrs={'id': 'li_talkCount'})
            if talkCount and len(talkCount) != 0:
                user.talkCount = talkCount.find('a').string

            # 礼物数
            giftCount = countList.find(attrs={'id': 'li_giftCount'})
            if giftCount and len(giftCount) != 0:
                user.giftCount = giftCount.find('a').string

        # 个人信息
        profileList = soup.find(id='u_profile').find('ul')

        # 继续处理数据
        if profileList:
            for child in profileList.children:

                if child.name != 'li':
                    continue

                text = child.get_text(strip=True)
                if re.compile(r'性别').search(text):
                    user.gender = child.find_all('span')[1].string

                if re.compile(r'城市').search(text):
                    user.city = child.find_all('span')[1].string

                if re.compile(r'昵称').search(text):
                    child.span.replace_with('')
                    user.nickName = child.get_text(strip=True)

                if re.compile(r'签名').search(text):
                    child.span.replace_with('')
                    user.signature = child.get_text(strip=True)

                if re.compile(r'沪龄').search(text):
                    # user.yearLast = child.find_all('span')[1].string
                    user.registDate = child.find_all('span')[1]['title'][5:]

                if re.compile(r'打卡').search(text):
                    child.span.replace_with('')
                    user.signinLast = int(child.get_text(strip=True)[0:-1])

                if re.compile(r'登录').search(text):
                    user.lastSignin = child.find_all('span')[1].string

        # 自我介绍
        selfIntroPre = soup.find(id='user_Profile_span_reportIt')
        selfIntro = None

        if selfIntroPre:
            selfIntro = selfIntroPre.find_previous_sibling()

        if selfIntro and selfIntro.name == 'div':
            user.selfIntroduction = selfIntro.get_text(strip=True)

        # 城市,因为该部分是注释,所以用bs4找不出来就用re了
        cityMatch = re.compile(
            r'<li id="user_Profile_span_city.*?<span>(.*?)</span></li>',
            re.S).search(content.text)
        if cityMatch:
            user.city = cityMatch.group(1)

        # 获取名称
        userNameHtml = soup.find(id='cont_h1')
        userNameHtml.a.replace_with('')
        userNameHtml.span.replace_with('')
        user.name = userNameHtml.get_text(strip=True)[0:-5].strip()

        try:
            user.save(self.mysql_session)
        except Exception:
            self.logger.error('存储用户信息失败')
            raise Exception

        self.userUids.add(uid)
        self.logger.debug(user)
        return user
Пример #31
0
 def __init__(self, driver):
     self.driver = driver
     self.twitter_common = TwitterCommon(driver)
     self.utils = Utils(driver)
 def sgd_classify(df_gender):
     X_train, X_test, y_train, y_test = Utils.split_data(df_gender)
     clf = SGDClassifier(loss="hinge", penalty="l2")
     clf.fit(X_train, y_train)
     y_pred = clf.predict(X_test)
     print("sgd acc: ", accuracy_score(y_test, y_pred))
Пример #33
0
            print(station + '|' + str(len(data['results'])))
            Utils.write_csv('statistics.csv',
                            [station, str(len(data['results']))], 'a+')
            if not os.path.exists(path):
                os.mkdir(path)
            Utils.write_file(
                path + '/' + station + '.txt',
                json.dumps(data['results'], ensure_ascii=False, indent=4),
                'a+')
        else:
            Utils.write_csv('no_place_' + path + '.csv', [station], 'a+')

    def get_bus_place(self, station):
        self.__get_place(self.__REGION, station, '长途汽车站,公交车站', 'bus')

    def get_railway_place(self, station):
        self.__get_place(self.__REGION, station, '火车站', 'railway')


if __name__ == '__main__':
    api = LbsApi()

    # bus_config = 'config/busStationNoGroup.txt'
    # stations = Utils.get_txt_config(bus_config)
    # Utils.async_task(api.get_bus_place, stations)

    railway_config = '../config/trainStationNoGroup.txt'
    stations = Utils.get_txt_config(railway_config)
    Utils.async_task(api.get_railway_place, stations)
    # 合并文件
Пример #34
0
class Followers:
    def __init__(self, driver):
        self.driver = driver
        self.utils = Utils(driver)

    def __find_followable_people_element(self):
            tries = 0

            while True:
                tries += 1
                try:
                    follow_element = WebDriverWait(self.driver, 2).until(EC.presence_of_element_located((By.XPATH, '//div[contains(@style,"position: absolute;") and .//div[contains(@data-testid, "-follow")]]')))
                except Exception as error:
                    if tries > 2:
                        self.utils.handle_error(error)

                    self.utils.scroll_to_end()
                    continue
                else:
                    return follow_element


    def __find_username_and_follow_btn(self, follow_element):
        try:
            username = follow_element.find_element_by_xpath('.//div[contains(@class, "r-1re7ezh r-18u37iz")]/span').text
            follow_btn = follow_element.find_element_by_xpath('.//div[contains(@data-testid, "-follow")]')
        except:
            self.utils.handle_error("Followers: username or follow_btn xpath outdated")


        return (username, follow_btn)

    def __handle_if_error_occured(self, username, last_followed_user):
        if username == last_followed_user:
            try:
                error = WebDriverWait(self.driver, 2).until(EC.presence_of_element_located((By.XPATH, '//div[@data-testid="toast"]//div[contains(@class, "r-16dba41")]//span'))).text
            except NoSuchElementException:
                error = "Followers: Xpath for error text of followers is outdated"
            except:
                error = "Followers: Unexpected error"

            self.utils.handle_error(error)

    def __wait_before_next_follow(self):
        delay = round(random.uniform(2, 4), 3)
        print(f"Waiting {delay} seconds before next follow")
        time.sleep(delay)

    def follow(self, username_or_query, is_username=True, limit=400):
        if is_username:
            self.utils.navigate(TwitterUrls.get_link_of_user_followers(username_or_query))
        else:
            self.utils.navigate(TwitterUrls.get_twitter_users_link(username_or_query))

        current_iteration = 1
        last_followed_user = None

        while current_iteration <= limit:
            current_iteration += 1

            follow_element = self.__find_followable_people_element()
            username, follow_btn = self.__find_username_and_follow_btn(follow_element)

            self.__handle_if_error_occured(username, last_followed_user)
            self.utils.click_js(follow_btn)

            print(f"Just followed {username}")
            last_followed_user = username

            self.__wait_before_next_follow()
Пример #35
0
def load():
    global LOADED, INFO
    INFO = Utils.fetchFromDisk("config/global")
    LOADED = True
Пример #36
0
from models import User, Article, Item
from datetime import datetime
from util import Utils

utils = Utils()

class UserInfo(object):
    def __init__(self, uid):
        # 用户ID
        self.__uid = uid
        # 访客数量
        #self.viewCount = 0
        # 留言数量
        #self.msgCount = 0
        # 留言列表
        # 留言的评论时间有问题,一年以内(365天以内)没有年份
        #self.msgList = []
        # 碎碎数量(http://t.hujiang.com/u/3566854/)
        # 不要被显示的页数欺骗,其实有很多页,可以通过ul标签是否有li元素判断这一页是否有碎碎,没有则表示结束
        #self.ingCount = 0
        # 碎碎列表
        #self.ingList = []
        # 日志数量
        #self.blogCount = 0
        # 听写数量
        #self.listenCount = 0
        # 听写列表
        #self.listenList = []
        # 口语数量
        #self.talkCount = 0
        # 礼物数量
            for k,v in d.iteritems():
    #            filebyvalues.write('%s %s\n' % (str(k), ' '.join(map(str, v))))
                filebyvalues.write('%s\n' % ' '.join(map(str, v)))
    #            i += 1
    #            if i == 10:
    #                break
        
    
        print 'done.'
    print 'Alle done!'
    
    

    #Normalize files and pick random distinct initial centroids
    files = glob.glob(filespath + '.dat')
    for filename in files:
        data = np.loadtxt(filename)
        zscoredata = Utils.zscore(data)
        postfix = filename[-4:]     
        
        fileout = filename[:-4] + '_zscore' + postfix
        np.savetxt(fileout, zscoredata)
        
        k = 3 #3 features
        centroids = Utils.getInitialMeans(zscoredata, k)
        fileoutcentroids = filename[:-4] + '_zscore.centroids' + postfix
        np.savetxt(fileoutcentroids, centroids)
        print 'zscore saved: %s' % fileout