Exemplo n.º 1
0
 def __init__(self):
     # 在不登陆的情况下:淘宝的搜索页面
     self.index_url = "https://s.taobao.com/search?q={}"
     # 浏览器对象
     self.browser = webdriver.Chrome()
     # 浏览器加载对象
     self.wait = WebDriverWait(self.browser, 20)
     # 记录对象
     self.record = Record()
Exemplo n.º 2
0
 def __init__(self, username, password):
     # 记录对象
     self.record = Record()
     # 手动输入账号密码的登陆界面,登录成功后跳转到搜索页面
     self.url = "https://login.taobao.com/member/login.jhtml?redirectURL=https%3A%2F%2Fwww.taobao.com%2F"
     # 浏览器对象
     self.browser = webdriver.Chrome()
     # 浏览器加载对象
     self.wait = WebDriverWait(self.browser, 20)
     # 淘宝账号
     self.username = username
     # 淘宝登陆密码
     self.password = password
    def process(self):
        ## prepare data for round_end_record obj
        data = self.log['data']
        table_info = data['table']
        player_info = data['players']

        round_end_record = Record()
        round_end_record.set_table_info(table_info)

        record_history = RecordWarehouse.pop_records(
            round_end_record.get_table_id())
        self._assign_money_won(player_info, record_history)
        self._assign_action_history(record_history)

        ## write log from record
        log = []
        for record in record_history:
            log.append(record.to_feature_string())

        IOUtil.write_file_line_by_line(Config.get_output_path(), log)
Exemplo n.º 4
0
    def build_function_str_pairs(self, ast) -> None:
        """
        Construct a list of tuples of functions: strings for an AST.

        Before adding anything to the master `Record` list of tuples
        the entire AST has to be traversed and any `FuncDef` nodes
        captured. Then, iterating each node and recursively searching
        for constants (strings) results in a list of all strings used
        by that function.

        Every string, regardless of its status as unique is added to the
        list of tuples. Uniqueness only becomes a factor once all files
        have been parsed and its time to add things to the final
        dictionary.

        :param ast: top-level AST generated as a result of parse_file
        :return: returns nothing
        """
        function_list = self.locate_functions(ast)
        function_strings = []

        for function_node in function_list:

            # Explicitly clearing the list of strings using clear() is
            # unfortunately the most elegant way to ensure that no
            # strings from the last function_node persist to the second
            function_strings.clear()
            function_strings = self.locate_func_strings(function_node)

            for function_str in function_strings:
                Record.add_func_str_to_list(function_node.decl.name,
                                            function_str)

        # While dictionaries are not inherently sortable, they do preserve
        # their insertion order. Thus, keeping the list of tuples pre-sorted
        # ensures that the final dictionary is at least somewhat in order
        Record.sort_tmp_list()

        # Clear all of the nodes and values for the ConstantVisitor
        # and FuncDefVisitor instances
        self.__init__()
Exemplo n.º 5
0
 def process(self):
     ## prepare data for record obj
     data = self.log['data']
     table_info = data['table']
     action_info = data['action']
     player_info = data['players']
     
     player_name = action_info['playerName']
     
     for player in player_info:
         if player_name == player['playerName'] and player['isSurvive'] == False:
             return
     
     record = Record()
     record.set_table_info(table_info)
     record.set_action_info(action_info)
     record.set_player_info(player_name, player_info)
     
     ## push record to record_warehouse
     RecordWarehouse.append(record.get_table_id(), record)
Exemplo n.º 6
0
    def process_files(self, files: list) -> None:
        """
        Process a list of file I/O objects.

        For each file specified in the `files` list, its AST
        is loaded and properly processed before it is added
        to the module-level `Record`.

        :param files: list of argparser IO wrappers
        :return: returns nothing
        """
        # If the `files` list is found to be empty or improperly
        # populated then a `NoFileSpecifiedError` is raised
        if not files:
            raise NoFilesSpecifiedError()

        for f_str in files:
            ast = self._intr.load_new_ast(f_str.name)
            self._astp.process_ast(ast)

        # Rather than attempt to integrate the list and dict after
        # every file, it saves huge computational complexity to just
        # condense the operation and only do it once per run
        Record.integrate_list_to_dict()
Exemplo n.º 7
0
class TaoBaoStartRequest(object):
    """
    使用selenium+chrome模拟登陆淘宝
    采取的策略为当当前账号被检测到时,更换账号模拟登陆
    """
    def __init__(self, username, password):
        # 记录对象
        self.record = Record()
        # 手动输入账号密码的登陆界面,登录成功后跳转到搜索页面
        self.url = "https://login.taobao.com/member/login.jhtml?redirectURL=https%3A%2F%2Fwww.taobao.com%2F"
        # 浏览器对象
        self.browser = webdriver.Chrome()
        # 浏览器加载对象
        self.wait = WebDriverWait(self.browser, 20)
        # 淘宝账号
        self.username = username
        # 淘宝登陆密码
        self.password = password

    def login(self):
        """
        请求获取登陆界面,自动化输入账号密码,完成登陆,获取搜索界面
        :return:
        """
        # 删除所有的cookies, 请求获取登录界面
        self.browser.delete_all_cookies()
        self.browser.get(self.url)
        # 转换手动输入账号密码界面
        password_login = self.wait.until(
            EC.presence_of_element_located((By.CLASS_NAME, 'login-switch')))
        if password_login:
            password_login.click()
            time.sleep(1)
        # 获取用户名的输入框
        username = self.wait.until(
            EC.presence_of_element_located((By.ID, 'TPL_username_1')))
        # 获取账号的输入框
        password = self.wait.until(
            EC.presence_of_element_located((By.ID, 'TPL_password_1')))
        # 获取登陆按钮
        submit = self.wait.until(
            EC.element_to_be_clickable((By.ID, 'J_SubmitStatic')))
        # 填写表单内容
        username.send_keys(self.username)
        password.send_keys(self.password)
        time.sleep(1)
        # 提交表单
        submit.click()

    def password_error(self):
        """
        判断是否密码错误
        :return:
        """
        try:
            return WebDriverWait(self.browser, 5).until(
                EC.text_to_be_present_in_element_value(
                    (By.CLASS_NAME, 'error'), '你输入的密码和账户名不匹配'))
        except TimeoutException:
            return False

    def login_successfully(self):
        """
        判断是否登陆成功
        :return:
        """
        try:
            return bool(
                WebDriverWait(self.browser, 5).until(
                    EC.presence_of_element_located(
                        (By.CLASS_NAME, 'site-nav-login-info-nick '))))
        except TimeoutException:
            return False

    def get_input(self, keyword):
        """
        获取搜索框,输入关键字,搜索
        :param keyword:
        :return:
        """
        # 如果登陆成成功
        if self.login_successfully():
            # 获取搜索框
            search_key = self.wait.until(
                EC.presence_of_element_located((By.ID, 'q')))
            # 获取搜索按钮
            # search_click = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'btn-search tb-bg')))
            # 首先清空输入框
            search_key.clear()
            # 填写表单内容
            search_key.send_keys(keyword)
            time.sleep(1)
            # 提交表单
            # search_click.click()
            # 不知道为何搜索按钮就无法获得了,总是报错,故修改成使用键盘 ENTER 键来操作
            search_key.send_keys(Keys.ENTER)
            # 首先判断是否有搜索结果
            if not self.is_result():
                # 获取结果的总页数,使用find_element_by_xpath()也可以
                total = self.wait.until(
                    EC.presence_of_element_located((
                        By.XPATH,
                        '//div[@id="J_relative"]/div/div/div[@class="pager"]/ul/li[2]'
                    ))).text.split("/")[1]
                # 遍历传入页号采集
                for page in range(1, int(total) + 1):
                    try:
                        if page > 1:
                            self.skip_page(page)
                        print("正在采集[%s]关键字的[%s]页" % (keyword, page))
                        # 等待商品信息加载完成
                        self.wait.until(
                            EC.presence_of_element_located(
                                (By.CSS_SELECTOR, '.m-itemlist .items .item')))
                        # 采集搜索的商品信息
                        self.get_products()
                    except TimeoutException:
                        # 增加页面跳转失败时,将当前关键字和页号记录文件
                        self.record.record_breakpoint(keyword, page)
                    except InvalidElementStateException:
                        # 首先刷新页面
                        self.browser.refresh()
                        # 等待商品信息加载完成
                        self.wait.until(
                            EC.presence_of_element_located(
                                (By.CSS_SELECTOR, '.m-itemlist .items .item')))
                        # 采集搜索的商品信息
                        self.get_products()
            else:
                # 没有搜索结果将关键字写入文件
                print("关键字[%s]无搜索结果,写入文件" % keyword)
                self.record.no_search_result(keyword)

    def skip_page(self, num):
        """
        跳转指定的页号
        :param num:
        :return:
        """
        # 跳转页号的输入框
        input_ = self.wait.until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, '#mainsrp-pager div.form> input')))
        # 提交按钮
        submit = self.wait.until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR,
                 '#mainsrp-pager div.form> span.btn.J_Submit')))
        # 清空输入框
        input_.clear()
        # 传入要跳转的页号
        input_.send_keys(num)
        # 提交,跳转
        submit.click()
        # 判断是否跳转成功
        self.wait.until(
            EC.text_to_be_present_in_element(
                (By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'),
                str(num)))

    def is_result(self):
        """
        判断是否有匹配结果,需要注意的是在捕获异常的时候得是:selenium.common.exceptions import TimeoutException
        如果捕获的是TimeOutError则依然会报错
        这是第一种情况:直接就是无搜索结果
        :return:
        """
        try:
            self.wait.until(
                EC.presence_of_element_located(
                    (By.XPATH, '//div[@id="mainsrp-tips"]//ul')))
            return True
        except TimeoutException:
            return False

    def get_products(self):
        """
        提取商品数据
        :return:
        """
        # 获取加载完成的页面
        html = self.browser.page_source
        # 使用pyquery解析页面
        doc = pq(html)
        # 定位包含商品信息的items
        items = doc('#mainsrp-itemlist .items .item').items()
        # 获取每一个商品项的信息
        for item in items:
            product = {
                'image': item.find('.pic .img').attr('data-src'),
                'price': item.find('.price').text(),
                'deal': item.find('.deal-cnt').text(),
                'title': item.find('.title').text(),
                'product_detail':
                item.find('.title .J_ClickStat').attr('href'),
                'shop': item.find('.shop').text(),
                'shop_detail': item.find('.shop .shopname').attr('href'),
                'location': item.find('.location').text()
            }
            print("查看商品信息:", product)
            # 将数据写入文件
            self.record.record_result(product)
Exemplo n.º 8
0
    'userName': '******',
    'password': '******'
}

# 每次处理的数据量
everyCount = 10000

# 存储处理数据历史记录的文件地址
recordPath = 'migrate.db'

tableName = 'media_collection'

if __name__ == '__main__':
    mongDbPro = MongoClient(mongoConnPro)
    mongoDb = MongoClient(mongoConn)
    record = Record(recordPath)

    print("开始迁移表:%s" % tableName)
    total = mongoDb.getCount(tableName)
    print("%s数据总量:%d" % (tableName, total))

    # 计数
    startIndex = 0
    sum = 0

    # 后期记录表中的记录
    if record.findKeyExist('migrate'):
        # 存在该记录, 取出处理的数据量
        handleNum = record.getHandledNum('migrate')
        if handleNum != total:
            startIndex = handleNum
Exemplo n.º 9
0
class TaoBaoStartSearch(object):
    """
    尝试在不登录的情况下直接根据关键字进行搜索
    """
    def __init__(self):
        # 在不登陆的情况下:淘宝的搜索页面
        self.index_url = "https://s.taobao.com/search?q={}"
        # 浏览器对象
        self.browser = webdriver.Chrome()
        # 浏览器加载对象
        self.wait = WebDriverWait(self.browser, 20)
        # 记录对象
        self.record = Record()

    def index_page(self, keyword):
        """
        尝试在不登录的情况下获取搜索数据
        :param keyword:
        :return:
        """
        print("正在抓取的关键字:", keyword)
        # 发起请求
        self.browser.get(self.index_url.format(keyword))
        # 查看响应的页面
        # print("查看获取的响应页面:", self.browser.page_source)
        # 首先判断是否有搜索结果
        if not self.is_result():
            # 获取结果的总页数,使用find_element_by_xpath()也可以
            total = self.wait.until(
                EC.presence_of_element_located((
                    By.XPATH,
                    '//div[@id="J_relative"]/div/div/div[@class="pager"]/ul/li[2]'
                ))).text.split("/")[1]
            # 遍历传入页号采集
            for page in range(1, int(total) + 1):
                try:
                    if page > 1:
                        self.skip_page(page)
                    print("正在采集[%s]关键字的[%s]页" % (keyword, page))
                    # 等待商品信息加载完成
                    self.wait.until(
                        EC.presence_of_element_located(
                            (By.CSS_SELECTOR, '.m-itemlist .items .item')))
                    # 采集搜索的商品信息
                    self.get_products()
                except TimeoutException:
                    # 增加页面跳转失败时,将当前关键字和页号记录文件
                    self.record.record_breakpoint(keyword, page)
                except InvalidElementStateException:
                    # 首先刷新页面
                    self.browser.refresh()
                    # 等待商品信息加载完成
                    self.wait.until(
                        EC.presence_of_element_located(
                            (By.CSS_SELECTOR, '.m-itemlist .items .item')))
                    # 采集搜索的商品信息
                    self.get_products()
        else:
            # 没有搜索结果将关键字写入文件
            print("关键字[%s]无搜索结果,写入文件" % keyword)
            self.record.no_search_result(keyword)

    def skip_page(self, num):
        """
        跳转指定的页号
        :param num:
        :return:
        """
        # 跳转页号的输入框
        input_ = self.wait.until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, '#mainsrp-pager div.form> input')))
        # 提交按钮
        submit = self.wait.until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR,
                 '#mainsrp-pager div.form> span.btn.J_Submit')))
        # 清空输入框
        input_.clear()
        # 传入要跳转的页号
        input_.send_keys(num)
        # 提交,跳转
        submit.click()
        # 判断是否跳转成功
        self.wait.until(
            EC.text_to_be_present_in_element(
                (By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'),
                str(num)))

    def is_result(self):
        """
        判断是否有匹配结果,需要注意的是在捕获异常的时候得是:selenium.common.exceptions import TimeoutException
        如果捕获的是TimeOutError则依然会报错
        这是第一种情况:直接就是无搜索结果
        :return:
        """
        try:
            self.wait.until(
                EC.presence_of_element_located(
                    (By.XPATH, '//div[@id="mainsrp-tips"]//ul')))
            return True
        except TimeoutException:
            return False

    def get_products(self):
        """
        提取商品数据
        :return:
        """
        # 获取加载完成的页面
        html = self.browser.page_source
        # 使用pyquery解析页面
        doc = pq(html)
        # 定位包含商品信息的items
        items = doc('#mainsrp-itemlist .items .item').items()
        # 获取每一个商品项的信息
        for item in items:
            product = {
                'image': item.find('.pic .img').attr('data-src'),
                'price': item.find('.price').text(),
                'deal': item.find('.deal-cnt').text(),
                'title': item.find('.title').text(),
                'product_detail':
                item.find('.title .J_ClickStat').attr('href'),
                'shop': item.find('.shop').text(),
                'shop_detail': item.find('.shop .shopname').attr('href'),
                'location': item.find('.location').text()
            }
            print("查看商品信息:", product)
            # 将数据写入文件
            self.record.record_result(product)
Exemplo n.º 10
0
from record.record import Record

if __name__ == '__main__':
    r = Record('test.db')

    print(r.findKeyExist('tb_vehicle_status_his_20_05_16'))
    r.updateHandledNum('tb_vehicle_status_his_20_05_16', 10)
    print(r.getHandledNum('tb_vehicle_status_his_20_05_16'))
    print(r.findKeyExist('tb_vehicle_status_his_20_05_16'))
    r.updateHandledNum('tb_vehicle_status_his_20_05_16', 20)
    print(r.getHandledNum('tb_vehicle_status_his_20_05_16'))

Exemplo n.º 11
0
#     'port': 27017,
#     'db': 'gpshisdata',
#     'authDb': 'admin',
#     'userName': '******',
#     'password': '******'
# }

# 每次处理的数据量
everyCount = 10000

# 存储处理数据历史记录的文件地址
recordPath = 'update.db'

if __name__ == '__main__':
    mongoDb = MongoClient(mongoConn)
    record = Record(recordPath)

    name_list = []
    # 后期记录表中的记录
    if record.findKeyExist("record"):
        # 存在该记录, 取出处理的数据量
        name_list.extend(record.getHandledNum("record"))

    collection_names = mongoDb.getDocument("tb_vehicle_status_his")
    print("all collections: {}".format(collection_names))

    last_list = []

    for name in collection_names:
        if name in collection_names:
            if name.startswith('tb_vehicle_status_his_20_06_'
Exemplo n.º 12
0
    'db': 'gpshisdata',
    'authDb': 'gpshisdata',
    'userName': '******',
    'password': '******'
}

# 每次处理的数据量
everyCount = 10000

# 存储处理数据历史记录的文件地址
recordPath = 'record.db'

if __name__ == '__main__':
    sqlDb = MysqlClient(mysqlConn)
    mongoDb = MongoClient(mongoConn)
    record = Record(recordPath)

    # 获取tableName
    tableNameList = sqlDb.getAllTableName('tb_vehicle_status_his_')
    for tableName in tableNameList:
        print("开始迁移表:%s" % tableName)
        total = sqlDb.getAllRowCount(tableName)
        print("%s数据总量:%d" % (tableName, total))

        # 计数
        startIndex = 0
        sum = 0

        # 后期记录表中的记录
        if record.findKeyExist(tableName.replace('_', '')):
            # 存在该记录, 取出处理的数据量