Пример #1
0
class BingSearch:
    def __init__(self, parent=None):
        self.robot = Hrobot(None)
        self.robot.set_skip_image_loading(True)
        self.page = 3
        self.parent = parent  # type: BaseModule

    def update_progressbar(self, message, percent):
        """
        :param message: message of new state
        :param percent: total percent
        update progressbar value of request
        """
        self.parent.progress = {'state': message, 'percent': percent}

    def search(self, query, pages=None):
        """

        :param query: this parameter show searching term
        :param pages: the number of requested page for result of searching term
        :return:list of searching result in form of entity_property
        """
        final_result = []
        return_result = {}
        if pages:
            self.page = pages
        try:
            # open bing.com and set query value
            try:
                self.robot.go_to('https://www.bing.com')
            except InvalidResponseError:
                raise NetworkError('Unable to find the server at www.bing.com')
            query_field = self.robot.find_by_xpath('//*[@id="sb_form_q"]')
            if query_field:
                query_field.set_value(query)
                search_button = self.robot.find_by_xpath('//*[@id="sb_form_go"]')
                search_button.click()
                # iterate on number of excepted result, save one page on each iteration
                for i in range(self.page):
                    self.parent.check_point()
                    result = self.robot.find_by_xpath('/html/body/div[1]')
                    res = result.find_all_by_css('li[class="b_algo"]')
                    final_result.extend(self.parse_result(res))
                    pagination = self.robot.find_by_xpath("//a[@title='Next page']")
                    if pagination:
                        pagination.click()
                    # update progressbar
                    self.update_progressbar(" Pages has been searched: " + str(i + 1),
                                            (100 * (i + 1) / self.page))

                # no result found
                if len(final_result) == 0:
                    return_result["results"] = [
                        {"data": " ", "properties": [{'title': '', 'type': 0}, {'description': '', 'type': 0}],
                         "type": 1}]
                    return return_result
            return_result["results"] = final_result
            return return_result

        finally:
            self.robot.cleanup()

    @staticmethod
    def parse_result(unstructured_data):
        """

        :param unstructured_data:list of result to parse
        :return:list of parsed result
        """
        i = 0
        final_result = []
        resul = {}
        try:
            # creating data in json format
            for res in unstructured_data:
                properties = []
                resul[i] = {'type': 1}
                # add data key to result
                if res.find_by_css('h2 a').get_attr("href"):
                    resul[i].update({'data': res.find_by_css('h2 a').get_attr("href")})
                else:
                    resul[i]['data'] = ""

                # add data properties: description, title, type 0 to result
                if res.find_by_css('h2').get_text():
                    properties.append({'title': res.find_by_css('h2').get_text(), "type": 0})
                else:
                    properties.append({'title': "", "type": 0})
                if res.find_by_css('div p').get_text():
                    properties.append({'description': res.find_by_css('div p').get_text(), "type": 0})
                else:
                    properties.append({'description': "", "type": 0})

                resul[i]["properties"] = properties
                final_result.append(resul[i])
                i = i + 1

            return final_result
        except Exception as e:
            raise InternalModuleError('bad content' + str(e))
Пример #2
0
class GoogleSearch(object):
    def __init__(self, parent=None):
        self.robot = Hrobot(None)
        self.robot.set_skip_image_loading(True)
        self.page = 3
        self.parent = parent  # type: BaseModule
        self.progressbar = {'state': 'initializing', 'percent': 0.0}

    # update progressbar message and percent to suitable value
    def update_progressbar(self, message, percent):
        """
        :param message: message of new state
        :param percent: total percent
        update progressbar value of request
        """
        self.parent.progress = {'state': message, 'percent': percent}

    def search(self, query, pages=None):
        """
        :param query: this parameter show searching term
        :param pages:the number of requested page for result of searching term
        :return: list of searching result in form of entity_property
        """
        result_list = []
        return_result = {}

        if pages:
            self.page = pages
        try:
            self.parent.check_point()
            # open google.com and set query value
            try:
                self.robot.go_to('https://www.google.com/ncr')
            except InvalidResponseError:
                raise NetworkError(
                    'Unable to find the server at www.google.com')
            query_field = self.robot.find_by_css("input[name='q']")
            if query_field is not None:
                query_field.set_value(query)
                query_field.get_form().submit()
                # iterate on number of excepted result, save one page on each iteration
                for i in range(self.page):
                    self.parent.check_point()
                    result1 = self.robot.find_by_xpath('//div[@id="search"]')
                    result = result1.find_all_by_css('div[class="g"]')
                    pagination = self.robot.find_by_xpath(
                        "//*[contains(text(), 'Next')]")
                    result_list.extend(self.parse_result(result))
                    # update progressbar value
                    self.update_progressbar(
                        'Pages has been searched:' + str(i + 1),
                        (100 * (i + 1) / self.page))
                    if pagination is not None:
                        pagination.click()
                    else:
                        break
            if len(result_list) == 0:
                # if the following condition become true, we are faced captcha in search progress
                captcha_field1 = self.robot.find_by_xpath(
                    '/html/body/div[1]/form/input[3]')
                captcha_field2 = self.robot.find_by_css(
                    '#ctl00_ContentPlaceHolder1_TextBoxCode')
                if captcha_field1 or captcha_field2:
                    raise CaptchaNeededError(
                        "it is needed to resolve  captcha")
                else:
                    # no result found
                    return_result["results"] = [{
                        "data":
                        " ",
                        "properties": [{
                            'title': '',
                            'type': 0
                        }, {
                            'description': '',
                            'type': 0
                        }],
                        "type":
                        1
                    }]
                    return return_result

            return_result['results'] = result_list
            return return_result
        finally:
            self.robot.cleanup()

    @staticmethod
    def parse_result(unstructured_data):
        """

       :param unstructured_data: list of result to parse
       :return: list of parsed result
       """

        i = 0
        final_result = []
        resul = {}
        try:
            # creating data in json format
            for res in unstructured_data:
                properties = []
                resul[i] = {'type': 1}
                # add data key to result
                if res.find_by_css('h3 a'):
                    resul[i].update({
                        'data':
                        res.find_by_css('h3 a').get_attr("href").replace(
                            "/url?q=", " ")
                    })
                else:
                    resul[i]['data'] = ''

                # add data properties: description, title, type 0 to result
                if res.find_by_css('h3').get_text():
                    properties.append({
                        'title': res.find_by_css('h3').get_text(),
                        'type': 0
                    })
                else:
                    properties.append({'title': '', 'type': 0})
                if res.find_by_css('span[class="st"]'):
                    properties.append({
                        'description':
                        res.find_by_css('div [class="st"]').get_text(),
                        'type':
                        0
                    })
                else:
                    properties.append({'description': '', 'type': 0})
                resul[i]['properties'] = properties
                final_result.append(resul[i])

                i = i + 1

            return final_result
        except Exception as e:
            raise InternalModuleError('bad content to parse' + e)
Пример #3
0
class Hash(object):
    def __init__(self, username, password):
        self.email = username
        self.password = password
        self.current_milli_time = lambda: int(round(time.time() * 1000))
        self.unique_time = self.current_milli_time()
        cookie_path = os.path.dirname(__file__)
        self.robot = Hrobot(cookie_path, "http://cmd5.org")

    def is_logeed_in(self):
        self.unique_time = self.current_milli_time()
        if self.robot.find_by_xpath('//a[@href="exit.aspx"]') is not None:
            self.robot.save_cookies_to_file(self.robot.get_cookies())
            return True
        else:
            self.set_cookie()
            if self.robot.find_by_xpath('//a[@href="exit.aspx"]') is not None:
                self.robot.save_cookies_to_file(self.robot.get_cookies())
                return True
        return False

    def set_cookie(self):
        for cookie in self.robot.load_cookies_from_file():
            self.robot.set_cookie(cookie)
        self.robot.set_timeout(30)
        self.robot.go_to('/')

    def login(self):
        if self.is_logeed_in():
            ApiLogging.info('cookie login')
            return True
        else:
            ApiLogging.info('captcha login')
            self.robot.go_to('/login.aspx')
            email_field = self.robot.find_by_css(
                '#ctl00_ContentPlaceHolder1_TextBoxCmd5_E')
            password_field = self.robot.find_by_css(
                '#ctl00_ContentPlaceHolder1_TextBoxCmd5_P')
            email_field.set_value(self.email)
            password_field.set_value(self.password)
            self.fill_captcha_if_needed()
            submit_button = self.robot.find_by_css(
                "#ctl00_ContentPlaceHolder1_Button1")
            submit_button.click()
            self.robot.save_cookies_to_file(self.robot.get_cookies())
            if self.is_logeed_in():
                ApiLogging.info('logged in')
                return True
        return False

    def decode(self, hash_type, hash_code):
        if self.login():
            hash_field = self.robot.find_by_css(
                '#ctl00_ContentPlaceHolder1_TextBoxInput')
            if hash_field is not None:
                type_field = self.robot.find_by_css(
                    '#ctl00_ContentPlaceHolder1_InputHashType')
                hash_field.set_value(hash_code)
                type_field.set_value(hash_type)
                self.fill_captcha_if_needed()
                submit_button = self.robot.find_by_css(
                    "#ctl00_ContentPlaceHolder1_Button1")
                submit_button.click()
                result = self.robot.find_by_css(
                    '#ctl00_ContentPlaceHolder1_LabelAnswer')
                ApiLogging.info("result in hash: %s" % result.get_text())
                ApiLogging.info('type: ' + str(hash_type) + ' code: ' +
                                str(hash_code))
                chk_result = self.check_result(result)
                if chk_result == VERIFY:
                    self.decode(hash_type, hash_code)
                elif chk_result == PAYMENT:
                    pr = self.robot.find_by_contain_text('a', 'Purchase')
                    ApiLogging.info('click payment' + str(pr.get_text()))
                    if pr:
                        pr.click()
                    result = self.robot.find_by_css(
                        '#ctl00_ContentPlaceHolder1_LabelAnswer')
                    chk_result = self.check_result(result)
                    if chk_result is None:
                        return result.get_text()
                elif chk_result == NOT_FOUND:
                    return None
                else:
                    return result.get_text().split('\n')[0]

        else:
            ApiLogging.warning('login fail')

    def check_result(self, result):
        if result.get_text() == 'Verify code error!':
            return VERIFY
        elif 'payment' in result.get_text():
            ApiLogging.info('found payment')
            return PAYMENT
        elif 'Not Found' in result.get_text():
            return NOT_FOUND
        else:
            return None

    def fill_captcha_if_needed(self):
        captcha_field = self.robot.find_by_css(
            '#ctl00_ContentPlaceHolder1_TextBoxCode')
        if captcha_field is not None:
            ApiLogging.warning('captcha needed')
            self.robot.set_viewport_size(1280, 800)
            img = self.robot.find_by_css("#Image1")
            rect = img.get_position()
            box = (int(rect['left']), int(rect['top']), int(rect['right']),
                   int(rect['bottom']))
            filename = tempfile.mktemp('.png')
            self.robot.save_as_png(filename, 1280, 800)
            image = Image.open(filename)
            os.unlink(filename)
            captcha_image = image.crop(box)
            captcha_image.save('%s.png' % self.unique_time, 'png')
            captcha_field.set_value(
                self.resolve_captcha('%s.png' % self.unique_time))
            os.remove('%s.png' % self.unique_time)

    def resolve_captcha(self, file):
        api_key = "2632143214b9b24e9dc7590396f1dd22"
        captcha_object = CaptchaUpload(key=api_key, waittime=3)
        captcha = captcha_object.solve(file)
        ApiLogging.info('finded capcha: ' + str(captcha))
        return captcha

    @staticmethod
    def get_result_by_api(api_key, email, hash_code):
        url = 'https://www.cmd5.org/api.ashx?email=' + email + '&key=' + api_key + '&hash=' + hash_code
        result = Qhttp.get(url)
        if result.status_code == 200:
            if ':' in result.content.decode():
                error_code = result.content.decode().split(':')[-1]
                if error_code == '-1':
                    raise InvalidInputError(' invalid input ')
                if error_code == '-2':
                    raise InsufficientCredit('InsufficientCredit')
                if error_code == '-3':
                    raise NetworkError('server failed on cmd5.org')
                if error_code == '-4':
                    raise InvalidInputError('unknown sipher text')
                if error_code == '-7':
                    raise InvalidInputError('hash type not supported')
                if error_code == '-999':
                    raise NetworkError('some thing wrong with cmd5.org')
            try:
                return_result = {'results': result.json()}
                return return_result
            except Exception:
                ResultNotFoundError(' unknown result format ')
        else:
            raise NetworkError(result.status_code)