Exemplo n.º 1
0
def getallcheatinfo(page_request):
    cd_strain = bs4.SoupStrainer(class_="arcode-header")
    cheat_strain = bs4.SoupStrainer(class_="arcode-float")
    cd_soup = bs4.BeautifulSoup(page_request.data,
                                'lxml',
                                parse_only=cd_strain).find_all('a')
    cheat_soup = bs4.BeautifulSoup(page_request.data,
                                   'lxml',
                                   parse_only=cheat_strain)

    # Creates | delimited string containing all cheat codes
    cheatcodes = str(cheat_soup) \
        .replace("<!DOCTYPE HTML>", "") \
        .replace("</div>", "") \
        .replace('<div class="arcode-float">', "") \
        .replace("<textarea readonly=\"\">", "") \
        .replace("</textarea>", "|") \
        .replace("\r", "") \
        .replace("\n", " ") \
        .rstrip("|") \
        .strip()

    i = 0
    q = ""
    y = 0
    while i < len(cd_soup):
        p = re.search("(?<=>)(.*)(?=<)", str(cd_soup[i]))
        q += str(p.group()) + "^" + cheatcodes.split("|")[y] + "|"
        i += 2
        y += 1
    allcheatinfo = (q.rstrip("|"))
    return allcheatinfo
Exemplo n.º 2
0
    def boosted_creature_from_header(cls, content):
        """Get the boosted creature from any Tibia.com page.

        Parameters
        ----------
        content: :class:`str`
            The HTML content of a Tibia.com page.

        Returns
        -------
        :class:`CreatureEntry`
            The boosted creature of the day.

        Raises
        ------
        InvalidContent
            If content is not the HTML of a Tibia.com's page.
        """
        try:
            parsed_content = bs4.BeautifulSoup(content.replace('ISO-8859-1', 'utf-8'), "lxml",
                                               parse_only=bs4.SoupStrainer("div", attrs={"id": "RightArtwork"}))
            img = parsed_content.find("img", attrs={"id": "Monster"})
            name = img["title"].replace(BOOSTED_ALT, "").strip()
            image_url = img["src"]
            identifier = image_url.split("/")[-1].replace(".gif", "")
            return CreatureEntry(name, identifier)
        except TypeError as e:
            raise InvalidContent("content is not from Tibia.com", e)
Exemplo n.º 3
0
    async def get_max_page(self):
        async with aiohttp.ClientSession() as session:
            task = asyncio.tasks.create_task(fetch(session, self.url))
            page = await asyncio.gather(task)

        for one in tqdm(page,
                        desc="Fetching max page",
                        bar_format=self.printstring):
            strainer = bs4.SoupStrainer("div",
                                        {'class': ['jix_pagination_pages']})
            soup = bs4.BeautifulSoup(one.decode('utf-8'),
                                     "lxml",
                                     parse_only=strainer)
            pages = soup.find("div", {'class': ['jix_pagination_pages']})

            hrefs = []

            for element in pages:
                if 'href' in str(element):
                    try:
                        hrefs.append(int(element.contents[0]))
                    except Exception as e:
                        pass

            self.max_page = max(hrefs) + 1
Exemplo n.º 4
0
async def update_menus(day):
    html = await _fetch.fetch_data_get_text(_menu_url)
    if html == -1:
        return -1
    html = bs4.BeautifulSoup(html,
                             "html.parser",
                             parse_only=bs4.SoupStrainer(id="menu-repas"))
    for menu in html.find_all("h3"):
        embed = discord.Embed()
        embed.type = "rich"
        embed.title = "Resto’ U de l’Illberg"
        embed.url = _menu_url
        embed.colour = random.randint(0, 16581375)
        embed.description = menu.text.strip()
        menu = menu.next_element.next_element.next_element
        menu = menu.find("h4", string=re.compile(
            "Déjeuner")).next_element.next_element.next_element
        for group in menu.find_all('span'):
            name = group.text.strip()
            value = ""
            group = group.next_element.next_element
            try:
                for field in group.find_all('li'):
                    value += field.text.strip() + "\n"
            except AttributeError:
                pass
            if value == "":
                value = "¯\_(ツ)_/¯"
            embed.add_field(name=name, value=value, inline=False)
        _menus[embed.description] = embed
    try:
        return _menus[day]
    except KeyError:
        return -1
Exemplo n.º 5
0
def find_form_request(html: str):
    soup = bs4.BeautifulSoup(html,
                             "html.parser",
                             parse_only=bs4.SoupStrainer("form"))

    form = soup.form
    if not form:
        raise _exception.ParseError("Could not find form to submit", data=html)

    url = form.get("action")
    if not url:
        raise _exception.ParseError("Could not find url to submit to",
                                    data=form)

    # From what I've seen, it'll always do this!
    if url.startswith("/"):
        url = "https://www.facebook.com" + url

    # It's okay to set missing values to something crap, the values are localized, and
    # hence are not available in the raw HTML
    data = {
        x["name"]: x.get("value", "[missing]")
        for x in form.find_all(["input", "button"])
    }
    return url, data
Exemplo n.º 6
0
def _data_from_xml(
        fpath_xml: str) -> Dict[str, Union[str, Dict[str, np.ndarray]]]:
    ecg_data = dict()

    # define tags that we want to find and use SoupStrainer to speed up search
    tags = [
        "patientdemographics",
        "testdemographics",
        "order",
        "restingecgmeasurements",
        "originalrestingecgmeasurements",
        "diagnosis",
        "originaldiagnosis",
        "intervalmeasurementtimeresolution",
        "intervalmeasurementamplituderesolution",
        "intervalmeasurementfilter",
        "waveform",
    ]
    strainer = bs4.SoupStrainer(tags)

    # lxml parser makes all tags lower case
    with open(fpath_xml, "r") as f:
        soup = bs4.BeautifulSoup(f, "lxml", parse_only=strainer)

    for tag in tags:
        tag_suffix = ""
        if tag == "restingecgmeasurements":
            tag_suffix = "_md"
        elif tag == "originalrestingecgmeasurements":
            tag_suffix = "_pc"
        elif tag == "diagnosis":
            soup_tag = soup.find(tag)
            if soup_tag is not None:
                ecg_data["diagnosis_md"] = _parse_soup_diagnosis(soup_tag)
            continue
        elif tag == "originaldiagnosis":
            soup_tag = soup.find(tag)
            if soup_tag is not None:
                ecg_data["diagnosis_pc"] = _parse_soup_diagnosis(soup_tag)
            continue
        elif tag == "waveform":
            voltage_data = _get_voltage_from_waveform_tags(soup.find_all(tag))
            ecg_data.update(voltage_data)
            continue

        soup_tag = soup.find(tag)

        if soup_tag is not None:
            # find sub tags
            soup_sub_tags = soup_tag.find_all()

            # if there are no sub tags, use original tag
            if len(soup_sub_tags) == 0:
                soup_sub_tags = [soup_tag]

            ecg_data.update(
                {st.name + tag_suffix: st.text
                 for st in soup_sub_tags})

    return ecg_data
Exemplo n.º 7
0
    def from_content(cls, content):
        """
        Gets the boosted creature from any Tibia.com page.


        Parameters
        ----------
        content: :class:`str`
            The HTML content of a Tibia.com page.

        Returns
        -------
        :class:`News`
            The boosted article shown.

        Raises
        ------
        InvalidContent
            If content is not the HTML of a Tibia.com's page.
        """
        try:
            parsed_content = bs4.BeautifulSoup(content.replace('ISO-8859-1', 'utf-8'), "lxml",
                                               parse_only=bs4.SoupStrainer("div", attrs={"id": "RightArtwork"}))
            img = parsed_content.find("img", attrs={"id": "Monster"})
            name = img["title"].replace(BOOSTED_ALT, "").strip()
            image_url = img["src"]
            return cls(name, image_url)
        except TypeError:
            raise InvalidContent("content is not from Tibia.com")
Exemplo n.º 8
0
def get_error_data(html: str) -> Optional[str]:
    """Get error message from a request."""
    soup = bs4.BeautifulSoup(
        html, "html.parser", parse_only=bs4.SoupStrainer("form", id="login_form")
    )
    # Attempt to extract and format the error string
    return " ".join(list(soup.stripped_strings)[1:3]) or None
Exemplo n.º 9
0
def submitForms(url, session, value):
    html = session.get(url).text
    soup = bs4.BeautifulSoup(html,
                             "html.parser",
                             parse_only=bs4.SoupStrainer("form"))
    responses = list()
    for form in soup:
        if form.get('method') == None:
            method = "GET"
        else:
            method = form['method'].upper()

        if form.get('action') == None:
            submiturl = url
        else:
            submiturl = url[0:url.rfind("/")] + "/" + form['action']
        parameters = setFormParams(form, value)
        start = time.time()
        if method == "POST":
            response = session.post(url=submiturl, data=parameters)
        else:
            response = session.request(method=method,
                                       url=submiturl,
                                       params=parameters)
        end = time.time()
        responses.append([response, end - start, parameters])
    return responses
Exemplo n.º 10
0
    def __init__(self, address, ssl=True, testString=None):

        self.address = address.strip()
        if "http" not in self.address:
            if ssl:
                self.address = "https://%s" % (self.address)
            else:
                self.address = "http://%s" % (self.address)
        self.http = httplib2.Http()
        self.testString = testString
        self.testStatus = None
        try:
            """
            If self.status and self.response, go ahead and get the "body" part
                of the page and do runCheck
            """
            self.status, self.response = self.http.request(self.address,
                                                           redirections=0)
            self.page = bs4.BeautifulSoup(self.response,
                                          parse_only=bs4.SoupStrainer("body"),
                                          features="html.parser").text
        except TimeoutError:
            """
            If there's a timeout error, indicate that in the response and
                testStatus
            We want to do this instead of just setting self.testStatus to False
                because we can't even run the test to see if the string is
                on the page
            """
            self.status, self.response, self.page, self.testStatus = False, "Timeout Error", False, "Timeout Error"

        self.runCheck()
Exemplo n.º 11
0
def rand_page(user_id):
    link = 'https://vk.com/id'
    url = str(random.randint(1, 1000000000))
    not_created = []
    
    only_content = bs4.SoupStrainer("div", id='content')
    r = requests.get(link+url, stream=True, headers={'User-Agent': UserAgent().chrome})
    soup = bs4.BeautifulSoup(r.text, "lxml", parse_only=only_content) 
    
    
    blocked = soup.find_all('h5', class_="profile_blocked")
    not_created = soup.find_all('div', class_='message_page page_block')
    if blocked != []:
        bot.send_message(user_id,  '*User is blocked :(*\n' + link+url, parse_mode='Markdown', reply_markup=main_menu)
        return
    if not_created != []:
        rand_page(user_id)   
        
    try: 
        name = soup.find('h1', class_='page_name').text
        bot.send_message(user_id, '*User alive!*\nName: ' +  str(name) + '\n' + link+url, parse_mode='Markdown', reply_markup=callback)
        print(str(user_id), 'Page found!')
        return
    except AttributeError:
        print('Error blyat')
def get_soup(school_id, page_id):
    """Requests a page from CollegeData.com corresponding to the provided
    school_id and page_id and converts the response to a BeautifulSoup object
    """

    # Build URL
    url = url_pt1 + str(page_id) + url_pt2 + str(school_id)

    # Request the url and raise exception if something strange returned.
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        msg = url + ' gave status code ' + response.status_code
        logging.warning(msg)
        raise IOError

    # Limit HTML parsing to only <h1> tags or the tag <div id='tabcontwrap'>.
    strainer = bs4.SoupStrainer(
        lambda name, attrs: name == 'h1' or attrs.get('id') == 'tabcontwrap')

    # Parse response text into a BeautifulSoup object.
    soup = bs4.BeautifulSoup(markup=response.text,
                             features="lxml",
                             parse_only=strainer)

    # Raise an error if the <h1> tag contained the empty page string.
    # It's not really an error, as it's expected that many school_id will
    # not correspond to a page with actual school information, but this will
    # allow the scraper to skip any further attempts at more pages for this
    # school_id, saving time.
    if soup.h1.string == empty_h1_string:
        msg = 'School ID ' + str(school_id) + ' has no info.'
        logging.info(msg)
        raise LookupError

    return soup
Exemplo n.º 13
0
def parse_tibiacom_content(content,
                           *,
                           html_class="BoxContent",
                           tag="div",
                           builder="lxml"):
    """Parses HTML content from Tibia.com into a BeautifulSoup object.

    Parameters
    ----------
    content: :class:`str`
        The raw HTML content from Tibia.com
    html_class: :class:`str`
        The HTML class of the parsed element. The default value is ``BoxContent``.
    tag: :class:`str`
        The HTML tag select. The default value is ``div``.
    builder: :class:`str`
        The builder to use. The default value is ``lxml``.

    Returns
    -------
    :class:`bs4.BeautifulSoup`, optional
        The parsed content.
    """
    strainer = bs4.SoupStrainer(
        tag, class_=html_class) if builder != "html5lib" else None
    return bs4.BeautifulSoup(content.replace('ISO-8859-1', 'utf-8'),
                             builder,
                             parse_only=strainer)
Exemplo n.º 14
0
def dlp(p):
    url = "https://movie.douban.com/celebrity/" + str(
        celebrity_id) + "/photos/?type=C&start=" + str(
            p * 30) + "&sortby=like&size=a&subtype=a"
    req = requests.get(url)
    if req.status_code != 200:
        print("NETWORK ERROR")
        return 1
    content = req.text
    covers = bs4.BeautifulSoup(
        content, "html.parser",
        parse_only=bs4.SoupStrainer(class_="cover")).find_all("a")
    img_id = [cover["href"].split("/")[-2] for cover in covers]
    if len(img_id) == 0:
        print("No image in page #%s, please check!" % (str(p)))
        return 1
    img_link = [
        "https://img3.doubanio.com/view/photo/raw/public/p" + str(id) + ".jpg"
        for id in img_id
    ]
    print(
        "========================= START Page %d: =========================" %
        p)
    for im, l in zip(img_id, img_link):
        img_dl(l, im)
    print(
        "========================= FINISH Page %d: =========================" %
        p)
Exemplo n.º 15
0
    def run_atc(self):
        self.log.info('Starting ATC')
        start = get_time()
        # get product page
        item_page = self.session.get(self.url)
        while not item_page and item_page.status_code is not '200':
            self.log.info("Failed to get item page " + item_page.status_code)
            item_page = self.session.get(self.url)

        self.log.info("Retrieved product page - %dms" % (get_time()-start))

        start = get_time()
        strain = bs4.SoupStrainer(id='product_addtocart_form')
        item_parse = bs4.BeautifulSoup(item_page.content, "lxml", parse_only=strain)
        form = item_parse.find('form')
        select = form.find('select')

        self.log.info("Page parsed - %dms" % (get_time() - start))

        start = get_time()
        # find add url
        action = form['action'].replace("checkout/cart", "ajax/index")
        # form payload
        payload = {'qty': '1', 'isAjax': '1'}
        for item in form.find_all('input'):
            payload[item['name']] = item['value']
        opts = form.find(id='options_502').contents

        if hasattr(self, 'size'):
            size_id = sizes[self.size]
            size = self.size
            for item in form.find(id='options_502').contents:
                if type(item) == element.Tag and item['data-simplesku'].split('-', 1)[-1] == self.size:
                    size = item['data-simplesku'].split('-')[-1]
                    size_id = item['id'].split('_')[-1]
                    break
        else:
            rand = random.choice(opts[:-2])
            size = rand['data-simplesku'].split('-', 1)[-1]
            size_id = rand['id'].split('_')[-1]

        payload[select['name']] = size_id
        self.log.info('Selected size %s' % size)
        print("POST request created - {}ms {}".format((get_time() - start), str(payload)))

        # stdin.readline()

        start = get_time()
        start_atc = get_time()
        atc_resp = self.session.post(action, data=payload)
        while atc_resp.status_code != '200' and json.loads(atc_resp.content)['status'] != 'SUCCESS':
            self.log.info('POST atc failed - {} - {}'.format(atc_resp.status_code, json.loads(atc_resp.content)['status']))
            time.sleep(1)
            start = get_time()
            atc_resp = self.session.post(action, data=payload)

        print("Added - %dms" % (get_time() - start_atc))
        self.queue.put(self.session.cookies['frontend'])
        self.log.info('Added cookie to queue')
Exemplo n.º 16
0
def getpagecount(page_request):
    page_strain = bs4.SoupStrainer(
        onchange="$(this).closest('form').submit();")
    page_soup = bs4.BeautifulSoup(page_request.data,
                                  'lxml',
                                  parse_only=page_strain).find_all("option")
    pagecount = int(len(page_soup) - 1)
    return pagecount
Exemplo n.º 17
0
 def scrape_all_pages(self):
     for job_page in tqdm(self.html_pages,
                          desc="Scraping:",
                          bar_format=self.printstring):
         strainer = bs4.SoupStrainer("div", {'class': ['PaidJob']})
         soup = bs4.BeautifulSoup(job_page.decode('utf-8'),
                                  'lxml',
                                  parse_only=strainer)
         self.scrape_page(soup)
Exemplo n.º 18
0
def file_parse(filepath):
    link_list = []
    with open(filepath, 'r') as file_object:
        for link in bs4.BeautifulSoup(file_object.read(),
                                      "html.parser",
                                      parse_only=bs4.SoupStrainer('a')):
            if link.has_attr('href'):
                link_list.append(link['href'])
    return link_list
Exemplo n.º 19
0
def scrape_items():
    from src.gather_files import populate_cache

    populate_cache()
    strainer = bs4.SoupStrainer("table")
    items_html = bs4.BeautifulSoup(ITEMS_LIST.read_text(), "lxml", parse_only=strainer)

    items = items_html.select("tbody tr")

    return [parse_item(item) for item in items]
Exemplo n.º 20
0
async def fetch_tibia_bosses_world(world: str):
    url = f"https://www.tibiabosses.com/{world}/"

    try:
        bosses = CACHE_BOSSES[world]
        return bosses
    except KeyError:
        bosses = defaultdict(list)

    try:
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as resp:
                content = await resp.text()
    except (aiohttp.ClientError, asyncio.TimeoutError):
        raise errors.NetworkError(f"get_world_bosses({world})")

    try:
        parsed_content = bs4.BeautifulSoup(content,
                                           "lxml",
                                           parse_only=bs4.SoupStrainer(
                                               "div", class_="panel-layout"))
        _sections = parsed_content.find_all('div', class_="widget_execphp")
        for section in _sections:
            heading = section.find('h3')
            if heading is None:
                continue
            title = heading.text
            section_content = section.find('div', class_="execphpwidget")
            m = boss_pattern.findall(str(section_content))
            if m:
                for (chance, link, image, expect_last, days) in m:
                    name = link.split("/")[-1].replace("-", " ").lower()
                    bosses[title].append(
                        dict(name=name,
                             chance=chance.strip(),
                             url=link,
                             image=image,
                             type=expect_last,
                             days=int(days)))
            else:
                # This regex is for bosses without prediction
                m = unpredicted_pattern.findall(str(section_content))
                for (link, image, expect_last, days) in m:
                    name = link.split("/")[-1].replace("-", " ").lower()
                    bosses[title].append(
                        dict(name=name,
                             chance="Unpredicted",
                             url=link,
                             image=image,
                             type=expect_last,
                             days=int(days)))
    except:
        pass
    CACHE_BOSSES[world] = bosses
    return bosses
Exemplo n.º 21
0
def init_bs(html, head_only=False):
    features = "html5lib"
    parse_only = None

    if LXML_AVAILABLE:
        features = "lxml"

        if head_only:
            parse_only = bs4.SoupStrainer("head")

    return bs4.BeautifulSoup(html, features=features, parse_only=parse_only)
Exemplo n.º 22
0
def scrape_secret():
    """
    hit the website and scrape the first page
    """
    url = "https://jobs.secrettelaviv.com/"
    req = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    page = req.text

    # jobs are in spans
    parse_only = bs4.SoupStrainer('span')
    return bs4.BeautifulSoup(page, "lxml", parse_only=parse_only)
Exemplo n.º 23
0
def add_sentiment():
    url = request.args.get('a', 0, type=str)
    url = str('https://www.' + re.sub('https://|www.', "", url))
    source = urllib.request.urlopen(url)
    soup = bs.BeautifulSoup(source,
                            'html.parser',
                            parse_only=bs.SoupStrainer('div'))
    txt = soup.text
    txt = re.findall("[A-z]+", txt)
    txt = " ".join(txt)
    return jsonify(result=txt)
Exemplo n.º 24
0
def gettitleinfo(page_request):
    table_strain = bs4.SoupStrainer(style="width:753px;")
    table_soup = bs4.BeautifulSoup(page_request.data,
                                   'lxml',
                                   parse_only=table_strain)
    titleinfo = table_soup.text \
        .replace("Publisher: ", "|") \
        .replace("Title ID: ", "|") \
        .replace("Serial: ", "|") \
        .replace("\n", "") \
        .strip()
    return titleinfo
Exemplo n.º 25
0
def get_error_data(html: str, url: str) -> Tuple[Optional[int], Optional[str]]:
    """Get error code and message from a request."""
    code = None
    try:
        code = int(_util.get_url_parameter(url, "e"))
    except (TypeError, ValueError):
        pass

    soup = bs4.BeautifulSoup(
        html, "html.parser", parse_only=bs4.SoupStrainer("div", id="login_error")
    )
    return code, soup.get_text() or None
Exemplo n.º 26
0
def getcheatdesc(page_request):
    hdr_strain = bs4.SoupStrainer(class_="arcode-header")
    hdr_soup = bs4.BeautifulSoup(page_request.data,
                                 'lxml',
                                 parse_only=hdr_strain).find_all('a')
    i = 0
    q = ""
    while i < len(hdr_soup):
        p = re.search("(?<=>)(.*)(?=<)", str(hdr_soup[i]))
        q += str(p.group()) + "|"
        i += 2
    cheatdesc = q.rstrip("|")
    return cheatdesc
Exemplo n.º 27
0
def get_error_data(html, url):
    """Get error code and message from a request."""
    try:
        code = _util.get_url_parameter(url, "e")
    except IndexError:
        code = None

    soup = bs4.BeautifulSoup(
        html,
        "html.parser",
        parse_only=bs4.SoupStrainer("div", id="login_error"),
    )
    return code, soup.get_text() or None
Exemplo n.º 28
0
def getFormInputs(session, url):
    foundInputs = list()
    html = session.get(url).text
    soup = bs4.BeautifulSoup(html,
                             "html.parser",
                             parse_only=bs4.SoupStrainer('input'))
    inputLines = soup.prettify()
    for line in inputLines.splitlines(keepends=False):
        curLine = line.strip()
        #check if it is an input opening tag that is not a submit button
        if curLine.startswith("<input") and 'type="submit"' not in curLine:
            foundInputs.append(curLine)

    return foundInputs
Exemplo n.º 29
0
def download(domain):
    if not path.isdir(base_dir):
        mkdir(base_dir)

    url = build_url(domain)
    soup = bs4.BeautifulSoup(requests.get(url).content,
                             'html.parser',
                             parse_only=bs4.SoupStrainer(
                                 ['p', 'a', 'ul', 'ol', 'li']))

    text = build_text(soup)
    with open(path.join(base_dir, build_filename(url)), 'w') as f:
        f.write(text)

    return text
Exemplo n.º 30
0
def getcheatcodes(page_request):
    cheat_strain = bs4.SoupStrainer(class_="arcode-float")
    cheat_soup = bs4.BeautifulSoup(page_request.data,
                                   'lxml',
                                   parse_only=cheat_strain)
    cheatcodes = str(cheat_soup) \
        .replace("<!DOCTYPE HTML>", "") \
        .replace("</div>", "") \
        .replace('<div class="arcode-float">', "") \
        .replace("<textarea readonly=\"\">", "") \
        .replace("</textarea>", "|") \
        .replace("\r", "") \
        .replace("\n", " ") \
        .rstrip("|") \
        .strip()
    return cheatcodes