Exemplo n.º 1
0
def scrape(request):
    session = requests.Session()
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)%22%7D"
    }
    url = "https://www.ynet.co.il/home/0,7340,L-8,00.html"

    content = session.get(url, verify=False).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all('div',
                         {"class": "str3s str3s_small str3s_type_small"})
    Titles = soup.find_all('div', {"class": "title"})
    TitlesText = []
    for title in Titles:
        t = title.text
        TitlesText.append(t)

    i = 0
    new_headline_links = []
    for article in Headline.objects.all():
        new_headline_links.append(article.title)

    for artcile in News:
        main = artcile.find_all('a')[0]

        link = main['href']
        image_src = str(main.find('img')['src']).split(" ")[0]

        if (TitlesText[i] in new_headline_links):
            break

        if (link.find("https") != -1):
            link2 = link
        else:
            link2 = "https://www.ynet.co.il/" + link

        link2 = link2.replace('#autoplay', '')
        articleContent = session.get(link2, verify=False).content
        print(link2)
        soup = BSoup(articleContent, "html.parser")

        new_headline = Headline()

        ok = "פורסם:"
        #header = soup.find_all('div', {"class":"element B3 ghcite noBottomPadding"})[0]
        dates = soup.find_all('span', string=ok)
        print(dates)

        new_headline.date = dates[1].text
        new_headline.title = TitlesText[i]
        new_headline.url = link2
        new_headline.image = image_src
        #if (new_headline.date != 'error#'):
        #    new_headline.save()
        new_headline.save()
        i = i + 1

    return redirect("../")
Exemplo n.º 2
0
def get_query_results(url):
    results = get_page(url)['cargoquery'][0]['title']
    return {
        k: re.sub(
            '\n+', '\n',
            BSoup(re.sub('<br[^>]*>', '\n',
                         BSoup(results[k], 'lxml').text), 'lxml').text)
        for k in results
    }
Exemplo n.º 3
0
def main(postcodes):
    
    uklogin_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "DeliveryStatus/ukmail_login.json")
    
    with open(uklogin_path) as ukmail_login:    
        ukmail_login = json.load(ukmail_login)
    
    login_url = 'https://iconsign.ukmail.com/iconsignv5/Login.aspx'
    
    with requests.Session() as s:
        get_viewstate = s.get(login_url)
        get_soup = BSoup(get_viewstate.content, 'html.parser')
        viewstate_login = get_soup.find('input', id='_VIEWSTATE_Login').get('value')
        
        login_payload = {
          '_VIEWSTATE_Login' : viewstate_login,
          '__VIEWSTATE' : '',
          'btnLogin': '******',
          'txtUsername' : ukmail_login['Username'],
          'txtPassword' : ukmail_login['Password']
        }

        r = s.post(login_url, data=login_payload)
        
        consignment_data = []
        
        for i in range(30):
            consignments_url = 'https://iconsign.ukmail.com/iconsignv5/FindConsignments.aspx?pn=%d' % (i + 1)
            resp = s.get(consignments_url)

            soup = BSoup(resp.content, 'html.parser')
            consignment_list = soup.find('table', id='ctl00_mainContent_consignmentGridView')
            
            for consignment in consignment_list.find_all('tr'):
                data = []
                
                for consignment_td in consignment.find_all('td'):
                    data.append(consignment_td.text)

                for postcode in postcodes:
                    try:
                        if postcode in data:
                            consignment_data.append(data)
                    except:
                        skip = 'Skipping non-order <td> element at start of each table'

            if len(consignment_data) == len(postcodes):
                break
    
    if consignment_data:
        for consignment in consignment_data:
            status = get_status(consignment[6])
            print_html(consignment, status)
Exemplo n.º 4
0
def change_img_2(pPokemon):
    urlCmp = 'https://www.pokepedia.fr/' + str(pPokemon)
    requete = rq.get(urlCmp)
    page = requete.content
    soup = BSoup(page, "lxml")
    recup = soup.find("a", {"class": "image"}).contents[0]
    imgPage = 'https://www.pokepedia.fr/Fichier:' + str(recup.attrs['alt'])
    requete = rq.get(imgPage)
    page = requete.content
    soup = BSoup(page, "lxml")
    recup = soup.find("div", {"id": "file"}).contents[0]
    imgURL = 'https://www.pokepedia.fr' + str(recup.attrs['href'])

    return imgURL
Exemplo n.º 5
0
def get_definition(word, lang):
    if lang.lower() == 'en':
        definition_url = 'https://www.wordreference.com/definition/' + str(
            word)
    elif lang.lower() == 'es':
        definition_url = 'https://www.wordreference.com/definicion/' + str(
            word)
    else:
        return ['Please provide a valid language']
    content = requests.get(definition_url)
    soup = BSoup(content.text, 'html.parser')
    def_result = soup.find_all('ol')
    if def_result:
        results = []
        no = 1
        for search_result in def_result[:2]:
            results.append(str(no))
            listed_elements = search_result.find_all('li')
            for element in listed_elements[:3]:
                def_string = str(element)
                def_string = re.sub(r'<.*?>|\[.*?\]|:.*', '', def_string)
                results.append(def_string.replace('.', '. \n'))
            results.append('\n')
            no += 1
        results.append('From: ' + str(definition_url))
        return results
    else:
        return ['Word not found']
Exemplo n.º 6
0
def lookup_cik_ticker(ticker):
    import requests
    import sys
    from bs4 import BeautifulSoup as BSoup

    req = requests.get(\
        "https://www.sec.gov/cgi-bin/browse-edgar?CIK={:s}&owner=exclude&action=getcompany&Find=Search"\
        .format(ticker.lower()))
    ## Check for errors encountered in trying to get that url.
    try:
        req.raise_for_status()
    except:
        print(" -- {}:\n\t\t{}".format(sys.exc_info()[0], req.url))
        return None
    soup = BSoup(req.content, "lxml")
    ## Search for the tag that contains the company name.
    conmTag = soup.find("span", {"class": "companyName"})
    if not conmTag:
        print(
            "Unable to find the company name for ticker {:s}.".format(ticker))
        return None
    ## Search for the a-ref tag that links to "all company filings". Its text contains the CIK.
    atags = soup.findAll("a")
    atagCik = None
    for t in atags:
        if "see all company filings" in t.text:
            atagCik = t
    if not atagCik:
        print("Unable to find the a-ref tag with the CIK for ticker {:s}.".
              format(ticker))
        return None
    cik = atagCik.text.split(" ")[0]
    conm = conmTag.text.split("CIK")[0].strip()
    return (str(cik), ticker, str(conm))
Exemplo n.º 7
0
def post_request(query, session):

    seq_data = {
        'tabtype': 'animalTabPane',
        'historicalDB': '',
        'searchdb': 'COX1',
        'sequence': query
    }

    ## send search request
    r = session.post(
        'https://boldsystems.org/index.php/IDS_IdentificationRequest',
        data=seq_data,
        timeout=300)

    ## extract Top20 table links from the BOLD Result page
    soup = BSoup(r.text, 'html5lib')
    data = soup.find_all('span', style='text-decoration: none')
    data = [
        'http://boldsystems.org' + data[i].get('result')
        for i in range(len(data))
    ]

    ## return the data
    return data
Exemplo n.º 8
0
def login(username, password, certificate, remember=False):
    ## start a new html session
    session = requests_html.HTMLSession(verify=certificate)

    ## data to push into the post request
    data = {
        'name': username,
        'password': password,
        'destination': 'MAS_Management_UserConsole',
        'loginType': ''
    }

    ## send a post request to log into boldsystems.org
    session.post('https://boldsystems.org/index.php/Login', data=data)

    ## test if the login was successfull
    url = session.get('https://boldsystems.org/')
    soup = BSoup(url.text, 'html.parser')
    content = soup.find(class_='site-navigation nav navbar-nav')
    tags = content.find_all('a')
    if tags[5].text != 'Log out':
        sg.popup('Unable to login.\nPlease check your userdata.')
    else:
        sg.popup('Login successful.')
        ## save userdata only if login is successful and mark is set
        if remember:
            userdata = {"username": username, "password": password}
            abs_path = os.path.dirname(__file__)
            rel_path = os.path.join(abs_path, 'data/userdata')
            json.dump(userdata, open(rel_path, 'w'))

        ## return the session, not neccessary for this check but
        ## useful if you want to do other things with the login
        return session
def req_pkg_details(url, info, proxy_url):
    try:
        headers['User-Agent'] = random.choice(user_agents)
        time.sleep(3)
        r = requests.get(url, headers=headers, proxies=proxy_url)
        soup = BSoup(r.text, 'html.parser')
        p_tags = soup.find_all(
            'p', attrs={'class': compile('fw6 mb3 mt2 truncate black-80 f4')})
        ul_tag = soup.find('ul', attrs={'class': 'list pl0 cf'})
        repo_link = p_tags[3].a['href']
        repo_api_link = urljoin(github_api, urlparse(repo_link).path)
        pkg_version = p_tags[0].text
        pkg_license = p_tags[1].text
        pkg_homepage = p_tags[2].a['href']
        pkg_repo = {'main': repo_link, 'api': repo_api_link}
        pkg_collaborator = [
            handle_author_info(a['href']) for a in ul_tag.find_all('a')
        ]
        pkg_last_update = soup.find('time').text
        pkg_name = info['pkg_name']

        logger.info('StatusCode:(' + str(r.status_code) + ')  ' + 'Package: ' +
                    pkg_name + ' --- ' + url)
        # 存放到mysql
        save_to_mysql(pkg_name, url, pkg_version, dumps(info['pkg_author']),
                      pkg_license, pkg_homepage, pkg_last_update,
                      dumps(info['pkg_judge']), dumps(pkg_collaborator),
                      dumps(pkg_repo))
    except BaseException as e:
        logger.error('Error(150):' + str(e))
        return
    return
Exemplo n.º 10
0
def new_mark():
    form = MarkForm()
    if form.validate_on_submit():
        if g.user.murl(form.url.data):
            flash('Mark with this url ({}) already\
                  exists'.format(form.url.data),
                  category='error')
            return redirect(url_for('marks'))
        m = Mark()
        form.populate_obj(m)
        m.owner_id = g.user.id
        m.created = datetime.utcnow()
        if form.tags.data:
            m.tags = ' '.join([t.strip()
                              for t in form.tags.data.strip().split(',')])\
                        .lower()
        m.clicks = 0
        if not form.title.data:
            soup = BSoup(urlopen(form.url.data), 'html.parser')
            m.title = soup.title.string
        db.session.add(m)
        db.session.commit()
        flash('New mark {} added'.format(m.title), category='info')
        return redirect(url_for('marks'))
    if request.args.get('url'):
        form.url.data = request.args.get('url')
    if request.args.get('title'):
        form.title.data = request.args.get('title')
    if request.args.get('type') == 'feed':
        form.type.data = 'feed'
    return render_template('mark/new.html', title='New mark', form=form)
Exemplo n.º 11
0
def scrape_article(request):
    if request.method == 'POST':
        url = request.POST['URL']
        session = requests.Session()
        session.headers = {
            "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"
        }
        content = session.get(url, verify=True).content
        soup = BSoup(content, "html.parser")
        article = soup.find('meta', {"name": "description"})['content']
        title = soup.title.string
        images = soup.find('img')
        print("IMAGES ", images)
        print('                                 ')
        print('                                 ')
        print('                                 ')

        if images is not None:
            if images.has_attr('data-srcset'):
                txt = str(soup.find('img')['data-srcset'])
                img_list = re.findall("(?<=https).*?(?=jpg)", txt)
                img_list = 'https' + img_list[1] + 'jpg'
                print('image link: ', img_list)
        print('                                 ')
        print('                                 ')
        print('                                 ')

        print('article', article)
        print('title', title)

    return redirect("news")
Exemplo n.º 12
0
def get_ip_lists_from_66():
  r = requests.get(initial_url)
  soup = BSoup(r.text, 'html.parser')
  c = soup.find_all('a', href=re.compile('areaindex'))
  for alink in c:
    c_link = urljoin(initial_url, alink.get('href'))
    get_details(c_link)
Exemplo n.º 13
0
    def __get_redemption_form(self, code, platform):
        """Get Form data for code redemption"""
        the_url = "{}/code_redemptions/new".format(base_url)
        status_code, token = self.__get_token(the_url)
        if not token:
            _L.debug("no token")
            return False, status_code, "Could not retrieve Token"

        r = self.client.get("{base_url}/entitlement_offer_codes?code={code}"
                            .format(base_url=base_url, **locals()),
                            headers=json_headers(token))

        _L.debug("{} {} {}".format(r.request.method, r.url, r.status_code))
        soup = BSoup(r.text, "html.parser")
        if not soup.find("form", class_="new_archway_code_redemption"):
            return False, r.status_code, r.text.strip()

        inp = soup.find_all("input", attrs=dict(name="authenticity_token"))
        form_code = soup.find_all(id="archway_code_redemption_code")
        check = soup.find_all(id="archway_code_redemption_check")
        service = soup.find_all(id="archway_code_redemption_service")

        ind = None
        for i, s in enumerate(service):
            if platform in s["value"]:
                ind = i
                break
        if (ind is None):
            return False, r.status_code, "This code is not available for your platform"

        form_data = {"authenticity_token": inp[ind]["value"],
                     "archway_code_redemption[code]": form_code[ind]["value"],
                     "archway_code_redemption[check]": check[ind]["value"],
                     "archway_code_redemption[service]": service[ind]["value"]}
        return True, r.status_code, form_data
Exemplo n.º 14
0
 def get_retried_jobs(self, workflow):
     retry_workflow_search_url = f"https://app.cloudsnap.com/workflow_instances?utf8=%E2%9C%93&utf8=%E2%9C%93&workflow_id={workflow.workflow_option}&q%5Bc%5D%5B0%5D%5Ba%5D%5B0%5D%5Bname%5D=relaunched&q%5Bc%5D%5B0%5D%5Bp%5D=true&q%5Bc%5D%5B0%5D%5Bv%5D%5B0%5D%5Bvalue%5D=true"
     self.get_by_url(retry_workflow_search_url)
     bs_obj = BSoup(self.driver.page_source, "html.parser")
     try:
         rows = bs_obj.find_all("table")[0].find("tbody").find_all("tr")
         for row in rows:
             cells = row.find_all("td")
             if len(cells) == 0:
                 continue
             job_name = cells[0].get_text()
             entity_name = job_name[job_name.find("(") +
                                    1:job_name.find(")")]
             string_datetime = time.strptime(cells[1].get_text(),
                                             "%m/%d/%Y %H:%M:%S %Z")
             job_date = datetime.datetime.fromtimestamp(
                 time.mktime(string_datetime))
             week = datetime.datetime.now() - datetime.timedelta(days=7)
             if job_date < week:
                 continue
             job_state = "Retried"
             workflow_url = ("https://app.cloudsnap.com" +
                             cells[0].find_all("a", href=True)[0]["href"])
             retry = RetryJobs()
             retry.timestamp = job_date
             retry.url = workflow_url
             retry.state = job_state
             retry.name = job_name
             retry.entity = entity_name
             db.session.add(retry)
             db.session.commit()
         return StepFailure.query.all()
     except Exception as e:
         print(e)
         return None
Exemplo n.º 15
0
    def _parse_personal_info(self, personal_info, dic):

        soup = BSoup(personal_info, "lxml")
        for th, td in zip(soup.find_all('th'), soup.find_all('td')):
            if th.get_text() == 'IELTS:':
                dic['test_score']['IELTS'] = td.get_text().strip()
            if th.get_text() == 'TOEFL:':
                dic['test_score']['TOEFL'] = td.get_text().strip()
            if th.get_text() == 'GRE:':
                dic['test_score']['GRE'] = td.get_text().strip()
            if th.get_text() == 'SAT:':
                dic['test_score']['SAT'] = td.get_text().strip()
            if th.get_text() == 'GMAT:':
                dic['test_score']['GMAT'] = td.get_text().strip()
            if th.get_text() == 'ACT:':
                dic['test_score']['ACT'] = td.get_text().strip()
            if th.get_text() == 'LSAT:':
                dic['test_score']['LSAT'] = td.get_text().strip()
            if th.get_text() == 'MCAT:':
                dic['test_score']['MCAT'] = td.get_text().strip()
            if th.get_text() == 'sub:':
                dic['test_score']['sub'] = td.get_text().strip()
            if th.get_text() == '本科专业:':
                dic['current_major'] = td.get_text().strip()
            if th.get_text() == '其他说明:':
                dic['notes'] = td.get_text().strip()
            if th.get_text() == '本科专业:':
                dic['current_major'] = td.get_text().strip()
            if th.get_text() == '本科学校档次:':
                dic['current_school'] = td.get_text().strip()
            if th.get_text() == '本科成绩和算法、排名:':
                dic['gpa'] = td.get_text().strip()
        return dic
Exemplo n.º 16
0
def get_sensors_status(config, driver):

    sensors = list()
    sensor_link = driver.find_element_by_css_selector('a#nav-link-sensors')
    logger.info('Visting sensors page link...')

    driver.get(sensor_link.get_attribute('href'))
    logger.info('Waiting for sensors page...')

    res = wait_for_element(driver, '#table-sensors-list', timeout=15)
    if res and res.get('exit'):
        message = 'Giving up after 3rd retry ' \
            'of waiting for element.'
        logger.info(message)
        return sensors

    logger.info('Finding disconnected sensors from page...')
    soup = BSoup(driver.page_source, 'html.parser')
    for row in soup.select('#table-sensors-list tr.result-row'):
        _id = row.get('id').replace('result-row-', str())
        text = row.select('.result-column-sensor-status')[0] \
            .get_text().strip('\n').split('\n')[0].strip()
        ip = row.select('.result-column-sensor-ip')[0] \
            .get_text().strip('\n').split('\n')[0].strip()
        name = row.select('.result-column-sensor-name')[0] \
            .get_text().strip('\n').split('\n')[0]
        sensors.append({'id': _id, 'name': name, 'text': text, 'ip': ip})

    return sensors
Exemplo n.º 17
0
def fetch_depts():
    """
    Fetch list of departments from the site
    :return: list of departments at UT Austin
    :rtype: list[str]
    """
    c_html = fetch_html('https://registrar.utexas.edu/staff/fos')

    if c_html is None:
        return []

    c_soup = BSoup(c_html, "html.parser")

    dept_dl_group = c_soup.find("div", {"class": "field body"}).findAll("dl")
    dept_abrs = [dt.text.strip() for dl in dept_dl_group for dt in dl.findAll("dt")]
    dept_names = [
        dd.text.strip().replace('-', ' ') 
        for dl in dept_dl_group 
        for dd in dl.findAll("dd")
        ]
    dept_names = [titlecase(name) for name in dept_names]

    if len(dept_abrs) != len(dept_names):
        # print("Unexpected Error for Dept: number of abr does not equal number of names. Failed fetch")
        return None
    
    depts = [(dept_abrs[i], dept_names[i]) for i in range(len(dept_abrs))]

    return depts
Exemplo n.º 18
0
def fravega_check(url):
    bs_obj = BSoup(requests.get(url).content, "lxml")
    quantity = int(
        bs_obj.find("h4", {
            "class": "even"
        }).text.split(" ")[1].strip(string.punctuation))
    return quantity
Exemplo n.º 19
0
def fetch_prof_info(depts, sem="spring", year=2020):

    f_profs = []

    # fetching courses for each department
    for dept in depts:

        c_html = fetch_html(get_course_url(sem=sem, year=year, dept=dept))

        if c_html is not None:

            c_soup = BSoup(c_html, "html.parser")

            courses = c_soup.findAll("tr", {"class": ["tboff", "tbon"]})

            # fetching information for each course in the department
            for course in courses:

                info = course.findAll("td")

                my_info = collapse_prof_info(info, profs=f_profs)

                if my_info is not None:
                    f_profs.append(my_info)

    return f_profs
Exemplo n.º 20
0
def get_apt_info(htmlpage):

    apt = {}
    soup = BSoup(htmlpage.text, 'html.parser')
    res = soup.find('div', {'class': 'col-md-4 detalhes-imovel'})
    infos = res.find_all('p')

    apt_code = None
    for i in infos:
        text = i.text.strip()
        stext = text.split()
        first_part = stext[0]
        sec_part = ' '.join(stext[1:])

        if 'Código' in text:
            apt_code = sec_part
        elif 'R$' in text:
            apt['preco'] = float(sec_part.replace(',', '.'))
            #teste = 1
        elif 'Finalidade' in text:
            apt['finalidade'] = sec_part
        elif 'Tipo' in text:
            apt['tipo'] = sec_part
        elif 'Bairro' in text:
            apt['bairro'] = sec_part
        elif 'Dorm' in text:
            apt['Dormitorios'] = find_value(first_part)
        elif 'Cozinha' in text:
            apt['Cozinha'] = find_value(first_part)
        elif 'Lavanderia' in text:
            apt['Lavanderia'] = find_value(first_part)
    return apt_code, apt
Exemplo n.º 21
0
def scrape(request):
    session = requests.Session()
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"
    }
    url = "https://www.theonion.com/"
    content = session.get(url, verify=True).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all('article',
                         {"class": "sc-1pw4fyi-7 gDJTEP js_post_item"})
    for article in News:
        main = article.find_all('a')[0]
        title = article.find_all('h4')[0]
        link = main['href']
        images = main.find('img')
        if images is not None:
            if images.has_attr('srcset'):
                #print(images)
                image_src = str(main.find('img')['srcset']).split(".jpg")[0]
                print('title: ', title.text)
                print('link: ', link)

                titlet = str(title.text)
                image_src = image_src + '.jpg'
                print('image_src: ', image_src)
                if link is not None and image_src is not None and title is not None:
                    new_headline = Headline(title=titlet,
                                            image=image_src,
                                            url=link)
                    new_headline.save()
    return redirect('news')
Exemplo n.º 22
0
    def get_proxy(self, ind=True):
        if not ind:
            return None
        exe_path=r"/usr/local/bin/chromedriver"
        chrome_options = Options()  
        chrome_options.add_argument("--headless")
        driver=webdriver.Chrome(executable_path=exe_path, chrome_options=chrome_options)

        driver.get('https://free-proxy-list.net/')
#        driver.find_element_by_xpath("//*[@class='ui-state-default']//*[text()='US']").click()
        driver.find_element_by_xpath("//*[@class='ui-state-default']//*[text()='anonymous']").click()
        driver.find_element_by_xpath("//*[@class='hx ui-state-default']//*[text()='yes']").click()
        html = driver.page_source
        
        bs_obj = BSoup(html, 'html.parser')
        rows = bs_obj.find_all('table')[0].find('tbody').find_all('tr')
    
        trans = []
        
        for row in rows:
            t=[]
            cells = row.find_all('td')
            for i in range(5):
                t.append(cells[i].get_text())
            trans.append(t)
    
        df_trans=pd.DataFrame(trans)
        PROXIES={}
        PROXIES['http']=PROXIES['https']='http://%s:%s' % (df_trans.iloc[0,0], df_trans.iloc[0,1])
        driver.close()
        print(PROXIES)
        return PROXIES
Exemplo n.º 23
0
def garbarino_check(url):
    bs_obj = BSoup(requests.get(url).content, "lxml")
    quantity = int(
        bs_obj.find("li", {
            "class": "breadcrumb-item--active"
        }).span.text.strip(string.punctuation).split(" ")[0])
    return quantity
Exemplo n.º 24
0
 def get_curr_trans(self, exchange_add):
     logger.info('get exchange transfers for %s' % exchange_add)
     bs_obj = BSoup(requests.get(exchange_add, proxies=self.PROXIES).content, 'html.parser')
     rows = bs_obj.find_all('table')[0].find('tbody').find_all('tr')
 
     trans = []
 
     for row in rows:
         cells = row.find_all('td')
         block = cells[1].get_text()
         age=cells[2].get_text()
         fromadd=cells[3].get_text()
         toadd=cells[5].get_text()
         val=cells[6].get_text()
 
         trans.append([
             block, age, fromadd, toadd, val
         ])
 
     df_trans=pd.DataFrame(trans)
     df_trans.columns=['block','age','from','to','value']
     df_trans['time']=df_trans['age'].apply(lambda x: datetime.today()- self.get_time(x, timedelay=[0,0,0]))
     df_trans['value']=df_trans['value'].apply(lambda x: x.split(' Ether')[0])
     df_trans['value']=df_trans['value'].apply(pd.to_numeric, errors='coerce')
     return df_trans    
def scrape(request):
    session = requests.Session()
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"
    }
    url = "https://www.theonion.com/"

    content = session.get(url, verify=False).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all('article', {"class": "js_post_item"})
    for article in News:
        title = article.find_all('a', {"class": "js_link"})[-1].text
        link = article.find("a", {"class": "js_link"}).attrs["href"]
        image_src = article.find("a", {"class": "js_link"}).find("img")
        if image_src:
            try:
                image_src = image_src.attrs["srcset"]
                image_src = image_src.split(" ")[-4]
            except:
                try:
                    image_src = image_src.attrs["data-expanded-srcset"]
                    image_src = image_src.split(" ")[-4]
                except:
                    continue
        else:
            continue
        new_headline = Headline()
        new_headline.title = title
        new_headline.url = link
        new_headline.image = image_src
        new_headline.save()
    return redirect("../")
Exemplo n.º 26
0
def scrape(request):
    session = requests.Session()
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"
    }
    url = "https://metrowatch.com.pk/"
    content = session.get(url, verify=False).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all(
        'div', {
            "class":
            "pb-5 mb-10 block-post-smallrow col-lg-7 col-md-7 col-sm-16 col-xs-16 pl-0 pr-0"
        })
    #News = soup.find_all('li')
    for artcile in News:
        main = artcile.find_all('a')[0]
        link = main['href']
        image_src = str(main.find('img')['srcset']).split(" ")[-4]
        title = artcile.find_all('a')[1]
        if not Headline.objects.get(url=link):
            new_headline = Headline()
            new_headline.title = title.text
            new_headline.url = link
            new_headline.image = image_src
            new_headline.save()
    messages.info(request, "Your Record display successfully")
    return redirect("../")
def return_reading(hexagram_number,hex_cast_number):
    global hex1_lines
    global hex2_lines
    global hex1_url
    global hex2_url
    res = get(f'https://divination.com/iching/lookup/{hexagram_number}-2/')
    soup = BSoup(res.text, 'html.parser')
    hex_name = soup.select('.entry-header > h1')[0].text
    text = soup.select('.entry-content > p')
    moving_lines_head = soup.select('.movinglines > h4')
    moving_lines_body= soup.select('.movinglines > p')
    for i in hexagrams[hexagram_number]:
        hex1_lines.append(print_line(line=i))
    if hex_cast_number == 1:
        hex1_url = res.url
        print(f'{hex_name} \n')
        for i in text:
            print(f'{i.text} \n\n')
        for head,body in list(zip(moving_lines_head, moving_lines_body))[::-1]:
            print(f'\t{head.text}')
            print(f'{body.text}')
        print('\n')
    else:
        print(f'{hex_name} \n')
        for i in text:
            print(f'{i.text} \n')
        hex2_url = res.url
        for i in hexagrams[hexagram_number]:
            hex2_lines.append(print_line(line=i))
def scrape(request):
    session = requests.Session()
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"
    }
    url = "https://www.theonion.com/"
    content = session.get(url, verify=False).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all('article', {"class": "sc-1pw4fyi-5 RkwFH"})
    for artcile in News:
        main = artcile.find_all('a')[0]
        link = main['href']
        title = artcile.find('h4', {"class": "sc-1qoge05-0 eoIfRA"}).text
        News3 = artcile.find('img', {"class": "dv4r5q-2 iaqrWM"})
        if News3 is None:
            image_src = temp
        else:
            image_src = News3['srcset'].split(' ')[0]
            temp = image_src
        new_headline = Headline()
        new_headline.title = title
        new_headline.url = link
        new_headline.image = image_src
        new_headline.save()

    return redirect("../")
Exemplo n.º 29
0
 def __init__(self, url, encoding='utf-8', ip=None, timeout=8):
     print(url)
     self.url = url
     self.page = requests.get(url)
     self.page.encoding = encoding
     self.html = self.page.text
     self.bsObj = BSoup(self.html, 'html.parser')
Exemplo n.º 30
0
    def extract_kqed_info(self, **kwargs):
        request = requests.get(
            "https://projects-api.kqed.org/posts/news?&page[size]=100&page[from]=0"
        )

        content = json.loads(request.text)
        info = {}
        for post in content["data"]:

            info[post["attributes"]["disqusUrl"]] = {}
            cur_dict = info[post["attributes"]["disqusUrl"]]
            cur_dict["date"] = datetime.date.today(
            )  #need to convert from epoch
            cur_dict["source"] = "KQED"
            cur_dict["source_url"] = "https://www.kqed.org/"
            cur_dict["status_code"] = request.status_code
            cur_dict["title"] = post["attributes"]["title"]
            #            cur_dict["author"] = "unknown"
            result = self.browser.open(post["attributes"]["disqusUrl"])
            page = self.browser.get_current_page()
            # author = page.find("span", {"class":"src-routes-Site-routes-Post-components-Post-___Post__post_Author___3vn-d"})
            # if author is not None:
            #     cur_dict["author"] = author.text
            cur_dict["media"] = post["attributes"].get("nprAudio", None)
            text = post["attributes"].get("content", None)
            soup = BSoup(text, "lxml")
            text = soup.get_text()
            cur_dict["text"] = text.encode('ascii', 'ignore').decode("utf-8")
            cur_dict["labels"] = get_labels(cur_dict["text"],
                                            cur_dict["title"])
        return info