Пример #1
0
def get_response_2():
    raw_html = urllib2.urlopen(
        'https://mutualfund.wishfin.com/reliance-mutual-fund').read()
    html = BeautifulSoup(raw_html, 'html.parser')
    response = []
    for a in html.select('li'):
        data = dict()
        link = a.select("a")

        if len(link) > 0:
            link = link[0]
            try:
                if 'reliance' in link['href']:
                    data['url'] = link['href']
                    data['fund_name'] = link.text
            except Exception:
                pass
        percent = a.select("li")
        if len(percent) > 1:
            percent = percent[1]
            span = percent.select("span")
            if len(span) > 1:
                data["1Y"] = float(span[1].text.split(" ")[0])
                data["3Y"] = 0
            else:
                print "NA"
        if len(data) > 0:
            response.append(data)
    response.pop(-1)
    return response
Пример #2
0
def searchGenders( browser : mc.Browser, gender : str , page = 1) -> None:

    url = "https://www.goodreads.com/shelf/show/"+str(gender)+"?page="+str(page)
    res = browser.open(url)

    html = res.read()
    html = bs4( html, "html.parser" )

    pageCount = html.select("div[max_num_pages]")
    pageCount = pageCount[0].select(":not(:last-child)")

    maxPage = int(pageCount[ len(pageCount) -1 ].get_text())
    linkdata = open("links.html", "a")

    for i in range(maxPage + 1):
        if( i >= 1 ):
            print("===> getting page {}\n".format(i))
            url = "https://www.goodreads.com/shelf/show/"+str(gender)+"?page="+str(i)
            res = browser.open(url)

            html = res.read()
            html = bs4( html, "html.parser" )           
            booklinks = html.find_all('a', {'class' : 'bookTitle'})

            for link in booklinks:
                linkdata.write( "<a href='"+ str(url) + str( link['href'] ) +"' ></a>\n")
    
    print("Ready!")
def get_sectors():
    # set up browser
    chrome_options = webdriver.ChromeOptions()
    prefs = {"profile.managed_default_content_settings.images": 2}
    chrome_options.add_experimental_option("prefs", prefs)
    browser = webdriver.Chrome(options=chrome_options)
    # browser = webdriver.Chrome() # open web page
    browser.implicitly_wait(10)  # wait for web page to load

    url = 'https://www.thestar.com.my/business/marketwatch/'
    browser.get(url)
    r = browser.page_source
    html = BeautifulSoup(r, 'html.parser')
    # print(html)
    browser.close()

    # sector elements
    htmlPart = html.find(class_=re.compile("stocks"))
    linkPart = [
        x.get_attribute_list('id')
        for x in htmlPart.find_all('a', {"id": True})
    ]
    for i in range(len(linkPart)):
        sector_elements.extend(linkPart[i])
    # print(linkPart)
    # print(sector_elements)
    # print(len(sector_elements))

    # sector_list
    sector = html.find_all('strong')
    for i in sector:
        sector_list.append(i.text.strip(':'))
        # print(i.text)
    # print(sector_list)

    # sector_name_list
    sector_n = html.select('div.text a')
    for i in sector_n:
        sector_name_list.append(i.text)
    # print(sector_name_list)

    return
Пример #4
0
    def parseByTag(self, html, tag):
        '''parse HTML by tag

        Keyword arguments:
        html -- BeautifulSoup object
        tag  -- string 
        '''

        total_words=int()
        
        words = []

        for i, t in enumerate(html.select(tag)):
            for j, word in enumerate(t.text.split(' ')):
                for k, w in enumerate(word.split('\n')):
                    w = w.rstrip(punctuation)
                    w = w.rstrip(' ')
                    if len(w)>1 and w is not  None and len(w)<255:
                        total_words+=1
                        words.append(w)

        
        return  total_words, words
Пример #5
0
def response(flow):
    content_type = flow.response.headers.get('Content-Type', '')
    if content_type.startswith('text/html'):
        charset = 'utf8'
        if '=' in content_type:
            charset = content_type.split('=')[1]
        print("delete ads:" + content_type + "->" + charset + ": " +
              flow.request.url)
        t1 = time.time()
        html = BeautifulSoup(flow.response.content,
                             "html.parser",
                             from_encoding=charset)
        print("           BeautifulSoup:" + str(round(time.time() - t1, 1)) +
              "sec")
        if html.head:
            delcount = 0
            t1 = time.time()
            for bkey in blist:
                for col in html.select(bkey):
                    col.extract()
                    delcount += 1
            flow.response.content = str(html).encode(charset)
            print("           " + str(round(time.time() - t1, 1)) +
                  "sec: delcount=" + str(delcount))
Пример #6
0
import requests
import lxml.html
from bs4 import BeautifulSoup as BS
import xml.etree.ElementTree as ET
from xml.dom import minidom

r = requests.get('https://www.citrus.ua/smartfony/')
html = BS(r.content, 'html.parser')
k = -1

tree = lxml.html.fromstring(r.text)
content = tree.xpath('//meta[@itemprop="sku"]/@content')

for el in html.select('.product-card__overview'):
    k += 1

    def main():
        ti = el.select('.product-card__name > a')
        pr = el.select('.prices__price')
        hr = ti[0]['href']

        new = ET.Element('all_data')
        phone = ET.SubElement(new, 'phone')
        phone.set('id', content[k])
        name = ET.SubElement(phone, 'name')
        name.text = ti[0]['title']
        price = ET.SubElement(phone, 'price')
        price.text = pr[0].text
        characteristics = ET.SubElement(phone, 'characteristics')

        r2 = requests.get('https://www.citrus.ua' + hr)
Пример #7
0
import lxml.html
from bs4 import BeautifulSoup as BS
import xml.etree.ElementTree as ET
from xml.dom import minidom

r = requests.get(
    'https://www.ebay.com/b/Football-Clothing-Shoes-Accessories/159113/bn_1941036'
)
html = BS(r.content, 'html.parser')
z = -1

tree = lxml.html.fromstring(r.text)
shipping = tree.xpath(
    '//span[@class="s-item__shipping s-item__logisticsCost"]/@span')

for vd in html.select('.s-item__wrapper.clearfix'):
    z += 1

    def main():
        n = vd.select('.s-item__title > h3')
        p = vd.select('.s-item__price')
        b = vd.select('.s-item__dynamic s-item__dynamicAttributes1')
        hr = n[0]['href']

        elem = ET.Element('all_data')
        product = ET.SubElement(new, 'product')
        product.set('id', shipping[k])
        name = ET.SubElement(product, 'name')
        name.text = n[0]['h3']
        price = ET.SubElement(product, 'price')
        price.text = p[0].text
Пример #8
0
def scapper_tsv(n_links, path_html, path_tsv):

    # loading bar
    with tqdm(total=n_links) as pbar:

        for article in range(n_links):  # range of files
            try:
                html = BeautifulSoup(
                    open(path_html + "/article_{}.html".format(article)),
                    'html.parser')
            except Exception as e:  # if article doesn't exists
                print(article, e)
                continue

            title = html.select("h1")[0].text

            # initialize tmp as intro
            tmp = 'intro'
            sections = {'intro': '', 'plot': ''}

            # take all paragraphs section by section and save only intro and plot
            for section in html.select('div.mw-parser-output > *'
                                       ):  # take only notes in the first level
                if (section.name == 'p' and tmp == 'intro'):
                    sections['intro'] += section.text.strip()

                # chage tmp on section names
                if (section.name in ['h2', 'h3']):
                    tmp = section.span['id']

                # take only sections we are interrested in
                if (section.name == 'p' and tmp in [
                        'Plot', 'Plot_summary', 'Premise'
                ]):  # check different names for plot sections
                    sections['plot'] += section.text.strip()

            # we doesn't take in consideration pages without Plot
            if (sections['plot'] == ''):
                print(article, 'No Plot')
                continue

            # dictionary for infobox
            d = {
                'film_name': title,
                'Directed by': 'NA',
                'Produced by': 'NA',
                'Written by': 'NA',
                'Starring': 'NA',
                'Music by': 'NA',
                'Release date': 'NA',
                'Running time': 'NA',
                'Country': 'NA',
                'Language': 'NA',
                'Budget': 'NA'
            }

            # take elem from infobox
            info_box = html.findAll(['th', 'td'])
            for elem in info_box:
                info = elem.text.strip('\n')  # take text from the table
                if info in d:
                    d[info] = info_box[info_box.index(elem) +
                                       1].text.strip('\n')

            # select elem in oroder as a list to save in .tsv
            ld = list(d.values())
            columns = [
                'title', 'intro', 'plot', 'film_name', 'Directed by',
                'Produced by', 'Written by', 'Starring', 'Music by',
                'Release date', 'Running time', 'Country', 'Language', 'Budget'
            ]
            data = [title, sections['intro'], sections['plot']] + ld[0:]

            # create and save a tsv
            with open(path_tsv + '/article_{}.tsv'.format(article),
                      'w',
                      newline='',
                      encoding='utf-8') as f_output:
                tsv_output = csv.writer(f_output, delimiter='\t')
                tsv_output.writerow(columns)
                tsv_output.writerow(data)

            pbar.update(1)